#include #include #include #include #include #include #include #include #include #include #include #include using namespace std; namespace fs=filesystem; const vector CCC_C_KEYYORD_HEAD {0,0,0}; const vector CCC_SPACE {0,1,1,1,0,0,1}; const vector CCC_PREPROCESSOR_CONDITIONAL_IF {0,0,1,0,0,0}; const vector CCC_PREPROCESSOR_CONDITIONAL_IFDEF {0,0,1,0,0,1}; const vector CCC_PREPROCESSOR_CONDITIONAL_IFNDEF {0,0,1,0,1,0}; const vector CCC_PREPROCESSOR_CONDITIONAL_ELSE {0,0,1,0,1,1}; const vector CCC_PREPROCESSOR_CONDITIONAL_ELIF {0,0,1,1,0,0}; const vector CCC_PREPROCESSOR_CONDITIONAL_ELIFDEF {0,0,1,1,0,1}; const vector CCC_PREPROCESSOR_CONDITIONAL_ELIFNDEF {0,0,1,1,1,0}; const vector CCC_PREPROCESSOR_CONDITIONAL_ENDIF {0,0,1,1,1,1}; const vector CCC_PREPROCESSOR_OTHER_DEFINE {0,1,0,0,0,0}; const vector CCC_PREPROCESSOR_OTHER_UNDEF {0,1,0,0,0,1}; const vector CCC_PREPROCESSOR_OTHER_INCLUDE {0,1,0,0,1,0}; const vector CCC_PREPROCESSOR_OTHER_ERROR {0,1,0,0,1,1}; const vector CCC_PREPROCESSOR_OTHER_WARNING {0,1,0,1,0,0}; const vector CCC_PREPROCESSOR_OTHER_PRAGMA {0,1,0,1,0,1}; const vector CCC_PREPROCESSOR_OTHER_LINE {0,1,0,1,1,0}; const vector CCC_QUOTE {0,1,0,1,1,1}; const vector CCC_DELIMITER_HEAD {0,1,1}; const vector CCC_OTHER_GRAMMAR_HEAD {1,0,0}; const vector CCC_MISCELLANEOUS_HEAD {1,0,1}; const vector CCC_REC_TABLE_REF_HEAD {1,1,0}; const vector CCC_STRING_ASCII {1,1,1,0}; const vector CCC_STRING_UTF8 {1,1,1,1}; const vector CCC_STRING_END_ASCII {0,0,0,0,0,0,0}; const vector CCC_STRING_END_UTF8 {0,0,0,0,0,0,0,0}; #define CCC_ADD_COMPOMENT(vec,tail) \ do { \ auto tmp=tail; \ vec.insert(vec.end(),tmp.begin(),tmp.end()); \ } while (0) const vector delimiter={ "\n", "\t", "{", "}", "(", ")", "[", "]", " ", "{}", "()", "[]", "", ";", ",", "." }; const vector other_grammer={ "!", "%", "'", "*", "+", "-", "/", ":", "<", ">", "=", "?", "^", "|", "&", "~" }; const vector miscellaneous={ "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "++", "--", "<<", ">>", "==", "!=", "<=", ">=", "->", "...", "||", "&&", "NULL", "size_t", "uint8_t", "uint16_t", "uint32_t", "uint64_t", "int8_t", "int16_t", "int32_t", "int64_t" }; const vector c_keywords={ "alignas", "alignof", "auto", "bool", "break", "case", "char", "const", "constexpr", "continue", "default", "do", "double", "else", "enum", "extern", "false", "float", "for", "goto", "if", "inline", "int", "long", "nullptr", "register", "restrict", "return", "short", "signed", "sizeof", "static", "static_assert", "struct", "switch", "thread_local", "true", "typedef", "typeof", "typeof_unequal", "union", "unsigned", "void", "volatile", "while", "__asm__", "__attribute__" }; struct symbol { string name; int score; }; struct node { map children; int token_id=-1; }; void insert(node* root,string str,int id) { node* curr=root; for (char c:str) { if (curr->children.find(c)==curr->children.end()) { curr->children[c]=new node(); } curr=curr->children[c]; } curr->token_id=id; } struct processed_file { string path; uint32_t payload_size; vector payload; bool is_payload_compressed; }; map> all_tokens; map rec_map; vector rec_list; bool debug=false; void get_all_nodes(TSNode node,const string &source_code,map &rec_map,const string& file) { if (ts_node_child_count(node)==0) { all_tokens[file].push_back(node); string text=source_code.substr(ts_node_start_byte(node),ts_node_end_byte(node)-ts_node_start_byte(node)); if (string(ts_node_type(node))=="string_content" || string(ts_node_type(node))=="system_lib_string" || string(ts_node_type(node))=="identifier" || string(ts_node_type(node))=="number_literal" || string(ts_node_type(node))=="type_identifier" || string(ts_node_type(node))=="field_identifier" || string(ts_node_type(node))=="escape_sequence" || string(ts_node_type(node))=="statement_identifier") { rec_map[text]++; } if (string(ts_node_type(node))=="primitive_type" && find(c_keywords.begin(),c_keywords.end(),text)==c_keywords.end()) { rec_map[text]++; } if (string(ts_node_type(node))=="comment") { rec_map[text]=2; } } else { uint32_t child_count=ts_node_child_count(node); for (uint32_t i=0;i byte_to_bits(unsigned char c) { vector out; for (int i=7;i>=0;i--) { bool enabled=(c>>i)&0x01; out.push_back(enabled); } return out; } vector ascii_to_bits(unsigned char c) { vector out; for (int i=6;i>=0;i--) { bool enabled=(c>>i)&0x01; out.push_back(enabled); } return out; } vector generate_c_keyword(size_t index) { vector out; CCC_ADD_COMPOMENT(out,CCC_C_KEYYORD_HEAD); for (int i=5;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_rec(size_t index,size_t total_recs) { vector out; size_t bits=0; while (total_recs) { total_recs>>=1; ++bits; } CCC_ADD_COMPOMENT(out,CCC_REC_TABLE_REF_HEAD); for (int i=bits;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_delimiter(size_t index) { vector out; CCC_ADD_COMPOMENT(out,CCC_DELIMITER_HEAD); for (int i=3;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_other_grammar(size_t index) { vector out; CCC_ADD_COMPOMENT(out,CCC_OTHER_GRAMMAR_HEAD); for (int i=3;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_miscellaneous(size_t index) { vector out; CCC_ADD_COMPOMENT(out,CCC_MISCELLANEOUS_HEAD); for (int i=4;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_string_content(string str) { vector out; bool is_utf8=false; for (auto c:str) { if (c>127) { is_utf8=true; break; } } if (is_utf8) { CCC_ADD_COMPOMENT(out,CCC_STRING_UTF8); for (auto c:str) { CCC_ADD_COMPOMENT(out,byte_to_bits(c)); } CCC_ADD_COMPOMENT(out,CCC_STRING_END_UTF8); } else { CCC_ADD_COMPOMENT(out,CCC_STRING_ASCII); for (auto c:str) { CCC_ADD_COMPOMENT(out,ascii_to_bits(c)); } CCC_ADD_COMPOMENT(out,CCC_STRING_END_ASCII); } return out; } void print_debug(string text) { if (debug==true) { cout< process_file_nodes(vector *nodes,string code,vector &rec_list) { vector out; for (int i=0;isize();i++) { string type=string(ts_node_type(nodes->at(i))); if (type=="#if") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IF); print_debug("if"); } else if (type=="#ifdef") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IFDEF); print_debug("ifdef"); } else if (type=="#ifndef") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IFNDEF); print_debug("ifndef"); } else if (type=="#else") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELSE); print_debug("else"); } else if (type=="#elif") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIF); print_debug("elif"); } else if (type=="#elifdef") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIFDEF); print_debug("elifdef"); } else if (type=="#elifndef") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIFNDEF); print_debug("elifndef"); } else if (type=="#endif") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ENDIF); print_debug("endif"); } else if (type=="#define") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_DEFINE); print_debug("define"); } else if (type=="#undef") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_UNDEF); print_debug("undef"); } else if (type=="#include") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_INCLUDE); print_debug("include"); } else if (type=="#error") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_ERROR); print_debug("error"); } else if (type=="#warning") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_WARNING); print_debug("warning"); } else if (type=="#pragma") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_PRAGMA); print_debug("pragma"); } else if (type=="#line") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_LINE); print_debug("line"); } else if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="type_identifier" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") { string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); auto it=find(rec_list.begin(),rec_list.end(),text); if (it==rec_list.end()) { if (!text.empty()) { string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); CCC_ADD_COMPOMENT(out,generate_string_content(text)); print_debug("string ("+type+"): "+text); } else { auto it=find(delimiter.begin(),delimiter.end(),""); size_t index=distance(delimiter.begin(),it); CCC_ADD_COMPOMENT(out,generate_delimiter(index)); print_debug("delimiter for empty string"); } } else { size_t index=distance(rec_list.begin(),it); CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size())); print_debug("rec_table for string ("+type+"): "+text); } } else if (type=="primitive_type") { string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); auto it=find(c_keywords.begin(),c_keywords.end(),text); if (it!=c_keywords.end()) { size_t index=distance(c_keywords.begin(),it); CCC_ADD_COMPOMENT(out,generate_c_keyword(index)); print_debug("primitive_type: "+text); } else { auto it=find(rec_list.begin(),rec_list.end(),text); if (it==rec_list.end()) { if (!text.empty()) { CCC_ADD_COMPOMENT(out,generate_string_content(text)); print_debug("string ("+type+"): "+text); } else { cout<<"Error: provided primitive is empty: "<size()) { if (string(ts_node_type(nodes->at(i+1)))==")") { text="()"; i++; } else { text="("; } } else if (type=="[" && i+1size()) { if (string(ts_node_type(nodes->at(i+1)))=="]") { text="[]"; i++; } else { text="["; } } else if (type=="{" && i+1size()) { if (string(ts_node_type(nodes->at(i+1)))=="}") { text="{}"; i++; } else { text="{"; } } else { text=type; } auto it=find(delimiter.begin(),delimiter.end(),text); if (it!=delimiter.end()) { size_t index=distance(delimiter.begin(),it); CCC_ADD_COMPOMENT(out,generate_delimiter(index)); print_debug("delimiter: "+text); } else { cout<<"Error: unknow delimiter, that shouldn't happen: "<at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); auto it=find(rec_list.begin(),rec_list.end(),text); if (it==rec_list.end()) { cout<<"Error: comment in reccurences map not found: "<size()) { if (string(ts_node_type(nodes->at(i+1)))=="\"") { auto it=find(delimiter.begin(),delimiter.end(),""); size_t index=distance(delimiter.begin(),it); CCC_ADD_COMPOMENT(out,generate_delimiter(index)); print_debug("double quotes mark, inserting delimiter for empty string"); i++; } else { CCC_ADD_COMPOMENT(out,CCC_QUOTE); print_debug("single quote mark"); } } } else { string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); cout<<"Error: unknow node type: "< payload_bytes; unsigned char current=0; size_t bit_index=0; for (bool b:out) { current|=(b<<(7-bit_index)); bit_index++; if (bit_index==8) { payload_bytes.push_back(current); current=0; bit_index=0; } } if (bit_index!=0) { payload_bytes.push_back(current); } return payload_bytes; } void construct_rec_table(vector &files_content,vector files_names) { for (int i=0;i=2 and s.first.size()>=3) { rec_list.push_back(s.first); } } } int main(int argc,char **argv) { if (argc<2) { cout<<"Usage: ccc [FILES]"< files; for (int i=1;i files_content; for (auto f:files) { ifstream file(f,ios::binary); if (!file) { cout<<"Error: couldn't open provided file."<(file)),istreambuf_iterator()); files_content.push_back(code); } construct_rec_table(files_content,files); vector files_archive; for (int i=0;i payload_compressed; payload_compressed.resize(payload_bytes.size()+payload_bytes.size()/3+128); lzma_stream strm=LZMA_STREAM_INIT; if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { cout<<"Error: couldn't initialize LZMA compressor for file: "<=original_size) { pfile.is_payload_compressed=false; pfile.payload=payload_bytes; pfile.payload_size=original_size; } else { pfile.is_payload_compressed=true; pfile.payload=payload_compressed; pfile.payload_size=compressed_size; } cout< rec_table; // for (int i=0;i rec_table_compressed; // rec_table_compressed.resize(rec_table.size()+rec_table.size()/3+128); // lzma_stream strm=LZMA_STREAM_INIT; // if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { // cout<<"Error: couldn't initialize LZMA compressor."< out; // out.push_back('C'); // out.push_back(compressed_size>=original_size?'C':'c'); // vector payload_bytes; // unsigned char current=0; // size_t bit_index=0; // for (bool b:payload) { // current|=(b<<(7-bit_index)); // bit_index++; // if (bit_index==8) { // payload_bytes.push_back(current); // current=0; // bit_index=0; // } // } // if (bit_index!=0) { // payload_bytes.push_back(current); // } // vector payload_compressed; // payload_compressed.resize(payload_bytes.size()+payload_bytes.size()/3+128); // strm=LZMA_STREAM_INIT; // if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { // cout<<"Error: couldn't initialize LZMA compressor."<=original_size1?'C':'c'); // if (compressed_size>=original_size) { // CCC_ADD_COMPOMENT(out,rec_table); // } else { // CCC_ADD_COMPOMENT(out,rec_table_compressed); // } // if (compressed_size1>=original_size1) { // CCC_ADD_COMPOMENT(out,payload_bytes); // } else { // CCC_ADD_COMPOMENT(out,payload_compressed); // } // ofstream fileout("test.ccc",ios::binary); // if (!fileout) { // cout<<"Error: couldn't open output file."<(out.data()),out.size()); // fileout.close(); // return 0; }