#include #include #include #include #include #include #include #include #include #include #include using namespace std; namespace fs=filesystem; const vector CCC_C_KEYYORD_HEAD {0,0,0}; const vector CCC_SPACE {0,1,1,1,0,0,1}; const vector CCC_PREPROCESSOR_CONDITIONAL_IF {0,0,1,0,0,0}; const vector CCC_PREPROCESSOR_CONDITIONAL_IFDEF {0,0,1,0,0,1}; const vector CCC_PREPROCESSOR_CONDITIONAL_IFNDEF {0,0,1,0,1,0}; const vector CCC_PREPROCESSOR_CONDITIONAL_ELSE {0,0,1,0,1,1}; const vector CCC_PREPROCESSOR_CONDITIONAL_ELIF {0,0,1,1,0,0}; const vector CCC_PREPROCESSOR_CONDITIONAL_ELIFDEF {0,0,1,1,0,1}; const vector CCC_PREPROCESSOR_CONDITIONAL_ELIFNDEF {0,0,1,1,1,0}; const vector CCC_PREPROCESSOR_CONDITIONAL_ENDIF {0,0,1,1,1,1}; const vector CCC_PREPROCESSOR_OTHER_DEFINE {0,1,0,0,0,0}; const vector CCC_PREPROCESSOR_OTHER_UNDEF {0,1,0,0,0,1}; const vector CCC_PREPROCESSOR_OTHER_INCLUDE {0,1,0,0,1,0}; const vector CCC_PREPROCESSOR_OTHER_ERROR {0,1,0,0,1,1}; const vector CCC_PREPROCESSOR_OTHER_WARNING {0,1,0,1,0,0}; const vector CCC_PREPROCESSOR_OTHER_PRAGMA {0,1,0,1,0,1}; const vector CCC_PREPROCESSOR_OTHER_LINE {0,1,0,1,1,0}; const vector CCC_QUOTE {0,1,0,1,1,1}; const vector CCC_DELIMITER_HEAD {0,1,1}; const vector CCC_OTHER_GRAMMAR_HEAD {1,0,0}; const vector CCC_MISCELLANEOUS_HEAD {1,0,1}; const vector CCC_REC_TABLE_REF_HEAD {1,1,0}; const vector CCC_STRING_ASCII {1,1,1,0}; const vector CCC_STRING_UTF8 {1,1,1,1}; const vector CCC_STRING_END_ASCII {0,0,0,0,0,0,0}; const vector CCC_STRING_END_UTF8 {0,0,0,0,0,0,0,0}; #define CCC_ADD_COMPOMENT(vec,tail) \ do { \ auto tmp=tail; \ vec.insert(vec.end(),tmp.begin(),tmp.end()); \ } while (0) const vector delimiter={ "\n", "\t", "{", "}", "(", ")", "[", "]", " ", "{}", "()", "[]", "", ";", ",", "." }; const vector other_grammer={ "!", "%", "'", "*", "+", "-", "/", ":", "<", ">", "=", "?", "^", "|", "&", "~" }; const vector miscellaneous={ "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "++", "--", "<<", ">>", "==", "!=", "<=", ">=", "->", "...", "||", "&&", "NULL", "size_t", "uint8_t", "uint16_t", "uint32_t", "uint64_t", "int8_t", "int16_t", "int32_t", "int64_t" }; const vector c_keywords={ "alignas", "alignof", "auto", "bool", "break", "case", "char", "const", "constexpr", "continue", "default", "do", "double", "else", "enum", "extern", "false", "float", "for", "goto", "if", "inline", "int", "long", "nullptr", "register", "restrict", "return", "short", "signed", "sizeof", "static", "static_assert", "struct", "switch", "thread_local", "true", "typedef", "typeof", "typeof_unequal", "union", "unsigned", "void", "volatile", "while" }; struct symbol { string name; int score; }; struct node { map children; int token_id=-1; }; void insert(node* root,string str,int id) { node* curr=root; for (char c:str) { if (curr->children.find(c)==curr->children.end()) { curr->children[c]=new node(); } curr=curr->children[c]; } curr->token_id=id; } vector all_tokens; void get_all_nodes(TSNode node,const string &source_code,map &rec_map) { if (ts_node_child_count(node)==0) { all_tokens.push_back(node); string text=source_code.substr(ts_node_start_byte(node),ts_node_end_byte(node)-ts_node_start_byte(node)); if (string(ts_node_type(node))=="string_content" || string(ts_node_type(node))=="system_lib_string" || string(ts_node_type(node))=="identifier" || string(ts_node_type(node))=="number_literal" || string(ts_node_type(node))=="type_identifier" || string(ts_node_type(node))=="field_identifier" || string(ts_node_type(node))=="escape_sequence" || string(ts_node_type(node))=="statement_identifier") { rec_map[text]++; } if (string(ts_node_type(node))=="primitive_type" && find(c_keywords.begin(),c_keywords.end(),text)==c_keywords.end()) { rec_map[text]++; } } else { uint32_t child_count=ts_node_child_count(node); for (uint32_t i=0;i byte_to_bits(unsigned char c) { vector out; for (int i=7;i>=0;i--) { bool enabled=(c>>i)&0x01; out.push_back(enabled); } return out; } vector ascii_to_bits(unsigned char c) { vector out; for (int i=6;i>=0;i--) { bool enabled=(c>>i)&0x01; out.push_back(enabled); } return out; } vector generate_c_keyword(size_t index) { vector out; CCC_ADD_COMPOMENT(out,CCC_C_KEYYORD_HEAD); for (int i=5;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_rec(size_t index,size_t total_recs) { vector out; size_t bits=0; while (total_recs) { total_recs>>=1; ++bits; } CCC_ADD_COMPOMENT(out,CCC_REC_TABLE_REF_HEAD); for (int i=bits;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_delimiter(size_t index) { vector out; CCC_ADD_COMPOMENT(out,CCC_DELIMITER_HEAD); for (int i=3;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_other_grammar(size_t index) { vector out; CCC_ADD_COMPOMENT(out,CCC_OTHER_GRAMMAR_HEAD); for (int i=3;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_miscellaneous(size_t index) { vector out; CCC_ADD_COMPOMENT(out,CCC_MISCELLANEOUS_HEAD); for (int i=4;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_string_content(string str) { vector out; bool is_utf8=false; for (auto c:str) { if (c>127) { is_utf8=true; break; } } if (is_utf8) { CCC_ADD_COMPOMENT(out,CCC_STRING_UTF8); for (auto c:str) { CCC_ADD_COMPOMENT(out,byte_to_bits(c)); } CCC_ADD_COMPOMENT(out,CCC_STRING_END_UTF8); } else { CCC_ADD_COMPOMENT(out,CCC_STRING_ASCII); for (auto c:str) { CCC_ADD_COMPOMENT(out,ascii_to_bits(c)); } CCC_ADD_COMPOMENT(out,CCC_STRING_END_ASCII); } return out; } vector process_all_nodes(vector *nodes,string code,vector &rec_list) { vector out; for (int i=0;isize();i++) { string type=string(ts_node_type(nodes->at(i))); if (type=="#if") { CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IF); cout<<"if"<at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); auto it=find(rec_list.begin(),rec_list.end(),text); if (it==rec_list.end()) { if (!text.empty()) { string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); CCC_ADD_COMPOMENT(out,generate_string_content(text)); cout<<"string ("<at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); auto it=find(c_keywords.begin(),c_keywords.end(),text); if (it!=c_keywords.end()) { size_t index=distance(c_keywords.begin(),it); CCC_ADD_COMPOMENT(out,generate_c_keyword(index)); cout<<"primitive_type: "<size()) { if (string(ts_node_type(nodes->at(i+1)))==")") { text="()"; i++; } else { text="("; } } else if (type=="[" && i+1size()) { if (string(ts_node_type(nodes->at(i+1)))=="]") { text="[]"; i++; } else { text="["; } } else if (type=="{" && i+1size()) { if (string(ts_node_type(nodes->at(i+1)))=="}") { text="{}"; i++; } else { text="{"; } } else { text=type; } auto it=find(delimiter.begin(),delimiter.end(),text); if (it!=delimiter.end()) { size_t index=distance(delimiter.begin(),it); CCC_ADD_COMPOMENT(out,generate_delimiter(index)); cout<<"delimiter: "<size()) { if (string(ts_node_type(nodes->at(i+1)))=="\"") { auto it=find(delimiter.begin(),delimiter.end(),""); size_t index=distance(delimiter.begin(),it); CCC_ADD_COMPOMENT(out,generate_delimiter(index)); cout<<"double quotes mark, inserting delimiter for empty string"<at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); cout<<"unknow node type: "<"<(file)),istreambuf_iterator()); TSParser *parser=ts_parser_new(); ts_parser_set_language(parser,tree_sitter_c()); TSTree *tree=ts_parser_parse_string(parser,nullptr,code.c_str(),code.size()); TSNode root=ts_tree_root_node(tree); map rec_map; vector rec_list; get_all_nodes(root,code,rec_map); for (auto s:rec_map) { if (s.second>=2 and s.first.size()>=3 && s.first.size()<=256) { rec_list.push_back(s.first); } } auto payload=process_all_nodes(&all_tokens,code,rec_list); vector out={0,1,0,0,0,0,1,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,1,1}; for (int i=63;i>=0;i--) { bool enabled=(rec_list.size()>>i)&0x01; out.push_back(enabled); } for (int i=0;i=0;i--) { bool enabled=(size>>i)&0x01; out.push_back(enabled); } for (auto c:rec_list[i]) { for (int i=7;i>=0;i--) { bool enabled=(c>>i)&0x01; out.push_back(enabled); } } } CCC_ADD_COMPOMENT(out,payload); vector outbytes; unsigned char current=0; int bit_index=0; for (bool b:out) { current|=(b<<(7-bit_index)); bit_index++; if (bit_index==8) { outbytes.push_back(current); current=0; bit_index=0; } } if (bit_index!=0) { outbytes.push_back(current); } ofstream fileout(filepath+".ccc",ios::binary); if (!fileout) { cout<<"Error: couldn't open output file."<(outbytes.data()),outbytes.size()); fileout.close(); cout<<"Reccurences map entry count: "<