#include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; namespace fs=filesystem; const vector CCC_DELIMITER_0_HEAD={0}; const vector CCC_DELIMITER_1_HEAD={1,0}; const vector CCC_C_KEYWORD_HEAD={1,1,0,0}; const vector CCC_MISCELANEOUS_HEAD={1,1,0,1}; const vector CCC_STRING_INLINE_HEAD={1,1,1,0}; const vector CCC_REC_TABLE_REF_HEAD={1,1,1,1}; const vector CCC_STRING_INLINE_END={0,0,0,0,0,0,0,0}; #define CCC_ADD_COMPOMENT(vec,tail) \ do { \ auto tmp=tail; \ vec.insert(vec.end(),tmp.begin(),tmp.end()); \ } while (0) const vector delimiter0={ "{", "}", "(", ")", "[", "]", ",", "." }; const vector delimiter1={ "{}", "()", "[]", ";" }; const vector miscellaneous={ "!", "%", "'", "*", "+", "-", "/", ":", "<", ">", "=", "?", "^", "|", "&", "~", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "++", "--", "<<", ">>", "==", "!=", "<=", ">=", "->", "...", "||", "&&", "NULL", "size_t", "uint8_t", "uint16_t", "uint32_t", "uint64_t", "int8_t", "int16_t", "int32_t", "int64_t" }; const vector c_keywords={ "#if", "#ifdef", "#ifndef", "#else", "#elif", "#elifdef", "#elifndef", "#endif", "#define", "#undef", "#include", "#error", "#warning", "#pragma", "#line", "alignas", "alignof", "auto", "bool", "break", "case", "char", "const", "constexpr", "continue", "default", "do", "double", "else", "enum", "extern", "false", "float", "for", "goto", "if", "inline", "int", "long", "nullptr", "register", "restrict", "return", "short", "signed", "sizeof", "static", "static_assert", "struct", "switch", "thread_local", "true", "typedef", "typeof", "typeof_unequal", "union", "unsigned", "void", "volatile", "while", "__asm__", "__attribute__", "defined", }; struct symbol { string name; int score; }; #pragma pack(push,1) struct header { uint8_t sig[3]; uint8_t flags; size_t size_rec_table; size_t entry_count; size_t size_payload; }; #pragma pack(pop) map> all_tokens; map rec_map; vector rec_list; bool debug=false; void get_all_nodes(TSNode node,const string &source_code,map &rec_map,const string& file) { if (ts_node_child_count(node)==0) { all_tokens[file].push_back(node); string text=source_code.substr(ts_node_start_byte(node),ts_node_end_byte(node)-ts_node_start_byte(node)); if (string(ts_node_type(node))=="string_content" || string(ts_node_type(node))=="system_lib_string" || string(ts_node_type(node))=="identifier" || string(ts_node_type(node))=="number_literal" || string(ts_node_type(node))=="type_identifier" || string(ts_node_type(node))=="field_identifier" || string(ts_node_type(node))=="escape_sequence" || string(ts_node_type(node))=="statement_identifier") { rec_map[text]++; } if (string(ts_node_type(node))=="primitive_type" && find(c_keywords.begin(),c_keywords.end(),text)==c_keywords.end()) { rec_map[text]++; } if (string(ts_node_type(node))=="comment") { rec_map[text]=2; } } else { uint32_t child_count=ts_node_child_count(node); for (uint32_t i=0;i byte_to_bits(unsigned char c) { vector out; for (int i=7;i>=0;i--) { bool enabled=(c>>i)&0x01; out.push_back(enabled); } return out; } vector generate_c_keyword(size_t index) { vector out; CCC_ADD_COMPOMENT(out,CCC_C_KEYWORD_HEAD); for (int i=5;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_rec(size_t index,size_t total_recs) { vector out; size_t bits=0; while (total_recs) { total_recs>>=1; ++bits; } CCC_ADD_COMPOMENT(out,CCC_REC_TABLE_REF_HEAD); for (int i=bits;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_delimiter0(size_t index) { vector out; CCC_ADD_COMPOMENT(out,CCC_DELIMITER_0_HEAD); for (int i=2;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_delimiter1(size_t index) { vector out; CCC_ADD_COMPOMENT(out,CCC_DELIMITER_1_HEAD); for (int i=1;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_miscellaneous(size_t index) { vector out; CCC_ADD_COMPOMENT(out,CCC_MISCELANEOUS_HEAD); for (int i=5;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_string_content(string str) { vector out; CCC_ADD_COMPOMENT(out,CCC_STRING_INLINE_HEAD); for (auto c:str) { CCC_ADD_COMPOMENT(out,byte_to_bits(c)); } CCC_ADD_COMPOMENT(out,CCC_STRING_INLINE_END); return out; } void print_debug(string text) { if (debug==true) { cout< process_file_nodes(vector *nodes,string code,vector &rec_list) { vector out; for (int i=0;isize();i++) { string type=string(ts_node_type(nodes->at(i))); string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") { auto it=find(rec_list.begin(),rec_list.end(),text); if (it==rec_list.end()) { string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); CCC_ADD_COMPOMENT(out,generate_string_content(text)); print_debug("string ("+type+"): "+text); } else { size_t index=distance(rec_list.begin(),it); CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size())); print_debug("rec_table for string ("+type+"): "+text); } } else if (type=="primitive_type" || type=="type_identifier") { string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); auto it=find(c_keywords.begin(),c_keywords.end(),text); if (it!=c_keywords.end()) { size_t index=distance(c_keywords.begin(),it); CCC_ADD_COMPOMENT(out,generate_c_keyword(index)); print_debug("type found in c keyword: "+text); } else { auto it=find(rec_list.begin(),rec_list.end(),text); if (it==rec_list.end()) { if (!text.empty()) { CCC_ADD_COMPOMENT(out,generate_string_content(text)); print_debug("string for type ("+type+"): "+text); } else { cout<<"Warning: provided primitive is empty: "<size()) { if (string(ts_node_type(nodes->at(i+1)))==")") { insert="()"; i++; } else { insert="("; } } else if (type=="[" && i+1size()) { if (string(ts_node_type(nodes->at(i+1)))=="]") { insert="[]"; i++; } else { insert="["; } } else if (type=="{" && i+1size()) { if (string(ts_node_type(nodes->at(i+1)))=="}") { insert="{}"; i++; } else { insert="{"; } } else { insert=type; } auto it=find(delimiter0.begin(),delimiter0.end(),insert); if (it!=delimiter0.end()) { size_t index=distance(delimiter0.begin(),it); CCC_ADD_COMPOMENT(out,generate_delimiter0(index)); print_debug("delimiter 0: "+insert); } else { if (insert!="{}" && insert!="\"") { auto it=find(delimiter1.begin(),delimiter1.end(),insert); if (it!=delimiter1.end()) { size_t index=distance(delimiter1.begin(),it); CCC_ADD_COMPOMENT(out,generate_delimiter1(index)); print_debug("delimiter 1: "+insert); } else { cout<<"Error: unknow delimiter, that shouldn't happen: "< payload_bytes; unsigned char current=0; size_t bit_index=0; for (bool b:out) { current|=(b<<(7-bit_index)); bit_index++; if (bit_index==8) { payload_bytes.push_back(current); current=0; bit_index=0; } } if (bit_index!=0) { payload_bytes.push_back(current); } return payload_bytes; } void construct_rec_table(vector &files_content,vector files_names) { for (int i=0;i=2 and s.first.size()>=3) { rec_list.push_back(s.first); } } } int main(int argc,char **argv) { cout< files; for (int i=1;i files_content; for (auto f:files) { ifstream file(f,ios::binary); if (!file) { cout<<"Error: couldn't open provided file."<(file)),istreambuf_iterator()); files_content.push_back(code); } construct_rec_table(files_content,files); vector files_archive; vector payloads_size; vector payloads_start; for (int i=0;i payload_compressed; payload_compressed.resize(files_archive.size()+files_archive.size()/3+128); lzma_stream strm=LZMA_STREAM_INIT; if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { cout<<"Error: couldn't initialize LZMA compressor for file archive."<=original_size) { flags&= ~(0b00000001); payload_total_size=original_size; } else { flags|=0b00000001; payload_total_size=compressed_size; } vector rec_table; for (int i=0;i rec_table_compressed; rec_table_compressed.resize(rec_table.size()+rec_table.size()/3+128); strm=LZMA_STREAM_INIT; if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { cout<<"Error: couldn't initialize LZMA compressor for reccurences table."<=original_size) { flags&= ~(0b00000010); rec_table_total_size=original_size; } else { flags|=0b00000010; rec_table_total_size=compressed_size; } vector files_table; for (int i=0;i files_table_compressed; files_table_compressed.resize(files_table.size()+files_table.size()/3+128); strm=LZMA_STREAM_INIT; if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { cout<<"Error: couldn't initialize LZMA compressor for files table."<=original_size) { flags&= ~(0b00000100); files_table_total_size=original_size; } else { flags|=0b00000100; files_table_total_size=compressed_size; } header head; head.sig[0]='C'; head.sig[1]='C'; head.sig[2]='C'; head.flags=flags; head.size_payload=payload_total_size; head.size_rec_table=rec_table_total_size; head.entry_count=files.size(); vector out; for (int i=0;i(out.data()),out.size()); fileout.close(); return 0; }