#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; namespace fs=filesystem; const vector CCC_DELIMITER_0_HEAD={0}; const vector CCC_DELIMITER_1_HEAD={1,0}; const vector CCC_C_KEYWORD_HEAD={1,1,0,0}; const vector CCC_MISCELANEOUS_HEAD={1,1,0,1}; const vector CCC_STRING_INLINE_HEAD={1,1,1,0}; const vector CCC_REC_TABLE_REF_HEAD={1,1,1,1}; const vector CCC_STRING_INLINE_END={0,0,0,0,0,0,0,0}; #define CCC_ADD_COMPONENT(vec,tail) \ do { \ auto tmp=tail; \ vec.insert(vec.end(),tmp.begin(),tmp.end()); \ } while (0) #define CCC_ADD_COMPONENT_ALIGNED(vec,tail) \ do { \ static_assert(is_same_v>,"vec must be vector"); \ static_assert(is_same_v>,"tail must be vector"); \ vec.reserve(vec.size()+tail.size()+8); \ for (auto b:tail) vec.push_back(b); \ size_t rem=vec.size()%8; \ if (rem!=0) { \ vec.insert(vec.end(),8-rem,false); \ } \ } while (0) struct XXH3HasherString { size_t operator()(const std::string& s) const { return static_cast(XXH3_64bits(s.data(),s.size())); } }; const vector delimiter0={ "{", "}", "(", ")", "[", "]", ",", "." }; const vector delimiter1={ "{}", "()", "[]", ";" }; const vector miscellaneous={ "!", "%", "'", "*", "+", "-", "/", ":", "<", ">", "=", "?", "^", "|", "&", "~", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "++", "--", "<<", ">>", "==", "!=", "<=", ">=", "->", "...", "||", "&&", "NULL", "size_t", "uint8_t", "uint16_t", "uint32_t", "uint64_t", "int8_t", "int16_t", "int32_t", "int64_t" }; const vector c_keywords={ "#if", "#ifdef", "#ifndef", "#else", "#elif", "#elifdef", "#elifndef", "#endif", "#define", "#undef", "#include", "#error", "#warning", "#pragma", "#line", "alignas", "alignof", "auto", "bool", "break", "case", "char", "const", "constexpr", "continue", "default", "do", "double", "else", "enum", "extern", "false", "float", "for", "goto", "if", "inline", "int", "long", "nullptr", "register", "restrict", "return", "short", "signed", "sizeof", "static", "static_assert", "struct", "switch", "thread_local", "true", "typedef", "typeof", "typeof_unequal", "union", "unsigned", "void", "volatile", "while", "__asm__", "__attribute__", "defined", }; struct symbol { string name; int score; }; #pragma pack(push,1) struct header { uint8_t sig[3]; uint8_t flags; size_t size_rec_table; size_t entry_count; size_t size_payload; }; #pragma pack(pop) struct node { uint16_t type; uint32_t start; uint32_t end; }; unordered_map optimized_type_u16_list; unordered_map type_map; vector> all_tokens; map rec_map; vector rec_list; unordered_map rec_lookup; unordered_map c_keyword_lookup; unordered_map miscelaneous_lookup; unordered_map delimiter0_lookup; unordered_map delimiter1_lookup; bool debug=false; bool fail_on_warning=false; static uint16_t next_type_id; void get_all_nodes(TSNode node,const string &source_code,map &rec_map,size_t index) { if (ts_node_child_count(node)==0) { string text=source_code.substr(ts_node_start_byte(node),ts_node_end_byte(node)-ts_node_start_byte(node)); string type=string(ts_node_type(node)); if (optimized_type_u16_list.find(type)==optimized_type_u16_list.end()) { optimized_type_u16_list[type]=next_type_id; type_map[optimized_type_u16_list.at(type)]=type; next_type_id++; } all_tokens[index].push_back({.type=optimized_type_u16_list[type],.start=ts_node_start_byte(node),.end=ts_node_end_byte(node)}); if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="type_identifier" || type=="field_identifier" || type=="escape_sequence" || type=="statement_identifier") { rec_map[text]++; } if (type=="primitive_type" && find(c_keywords.begin(),c_keywords.end(),text)==c_keywords.end()) { rec_map[text]++; } if (type=="comment") { rec_map[text]=2; } } else { uint32_t child_count=ts_node_child_count(node); for (uint32_t i=0;i byte_to_bits(unsigned char c) { vector out; for (int i=7;i>=0;i--) { bool enabled=(c>>i)&0x01; out.push_back(enabled); } return out; } vector generate_c_keyword(size_t index) { vector out; CCC_ADD_COMPONENT(out,CCC_C_KEYWORD_HEAD); for (int i=5;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_rec(size_t index,size_t total_recs) { vector out; size_t bits=0; while (total_recs) { total_recs>>=1; ++bits; } CCC_ADD_COMPONENT(out,CCC_REC_TABLE_REF_HEAD); for (int i=bits;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_delimiter0(size_t index) { vector out; CCC_ADD_COMPONENT(out,CCC_DELIMITER_0_HEAD); for (int i=2;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_delimiter1(size_t index) { vector out; CCC_ADD_COMPONENT(out,CCC_DELIMITER_1_HEAD); for (int i=1;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_miscellaneous(size_t index) { vector out; CCC_ADD_COMPONENT(out,CCC_MISCELANEOUS_HEAD); for (int i=5;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } vector generate_string_content(string str) { vector out; CCC_ADD_COMPONENT(out,CCC_STRING_INLINE_HEAD); for (auto c:str) { CCC_ADD_COMPONENT(out,byte_to_bits(c)); } CCC_ADD_COMPONENT(out,CCC_STRING_INLINE_END); return out; } void print_debug(string text) { if (debug==true) { cout< process_file_nodes(vector *nodes,string code) { vector out; for (int i=0;isize();i++) { node n=nodes->at(i); string type=type_map[n.type]; string text=code.substr(n.start,n.end-n.start); if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") { auto it=rec_lookup.find(text); if (it==rec_lookup.end()) { CCC_ADD_COMPONENT_ALIGNED(out,generate_string_content(text)); print_debug("string ("+type+"): "+text); } else { size_t index=it->second; CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size())); print_debug("rec_table for string ("+type+"): "+text); } } else if (type=="primitive_type" || type=="type_identifier") { auto it=c_keyword_lookup.find(text); if (it!=c_keyword_lookup.end()) { size_t index=it->second; CCC_ADD_COMPONENT_ALIGNED(out,generate_c_keyword(index)); print_debug("type found in c keyword: "+text); } else { auto it=rec_lookup.find(text); if (it==rec_lookup.end()) { if (!text.empty()) { CCC_ADD_COMPONENT_ALIGNED(out,generate_string_content(text)); print_debug("string for type ("+type+"): "+text); } else { cout<<"Warning: type node is empty: "<second; CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size())); print_debug("rec_table for string for type ("+type+"): "+text); } } } else if (delimiter0_lookup.find(type)!=delimiter0_lookup.end() || delimiter1_lookup.find(type)!=delimiter1_lookup.end() || type=="\"") { string insert; if (type=="(" && i+1size()) { if (type_map[nodes->at(i+1).type]==")") { insert="()"; i++; } else { insert="("; } } else if (type=="[" && i+1size()) { if (type_map[nodes->at(i+1).type]=="]") { insert="[]"; i++; } else { insert="["; } } else if (type=="{" && i+1size()) { if (type_map[nodes->at(i+1).type]=="}") { insert="{}"; i++; } else { insert="{"; } } else { insert=type; } auto it=delimiter0_lookup.find(insert); if (it!=delimiter0_lookup.end()) { size_t index=it->second; CCC_ADD_COMPONENT_ALIGNED(out,generate_delimiter0(index)); print_debug("delimiter 0: "+insert); } else { if (insert!="{}" && insert!="\"") { auto it=delimiter1_lookup.find(insert); if (it!=delimiter1_lookup.end()) { size_t index=it->second; CCC_ADD_COMPONENT_ALIGNED(out,generate_delimiter1(index)); print_debug("delimiter 1: "+insert); } else { cout<<"Warning: unknow delimiter, that shouldn't happen: "<second; CCC_ADD_COMPONENT(out,generate_delimiter1(index)); vector temp={0}; CCC_ADD_COMPONENT_ALIGNED(out,temp); print_debug("delimiter 1: "+insert); } else { cout<<"Warning: unknow delimiter, that shouldn't happen: "<second; CCC_ADD_COMPONENT(out,generate_delimiter1(index)); vector temp={1}; CCC_ADD_COMPONENT_ALIGNED(out,temp); print_debug("delimiter 1: "+insert); } else { cout<<"Warning: unknow delimiter, that shouldn't happen: "<second; CCC_ADD_COMPONENT_ALIGNED(out,generate_c_keyword(index)); print_debug("c keyword: "+type); } else { cout<<"Warning: unknow C keyword, that shouldn't happen: "<second; CCC_ADD_COMPONENT_ALIGNED(out,generate_c_keyword(index)); print_debug("c keyword: "+type); } else { auto it=rec_lookup.find(text); if (it==rec_lookup.end()) { if (!text.empty()) { CCC_ADD_COMPONENT_ALIGNED(out,generate_string_content(text)); print_debug("string for c keyword ("+type+"): "+text); } else { cout<<"Warning: C keyword is empty: "<second; CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size())); print_debug("rec_table for string for c keyword ("+type+"): "+text); } } } } else if (miscelaneous_lookup.find(type)!=miscelaneous_lookup.end()) { auto it=miscelaneous_lookup.find(type); if (it!=miscelaneous_lookup.end()) { size_t index=it->second; CCC_ADD_COMPONENT_ALIGNED(out,generate_miscellaneous(index)); print_debug("miscellaneous: "+type); } else { cout<<"Warning: unknow miscellaneous, that shouldn't happen: "<second; CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size())); print_debug("rec_table for comment"); } } else { auto it=rec_lookup.find(type); if (it==rec_lookup.end()) { if (!text.empty()) { CCC_ADD_COMPONENT_ALIGNED(out,generate_string_content(text)); print_debug("string for unknow node ("+type+"): "+text); } else { cout<<"Warning: unknow node is empty: "<second; CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size())); print_debug("rec_table for string for unknow node ("+type+"): "+text); } } } vector payload_bytes; unsigned char current=0; size_t bit_index=0; for (bool b:out) { current|=(b<<(7-bit_index)); bit_index++; if (bit_index==8) { payload_bytes.push_back(current); current=0; bit_index=0; } } if (bit_index!=0) { payload_bytes.push_back(current); } return payload_bytes; } void construct_rec_table(vector &files_content,vector files_names) { for (int i=0;i=2 and s.first.size()>=3) { rec_list.push_back(s.first); } } for (int i=0;i files; for (int i=1;i files_content; for (int i=0;i(file)),istreambuf_iterator()); files_content.push_back(code); cout< files_archive; vector payloads_size; vector payloads_start; for (int i=0;i payload_compressed; payload_compressed.resize(files_archive.size()+files_archive.size()/3+128); lzma_stream strm=LZMA_STREAM_INIT; if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { cout<<"Error: couldn't initialize LZMA compressor for file archive."<=original_size) { flags&= ~(0b00000001); payload_total_size=original_size; } else { flags|=0b00000001; payload_total_size=compressed_size; } vector rec_table; for (int i=0;i rec_table_compressed; rec_table_compressed.resize(rec_table.size()+rec_table.size()/3+128); strm=LZMA_STREAM_INIT; if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { cout<<"Error: couldn't initialize LZMA compressor for reccurences table."<=original_size) { flags&= ~(0b00000010); rec_table_total_size=original_size; } else { flags|=0b00000010; rec_table_total_size=compressed_size; } vector files_table; for (int i=0;i files_table_compressed; files_table_compressed.resize(files_table.size()+files_table.size()/3+128); strm=LZMA_STREAM_INIT; if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { cout<<"Error: couldn't initialize LZMA compressor for files table."<=original_size) { flags&= ~(0b00000100); files_table_total_size=original_size; } else { flags|=0b00000100; files_table_total_size=compressed_size; } header head; head.sig[0]='C'; head.sig[1]='C'; head.sig[2]='C'; head.flags=flags; head.size_payload=payload_total_size; head.size_rec_table=rec_table_total_size; head.entry_count=files.size(); vector out; for (int i=0;i(out.data()),out.size()); fileout.close(); cout<<"Finished !"<