#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; namespace fs=filesystem; const uint64_t CCC_DELIMITER_0_HEAD=0b0; const uint64_t CCC_DELIMITER_1_HEAD=0b10; const uint64_t CCC_C_KEYWORD_HEAD=0b1100; const uint64_t CCC_MISCELANEOUS_HEAD=0b1101; const uint64_t CCC_STRING_INLINE_HEAD=0b1110; const uint64_t CCC_REC_TABLE_REF_HEAD=0b1111; const uint64_t CCC_STRING_INLINE_END=0b00000000; #define CCC_ADD_COMPONENT(vec,tail) \ do { \ auto tmp=tail; \ vec.insert(vec.end(),tmp.begin(),tmp.end()); \ } while (0) struct XXH3HasherString { size_t operator()(const std::string& s) const { return static_cast(XXH3_64bits(s.data(),s.size())); } }; class bit_streamer { private: vector out; uint8_t current_byte=0; uint8_t bit_pos=0; public: size_t index; bit_streamer(size_t index) { out.reserve(1024*1024); this->index=index; } size_t get_size() { return out.size(); } void write_bits(uint64_t value,uint8_t count) { for (int i=count-1;i>=0;--i) { if ((value>>i) & 1) { current_byte|=(1<<(7-bit_pos)); } bit_pos++; if (bit_pos==8) { out.push_back(current_byte); current_byte=0; bit_pos=0; } } } void align() { if (bit_pos>0) { out.push_back(current_byte); current_byte=0; bit_pos=0; } } const vector& get_out() const { return out; } vector extract_buffer() { align(); return std::move(out); } }; const vector delimiter0={ "{", "}", "(", ")", "[", "]", ",", "." }; const vector delimiter1={ "{}", "()", "[]", ";" }; const vector miscellaneous={ "!", "%", "'", "*", "+", "-", "/", ":", "<", ">", "=", "?", "^", "|", "&", "~", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "++", "--", "<<", ">>", "==", "!=", "<=", ">=", "->", "...", "||", "&&", "NULL", "size_t", "uint8_t", "uint16_t", "uint32_t", "uint64_t", "int8_t", "int16_t", "int32_t", "int64_t" }; const vector c_keywords={ "#if", "#ifdef", "#ifndef", "#else", "#elif", "#elifdef", "#elifndef", "#endif", "#define", "#undef", "#include", "#error", "#warning", "#pragma", "#line", "alignas", "alignof", "auto", "bool", "break", "case", "char", "const", "constexpr", "continue", "default", "do", "double", "else", "enum", "extern", "false", "float", "for", "goto", "if", "inline", "int", "long", "nullptr", "register", "restrict", "return", "short", "signed", "sizeof", "static", "static_assert", "struct", "switch", "thread_local", "true", "typedef", "typeof", "typeof_unequal", "union", "unsigned", "void", "volatile", "while", "__asm__", "__attribute__", "defined", }; #pragma pack(push,1) struct header { uint8_t sig[3]; uint8_t flags; size_t size_rec_table; size_t entry_count; size_t size_payload; }; #pragma pack(pop) struct node { uint16_t type; uint32_t start; uint32_t end; }; struct file_entry { string name; size_t size; size_t index; }; struct thread_iterate_input_loop_call { string &source_code; vector &thread_local_node_list; unordered_map& thread_local_type_map; unordered_map& thread_local_type_u16_map; uint16_t thread_local_next_type_id; unordered_map thread_local_rec_map; }; struct thread_rec_map_result { unordered_map thread_local_rec_map; }; struct thread_encoding_input_loop_call { string &source_code; vector &node_list; unordered_map& thread_local_type_map; bit_streamer& thread_local_bit_stream; }; enum iterating_mode { REC_MAP, PARSING }; queue rec_map_files_queue; mutex rec_map_queue_mutex; queue encoding_files_queue; mutex encoding_queue_mutex; vector rec_list; unordered_map> rec_lookup; unordered_map> c_keyword_lookup; unordered_map> miscelaneous_lookup; unordered_map> delimiter0_lookup; unordered_map> delimiter1_lookup; bool show_warning=false; bool fail_on_warning=false; bool enable_malloc_trim=true; void iterate_all_nodes_loop_call(thread_iterate_input_loop_call &settings,TSNode current_node,iterating_mode mode) { if (ts_node_child_count(current_node)==0) { uint32_t start=ts_node_start_byte(current_node); uint32_t end=ts_node_end_byte(current_node); string_view text{settings.source_code.data()+start,end-start}; string type=string(ts_node_type(current_node)); if (mode==iterating_mode::REC_MAP) { if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="type_identifier" || type=="field_identifier" || type=="escape_sequence" || type=="statement_identifier") { settings.thread_local_rec_map[string(text)]++; } if (type=="primitive_type" && find(c_keywords.begin(),c_keywords.end(),text)==c_keywords.end()) { settings.thread_local_rec_map[string(text)]++; } if (type=="comment") { settings.thread_local_rec_map[string(text)]=2; } } else if (mode==iterating_mode::PARSING) { if (settings.thread_local_type_u16_map.find(type)==settings.thread_local_type_u16_map.end()) { settings.thread_local_type_u16_map[type]=settings.thread_local_next_type_id; settings.thread_local_type_map[settings.thread_local_type_u16_map.at(type)]=type; settings.thread_local_next_type_id++; } settings.thread_local_node_list.push_back({.type=settings.thread_local_type_u16_map[type],.start=start,.end=end}); } } else { uint32_t child_count=ts_node_child_count(current_node); for (uint32_t i=0;i useless_type_u16_map; vector useless_node_vector; unordered_map useless_type_map; TSParser *parser=ts_parser_new(); ts_parser_set_language(parser,tree_sitter_c()); int counter=0; while (true) { file_entry f; { lock_guard lock(rec_map_queue_mutex); if (rec_map_files_queue.empty()) break; f=std::move(rec_map_files_queue.front()); rec_map_files_queue.pop(); } ifstream file(f.name); if (!file) { cout<<"Error: couldn't open "<(file)),istreambuf_iterator()); file.close(); thread_iterate_input_loop_call loop_settings { .source_code=code, .thread_local_node_list=useless_node_vector, .thread_local_type_map=useless_type_map, .thread_local_type_u16_map=useless_type_u16_map, .thread_local_next_type_id=0, .thread_local_rec_map=res.thread_local_rec_map }; TSTree *tree=ts_parser_parse_string(parser,nullptr,code.c_str(),code.size()); TSNode root=ts_tree_root_node(tree); loop_settings.source_code=code; iterate_all_nodes_loop_call(loop_settings,root,iterating_mode::REC_MAP); string().swap(code); ts_tree_delete(tree); { lock_guard lock(encoding_queue_mutex); encoding_files_queue.push(std::move(f)); } if (++counter%20==0 && enable_malloc_trim) malloc_trim(0); } ts_parser_delete(parser); auto end=chrono::high_resolution_clock::now(); auto ms=chrono::duration_cast(end-start).count(); cout<<"Recccurences map thread number "<>=1; ++bits; } bitstream.align(); bitstream.write_bits(CCC_REC_TABLE_REF_HEAD,4); bitstream.write_bits(index,bits); bitstream.align(); return; } void generate_delimiter0(bit_streamer& bitstream,size_t index) { bitstream.align(); bitstream.write_bits(CCC_DELIMITER_0_HEAD,1); bitstream.write_bits(index,3); bitstream.align(); return; } void generate_delimiter1(bit_streamer& bitstream,size_t index) { bitstream.align(); bitstream.write_bits(CCC_DELIMITER_1_HEAD,2); bitstream.write_bits(index,2); bitstream.align(); return; } void generate_miscellaneous(bit_streamer& bitstream,size_t index) { bitstream.align(); bitstream.write_bits(CCC_MISCELANEOUS_HEAD,4); bitstream.write_bits(index,6); bitstream.align(); } void generate_string_content(bit_streamer& bitstream,const char *text,size_t text_len) { bitstream.align(); bitstream.write_bits(CCC_STRING_INLINE_HEAD,4); for (int i=0;isecond; generate_rec(out,index,rec_list.size()); } } else if (type=="primitive_type" || type=="type_identifier") { auto it=c_keyword_lookup.find(string(text)); if (it!=c_keyword_lookup.end()) { size_t index=it->second; generate_c_keyword(out,index); } else { auto it=rec_lookup.find(string(text)); if (it==rec_lookup.end()) { if (!text.empty()) { generate_string_content(out,text.data(),text.size()); } else { print_warning("Warning: type node is empty: "+string(text)); fail_if_warning(); } } else { size_t index=it->second; generate_rec(out,index,rec_list.size()); } } } else if (delimiter0_lookup.find(type)!=delimiter0_lookup.end() || delimiter1_lookup.find(type)!=delimiter1_lookup.end() || type=="\"") { string insert; if (type=="(" && i+1second; generate_delimiter0(out,index); } else { if (insert!="{}" && insert!="\"") { auto it=delimiter1_lookup.find(insert); if (it!=delimiter1_lookup.end()) { size_t index=it->second; generate_delimiter1(out,index); } else { print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert); fail_if_warning(); } } else { if (insert=="{}") { auto it=delimiter1_lookup.find("{}"); if (it!=delimiter1_lookup.end()) { size_t index=it->second; out.align(); out.write_bits(CCC_DELIMITER_1_HEAD,2); out.write_bits(index,2); out.write_bits(0b0,1); out.align(); } else { print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert); fail_if_warning(); } } else if (insert=="\"") { auto it=delimiter1_lookup.find("{}"); if (it!=delimiter1_lookup.end()) { size_t index=it->second; out.align(); out.write_bits(CCC_DELIMITER_1_HEAD,2); out.write_bits(index,2); out.write_bits(0b1,1); out.align(); } else { print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert); fail_if_warning(); } } else { print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert); fail_if_warning(); } } } } else if (c_keyword_lookup.find(type)!=c_keyword_lookup.end() || type=="preproc_directive") { if (type!="preproc_directive") { auto it=c_keyword_lookup.find(type); if (it!=c_keyword_lookup.end()) { size_t index=it->second; generate_c_keyword(out,index); } else { print_warning("Warning: unknow C keyword, that shouldn't happen: "+type+" "+string(text)); fail_if_warning(); } } else { auto it=c_keyword_lookup.find(string(text)); if (it!=c_keyword_lookup.end()) { size_t index=it->second; generate_c_keyword(out,index); } else { auto it=rec_lookup.find(string(text)); if (it==rec_lookup.end()) { if (!text.empty()) { generate_string_content(out,text.data(),text.size()); } else { print_warning("Warning: C keyword is empty: "+string(text)); fail_if_warning(); } } else { size_t index=it->second; generate_rec(out,index,rec_list.size()); } } } } else if (miscelaneous_lookup.find(type)!=miscelaneous_lookup.end()) { auto it=miscelaneous_lookup.find(type); if (it!=miscelaneous_lookup.end()) { size_t index=it->second; generate_miscellaneous(out,index); } else { print_warning("Warning: unknow miscellaneous, that shouldn't happen: "+type); fail_if_warning(); } } else if (type=="comment") { auto it=rec_lookup.find(string(text)); if (it==rec_lookup.end()) { if (!text.empty()) { generate_string_content(out,text.data(),text.size()); } else { print_warning("Warning: comment is empty: "+string(text)); fail_if_warning(); } } else { size_t index=it->second; generate_rec(out,index,rec_list.size()); } } else { auto it=rec_lookup.find(type); if (it==rec_lookup.end()) { if (!text.empty()) { generate_string_content(out,text.data(),text.size()); } else { print_warning("Warning: unknow node is empty: "+string(text)); fail_if_warning(); } } else { size_t index=it->second; generate_rec(out,index,rec_list.size()); } } } out.align(); return; } void run_thread_encoding(size_t thread_num) { auto start=chrono::high_resolution_clock::now(); unordered_map useless_rec_map; unordered_map thread_local_type_map; unordered_map thread_local_type_u16_map; vector thread_local_node_list; TSParser *parser=ts_parser_new(); ts_parser_set_language(parser,tree_sitter_c()); int counter=0; while (true) { bit_streamer bitstream(0); file_entry f; { lock_guard lock(encoding_queue_mutex); if (encoding_files_queue.empty()) break; f=std::move(encoding_files_queue.front()); encoding_files_queue.pop(); } bitstream.index=f.index; ofstream temp_thread_out(".temp_ccc/temp_ccc_"+to_string(f.index)+".bin",ios::binary); if (!temp_thread_out) { cout<<"Error: couldn't open .temp_ccc/temp_ccc_"<(file)), istreambuf_iterator()); file.close(); thread_iterate_input_loop_call iterate_loop_settings { .source_code=code, .thread_local_node_list=thread_local_node_list, .thread_local_type_map=thread_local_type_map, .thread_local_type_u16_map=thread_local_type_u16_map, .thread_local_next_type_id=0, .thread_local_rec_map=useless_rec_map }; thread_encoding_input_loop_call encoding_loop_settings { .source_code=code, .node_list=thread_local_node_list, .thread_local_type_map=thread_local_type_map, .thread_local_bit_stream=bitstream }; TSTree *tree=ts_parser_parse_string(parser,nullptr,code.c_str(),code.size()); TSNode root=ts_tree_root_node(tree); iterate_loop_settings.source_code=code; iterate_all_nodes_loop_call(iterate_loop_settings,root,iterating_mode::PARSING); ts_tree_delete(tree); encoding_loop_settings.source_code=code; process_file_nodes_loop_call(encoding_loop_settings); auto payload=std::move(bitstream.extract_buffer()); temp_thread_out.write(reinterpret_cast(payload.data()),payload.size()); temp_thread_out.close(); payload.clear(); payload.shrink_to_fit(); vector().swap(thread_local_node_list); thread_local_type_map.clear(); thread_local_type_u16_map.clear(); string().swap(code); if (++counter%10==0 && enable_malloc_trim) malloc_trim(0); } ts_parser_delete(parser); auto end=chrono::high_resolution_clock::now(); auto ms=chrono::duration_cast(end-start).count(); cout<<"Parsing/encoding thread number "< files; for (int i=1;i> rec_map_futures; for (size_t i=0;i all_rec_map_results; map global_rec_map; for (auto& fut:rec_map_futures) { all_rec_map_results.push_back(fut.get()); for (auto const& [str,count]:all_rec_map_results.back().thread_local_rec_map) { global_rec_map[str]+=count; } } for (auto const& [str,count]:global_rec_map) { if (count>=2 && str.size()>=3) { rec_list.push_back(str); rec_lookup[str]=rec_list.size()-1; } } global_rec_map.clear(); vector encoding_files_vec; while (!encoding_files_queue.empty()) { encoding_files_vec.push_back(std::move(encoding_files_queue.front())); encoding_files_queue.pop(); } sort(encoding_files_vec.begin(),encoding_files_vec.end(),[](const file_entry& a,const file_entry& b) { return a.size>b.size; }); for (auto& f:encoding_files_vec) { encoding_files_queue.push(std::move(f)); } vector> encoding_futures; for (size_t i=0;i global_payloads_start; size_t current_offset=0; for (int i=0;i final_payloads; final_payloads.resize(current_offset); size_t offset=0; for (int i=0;i(final_payloads.data()+offset),current_file_size); offset+=current_file_size; ccc_file.close(); } fs::remove_all(".temp_ccc"); vector payload_compressed; payload_compressed.resize(final_payloads.size()+final_payloads.size()/3+128); lzma_mt mt_options={}; mt_options.flags=0; mt_options.threads=thread::hardware_concurrency(); mt_options.block_size=max((size_t)8*1024*1024,final_payloads.size()/mt_options.threads); mt_options.timeout=0; mt_options.filters=nullptr; mt_options.check=LZMA_CHECK_CRC64; lzma_options_lzma opt_lzma; if (lzma_lzma_preset(&opt_lzma,compression_ratio)) { cout<<"Error: couldn't initialize LZMA compressor for files archive."<(end-start).count(); if (ret!=LZMA_STREAM_END) { cout<<"Error: couldn't compress files archive."<=original_size) { flags&= ~(0b00000001); payload_total_size=original_size; } else { flags|=0b00000001; payload_total_size=compressed_size; } vector rec_table; for (int i=0;i rec_table_compressed; rec_table_compressed.resize(rec_table.size()+rec_table.size()/3+128); strm=LZMA_STREAM_INIT; if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { cout<<"Error: couldn't initialize LZMA compressor for reccurences table."<=original_size) { flags&= ~(0b00000010); rec_table_total_size=original_size; } else { flags|=0b00000010; rec_table_total_size=compressed_size; } vector files_table; for (int i=0;i files_table_compressed; files_table_compressed.resize(files_table.size()+files_table.size()/3+128); strm=LZMA_STREAM_INIT; if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { cout<<"Error: couldn't initialize LZMA compressor for files table."<=original_size) { flags&= ~(0b00000100); files_table_total_size=original_size; } else { flags|=0b00000100; files_table_total_size=compressed_size; } header head; head.sig[0]='C'; head.sig[1]='C'; head.sig[2]='C'; head.flags=flags; head.size_payload=payload_total_size; head.size_rec_table=rec_table_total_size; head.entry_count=files.size(); vector out; for (int i=0;i(out.data()),out.size()); fileout.close(); cout<<"Finished !"<