commit f6d1f74a45ee4b1518c50686a1bb985fee7abaf0 Author: lolo859 Date: Fri Feb 6 17:56:22 2026 +0100 final new version diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b2a7546 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +ccc diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..6d71709 --- /dev/null +++ b/build.sh @@ -0,0 +1 @@ +g++ ccc.cpp -o ccc -ltree-sitter -ltree-sitter-c -llzma -Ofast -march=native diff --git a/ccc.cpp b/ccc.cpp new file mode 100644 index 0000000..ab8d528 --- /dev/null +++ b/ccc.cpp @@ -0,0 +1,740 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace std; +namespace fs=filesystem; +const vector CCC_DELIMITER_0_HEAD={0}; +const vector CCC_DELIMITER_1_HEAD={1,0}; +const vector CCC_C_KEYWORD_HEAD={1,1,0,0}; +const vector CCC_MISCELANEOUS_HEAD={1,1,0,1}; +const vector CCC_STRING_INLINE_HEAD={1,1,1,0}; +const vector CCC_REC_TABLE_REF_HEAD={1,1,1,1}; +const vector CCC_STRING_INLINE_END={0,0,0,0,0,0,0,0}; +#define CCC_ADD_COMPONENT(vec,tail) \ + do { \ + auto tmp=tail; \ + vec.insert(vec.end(),tmp.begin(),tmp.end()); \ + } while (0) +#define CCC_ADD_COMPONENT_ALIGNED(vec,tail) \ + do { \ + static_assert(is_same_v>,"vec must be vector"); \ + static_assert(is_same_v>,"tail must be vector"); \ + vec.reserve(vec.size()+tail.size()+8); \ + for (auto b:tail) vec.push_back(b); \ + size_t rem=vec.size()%8; \ + if (rem!=0) { \ + vec.insert(vec.end(),8-rem,false); \ + } \ + } while (0) +struct XXH3HasherString { + size_t operator()(const std::string& s) const { + return static_cast(XXH3_64bits(s.data(),s.size())); + } +}; +const vector delimiter0={ + "{", + "}", + "(", + ")", + "[", + "]", + ",", + "." +}; +const vector delimiter1={ + "{}", + "()", + "[]", + ";" +}; +const vector miscellaneous={ + "!", + "%", + "'", + "*", + "+", + "-", + "/", + ":", + "<", + ">", + "=", + "?", + "^", + "|", + "&", + "~", + "+=", + "-=", + "*=", + "/=", + "%=", + "&=", + "|=", + "^=", + "<<=", + ">>=", + "++", + "--", + "<<", + ">>", + "==", + "!=", + "<=", + ">=", + "->", + "...", + "||", + "&&", + "NULL", + "size_t", + "uint8_t", + "uint16_t", + "uint32_t", + "uint64_t", + "int8_t", + "int16_t", + "int32_t", + "int64_t" +}; +const vector c_keywords={ + "#if", + "#ifdef", + "#ifndef", + "#else", + "#elif", + "#elifdef", + "#elifndef", + "#endif", + "#define", + "#undef", + "#include", + "#error", + "#warning", + "#pragma", + "#line", + "alignas", + "alignof", + "auto", + "bool", + "break", + "case", + "char", + "const", + "constexpr", + "continue", + "default", + "do", + "double", + "else", + "enum", + "extern", + "false", + "float", + "for", + "goto", + "if", + "inline", + "int", + "long", + "nullptr", + "register", + "restrict", + "return", + "short", + "signed", + "sizeof", + "static", + "static_assert", + "struct", + "switch", + "thread_local", + "true", + "typedef", + "typeof", + "typeof_unequal", + "union", + "unsigned", + "void", + "volatile", + "while", + "__asm__", + "__attribute__", + "defined", +}; +struct symbol { + string name; + int score; +}; +#pragma pack(push,1) +struct header { + uint8_t sig[3]; + uint8_t flags; + size_t size_rec_table; + size_t entry_count; + size_t size_payload; +}; +#pragma pack(pop) +struct node { + uint16_t type; + uint32_t start; + uint32_t end; +}; +unordered_map optimized_type_u16_list; +unordered_map type_map; +vector> all_tokens; +map rec_map; +vector rec_list; +unordered_map rec_lookup; +unordered_map c_keyword_lookup; +unordered_map miscelaneous_lookup; +unordered_map delimiter0_lookup; +unordered_map delimiter1_lookup; +bool debug=false; +bool fail_on_warning=false; +static uint16_t next_type_id; +void get_all_nodes(TSNode node,const string &source_code,map &rec_map,size_t index) { + if (ts_node_child_count(node)==0) { + string text=source_code.substr(ts_node_start_byte(node),ts_node_end_byte(node)-ts_node_start_byte(node)); + string type=string(ts_node_type(node)); + if (optimized_type_u16_list.find(type)==optimized_type_u16_list.end()) { + optimized_type_u16_list[type]=next_type_id; + type_map[optimized_type_u16_list.at(type)]=type; + next_type_id++; + } + all_tokens[index].push_back({.type=optimized_type_u16_list[type],.start=ts_node_start_byte(node),.end=ts_node_end_byte(node)}); + if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="type_identifier" || type=="field_identifier" || type=="escape_sequence" || type=="statement_identifier") { + rec_map[text]++; + } + if (type=="primitive_type" && find(c_keywords.begin(),c_keywords.end(),text)==c_keywords.end()) { + rec_map[text]++; + } + if (type=="comment") { + rec_map[text]=2; + } + } else { + uint32_t child_count=ts_node_child_count(node); + for (uint32_t i=0;i byte_to_bits(unsigned char c) { + vector out; + for (int i=7;i>=0;i--) { + bool enabled=(c>>i)&0x01; + out.push_back(enabled); + } + return out; +} +vector generate_c_keyword(size_t index) { + vector out; + CCC_ADD_COMPONENT(out,CCC_C_KEYWORD_HEAD); + for (int i=5;i>=0;i--) { + bool enabled=(index>>i)&0x01; + out.push_back(enabled); + } + return out; +} +vector generate_rec(size_t index,size_t total_recs) { + vector out; + size_t bits=0; + while (total_recs) { + total_recs>>=1; + ++bits; + } + CCC_ADD_COMPONENT(out,CCC_REC_TABLE_REF_HEAD); + for (int i=bits;i>=0;i--) { + bool enabled=(index>>i)&0x01; + out.push_back(enabled); + } + return out; +} +vector generate_delimiter0(size_t index) { + vector out; + CCC_ADD_COMPONENT(out,CCC_DELIMITER_0_HEAD); + for (int i=2;i>=0;i--) { + bool enabled=(index>>i)&0x01; + out.push_back(enabled); + } + return out; +} +vector generate_delimiter1(size_t index) { + vector out; + CCC_ADD_COMPONENT(out,CCC_DELIMITER_1_HEAD); + for (int i=1;i>=0;i--) { + bool enabled=(index>>i)&0x01; + out.push_back(enabled); + } + return out; +} +vector generate_miscellaneous(size_t index) { + vector out; + CCC_ADD_COMPONENT(out,CCC_MISCELANEOUS_HEAD); + for (int i=5;i>=0;i--) { + bool enabled=(index>>i)&0x01; + out.push_back(enabled); + } + return out; +} +vector generate_string_content(string str) { + vector out; + CCC_ADD_COMPONENT(out,CCC_STRING_INLINE_HEAD); + for (auto c:str) { + CCC_ADD_COMPONENT(out,byte_to_bits(c)); + } + CCC_ADD_COMPONENT(out,CCC_STRING_INLINE_END); + return out; +} +void print_debug(string text) { + if (debug==true) { + cout< process_file_nodes(vector *nodes,string code) { + vector out; + for (int i=0;isize();i++) { + node n=nodes->at(i); + string type=type_map[n.type]; + string text=code.substr(n.start,n.end-n.start); + if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") { + auto it=rec_lookup.find(text); + if (it==rec_lookup.end()) { + CCC_ADD_COMPONENT_ALIGNED(out,generate_string_content(text)); + print_debug("string ("+type+"): "+text); + } else { + size_t index=it->second; + CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size())); + print_debug("rec_table for string ("+type+"): "+text); + } + } else if (type=="primitive_type" || type=="type_identifier") { + auto it=c_keyword_lookup.find(text); + if (it!=c_keyword_lookup.end()) { + size_t index=it->second; + CCC_ADD_COMPONENT_ALIGNED(out,generate_c_keyword(index)); + print_debug("type found in c keyword: "+text); + } else { + auto it=rec_lookup.find(text); + if (it==rec_lookup.end()) { + if (!text.empty()) { + CCC_ADD_COMPONENT_ALIGNED(out,generate_string_content(text)); + print_debug("string for type ("+type+"): "+text); + } else { + cout<<"Warning: type node is empty: "<second; + CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size())); + print_debug("rec_table for string for type ("+type+"): "+text); + } + } + } else if (delimiter0_lookup.find(type)!=delimiter0_lookup.end() || delimiter1_lookup.find(type)!=delimiter1_lookup.end() || type=="\"") { + string insert; + if (type=="(" && i+1size()) { + if (type_map[nodes->at(i+1).type]==")") { + insert="()"; + i++; + } else { + insert="("; + } + } else if (type=="[" && i+1size()) { + if (type_map[nodes->at(i+1).type]=="]") { + insert="[]"; + i++; + } else { + insert="["; + } + } else if (type=="{" && i+1size()) { + if (type_map[nodes->at(i+1).type]=="}") { + insert="{}"; + i++; + } else { + insert="{"; + } + } else { + insert=type; + } + auto it=delimiter0_lookup.find(insert); + if (it!=delimiter0_lookup.end()) { + size_t index=it->second; + CCC_ADD_COMPONENT_ALIGNED(out,generate_delimiter0(index)); + print_debug("delimiter 0: "+insert); + } else { + if (insert!="{}" && insert!="\"") { + auto it=delimiter1_lookup.find(insert); + if (it!=delimiter1_lookup.end()) { + size_t index=it->second; + CCC_ADD_COMPONENT_ALIGNED(out,generate_delimiter1(index)); + print_debug("delimiter 1: "+insert); + } else { + cout<<"Warning: unknow delimiter, that shouldn't happen: "<second; + CCC_ADD_COMPONENT(out,generate_delimiter1(index)); + vector temp={0}; + CCC_ADD_COMPONENT_ALIGNED(out,temp); + print_debug("delimiter 1: "+insert); + } else { + cout<<"Warning: unknow delimiter, that shouldn't happen: "<second; + CCC_ADD_COMPONENT(out,generate_delimiter1(index)); + vector temp={1}; + CCC_ADD_COMPONENT_ALIGNED(out,temp); + print_debug("delimiter 1: "+insert); + } else { + cout<<"Warning: unknow delimiter, that shouldn't happen: "<second; + CCC_ADD_COMPONENT_ALIGNED(out,generate_c_keyword(index)); + print_debug("c keyword: "+type); + } else { + cout<<"Warning: unknow C keyword, that shouldn't happen: "<second; + CCC_ADD_COMPONENT_ALIGNED(out,generate_c_keyword(index)); + print_debug("c keyword: "+type); + } else { + auto it=rec_lookup.find(text); + if (it==rec_lookup.end()) { + if (!text.empty()) { + CCC_ADD_COMPONENT_ALIGNED(out,generate_string_content(text)); + print_debug("string for c keyword ("+type+"): "+text); + } else { + cout<<"Warning: C keyword is empty: "<second; + CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size())); + print_debug("rec_table for string for c keyword ("+type+"): "+text); + } + } + } + } else if (miscelaneous_lookup.find(type)!=miscelaneous_lookup.end()) { + auto it=miscelaneous_lookup.find(type); + if (it!=miscelaneous_lookup.end()) { + size_t index=it->second; + CCC_ADD_COMPONENT_ALIGNED(out,generate_miscellaneous(index)); + print_debug("miscellaneous: "+type); + } else { + cout<<"Warning: unknow miscellaneous, that shouldn't happen: "<second; + CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size())); + print_debug("rec_table for comment"); + } + } else { + auto it=rec_lookup.find(type); + if (it==rec_lookup.end()) { + if (!text.empty()) { + CCC_ADD_COMPONENT_ALIGNED(out,generate_string_content(text)); + print_debug("string for unknow node ("+type+"): "+text); + } else { + cout<<"Warning: unknow node is empty: "<second; + CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size())); + print_debug("rec_table for string for unknow node ("+type+"): "+text); + } + } + } + vector payload_bytes; + unsigned char current=0; + size_t bit_index=0; + for (bool b:out) { + current|=(b<<(7-bit_index)); + bit_index++; + if (bit_index==8) { + payload_bytes.push_back(current); + current=0; + bit_index=0; + } + } + if (bit_index!=0) { + payload_bytes.push_back(current); + } + return payload_bytes; +} +void construct_rec_table(vector &files_content,vector files_names) { + for (int i=0;i=2 and s.first.size()>=3) { + rec_list.push_back(s.first); + } + } + for (int i=0;i files; + for (int i=1;i files_content; + for (int i=0;i(file)),istreambuf_iterator()); + files_content.push_back(code); + cout< files_archive; + vector payloads_size; + vector payloads_start; + for (int i=0;i payload_compressed; + payload_compressed.resize(files_archive.size()+files_archive.size()/3+128); + lzma_stream strm=LZMA_STREAM_INIT; + if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { + cout<<"Error: couldn't initialize LZMA compressor for file archive."<=original_size) { + flags&= ~(0b00000001); + payload_total_size=original_size; + } else { + flags|=0b00000001; + payload_total_size=compressed_size; + } + vector rec_table; + for (int i=0;i rec_table_compressed; + rec_table_compressed.resize(rec_table.size()+rec_table.size()/3+128); + strm=LZMA_STREAM_INIT; + if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { + cout<<"Error: couldn't initialize LZMA compressor for reccurences table."<=original_size) { + flags&= ~(0b00000010); + rec_table_total_size=original_size; + } else { + flags|=0b00000010; + rec_table_total_size=compressed_size; + } + vector files_table; + for (int i=0;i files_table_compressed; + files_table_compressed.resize(files_table.size()+files_table.size()/3+128); + strm=LZMA_STREAM_INIT; + if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) { + cout<<"Error: couldn't initialize LZMA compressor for files table."<=original_size) { + flags&= ~(0b00000100); + files_table_total_size=original_size; + } else { + flags|=0b00000100; + files_table_total_size=compressed_size; + } + header head; + head.sig[0]='C'; + head.sig[1]='C'; + head.sig[2]='C'; + head.flags=flags; + head.size_payload=payload_total_size; + head.size_rec_table=rec_table_total_size; + head.entry_count=files.size(); + vector out; + for (int i=0;i(out.data()),out.size()); + fileout.close(); + cout<<"Finished !"<