diff --git a/build.sh b/build.sh index 145bd29..6d71709 100755 --- a/build.sh +++ b/build.sh @@ -1 +1 @@ -g++ ccc.cpp -o ccc -ltree-sitter -ltree-sitter-c -llzma +g++ ccc.cpp -o ccc -ltree-sitter -ltree-sitter-c -llzma -Ofast -march=native diff --git a/ccc.cpp b/ccc.cpp index a6fa247..29a94b0 100644 --- a/ccc.cpp +++ b/ccc.cpp @@ -14,56 +14,35 @@ #include using namespace std; namespace fs=filesystem; -const vector CCC_C_KEYYORD_HEAD {0,0,0}; -const vector CCC_SPACE {0,1,1,1,0,0,1}; -const vector CCC_PREPROCESSOR_CONDITIONAL_IF {0,0,1,0,0,0}; -const vector CCC_PREPROCESSOR_CONDITIONAL_IFDEF {0,0,1,0,0,1}; -const vector CCC_PREPROCESSOR_CONDITIONAL_IFNDEF {0,0,1,0,1,0}; -const vector CCC_PREPROCESSOR_CONDITIONAL_ELSE {0,0,1,0,1,1}; -const vector CCC_PREPROCESSOR_CONDITIONAL_ELIF {0,0,1,1,0,0}; -const vector CCC_PREPROCESSOR_CONDITIONAL_ELIFDEF {0,0,1,1,0,1}; -const vector CCC_PREPROCESSOR_CONDITIONAL_ELIFNDEF {0,0,1,1,1,0}; -const vector CCC_PREPROCESSOR_CONDITIONAL_ENDIF {0,0,1,1,1,1}; -const vector CCC_PREPROCESSOR_OTHER_DEFINE {0,1,0,0,0,0}; -const vector CCC_PREPROCESSOR_OTHER_UNDEF {0,1,0,0,0,1}; -const vector CCC_PREPROCESSOR_OTHER_INCLUDE {0,1,0,0,1,0}; -const vector CCC_PREPROCESSOR_OTHER_ERROR {0,1,0,0,1,1}; -const vector CCC_PREPROCESSOR_OTHER_WARNING {0,1,0,1,0,0}; -const vector CCC_PREPROCESSOR_OTHER_PRAGMA {0,1,0,1,0,1}; -const vector CCC_PREPROCESSOR_OTHER_LINE {0,1,0,1,1,0}; -const vector CCC_QUOTE {0,1,0,1,1,1}; -const vector CCC_DELIMITER_HEAD {0,1,1}; -const vector CCC_OTHER_GRAMMAR_HEAD {1,0,0}; -const vector CCC_MISCELLANEOUS_HEAD {1,0,1}; -const vector CCC_REC_TABLE_REF_HEAD {1,1,0}; -const vector CCC_STRING_ASCII {1,1,1,0}; -const vector CCC_STRING_UTF8 {1,1,1,1}; -const vector CCC_STRING_END_ASCII {0,0,0,0,0,0,0}; -const vector CCC_STRING_END_UTF8 {0,0,0,0,0,0,0,0}; +const vector CCC_DELIMITER_0_HEAD={0}; +const vector CCC_DELIMITER_1_HEAD={1,0}; +const vector CCC_C_KEYWORD_HEAD={1,1,0,0}; +const vector CCC_MISCELANEOUS_HEAD={1,1,0,1}; +const vector CCC_STRING_INLINE_HEAD={1,1,1,0}; +const vector CCC_REC_TABLE_REF_HEAD={1,1,1,1}; +const vector CCC_STRING_INLINE_END={0,0,0,0,0,0,0,0}; #define CCC_ADD_COMPOMENT(vec,tail) \ do { \ auto tmp=tail; \ vec.insert(vec.end(),tmp.begin(),tmp.end()); \ } while (0) -const vector delimiter={ - "\n", - "\t", +const vector delimiter0={ "{", "}", "(", ")", "[", "]", - " ", - "{}", - "()", - "[]", - "", - ";", ",", "." }; -const vector other_grammer={ +const vector delimiter1={ + "{}", + "()", + "[]", + ";" +}; +const vector miscellaneous={ "!", "%", "'", @@ -79,9 +58,7 @@ const vector other_grammer={ "^", "|", "&", - "~" -}; -const vector miscellaneous={ + "~", "+=", "-=", "*=", @@ -116,6 +93,21 @@ const vector miscellaneous={ "int64_t" }; const vector c_keywords={ + "#if", + "#ifdef", + "#ifndef", + "#else", + "#elif", + "#elifdef", + "#elifndef", + "#endif", + "#define", + "#undef", + "#include", + "#error", + "#warning", + "#pragma", + "#line", "alignas", "alignof", "auto", @@ -162,7 +154,8 @@ const vector c_keywords={ "volatile", "while", "__asm__", - "__attribute__" + "__attribute__", + "defined", }; struct symbol { string name; @@ -210,17 +203,9 @@ vector byte_to_bits(unsigned char c) { } return out; } -vector ascii_to_bits(unsigned char c) { - vector out; - for (int i=6;i>=0;i--) { - bool enabled=(c>>i)&0x01; - out.push_back(enabled); - } - return out; -} vector generate_c_keyword(size_t index) { vector out; - CCC_ADD_COMPOMENT(out,CCC_C_KEYYORD_HEAD); + CCC_ADD_COMPOMENT(out,CCC_C_KEYWORD_HEAD); for (int i=5;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); @@ -241,19 +226,19 @@ vector generate_rec(size_t index,size_t total_recs) { } return out; } -vector generate_delimiter(size_t index) { +vector generate_delimiter0(size_t index) { vector out; - CCC_ADD_COMPOMENT(out,CCC_DELIMITER_HEAD); - for (int i=3;i>=0;i--) { + CCC_ADD_COMPOMENT(out,CCC_DELIMITER_0_HEAD); + for (int i=2;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } return out; } -vector generate_other_grammar(size_t index) { +vector generate_delimiter1(size_t index) { vector out; - CCC_ADD_COMPOMENT(out,CCC_OTHER_GRAMMAR_HEAD); - for (int i=3;i>=0;i--) { + CCC_ADD_COMPOMENT(out,CCC_DELIMITER_1_HEAD); + for (int i=1;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } @@ -261,8 +246,8 @@ vector generate_other_grammar(size_t index) { } vector generate_miscellaneous(size_t index) { vector out; - CCC_ADD_COMPOMENT(out,CCC_MISCELLANEOUS_HEAD); - for (int i=4;i>=0;i--) { + CCC_ADD_COMPOMENT(out,CCC_MISCELANEOUS_HEAD); + for (int i=5;i>=0;i--) { bool enabled=(index>>i)&0x01; out.push_back(enabled); } @@ -270,26 +255,11 @@ vector generate_miscellaneous(size_t index) { } vector generate_string_content(string str) { vector out; - bool is_utf8=false; + CCC_ADD_COMPOMENT(out,CCC_STRING_INLINE_HEAD); for (auto c:str) { - if (c>127) { - is_utf8=true; - break; - } - } - if (is_utf8) { - CCC_ADD_COMPOMENT(out,CCC_STRING_UTF8); - for (auto c:str) { - CCC_ADD_COMPOMENT(out,byte_to_bits(c)); - } - CCC_ADD_COMPOMENT(out,CCC_STRING_END_UTF8); - } else { - CCC_ADD_COMPOMENT(out,CCC_STRING_ASCII); - for (auto c:str) { - CCC_ADD_COMPOMENT(out,ascii_to_bits(c)); - } - CCC_ADD_COMPOMENT(out,CCC_STRING_END_ASCII); + CCC_ADD_COMPOMENT(out,byte_to_bits(c)); } + CCC_ADD_COMPOMENT(out,CCC_STRING_INLINE_END); return out; } void print_debug(string text) { @@ -301,148 +271,143 @@ vector process_file_nodes(vector *nodes,string code,vecto vector out; for (int i=0;isize();i++) { string type=string(ts_node_type(nodes->at(i))); - string supertext=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); - if (type=="#if") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IF); - print_debug("if"); - } else if (type=="#ifdef") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IFDEF); - print_debug("ifdef"); - } else if (type=="#ifndef") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IFNDEF); - print_debug("ifndef"); - } else if (type=="#else") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELSE); - print_debug("else"); - } else if (type=="#elif") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIF); - print_debug("elif"); - } else if (type=="#elifdef") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIFDEF); - print_debug("elifdef"); - } else if (type=="#elifndef") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIFNDEF); - print_debug("elifndef"); - } else if (type=="#endif") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ENDIF); - print_debug("endif"); - } else if (type=="#define") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_DEFINE); - print_debug("define"); - } else if (type=="#undef") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_UNDEF); - print_debug("undef"); - } else if (type=="#include") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_INCLUDE); - print_debug("include"); - } else if (type=="#error") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_ERROR); - print_debug("error"); - } else if (type=="#warning") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_WARNING); - print_debug("warning"); - } else if (type=="#pragma" || (type=="preproc_directive" && supertext=="#pragma")) { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_PRAGMA); - print_debug("pragma"); - } else if (type=="#line") { - CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_LINE); - print_debug("line"); - } else if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="type_identifier" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") { - string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); + string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); + if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") { auto it=find(rec_list.begin(),rec_list.end(),text); if (it==rec_list.end()) { - if (!text.empty()) { - string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); - CCC_ADD_COMPOMENT(out,generate_string_content(text)); - print_debug("string ("+type+"): "+text); - } else { - auto it=find(delimiter.begin(),delimiter.end(),""); - size_t index=distance(delimiter.begin(),it); - CCC_ADD_COMPOMENT(out,generate_delimiter(index)); - print_debug("delimiter for empty string"); - } + string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); + CCC_ADD_COMPOMENT(out,generate_string_content(text)); + print_debug("string ("+type+"): "+text); } else { size_t index=distance(rec_list.begin(),it); CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size())); print_debug("rec_table for string ("+type+"): "+text); } - } else if (type=="primitive_type") { + } else if (type=="primitive_type" || type=="type_identifier") { string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); auto it=find(c_keywords.begin(),c_keywords.end(),text); if (it!=c_keywords.end()) { size_t index=distance(c_keywords.begin(),it); CCC_ADD_COMPOMENT(out,generate_c_keyword(index)); - print_debug("primitive_type: "+text); + print_debug("type found in c keyword: "+text); } else { auto it=find(rec_list.begin(),rec_list.end(),text); if (it==rec_list.end()) { if (!text.empty()) { CCC_ADD_COMPOMENT(out,generate_string_content(text)); - print_debug("string ("+type+"): "+text); + print_debug("string for type ("+type+"): "+text); } else { - cout<<"Error: provided primitive is empty: "<size()) { if (string(ts_node_type(nodes->at(i+1)))==")") { - text="()"; + insert="()"; i++; } else { - text="("; + insert="("; } } else if (type=="[" && i+1size()) { if (string(ts_node_type(nodes->at(i+1)))=="]") { - text="[]"; + insert="[]"; i++; } else { - text="["; + insert="["; } } else if (type=="{" && i+1size()) { if (string(ts_node_type(nodes->at(i+1)))=="}") { - text="{}"; + insert="{}"; i++; } else { - text="{"; + insert="{"; } } else { - text=type; + insert=type; } - auto it=find(delimiter.begin(),delimiter.end(),text); - if (it!=delimiter.end()) { - size_t index=distance(delimiter.begin(),it); - CCC_ADD_COMPOMENT(out,generate_delimiter(index)); - print_debug("delimiter: "+text); + auto it=find(delimiter0.begin(),delimiter0.end(),insert); + if (it!=delimiter0.end()) { + size_t index=distance(delimiter0.begin(),it); + CCC_ADD_COMPOMENT(out,generate_delimiter0(index)); + print_debug("delimiter 0: "+insert); } else { - cout<<"Error: unknow delimiter, that shouldn't happen: "< process_file_nodes(vector *nodes,string code,vecto print_debug("miscellaneous: "+type); } else { cout<<"Error: unknow miscellaneous, that shouldn't happen: "<at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); auto it=find(rec_list.begin(),rec_list.end(),text); if (it==rec_list.end()) { - cout<<"Error: comment in reccurences map not found: "<size()) { - if (string(ts_node_type(nodes->at(i+1)))=="\"") { - auto it=find(delimiter.begin(),delimiter.end(),""); - size_t index=distance(delimiter.begin(),it); - CCC_ADD_COMPOMENT(out,generate_delimiter(index)); - print_debug("double quotes mark, inserting delimiter for empty string"); - i++; - } else { - CCC_ADD_COMPOMENT(out,CCC_QUOTE); - print_debug("single quote mark"); - } - } } else { - string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i))); - cout<<"Error: unknow node type: "< payload_bytes; @@ -517,6 +487,7 @@ void construct_rec_table(vector &files_content,vector files_name } } int main(int argc,char **argv) { + cout< typedef static unsigned char HEY; // hello -// hello -// hello -// hello -// hello -// hello int main() { - hello[]="hello"; + char hello[]="hello"; HEY res=8; if (res!=9) { printf(hello); diff --git a/linux_sources.tar.xz b/linux_sources.tar.xz new file mode 100644 index 0000000..7ab91d5 Binary files /dev/null and b/linux_sources.tar.xz differ diff --git a/test.ccc b/test.ccc index d33b2bf..f5128a8 100644 Binary files a/test.ccc and b/test.ccc differ diff --git a/test.py b/test.py new file mode 100644 index 0000000..902bfb2 --- /dev/null +++ b/test.py @@ -0,0 +1,69 @@ +import os +import subprocess +import time + +def get_source_files(root_dir): + """Récupère les fichiers et calcule la taille totale.""" + source_files = [] + total_size = 0 + for root, _, files in os.walk(root_dir): + for file in files: + if file.endswith(('.c', '.h')) and len(source_files)<10000: + path = os.path.join(root, file) + source_files.append(path) + total_size += os.path.getsize(path) + return source_files, total_size + +def main(): + target_dir = "linux" + if not os.path.exists(target_dir): + print(f"Erreur: Le dossier {target_dir} n'existe pas.") + return + + print(f"--- Analyse de {target_dir} ---") + files, total_raw_size = get_source_files(target_dir) + raw_mo = total_raw_size / (1024 * 1024) + print(f"Fichiers trouvés : {len(files)}") + print(f"Taille totale brute : {raw_mo:.2f} Mo") + + # 1. Compression avec TAR + print("\n--- Lancement de TAR -cJf (XZ) ---") + start_tar = time.time() + tar_cmd = ["tar", "-cJf", "linux_sources.tar.xz", "--files-from=-"] + process_tar = subprocess.Popen(tar_cmd, stdin=subprocess.PIPE) + process_tar.communicate(input="\n".join(files).encode()) + end_tar = time.time() + + # 2. Compression avec CCC + print("\n--- Lancement de CCC (Output temps réel) ---") + print("-" * 40) + start_ccc = time.time() + try: + # On laisse stdout et stderr par défaut pour voir l'output de CCC + subprocess.run(["./ccc"] + files, check=True) + except subprocess.CalledProcessError as e: + print(f"\nErreur fatale CCC : {e}") + except OSError as e: + print(f"\nErreur système (trop de fichiers ?) : {e}") + return + end_ccc = time.time() + print("-" * 40) + + # 3. Calculs finaux + print("\n" + "="*40) + print(f" RÉSULTATS (Source: {raw_mo:.2f} Mo)") + print("="*40) + + for name, filename in [("TAR.XZ", "linux_sources.tar.xz"), ("CCC", "test.ccc")]: + if os.path.exists(filename): + size_mo = os.path.getsize(filename) / (1024 * 1024) + ratio = (size_mo / raw_mo) * 100 + print(f"{name:10} : {size_mo:8.2f} Mo ({ratio:5.2f}% du total)") + else: + print(f"{name:10} : Non généré") + + print(f"\nTemps TAR : {end_tar - start_tar:.2f}s") + print(f"Temps CCC : {end_ccc - start_ccc:.2f}s") + +if __name__ == "__main__": + main()