diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b2a7546 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +ccc diff --git a/ccc.cpp b/ccc.cpp index dbfc340..c8f8cbe 100644 --- a/ccc.cpp +++ b/ccc.cpp @@ -1,8 +1,15 @@ +#include +#include #include +#include +#include #include #include #include +#include +#include using namespace std; +namespace fs=filesystem; const vector tokens={ "alignas", "alignof", @@ -51,118 +58,6 @@ const vector tokens={ "while", "#include <", "#include \"", - "ifdef", - "elifdef", - "elifndef", - "define", - "undef", - "pragma", - "endif", - "+=", - "-=", - "*=", - "/=", - "%=", - "&=", - "|=", - "^=", - "<<=", - ">>=", - "++", - "--", - "<<", - ">>", - "==", - "!=", - "<=", - ">=", - "->", - ";\n", - ", ", - ") {\n", - "unsigned int", - "unsigned long", - "unsigned char", - "unsigned short", - "long long", - "unsigned long long", - "signed char", - "long double", - "static int", - "static unsigned", - "static const", - "static char*", - "const char*", - "const int", - "extern int", - "volatile int", - "char*", - "char *", - "void*", - "void *", - "int *", - "int*", - "char**", - "char **", - "struct ", - "typedef struct", - "typedef enum", - "enum ", - "uint8", - "uint16", - "uint32", - "uint64", - "int8", - "int16", - "int32", - "int64", - "size_t", - "NULL", - ".h>", - ".h\"", - "int i=0;", - "int i = 0;", - "()", - "(", - ")", - "[]", - "[", - "]", - "{}", - "{", - "}", - "<", - ">", - "=", - "!", - "+", - "+", - "*", - "/", - "%", - "&", - "|", - "^", - "~", - ".", - ":", - ";", - "?", - " ", - "\n", - "\t", - "\n ", - "\n\t", - " ", - "uint8_t", - "uint16_t", - "uint32_t", - "uint64_t", - "int8_t", - "int16_t", - "int32_t", - "int64_t", - "int main(", "#ifdef", "#elifdef", "#elifndef", @@ -170,25 +65,191 @@ const vector tokens={ "#undef", "#pragma", "#endif", - "printf(", - "malloc(", - "free(", - ");", - ");\n", - ");\n\t", - ");\n ", - "if (", - "while (", - "} else if (", - "} else {", - "for (", - "switch (", - "&&", - "||", - " && ", - " || ", + "unsigned int", + "unsigned char", + "unsigned long", + "unsigned short", + "long long", + "unsigned long long", + "signed char", + "long gouble", + "const char*", + "typedef struct" }; -int main() { - cout< children; + int token_id=-1; +}; +void insert(node* root,string str,int id) { + node* curr=root; + for (char c:str) { + if (curr->children.find(c)==curr->children.end()) { + curr->children[c]=new node(); + } + curr=curr->children[c]; + } + curr->token_id=id; +} +string clean_code(string source) { + string clean; + bool in_string=false; + bool in_comment_single=false; + bool in_comment_multi=false; + for (size_t i=0;i"<(file)),istreambuf_iterator()); + string cleanc=clean_code(code); + map counts; + regex symbol_regex("[a-zA-Z_][a-zA-Z0-9_]*"); + auto words_begin=sregex_iterator(cleanc.begin(),cleanc.end(),symbol_regex); + auto words_end=sregex_iterator(); + for (sregex_iterator i=words_begin;i!=words_end;i++) { + string match=i->str(); + if (match.length()>2) { + counts[match]++; + } + } + vector leaderboard; + for (auto const& [name,count]:counts) { + leaderboard.push_back({name,(int)((name.length()-1)*count-(name.length()+1))}); + } + sort(leaderboard.begin(),leaderboard.end(),[](const symbol& a,const symbol& b) { + return a.score>b.score; + }); + vector top64; + for (int i=0;i<64 && i output; + output.push_back('C'); + output.push_back('C'); + output.push_back('C'); + output.push_back((unsigned char)top64.size()); + for (const auto& s:top64) { + output.push_back((unsigned char)s.name.length()); + for (char c:s.name) output.push_back(c); + } + for (size_t i=0;ichildren.count(c)) { + curr=curr->children[c]; + if (curr->token_id!=-1) { + best_id=curr->token_id; + best_len=(j-i)+1; + } + } else { + break; + } + } + if (best_id!=-1) { + output.push_back((unsigned char)best_id); + i+=best_len; + } else { + unsigned char c=(unsigned char)code[i]; + if (c<128) { + output.push_back(c); + i++; + } else { + uint32_t codepoint=0; + int bytes_to_skip=0; + if ((c & 0xE0)==0xC0) { + if (i+10) { + if (bytes_to_skip==2) { + output.push_back((unsigned char)(codepoint>>8)); + output.push_back((unsigned char)(codepoint)); + } else if (bytes_to_skip==3) { + output.push_back((unsigned char)(codepoint>>8)); + output.push_back((unsigned char)(codepoint)); + } else if (bytes_to_skip==4) { + output.push_back((unsigned char)(codepoint>>16)); + output.push_back((unsigned char)(codepoint>>8)); + output.push_back((unsigned char)(codepoint)); + } + i+=bytes_to_skip; + } else { + output.push_back(c); + i++; + } + } + } + } return 0; }