#include #include #include #include #include #include #include #include #include #include using namespace std; namespace fs=filesystem; const vector tokens={ "alignas", "alignof", "auto", "bool", "break", "case", "char", "const", "constexpr", "continue", "default", "do", "double", "else", "enum", "extern", "false", "float", "for", "goto", "if", "inline", "int", "long", "nullptr", "register", "restrict", "return", "short", "signed", "sizeof", "static", "static_assert", "struct", "switch", "thread_local", "true", "typedef", "typeof", "typeof_unqual", "union", "unsigned", "void", "volatile", "while", "#include <", "#include \"", "#ifdef", "#elifdef", "#elifndef", "#define", "#undef", "#pragma", "#endif", "unsigned int", "unsigned char", "unsigned long", "unsigned short", "long long", "unsigned long long", "signed char", "long gouble", "const char*", "typedef struct" }; struct symbol { string name; int score; }; struct node { map children; int token_id=-1; }; void insert(node* root,string str,int id) { node* curr=root; for (char c:str) { if (curr->children.find(c)==curr->children.end()) { curr->children[c]=new node(); } curr=curr->children[c]; } curr->token_id=id; } string clean_code(string source) { string clean; bool in_string=false; bool in_comment_single=false; bool in_comment_multi=false; for (size_t i=0;i"<(file)),istreambuf_iterator()); string cleanc=clean_code(code); map counts; regex symbol_regex("[a-zA-Z_][a-zA-Z0-9_]*"); auto words_begin=sregex_iterator(cleanc.begin(),cleanc.end(),symbol_regex); auto words_end=sregex_iterator(); for (sregex_iterator i=words_begin;i!=words_end;i++) { string match=i->str(); if (match.length()>2) { counts[match]++; } } vector leaderboard; for (auto const& [name,count]:counts) { leaderboard.push_back({name,(int)((name.length()-1)*count-(name.length()+1))}); } sort(leaderboard.begin(),leaderboard.end(),[](const symbol& a,const symbol& b) { return a.score>b.score; }); vector top64; for (int i=0;i<64 && i output; output.push_back('C'); output.push_back('C'); output.push_back('C'); output.push_back((unsigned char)top64.size()); for (const auto& s:top64) { output.push_back((unsigned char)s.name.length()); for (char c:s.name) output.push_back(c); } for (size_t i=0;ichildren.count(c)) { curr=curr->children[c]; if (curr->token_id!=-1) { best_id=curr->token_id; best_len=(j-i)+1; } } else { break; } } if (best_id!=-1) { output.push_back((unsigned char)best_id); i+=best_len; } else { unsigned char c=(unsigned char)code[i]; if (c<128) { output.push_back(c); i++; } else { uint32_t codepoint=0; int bytes_to_skip=0; if ((c & 0xE0)==0xC0) { if (i+10) { if (bytes_to_skip==2) { output.push_back((unsigned char)(codepoint>>8)); output.push_back((unsigned char)(codepoint)); } else if (bytes_to_skip==3) { output.push_back((unsigned char)(codepoint>>8)); output.push_back((unsigned char)(codepoint)); } else if (bytes_to_skip==4) { output.push_back((unsigned char)(codepoint>>16)); output.push_back((unsigned char)(codepoint>>8)); output.push_back((unsigned char)(codepoint)); } i+=bytes_to_skip; } else { output.push_back(c); i++; } } } } return 0; }