second commit

This commit is contained in:
2026-02-03 16:28:35 +01:00
parent 9c2d4090b3
commit 3001621fcd
2 changed files with 193 additions and 131 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
ccc

323
ccc.cpp
View File

@@ -1,8 +1,15 @@
#include <cstddef>
#include <cstring>
#include <iostream> #include <iostream>
#include <filesystem>
#include <fstream>
#include <string> #include <string>
#include <map> #include <map>
#include <vector> #include <vector>
#include <iterator>
#include <regex>
using namespace std; using namespace std;
namespace fs=filesystem;
const vector<string> tokens={ const vector<string> tokens={
"alignas", "alignas",
"alignof", "alignof",
@@ -51,118 +58,6 @@ const vector<string> tokens={
"while", "while",
"#include <", "#include <",
"#include \"", "#include \"",
"ifdef",
"elifdef",
"elifndef",
"define",
"undef",
"pragma",
"endif",
"+=",
"-=",
"*=",
"/=",
"%=",
"&=",
"|=",
"^=",
"<<=",
">>=",
"++",
"--",
"<<",
">>",
"==",
"!=",
"<=",
">=",
"->",
";\n",
", ",
") {\n",
"unsigned int",
"unsigned long",
"unsigned char",
"unsigned short",
"long long",
"unsigned long long",
"signed char",
"long double",
"static int",
"static unsigned",
"static const",
"static char*",
"const char*",
"const int",
"extern int",
"volatile int",
"char*",
"char *",
"void*",
"void *",
"int *",
"int*",
"char**",
"char **",
"struct ",
"typedef struct",
"typedef enum",
"enum ",
"uint8",
"uint16",
"uint32",
"uint64",
"int8",
"int16",
"int32",
"int64",
"size_t",
"NULL",
".h>",
".h\"",
"int i=0;",
"int i = 0;",
"()",
"(",
")",
"[]",
"[",
"]",
"{}",
"{",
"}",
"<",
">",
"=",
"!",
"+",
"+",
"*",
"/",
"%",
"&",
"|",
"^",
"~",
".",
":",
";",
"?",
" ",
"\n",
"\t",
"\n ",
"\n\t",
" ",
"uint8_t",
"uint16_t",
"uint32_t",
"uint64_t",
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"int main(",
"#ifdef", "#ifdef",
"#elifdef", "#elifdef",
"#elifndef", "#elifndef",
@@ -170,25 +65,191 @@ const vector<string> tokens={
"#undef", "#undef",
"#pragma", "#pragma",
"#endif", "#endif",
"printf(", "unsigned int",
"malloc(", "unsigned char",
"free(", "unsigned long",
");", "unsigned short",
");\n", "long long",
");\n\t", "unsigned long long",
");\n ", "signed char",
"if (", "long gouble",
"while (", "const char*",
"} else if (", "typedef struct"
"} else {",
"for (",
"switch (",
"&&",
"||",
" && ",
" || ",
}; };
int main() { struct symbol {
cout<<tokens.size()<<endl; string name;
int score;
};
struct node {
map<unsigned char,node*> children;
int token_id=-1;
};
void insert(node* root,string str,int id) {
node* curr=root;
for (char c:str) {
if (curr->children.find(c)==curr->children.end()) {
curr->children[c]=new node();
}
curr=curr->children[c];
}
curr->token_id=id;
}
string clean_code(string source) {
string clean;
bool in_string=false;
bool in_comment_single=false;
bool in_comment_multi=false;
for (size_t i=0;i<source.size();++i) {
if (in_comment_multi) {
if (source[i]=='*' && i+1<source.size() && source[i+1]=='/') {
in_comment_multi=false;
i++;
}
continue;
}
if (in_comment_single) {
if (source[i]=='\n') {
in_comment_single=false;
} else {
continue;
}
}
if (in_string) {
if (source[i]=='\\') {i++;continue;}
if (source[i]=='"') in_string=false;
continue;
}
if (source[i]=='/' && i+1<source.size()) {
if (source[i+1]=='/') {in_comment_single=true;i++;continue;}
if (source[i+1]=='*') {in_comment_multi=true;i++;continue;}
}
if (source[i]=='"') {
in_string=true;
continue;
}
clean+=source[i];
}
return clean;
}
int main(int argc,char **argv) {
if (argc!=2) {
cout<<"Usage: ccc <c file>"<<endl;
return -1;
}
string filepath=string(argv[1]);
if (!fs::exists(filepath)) {
cout<<"Error: provided file doesn't exist."<<endl;
return -1;
}
ifstream file(filepath,ios::binary);
if (!file) {
cout<<"Error: couldn't open provided file."<<endl;
return -1;
}
string code((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
string cleanc=clean_code(code);
map<string,int> counts;
regex symbol_regex("[a-zA-Z_][a-zA-Z0-9_]*");
auto words_begin=sregex_iterator(cleanc.begin(),cleanc.end(),symbol_regex);
auto words_end=sregex_iterator();
for (sregex_iterator i=words_begin;i!=words_end;i++) {
string match=i->str();
if (match.length()>2) {
counts[match]++;
}
}
vector<symbol> leaderboard;
for (auto const& [name,count]:counts) {
leaderboard.push_back({name,(int)((name.length()-1)*count-(name.length()+1))});
}
sort(leaderboard.begin(),leaderboard.end(),[](const symbol& a,const symbol& b) {
return a.score>b.score;
});
vector<symbol> top64;
for (int i=0;i<64 && i<leaderboard.size();i++) {
if (!(leaderboard[i].score<=0)) top64.push_back(leaderboard[i]);
}
node root_node;
for (int i=0;i<tokens.size() && i<64;i++) {
insert(&root_node,tokens[i],0x80+i);
}
for (int i=0;i<top64.size();i++) {
insert(&root_node,top64[i].name,0xC0+i);
}
vector<unsigned char> output;
output.push_back('C');
output.push_back('C');
output.push_back('C');
output.push_back((unsigned char)top64.size());
for (const auto& s:top64) {
output.push_back((unsigned char)s.name.length());
for (char c:s.name) output.push_back(c);
}
for (size_t i=0;i<code.size();) {
node* curr=&root_node;
int best_id=-1;
size_t best_len=0;
for (size_t j=i;j<code.size();j++) {
unsigned char c=(unsigned char)code[j];
if (curr->children.count(c)) {
curr=curr->children[c];
if (curr->token_id!=-1) {
best_id=curr->token_id;
best_len=(j-i)+1;
}
} else {
break;
}
}
if (best_id!=-1) {
output.push_back((unsigned char)best_id);
i+=best_len;
} else {
unsigned char c=(unsigned char)code[i];
if (c<128) {
output.push_back(c);
i++;
} else {
uint32_t codepoint=0;
int bytes_to_skip=0;
if ((c & 0xE0)==0xC0) {
if (i+1<code.size()) {
codepoint=((code[i]&0x1F)<<6) | (code[i+1]&0x3F);
output.push_back(12);
bytes_to_skip=2;
}
} else if ((c & 0xF0)==0xE0) {
if (i+2<code.size()) {
codepoint=((code[i]&0x0F)<<12) | ((code[i+1]&0x3F)<<6) | (code[i+2]&0x3F);
output.push_back(13);
bytes_to_skip=3;
}
} else if ((c & 0xF8)==0xF0) {
if (i+3<code.size()) {
codepoint=((code[i]&0x07)<<18) | ((code[i+1]&0x3F)<<12) | ((code[i+2]&0x3F)<<6) | (code[i+3]&0x3F);
output.push_back(14);
bytes_to_skip=4;
}
}
if (bytes_to_skip>0) {
if (bytes_to_skip==2) {
output.push_back((unsigned char)(codepoint>>8));
output.push_back((unsigned char)(codepoint));
} else if (bytes_to_skip==3) {
output.push_back((unsigned char)(codepoint>>8));
output.push_back((unsigned char)(codepoint));
} else if (bytes_to_skip==4) {
output.push_back((unsigned char)(codepoint>>16));
output.push_back((unsigned char)(codepoint>>8));
output.push_back((unsigned char)(codepoint));
}
i+=bytes_to_skip;
} else {
output.push_back(c);
i++;
}
}
}
}
return 0; return 0;
} }