256 lines
6.1 KiB
C++
256 lines
6.1 KiB
C++
#include <cstddef>
|
|
#include <cstring>
|
|
#include <iostream>
|
|
#include <filesystem>
|
|
#include <fstream>
|
|
#include <string>
|
|
#include <map>
|
|
#include <vector>
|
|
#include <iterator>
|
|
#include <regex>
|
|
using namespace std;
|
|
namespace fs=filesystem;
|
|
const vector<string> tokens={
|
|
"alignas",
|
|
"alignof",
|
|
"auto",
|
|
"bool",
|
|
"break",
|
|
"case",
|
|
"char",
|
|
"const",
|
|
"constexpr",
|
|
"continue",
|
|
"default",
|
|
"do",
|
|
"double",
|
|
"else",
|
|
"enum",
|
|
"extern",
|
|
"false",
|
|
"float",
|
|
"for",
|
|
"goto",
|
|
"if",
|
|
"inline",
|
|
"int",
|
|
"long",
|
|
"nullptr",
|
|
"register",
|
|
"restrict",
|
|
"return",
|
|
"short",
|
|
"signed",
|
|
"sizeof",
|
|
"static",
|
|
"static_assert",
|
|
"struct",
|
|
"switch",
|
|
"thread_local",
|
|
"true",
|
|
"typedef",
|
|
"typeof",
|
|
"typeof_unqual",
|
|
"union",
|
|
"unsigned",
|
|
"void",
|
|
"volatile",
|
|
"while",
|
|
"#include <",
|
|
"#include \"",
|
|
"#ifdef",
|
|
"#elifdef",
|
|
"#elifndef",
|
|
"#define",
|
|
"#undef",
|
|
"#pragma",
|
|
"#endif",
|
|
"unsigned int",
|
|
"unsigned char",
|
|
"unsigned long",
|
|
"unsigned short",
|
|
"long long",
|
|
"unsigned long long",
|
|
"signed char",
|
|
"long gouble",
|
|
"const char*",
|
|
"typedef struct"
|
|
};
|
|
struct symbol {
|
|
string name;
|
|
int score;
|
|
};
|
|
struct node {
|
|
map<unsigned char,node*> children;
|
|
int token_id=-1;
|
|
};
|
|
void insert(node* root,string str,int id) {
|
|
node* curr=root;
|
|
for (char c:str) {
|
|
if (curr->children.find(c)==curr->children.end()) {
|
|
curr->children[c]=new node();
|
|
}
|
|
curr=curr->children[c];
|
|
}
|
|
curr->token_id=id;
|
|
}
|
|
string clean_code(string source) {
|
|
string clean;
|
|
bool in_string=false;
|
|
bool in_comment_single=false;
|
|
bool in_comment_multi=false;
|
|
for (size_t i=0;i<source.size();++i) {
|
|
if (in_comment_multi) {
|
|
if (source[i]=='*' && i+1<source.size() && source[i+1]=='/') {
|
|
in_comment_multi=false;
|
|
i++;
|
|
}
|
|
continue;
|
|
}
|
|
if (in_comment_single) {
|
|
if (source[i]=='\n') {
|
|
in_comment_single=false;
|
|
} else {
|
|
continue;
|
|
}
|
|
}
|
|
if (in_string) {
|
|
if (source[i]=='\\') {i++;continue;}
|
|
if (source[i]=='"') in_string=false;
|
|
continue;
|
|
}
|
|
if (source[i]=='/' && i+1<source.size()) {
|
|
if (source[i+1]=='/') {in_comment_single=true;i++;continue;}
|
|
if (source[i+1]=='*') {in_comment_multi=true;i++;continue;}
|
|
}
|
|
if (source[i]=='"') {
|
|
in_string=true;
|
|
continue;
|
|
}
|
|
clean+=source[i];
|
|
}
|
|
return clean;
|
|
}
|
|
int main(int argc,char **argv) {
|
|
if (argc!=2) {
|
|
cout<<"Usage: ccc <c file>"<<endl;
|
|
return -1;
|
|
}
|
|
string filepath=string(argv[1]);
|
|
if (!fs::exists(filepath)) {
|
|
cout<<"Error: provided file doesn't exist."<<endl;
|
|
return -1;
|
|
}
|
|
ifstream file(filepath,ios::binary);
|
|
if (!file) {
|
|
cout<<"Error: couldn't open provided file."<<endl;
|
|
return -1;
|
|
}
|
|
string code((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
|
|
string cleanc=clean_code(code);
|
|
map<string,int> counts;
|
|
regex symbol_regex("[a-zA-Z_][a-zA-Z0-9_]*");
|
|
auto words_begin=sregex_iterator(cleanc.begin(),cleanc.end(),symbol_regex);
|
|
auto words_end=sregex_iterator();
|
|
for (sregex_iterator i=words_begin;i!=words_end;i++) {
|
|
string match=i->str();
|
|
if (match.length()>2) {
|
|
counts[match]++;
|
|
}
|
|
}
|
|
vector<symbol> leaderboard;
|
|
for (auto const& [name,count]:counts) {
|
|
leaderboard.push_back({name,(int)((name.length()-1)*count-(name.length()+1))});
|
|
}
|
|
sort(leaderboard.begin(),leaderboard.end(),[](const symbol& a,const symbol& b) {
|
|
return a.score>b.score;
|
|
});
|
|
vector<symbol> top64;
|
|
for (int i=0;i<64 && i<leaderboard.size();i++) {
|
|
if (!(leaderboard[i].score<=0)) top64.push_back(leaderboard[i]);
|
|
}
|
|
node root_node;
|
|
for (int i=0;i<tokens.size() && i<64;i++) {
|
|
insert(&root_node,tokens[i],0x80+i);
|
|
}
|
|
for (int i=0;i<top64.size();i++) {
|
|
insert(&root_node,top64[i].name,0xC0+i);
|
|
}
|
|
vector<unsigned char> output;
|
|
output.push_back('C');
|
|
output.push_back('C');
|
|
output.push_back('C');
|
|
output.push_back((unsigned char)top64.size());
|
|
for (const auto& s:top64) {
|
|
output.push_back((unsigned char)s.name.length());
|
|
for (char c:s.name) output.push_back(c);
|
|
}
|
|
for (size_t i=0;i<code.size();) {
|
|
node* curr=&root_node;
|
|
int best_id=-1;
|
|
size_t best_len=0;
|
|
for (size_t j=i;j<code.size();j++) {
|
|
unsigned char c=(unsigned char)code[j];
|
|
if (curr->children.count(c)) {
|
|
curr=curr->children[c];
|
|
if (curr->token_id!=-1) {
|
|
best_id=curr->token_id;
|
|
best_len=(j-i)+1;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if (best_id!=-1) {
|
|
output.push_back((unsigned char)best_id);
|
|
i+=best_len;
|
|
} else {
|
|
unsigned char c=(unsigned char)code[i];
|
|
if (c<128) {
|
|
output.push_back(c);
|
|
i++;
|
|
} else {
|
|
uint32_t codepoint=0;
|
|
int bytes_to_skip=0;
|
|
if ((c & 0xE0)==0xC0) {
|
|
if (i+1<code.size()) {
|
|
codepoint=((code[i]&0x1F)<<6) | (code[i+1]&0x3F);
|
|
output.push_back(12);
|
|
bytes_to_skip=2;
|
|
}
|
|
} else if ((c & 0xF0)==0xE0) {
|
|
if (i+2<code.size()) {
|
|
codepoint=((code[i]&0x0F)<<12) | ((code[i+1]&0x3F)<<6) | (code[i+2]&0x3F);
|
|
output.push_back(13);
|
|
bytes_to_skip=3;
|
|
}
|
|
} else if ((c & 0xF8)==0xF0) {
|
|
if (i+3<code.size()) {
|
|
codepoint=((code[i]&0x07)<<18) | ((code[i+1]&0x3F)<<12) | ((code[i+2]&0x3F)<<6) | (code[i+3]&0x3F);
|
|
output.push_back(14);
|
|
bytes_to_skip=4;
|
|
}
|
|
}
|
|
if (bytes_to_skip>0) {
|
|
if (bytes_to_skip==2) {
|
|
output.push_back((unsigned char)(codepoint>>8));
|
|
output.push_back((unsigned char)(codepoint));
|
|
} else if (bytes_to_skip==3) {
|
|
output.push_back((unsigned char)(codepoint>>8));
|
|
output.push_back((unsigned char)(codepoint));
|
|
} else if (bytes_to_skip==4) {
|
|
output.push_back((unsigned char)(codepoint>>16));
|
|
output.push_back((unsigned char)(codepoint>>8));
|
|
output.push_back((unsigned char)(codepoint));
|
|
}
|
|
i+=bytes_to_skip;
|
|
} else {
|
|
output.push_back(c);
|
|
i++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|