Files
ccc/ccc.cpp

259 lines
6.2 KiB
C++

#include <cstddef>
#include <cstring>
#include <iostream>
#include <filesystem>
#include <fstream>
#include <string>
#include <map>
#include <vector>
#include <iterator>
#include <regex>
using namespace std;
namespace fs=filesystem;
const vector<string> tokens={
"alignas",
"alignof",
"auto",
"bool",
"break",
"case",
"char",
"const",
"constexpr",
"continue",
"default",
"do",
"double",
"else",
"enum",
"extern",
"false",
"float",
"for",
"goto",
"if",
"inline",
"int",
"long",
"nullptr",
"register",
"restrict",
"return",
"short",
"signed",
"sizeof",
"static",
"static_assert",
"struct",
"switch",
"thread_local",
"true",
"typedef",
"typeof",
"typeof_unqual",
"union",
"unsigned",
"void",
"volatile",
"while",
"#include <",
"#include \"",
"#ifdef",
"#elifdef",
"#elifndef",
"#define",
"#undef",
"#pragma",
"#endif",
"unsigned int",
"unsigned char",
"unsigned long",
"unsigned short",
"long long",
"unsigned long long",
"signed char",
"long gouble",
"const char*",
"typedef struct"
};
struct symbol {
string name;
int score;
};
struct node {
map<unsigned char,node*> children;
int token_id=-1;
};
void insert(node* root,string str,int id) {
node* curr=root;
for (char c:str) {
if (curr->children.find(c)==curr->children.end()) {
curr->children[c]=new node();
}
curr=curr->children[c];
}
curr->token_id=id;
}
string clean_code(string source) {
string clean;
bool in_string=false;
bool in_comment_single=false;
bool in_comment_multi=false;
for (size_t i=0;i<source.size();++i) {
if (in_comment_multi) {
if (source[i]=='*' && i+1<source.size() && source[i+1]=='/') {
in_comment_multi=false;
i++;
}
continue;
}
if (in_comment_single) {
if (source[i]=='\n') {
in_comment_single=false;
} else {
continue;
}
}
if (in_string) {
if (source[i]=='\\') {i++;continue;}
if (source[i]=='"') in_string=false;
continue;
}
if (source[i]=='/' && i+1<source.size()) {
if (source[i+1]=='/') {in_comment_single=true;i++;continue;}
if (source[i+1]=='*') {in_comment_multi=true;i++;continue;}
}
if (source[i]=='"') {
in_string=true;
continue;
}
clean+=source[i];
}
return clean;
}
int main(int argc,char **argv) {
if (argc!=2) {
cout<<"Usage: ccc <c file>"<<endl;
return -1;
}
string filepath=string(argv[1]);
if (!fs::exists(filepath)) {
cout<<"Error: provided file doesn't exist."<<endl;
return -1;
}
ifstream file(filepath,ios::binary);
if (!file) {
cout<<"Error: couldn't open provided file."<<endl;
return -1;
}
string code((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
string cleanc=clean_code(code);
map<string,int> counts;
regex symbol_regex("[a-zA-Z_][a-zA-Z0-9_]*");
auto words_begin=sregex_iterator(cleanc.begin(),cleanc.end(),symbol_regex);
auto words_end=sregex_iterator();
for (sregex_iterator i=words_begin;i!=words_end;i++) {
string match=i->str();
if (match.length()>2) {
counts[match]++;
}
}
vector<symbol> leaderboard;
for (auto const& [name,count]:counts) {
leaderboard.push_back({name,(int)((name.length()-1)*count-(name.length()+1))});
}
sort(leaderboard.begin(),leaderboard.end(),[](const symbol& a,const symbol& b) {
return a.score>b.score;
});
vector<symbol> top64;
for (int i=0;i<64 && i<leaderboard.size();i++) {
if (!(leaderboard[i].score<=0)) top64.push_back(leaderboard[i]);
}
node root_node;
for (int i=0;i<tokens.size() && i<64;i++) {
insert(&root_node,tokens[i],0x80+i);
}
for (int i=0;i<top64.size();i++) {
insert(&root_node,top64[i].name,0xC0+i);
}
vector<unsigned char> output;
output.push_back('C');
output.push_back('C');
output.push_back('C');
output.push_back((unsigned char)top64.size());
for (const auto& s:top64) {
output.push_back((unsigned char)s.name.length());
for (char c:s.name) output.push_back(c);
}
for (size_t i=0;i<code.size();) {
node* curr=&root_node;
int best_id=-1;
size_t best_len=0;
for (size_t j=i;j<code.size();j++) {
unsigned char c=(unsigned char)code[j];
if (curr->children.count(c)) {
curr=curr->children[c];
if (curr->token_id!=-1) {
best_id=curr->token_id;
best_len=(j-i)+1;
}
} else {
break;
}
}
if (best_id!=-1) {
output.push_back((unsigned char)best_id);
i+=best_len;
} else {
unsigned char c=(unsigned char)code[i];
if (c<128) {
output.push_back(c);
i++;
} else {
uint32_t codepoint=0;
int bytes_to_skip=0;
if ((c & 0xE0)==0xC0) {
if (i+1<code.size()) {
codepoint=((code[i]&0x1F)<<6) | (code[i+1]&0x3F);
output.push_back(12);
bytes_to_skip=2;
}
} else if ((c & 0xF0)==0xE0) {
if (i+2<code.size()) {
codepoint=((code[i]&0x0F)<<12) | ((code[i+1]&0x3F)<<6) | (code[i+2]&0x3F);
output.push_back(13);
bytes_to_skip=3;
}
} else if ((c & 0xF8)==0xF0) {
if (i+3<code.size()) {
codepoint=((code[i]&0x07)<<18) | ((code[i+1]&0x3F)<<12) | ((code[i+2]&0x3F)<<6) | (code[i+3]&0x3F);
output.push_back(14);
bytes_to_skip=4;
}
}
if (bytes_to_skip>0) {
if (bytes_to_skip==2) {
output.push_back((unsigned char)(codepoint>>8));
output.push_back((unsigned char)(codepoint));
} else if (bytes_to_skip==3) {
output.push_back((unsigned char)(codepoint>>8));
output.push_back((unsigned char)(codepoint));
} else if (bytes_to_skip==4) {
output.push_back((unsigned char)(codepoint>>16));
output.push_back((unsigned char)(codepoint>>8));
output.push_back((unsigned char)(codepoint));
}
i+=bytes_to_skip;
} else {
output.push_back(c);
i++;
}
}
}
}
ofstream out_file(filepath+".ccomp",ios::binary);
out_file.write(reinterpret_cast<const char*>(output.data()),output.size());
out_file.close();
return 0;
}