diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b2a7546
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+ccc
diff --git a/ccc.cpp b/ccc.cpp
index dbfc340..c8f8cbe 100644
--- a/ccc.cpp
+++ b/ccc.cpp
@@ -1,8 +1,15 @@
+#include <cstddef>
+#include <cstring>
 #include <iostream>
+#include <filesystem>
+#include <fstream>
 #include <string>
 #include <map>
 #include <vector>
+#include <iterator>
+#include <regex>
 using namespace std;
+namespace fs=filesystem;
 const vector<string> tokens={
   "alignas",
   "alignof",
@@ -51,118 +58,6 @@ const vector<string> tokens={
   "while",
   "#include <",
   "#include \"",
-  "ifdef",
-  "elifdef",
-  "elifndef",
-  "define",
-  "undef",
-  "pragma",
-  "endif",
-  "+=",
-  "-=",
-  "*=",
-  "/=",
-  "%=",
-  "&=",
-  "|=",
-  "^=",
-  "<<=",
-  ">>=",
-  "++",
-  "--",
-  "<<",
-  ">>",
-  "==",
-  "!=",
-  "<=",
-  ">=",
-  "->",
-  ";\n",
-  ", ",
-  ") {\n",
-  "unsigned int",
-  "unsigned long",
-  "unsigned char",
-  "unsigned short",
-  "long long",
-  "unsigned long long",
-  "signed char",
-  "long double",
-  "static int",
-  "static unsigned",
-  "static const",
-  "static char*",
-  "const char*",
-  "const int",
-  "extern int",
-  "volatile int",
-  "char*",
-  "char *",
-  "void*",
-  "void *",
-  "int *",
-  "int*",
-  "char**",
-  "char **",
-  "struct ",
-  "typedef struct",
-  "typedef enum",
-  "enum ",
-  "uint8",
-  "uint16",
-  "uint32",
-  "uint64",
-  "int8",
-  "int16",
-  "int32",
-  "int64",
-  "size_t",
-  "NULL",
-  ".h>",
-  ".h\"",
-  "int i=0;",
-  "int i = 0;",
-  "()",
-  "(",
-  ")",
-  "[]",
-  "[",
-  "]",
-  "{}",
-  "{",
-  "}",
-  "<",
-  ">",
-  "=",
-  "!",
-  "+",
-  "+",
-  "*",
-  "/",
-  "%",
-  "&",
-  "|",
-  "^",
-  "~",
-  ".",
-  ":",
-  ";",
-  "?",
-  " ",
-  "\n",
-  "\t",
-  "\n    ",
-  "\n\t",
-  "    ",
-  "uint8_t",
-  "uint16_t",
-  "uint32_t",
-  "uint64_t",
-  "int8_t",
-  "int16_t",
-  "int32_t",
-  "int64_t",
-  "int main(",
   "#ifdef",
   "#elifdef",
   "#elifndef",
@@ -170,25 +65,191 @@ const vector<string> tokens={
   "#undef",
   "#pragma",
   "#endif",
-  "printf(",
-  "malloc(",
-  "free(",
-  ");",
-  ");\n",
-  ");\n\t",
-  ");\n    ",
-  "if (",
-  "while (",
-  "} else if (",
-  "} else {",
-  "for (",
-  "switch (",
-  "&&",
-  "||",
-  " && ",
-  " || ",
+  "unsigned int",
+  "unsigned char",
+  "unsigned long",
+  "unsigned short",
+  "long long",
+  "unsigned long long",
+  "signed char",
+  "long gouble",
+  "const char*",
+  "typedef struct"
 };
-int main() {
-  cout<<tokens.size()<<endl;
+struct symbol {
+  string name;
+  int score;
+};
+struct node {
+  map<unsigned char,node*> children;
+  int token_id=-1;
+};
+void insert(node* root,string str,int id) {
+  node* curr=root;
+  for (char c:str) {
+    if (curr->children.find(c)==curr->children.end()) {
+      curr->children[c]=new node();
+    }
+    curr=curr->children[c];
+  }
+  curr->token_id=id;
+}
+string clean_code(string source) {
+  string clean;
+  bool in_string=false;
+  bool in_comment_single=false;
+  bool in_comment_multi=false;
+  for (size_t i=0;i<source.size();++i) {
+    if (in_comment_multi) {
+      if (source[i]=='*' && i+1<source.size() && source[i+1]=='/') {
+        in_comment_multi=false;
+        i++;
+      }
+      continue;
+    }
+    if (in_comment_single) {
+      if (source[i]=='\n') {
+        in_comment_single=false;
+      } else {
+        continue;
+      }
+    }
+    if (in_string) {
+      if (source[i]=='\\') {i++;continue;}
+      if (source[i]=='"') in_string=false;
+      continue;
+    }
+    if (source[i]=='/' && i+1<source.size()) {
+      if (source[i+1]=='/') {in_comment_single=true;i++;continue;}
+      if (source[i+1]=='*') {in_comment_multi=true;i++;continue;}
+    }
+    if (source[i]=='"') {
+      in_string=true;
+      continue;
+    }
+    clean+=source[i];
+  }
+  return clean;
+}
+int main(int argc,char **argv) {
+  if (argc!=2) {
+    cout<<"Usage: ccc <c file>"<<endl;
+    return -1;
+  }
+  string filepath=string(argv[1]);
+  if (!fs::exists(filepath)) {
+    cout<<"Error: provided file doesn't exist."<<endl;
+    return -1;
+  }
+  ifstream file(filepath,ios::binary);
+  if (!file) {
+    cout<<"Error: couldn't open provided file."<<endl;
+    return -1;
+  }
+  string code((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
+  string cleanc=clean_code(code);
+  map<string,int> counts;
+  regex symbol_regex("[a-zA-Z_][a-zA-Z0-9_]*");
+  auto words_begin=sregex_iterator(cleanc.begin(),cleanc.end(),symbol_regex);
+  auto words_end=sregex_iterator();
+  for (sregex_iterator i=words_begin;i!=words_end;i++) {
+    string match=i->str();
+    if (match.length()>2) {
+      counts[match]++;
+    }
+  }
+  vector<symbol> leaderboard;
+  for (auto const& [name,count]:counts) {
+    leaderboard.push_back({name,(int)((name.length()-1)*count-(name.length()+1))});
+  }
+  sort(leaderboard.begin(),leaderboard.end(),[](const symbol& a,const symbol& b) {
+    return a.score>b.score;
+  });
+  vector<symbol> top64;
+  for (int i=0;i<64 && i<leaderboard.size();i++) {
+    if (!(leaderboard[i].score<=0)) top64.push_back(leaderboard[i]);
+  }
+  node root_node;
+  for (int i=0;i<tokens.size() && i<64;i++) {
+    insert(&root_node,tokens[i],0x80+i);
+  }
+  for (int i=0;i<top64.size();i++) {
+    insert(&root_node,top64[i].name,0xC0+i);
+  }
+  vector<unsigned char> output;
+  output.push_back('C');
+  output.push_back('C');
+  output.push_back('C');
+  output.push_back((unsigned char)top64.size());
+  for (const auto& s:top64) {
+    output.push_back((unsigned char)s.name.length());
+    for (char c:s.name) output.push_back(c);
+  }
+  for (size_t i=0;i<code.size();) {
+    node* curr=&root_node;
+    int best_id=-1;
+    size_t best_len=0;
+    for (size_t j=i;j<code.size();j++) {
+      unsigned char c=(unsigned char)code[j];
+      if (curr->children.count(c)) {
+        curr=curr->children[c];
+        if (curr->token_id!=-1) {
+          best_id=curr->token_id;
+          best_len=(j-i)+1;
+        }
+      } else {
+        break;
+      }
+    }
+    if (best_id!=-1) {
+      output.push_back((unsigned char)best_id);
+      i+=best_len;
+    } else {
+      unsigned char c=(unsigned char)code[i];
+      if (c<128) {
+        output.push_back(c);
+        i++;
+      } else {
+        uint32_t codepoint=0;
+        int bytes_to_skip=0;
+        if ((c & 0xE0)==0xC0) {
+          if (i+1<code.size()) {
+            codepoint=((code[i]&0x1F)<<6) | (code[i+1]&0x3F);
+            output.push_back(12);
+            bytes_to_skip=2;
+          }
+        } else if ((c & 0xF0)==0xE0) {
+          if (i+2<code.size()) {
+            codepoint=((code[i]&0x0F)<<12) | ((code[i+1]&0x3F)<<6) | (code[i+2]&0x3F);
+            output.push_back(13);
+            bytes_to_skip=3;
+          }
+        } else if ((c & 0xF8)==0xF0) {
+          if (i+3<code.size()) {
+            codepoint=((code[i]&0x07)<<18) | ((code[i+1]&0x3F)<<12) | ((code[i+2]&0x3F)<<6) | (code[i+3]&0x3F);
+            output.push_back(14);
+            bytes_to_skip=4;
+          }
+        }
+        if (bytes_to_skip>0) {
+          if (bytes_to_skip==2) {
+            output.push_back((unsigned char)(codepoint>>8));
+            output.push_back((unsigned char)(codepoint));
+          } else if (bytes_to_skip==3) {
+            output.push_back((unsigned char)(codepoint>>8));
+            output.push_back((unsigned char)(codepoint));
+          } else if (bytes_to_skip==4) {
+            output.push_back((unsigned char)(codepoint>>16));
+            output.push_back((unsigned char)(codepoint>>8));
+            output.push_back((unsigned char)(codepoint));
+          }
+          i+=bytes_to_skip;
+        } else {
+          output.push_back(c);
+          i++;
+        }
+      }
+    }
+  }
   return 0;
 }