new horizon
This commit is contained in:
600
ccc.cpp
600
ccc.cpp
@@ -1,16 +1,118 @@
|
|||||||
#include <cstddef>
|
|
||||||
#include <cstring>
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
#include <stdint.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
#include <regex>
|
#include <algorithm>
|
||||||
|
#include <tree_sitter/api.h>
|
||||||
|
#include <tree_sitter/tree-sitter-c.h>
|
||||||
using namespace std;
|
using namespace std;
|
||||||
namespace fs=filesystem;
|
namespace fs=filesystem;
|
||||||
const vector<string> tokens={
|
const vector<bool> CCC_C_KEYYORD_HEAD {0,0,0};
|
||||||
|
const vector<bool> CCC_SPACE {0,1,1,1,0,0,1};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_IF {0,0,1,0,0,0};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_IFDEF {0,0,1,0,0,1};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_IFNDEF {0,0,1,0,1,0};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ELSE {0,0,1,0,1,1};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ELIF {0,0,1,1,0,0};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ELIFDEF {0,0,1,1,0,1};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ELIFNDEF {0,0,1,1,1,0};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ENDIF {0,0,1,1,1,1};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_OTHER_DEFINE {0,1,0,0,0,0};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_OTHER_UNDEF {0,1,0,0,0,1};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_OTHER_INCLUDE {0,1,0,0,1,0};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_OTHER_ERROR {0,1,0,0,1,1};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_OTHER_WARNING {0,1,0,1,0,0};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_OTHER_PRAGMA {0,1,0,1,0,1};
|
||||||
|
const vector<bool> CCC_PREPROCESSOR_OTHER_LINE {0,1,0,1,1,0};
|
||||||
|
const vector<bool> CCC_QUOTE {0,1,0,1,1,1};
|
||||||
|
const vector<bool> CCC_DELIMITER_HEAD {0,1,1};
|
||||||
|
const vector<bool> CCC_OTHER_GRAMMAR_HEAD {1,0,0};
|
||||||
|
const vector<bool> CCC_MISCELLANEOUS_HEAD {1,0,1};
|
||||||
|
const vector<bool> CCC_REC_TABLE_REF_HEAD {1,1,0};
|
||||||
|
const vector<bool> CCC_STRING_ASCII {1,1,1,0};
|
||||||
|
const vector<bool> CCC_STRING_UTF8 {1,1,1,1};
|
||||||
|
const vector<bool> CCC_STRING_END_ASCII {0,0,0,0,0,0,0};
|
||||||
|
const vector<bool> CCC_STRING_END_UTF8 {0,0,0,0,0,0,0,0};
|
||||||
|
#define CCC_ADD_COMPOMENT(vec,tail) \
|
||||||
|
do { \
|
||||||
|
auto tmp=tail; \
|
||||||
|
vec.insert(vec.end(),tmp.begin(),tmp.end()); \
|
||||||
|
} while (0)
|
||||||
|
const vector<string> delimiter={
|
||||||
|
"\n",
|
||||||
|
"\t",
|
||||||
|
"{",
|
||||||
|
"}",
|
||||||
|
"(",
|
||||||
|
")",
|
||||||
|
"[",
|
||||||
|
"]",
|
||||||
|
" ",
|
||||||
|
"{}",
|
||||||
|
"()",
|
||||||
|
"[]",
|
||||||
|
"",
|
||||||
|
";",
|
||||||
|
",",
|
||||||
|
"."
|
||||||
|
};
|
||||||
|
const vector<string> other_grammer={
|
||||||
|
"!",
|
||||||
|
"%",
|
||||||
|
"'",
|
||||||
|
"*",
|
||||||
|
"+",
|
||||||
|
"-",
|
||||||
|
"/",
|
||||||
|
":",
|
||||||
|
"<",
|
||||||
|
">",
|
||||||
|
"=",
|
||||||
|
"?",
|
||||||
|
"^",
|
||||||
|
"|",
|
||||||
|
"&",
|
||||||
|
"~"
|
||||||
|
};
|
||||||
|
const vector<string> miscellaneous={
|
||||||
|
"+=",
|
||||||
|
"-=",
|
||||||
|
"*=",
|
||||||
|
"/=",
|
||||||
|
"%=",
|
||||||
|
"&=",
|
||||||
|
"|=",
|
||||||
|
"^=",
|
||||||
|
"<<=",
|
||||||
|
">>=",
|
||||||
|
"++",
|
||||||
|
"--",
|
||||||
|
"<<",
|
||||||
|
">>",
|
||||||
|
"==",
|
||||||
|
"!=",
|
||||||
|
"<=",
|
||||||
|
">=",
|
||||||
|
"->",
|
||||||
|
"...",
|
||||||
|
"||",
|
||||||
|
"&&",
|
||||||
|
"NULL",
|
||||||
|
"size_t",
|
||||||
|
"uint8_t",
|
||||||
|
"uint16_t",
|
||||||
|
"uint32_t",
|
||||||
|
"uint64_t",
|
||||||
|
"int8_t",
|
||||||
|
"int16_t",
|
||||||
|
"int32_t",
|
||||||
|
"int64_t"
|
||||||
|
};
|
||||||
|
const vector<string> c_keywords={
|
||||||
"alignas",
|
"alignas",
|
||||||
"alignof",
|
"alignof",
|
||||||
"auto",
|
"auto",
|
||||||
@@ -50,31 +152,12 @@ const vector<string> tokens={
|
|||||||
"true",
|
"true",
|
||||||
"typedef",
|
"typedef",
|
||||||
"typeof",
|
"typeof",
|
||||||
"typeof_unqual",
|
"typeof_unequal",
|
||||||
"union",
|
"union",
|
||||||
"unsigned",
|
"unsigned",
|
||||||
"void",
|
"void",
|
||||||
"volatile",
|
"volatile",
|
||||||
"while",
|
"while"
|
||||||
"#include <",
|
|
||||||
"#include \"",
|
|
||||||
"#ifdef",
|
|
||||||
"#elifdef",
|
|
||||||
"#elifndef",
|
|
||||||
"#define",
|
|
||||||
"#undef",
|
|
||||||
"#pragma",
|
|
||||||
"#endif",
|
|
||||||
"unsigned int",
|
|
||||||
"unsigned char",
|
|
||||||
"unsigned long",
|
|
||||||
"unsigned short",
|
|
||||||
"long long",
|
|
||||||
"unsigned long long",
|
|
||||||
"signed char",
|
|
||||||
"long gouble",
|
|
||||||
"const char*",
|
|
||||||
"typedef struct"
|
|
||||||
};
|
};
|
||||||
struct symbol {
|
struct symbol {
|
||||||
string name;
|
string name;
|
||||||
@@ -94,42 +177,294 @@ void insert(node* root,string str,int id) {
|
|||||||
}
|
}
|
||||||
curr->token_id=id;
|
curr->token_id=id;
|
||||||
}
|
}
|
||||||
string clean_code(string source) {
|
vector<TSNode> all_tokens;
|
||||||
string clean;
|
void get_all_nodes(TSNode node,const string &source_code,map<string,int> &rec_map) {
|
||||||
bool in_string=false;
|
if (ts_node_child_count(node)==0) {
|
||||||
bool in_comment_single=false;
|
all_tokens.push_back(node);
|
||||||
bool in_comment_multi=false;
|
string text=source_code.substr(ts_node_start_byte(node),ts_node_end_byte(node)-ts_node_start_byte(node));
|
||||||
for (size_t i=0;i<source.size();++i) {
|
if (string(ts_node_type(node))=="string_content" || string(ts_node_type(node))=="system_lib_string" || string(ts_node_type(node))=="identifier" || string(ts_node_type(node))=="number_literal" || string(ts_node_type(node))=="type_identifier" || string(ts_node_type(node))=="field_identifier" || string(ts_node_type(node))=="escape_sequence" || string(ts_node_type(node))=="statement_identifier") {
|
||||||
if (in_comment_multi) {
|
rec_map[text]++;
|
||||||
if (source[i]=='*' && i+1<source.size() && source[i+1]=='/') {
|
|
||||||
in_comment_multi=false;
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
if (in_comment_single) {
|
if (string(ts_node_type(node))=="primitive_type" && find(c_keywords.begin(),c_keywords.end(),text)==c_keywords.end()) {
|
||||||
if (source[i]=='\n') {
|
rec_map[text]++;
|
||||||
in_comment_single=false;
|
|
||||||
} else {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (in_string) {
|
} else {
|
||||||
if (source[i]=='\\') {i++;continue;}
|
uint32_t child_count=ts_node_child_count(node);
|
||||||
if (source[i]=='"') in_string=false;
|
for (uint32_t i=0;i<child_count;++i) {
|
||||||
continue;
|
TSNode child=ts_node_child(node,i);
|
||||||
|
get_all_nodes(child,source_code,rec_map);
|
||||||
}
|
}
|
||||||
if (source[i]=='/' && i+1<source.size()) {
|
|
||||||
if (source[i+1]=='/') {in_comment_single=true;i++;continue;}
|
|
||||||
if (source[i+1]=='*') {in_comment_multi=true;i++;continue;}
|
|
||||||
}
|
|
||||||
if (source[i]=='"') {
|
|
||||||
in_string=true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
clean+=source[i];
|
|
||||||
}
|
}
|
||||||
return clean;
|
}
|
||||||
|
vector<bool> byte_to_bits(unsigned char c) {
|
||||||
|
vector<bool> out;
|
||||||
|
for (int i=7;i>=0;i--) {
|
||||||
|
bool enabled=(c>>i)&0x01;
|
||||||
|
out.push_back(enabled);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
vector<bool> ascii_to_bits(unsigned char c) {
|
||||||
|
vector<bool> out;
|
||||||
|
for (int i=6;i>=0;i--) {
|
||||||
|
bool enabled=(c>>i)&0x01;
|
||||||
|
out.push_back(enabled);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
vector<bool> generate_c_keyword(size_t index) {
|
||||||
|
vector<bool> out;
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_C_KEYYORD_HEAD);
|
||||||
|
for (int i=5;i>=0;i--) {
|
||||||
|
bool enabled=(index>>i)&0x01;
|
||||||
|
out.push_back(enabled);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
vector<bool> generate_rec(size_t index,size_t total_recs) {
|
||||||
|
vector<bool> out;
|
||||||
|
size_t bits=0;
|
||||||
|
while (total_recs) {
|
||||||
|
total_recs>>=1;
|
||||||
|
++bits;
|
||||||
|
}
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_REC_TABLE_REF_HEAD);
|
||||||
|
for (int i=bits;i>=0;i--) {
|
||||||
|
bool enabled=(index>>i)&0x01;
|
||||||
|
out.push_back(enabled);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
vector<bool> generate_delimiter(size_t index) {
|
||||||
|
vector<bool> out;
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_DELIMITER_HEAD);
|
||||||
|
for (int i=3;i>=0;i--) {
|
||||||
|
bool enabled=(index>>i)&0x01;
|
||||||
|
out.push_back(enabled);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
vector<bool> generate_other_grammar(size_t index) {
|
||||||
|
vector<bool> out;
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_OTHER_GRAMMAR_HEAD);
|
||||||
|
for (int i=3;i>=0;i--) {
|
||||||
|
bool enabled=(index>>i)&0x01;
|
||||||
|
out.push_back(enabled);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
vector<bool> generate_miscellaneous(size_t index) {
|
||||||
|
vector<bool> out;
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_MISCELLANEOUS_HEAD);
|
||||||
|
for (int i=4;i>=0;i--) {
|
||||||
|
bool enabled=(index>>i)&0x01;
|
||||||
|
out.push_back(enabled);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
vector<bool> generate_string_content(string str) {
|
||||||
|
vector<bool> out;
|
||||||
|
bool is_utf8=false;
|
||||||
|
for (auto c:str) {
|
||||||
|
if (c>127) {
|
||||||
|
is_utf8=true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (is_utf8) {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_STRING_UTF8);
|
||||||
|
for (auto c:str) {
|
||||||
|
CCC_ADD_COMPOMENT(out,byte_to_bits(c));
|
||||||
|
}
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_STRING_END_UTF8);
|
||||||
|
} else {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_STRING_ASCII);
|
||||||
|
for (auto c:str) {
|
||||||
|
CCC_ADD_COMPOMENT(out,ascii_to_bits(c));
|
||||||
|
}
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_STRING_END_ASCII);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
vector<bool> process_all_nodes(vector<TSNode> *nodes,string code,vector<string> &rec_list) {
|
||||||
|
vector<bool> out;
|
||||||
|
for (int i=0;i<nodes->size();i++) {
|
||||||
|
string type=string(ts_node_type(nodes->at(i)));
|
||||||
|
if (type=="#if") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IF);
|
||||||
|
cout<<"if"<<endl;
|
||||||
|
} else if (type=="#ifdef") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IFDEF);
|
||||||
|
cout<<"ifdef"<<endl;
|
||||||
|
} else if (type=="#ifndef") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IFNDEF);
|
||||||
|
cout<<"ifndef"<<endl;
|
||||||
|
} else if (type=="#else") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELSE);
|
||||||
|
cout<<"else"<<endl;
|
||||||
|
} else if (type=="#elif") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIF);
|
||||||
|
cout<<"elif"<<endl;
|
||||||
|
} else if (type=="#elifdef") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIFDEF);
|
||||||
|
cout<<"elifdef"<<endl;
|
||||||
|
} else if (type=="#elifndef") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIFNDEF);
|
||||||
|
cout<<"elifndef"<<endl;
|
||||||
|
} else if (type=="#endif") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ENDIF);
|
||||||
|
cout<<"endif"<<endl;
|
||||||
|
} else if (type=="#define") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_DEFINE);
|
||||||
|
cout<<"define"<<endl;
|
||||||
|
} else if (type=="#undef") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_UNDEF);
|
||||||
|
cout<<"undef"<<endl;
|
||||||
|
} else if (type=="#include") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_INCLUDE);
|
||||||
|
cout<<"include"<<endl;
|
||||||
|
} else if (type=="#error") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_ERROR);
|
||||||
|
cout<<"error"<<endl;
|
||||||
|
} else if (type=="#warning") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_WARNING);
|
||||||
|
cout<<"warning"<<endl;
|
||||||
|
} else if (type=="#pragma") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_PRAGMA);
|
||||||
|
cout<<"pragma"<<endl;
|
||||||
|
} else if (type=="#line") {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_LINE);
|
||||||
|
cout<<"line"<<endl;
|
||||||
|
} else if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="type_identifier" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") {
|
||||||
|
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
|
||||||
|
auto it=find(rec_list.begin(),rec_list.end(),text);
|
||||||
|
if (it==rec_list.end()) {
|
||||||
|
if (!text.empty()) {
|
||||||
|
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
|
||||||
|
CCC_ADD_COMPOMENT(out,generate_string_content(text));
|
||||||
|
cout<<"string ("<<type<<"): "<<text<<endl;
|
||||||
|
} else {
|
||||||
|
auto it=find(delimiter.begin(),delimiter.end(),"");
|
||||||
|
size_t index=distance(delimiter.begin(),it);
|
||||||
|
CCC_ADD_COMPOMENT(out,generate_delimiter(index));
|
||||||
|
cout<<"delimiter for empty string"<<endl;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
size_t index=distance(rec_list.begin(),it);
|
||||||
|
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
|
||||||
|
cout<<"rec_table for string ("<<type<<"): "<<text<<endl;
|
||||||
|
}
|
||||||
|
} else if (type=="primitive_type") {
|
||||||
|
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
|
||||||
|
auto it=find(c_keywords.begin(),c_keywords.end(),text);
|
||||||
|
if (it!=c_keywords.end()) {
|
||||||
|
size_t index=distance(c_keywords.begin(),it);
|
||||||
|
CCC_ADD_COMPOMENT(out,generate_c_keyword(index));
|
||||||
|
cout<<"primitive_type: "<<text<<endl;
|
||||||
|
} else {
|
||||||
|
auto it=find(rec_list.begin(),rec_list.end(),text);
|
||||||
|
if (it==rec_list.end()) {
|
||||||
|
if (!text.empty()) {
|
||||||
|
CCC_ADD_COMPOMENT(out,generate_string_content(text));
|
||||||
|
cout<<"string ("<<type<<"): "<<text<<endl;
|
||||||
|
} else {
|
||||||
|
cout<<"Error: provided primitive is empty: "<<text;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
size_t index=distance(rec_list.begin(),it);
|
||||||
|
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
|
||||||
|
cout<<"rec_table for string ("<<type<<"): "<<text<<endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (find(delimiter.begin(),delimiter.end(),type)!=delimiter.end()) {
|
||||||
|
string text;
|
||||||
|
if (type=="(" && i+1<nodes->size()) {
|
||||||
|
if (string(ts_node_type(nodes->at(i+1)))==")") {
|
||||||
|
text="()";
|
||||||
|
i++;
|
||||||
|
} else {
|
||||||
|
text="(";
|
||||||
|
}
|
||||||
|
} else if (type=="[" && i+1<nodes->size()) {
|
||||||
|
if (string(ts_node_type(nodes->at(i+1)))=="]") {
|
||||||
|
text="[]";
|
||||||
|
i++;
|
||||||
|
} else {
|
||||||
|
text="[";
|
||||||
|
}
|
||||||
|
} else if (type=="{" && i+1<nodes->size()) {
|
||||||
|
if (string(ts_node_type(nodes->at(i+1)))=="}") {
|
||||||
|
text="{}";
|
||||||
|
i++;
|
||||||
|
} else {
|
||||||
|
text="{";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
text=type;
|
||||||
|
}
|
||||||
|
auto it=find(delimiter.begin(),delimiter.end(),text);
|
||||||
|
if (it!=delimiter.end()) {
|
||||||
|
size_t index=distance(delimiter.begin(),it);
|
||||||
|
CCC_ADD_COMPOMENT(out,generate_delimiter(index));
|
||||||
|
cout<<"delimiter: "<<text<<endl;
|
||||||
|
} else {
|
||||||
|
cout<<"Error: unknow delimiter, that shouldn't happen: "<<text<<endl;;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
} else if (find(other_grammer.begin(),other_grammer.end(),type)!=other_grammer.end()) {
|
||||||
|
auto it=find(other_grammer.begin(),other_grammer.end(),type);
|
||||||
|
if (it!=other_grammer.end()) {
|
||||||
|
size_t index=distance(other_grammer.begin(),it);
|
||||||
|
CCC_ADD_COMPOMENT(out,generate_other_grammar(index));
|
||||||
|
cout<<"other grammar: "<<type<<endl;
|
||||||
|
} else {
|
||||||
|
cout<<"Error: unknow other grammar symbol, that shouldn't happen: "<<type<<endl;;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
} else if (find(c_keywords.begin(),c_keywords.end(),type)!=c_keywords.end()) {
|
||||||
|
auto it=find(c_keywords.begin(),c_keywords.end(),type);
|
||||||
|
if (it!=c_keywords.end()) {
|
||||||
|
size_t index=distance(c_keywords.begin(),it);
|
||||||
|
CCC_ADD_COMPOMENT(out,generate_c_keyword(index));
|
||||||
|
cout<<"c keyword: "<<type<<endl;
|
||||||
|
} else {
|
||||||
|
cout<<"Error: unknow C keyword, that shouldn't happen: "<<type<<endl;;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
} else if (find(miscellaneous.begin(),miscellaneous.end(),type)!=miscellaneous.end()) {
|
||||||
|
auto it=find(miscellaneous.begin(),miscellaneous.end(),type);
|
||||||
|
if (it!=miscellaneous.end()) {
|
||||||
|
size_t index=distance(miscellaneous.begin(),it);
|
||||||
|
CCC_ADD_COMPOMENT(out,generate_miscellaneous(index));
|
||||||
|
cout<<"miscellaneous: "<<type<<endl;
|
||||||
|
} else {
|
||||||
|
cout<<"Error: unknow miscellaneous, that shouldn't happen: "<<type<<endl;;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
} else if (type=="\"") {
|
||||||
|
if (i+1<nodes->size()) {
|
||||||
|
if (string(ts_node_type(nodes->at(i+1)))=="\"") {
|
||||||
|
auto it=find(delimiter.begin(),delimiter.end(),"");
|
||||||
|
size_t index=distance(delimiter.begin(),it);
|
||||||
|
CCC_ADD_COMPOMENT(out,generate_delimiter(index));
|
||||||
|
cout<<"double quotes mark, inserting delimiter for empty string"<<endl;
|
||||||
|
i++;
|
||||||
|
} else {
|
||||||
|
CCC_ADD_COMPOMENT(out,CCC_QUOTE);
|
||||||
|
cout<<"single quote mark"<<endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (type=="comment") {
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
|
||||||
|
cout<<"unknow node type: "<<type<<endl;
|
||||||
|
cout<<"unknow node text: "<<text<<endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
int main(int argc,char **argv) {
|
int main(int argc,char **argv) {
|
||||||
if (argc!=2) {
|
if (argc!=2) {
|
||||||
@@ -147,109 +482,66 @@ int main(int argc,char **argv) {
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
string code((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
|
string code((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
|
||||||
string cleanc=clean_code(code);
|
TSParser *parser=ts_parser_new();
|
||||||
map<string,int> counts;
|
ts_parser_set_language(parser,tree_sitter_c());
|
||||||
regex symbol_regex("[a-zA-Z_][a-zA-Z0-9_]*");
|
TSTree *tree=ts_parser_parse_string(parser,nullptr,code.c_str(),code.size());
|
||||||
auto words_begin=sregex_iterator(cleanc.begin(),cleanc.end(),symbol_regex);
|
TSNode root=ts_tree_root_node(tree);
|
||||||
auto words_end=sregex_iterator();
|
map<string,int> rec_map;
|
||||||
for (sregex_iterator i=words_begin;i!=words_end;i++) {
|
vector<string> rec_list;
|
||||||
string match=i->str();
|
get_all_nodes(root,code,rec_map);
|
||||||
if (match.length()>2) {
|
for (auto s:rec_map) {
|
||||||
counts[match]++;
|
if (s.second>=2 and s.first.size()>=3 && s.first.size()<=256) {
|
||||||
|
rec_list.push_back(s.first);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
vector<symbol> leaderboard;
|
auto payload=process_all_nodes(&all_tokens,code,rec_list);
|
||||||
for (auto const& [name,count]:counts) {
|
vector<bool> out={0,1,0,0,0,0,1,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,1,1};
|
||||||
leaderboard.push_back({name,(int)((name.length()-1)*count-(name.length()+1))});
|
for (int i=63;i>=0;i--) {
|
||||||
|
bool enabled=(rec_list.size()>>i)&0x01;
|
||||||
|
out.push_back(enabled);
|
||||||
}
|
}
|
||||||
sort(leaderboard.begin(),leaderboard.end(),[](const symbol& a,const symbol& b) {
|
for (int i=0;i<rec_list.size();i++) {
|
||||||
return a.score>b.score;
|
uint8_t size=(uint8_t)rec_list[i].size();
|
||||||
});
|
for (int i=7;i>=0;i--) {
|
||||||
vector<symbol> top64;
|
bool enabled=(size>>i)&0x01;
|
||||||
for (int i=0;i<64 && i<leaderboard.size();i++) {
|
out.push_back(enabled);
|
||||||
if (!(leaderboard[i].score<=0)) top64.push_back(leaderboard[i]);
|
|
||||||
}
|
|
||||||
node root_node;
|
|
||||||
for (int i=0;i<tokens.size() && i<64;i++) {
|
|
||||||
insert(&root_node,tokens[i],0x80+i);
|
|
||||||
}
|
|
||||||
for (int i=0;i<top64.size();i++) {
|
|
||||||
insert(&root_node,top64[i].name,0xC0+i);
|
|
||||||
}
|
|
||||||
vector<unsigned char> output;
|
|
||||||
output.push_back('C');
|
|
||||||
output.push_back('C');
|
|
||||||
output.push_back('C');
|
|
||||||
output.push_back((unsigned char)top64.size());
|
|
||||||
for (const auto& s:top64) {
|
|
||||||
output.push_back((unsigned char)s.name.length());
|
|
||||||
for (char c:s.name) output.push_back(c);
|
|
||||||
}
|
|
||||||
for (size_t i=0;i<code.size();) {
|
|
||||||
node* curr=&root_node;
|
|
||||||
int best_id=-1;
|
|
||||||
size_t best_len=0;
|
|
||||||
for (size_t j=i;j<code.size();j++) {
|
|
||||||
unsigned char c=(unsigned char)code[j];
|
|
||||||
if (curr->children.count(c)) {
|
|
||||||
curr=curr->children[c];
|
|
||||||
if (curr->token_id!=-1) {
|
|
||||||
best_id=curr->token_id;
|
|
||||||
best_len=(j-i)+1;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (best_id!=-1) {
|
for (auto c:rec_list[i]) {
|
||||||
output.push_back((unsigned char)best_id);
|
for (int i=7;i>=0;i--) {
|
||||||
i+=best_len;
|
bool enabled=(c>>i)&0x01;
|
||||||
} else {
|
out.push_back(enabled);
|
||||||
unsigned char c=(unsigned char)code[i];
|
|
||||||
if (c<128) {
|
|
||||||
output.push_back(c);
|
|
||||||
i++;
|
|
||||||
} else {
|
|
||||||
uint32_t codepoint=0;
|
|
||||||
int bytes_to_skip=0;
|
|
||||||
if ((c & 0xE0)==0xC0) {
|
|
||||||
if (i+1<code.size()) {
|
|
||||||
codepoint=((code[i]&0x1F)<<6) | (code[i+1]&0x3F);
|
|
||||||
output.push_back(12);
|
|
||||||
bytes_to_skip=2;
|
|
||||||
}
|
|
||||||
} else if ((c & 0xF0)==0xE0) {
|
|
||||||
if (i+2<code.size()) {
|
|
||||||
codepoint=((code[i]&0x0F)<<12) | ((code[i+1]&0x3F)<<6) | (code[i+2]&0x3F);
|
|
||||||
output.push_back(13);
|
|
||||||
bytes_to_skip=3;
|
|
||||||
}
|
|
||||||
} else if ((c & 0xF8)==0xF0) {
|
|
||||||
if (i+3<code.size()) {
|
|
||||||
codepoint=((code[i]&0x07)<<18) | ((code[i+1]&0x3F)<<12) | ((code[i+2]&0x3F)<<6) | (code[i+3]&0x3F);
|
|
||||||
output.push_back(14);
|
|
||||||
bytes_to_skip=4;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (bytes_to_skip>0) {
|
|
||||||
if (bytes_to_skip==2) {
|
|
||||||
output.push_back((unsigned char)(codepoint>>8));
|
|
||||||
output.push_back((unsigned char)(codepoint));
|
|
||||||
} else if (bytes_to_skip==3) {
|
|
||||||
output.push_back((unsigned char)(codepoint>>8));
|
|
||||||
output.push_back((unsigned char)(codepoint));
|
|
||||||
} else if (bytes_to_skip==4) {
|
|
||||||
output.push_back((unsigned char)(codepoint>>16));
|
|
||||||
output.push_back((unsigned char)(codepoint>>8));
|
|
||||||
output.push_back((unsigned char)(codepoint));
|
|
||||||
}
|
|
||||||
i+=bytes_to_skip;
|
|
||||||
} else {
|
|
||||||
output.push_back(c);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
CCC_ADD_COMPOMENT(out,payload);
|
||||||
|
vector<unsigned char> outbytes;
|
||||||
|
unsigned char current=0;
|
||||||
|
int bit_index=0;
|
||||||
|
for (bool b:out) {
|
||||||
|
current|=(b<<(7-bit_index));
|
||||||
|
bit_index++;
|
||||||
|
if (bit_index==8) {
|
||||||
|
outbytes.push_back(current);
|
||||||
|
current=0;
|
||||||
|
bit_index=0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (bit_index!=0) {
|
||||||
|
outbytes.push_back(current);
|
||||||
|
}
|
||||||
|
ofstream fileout(filepath+".ccc",ios::binary);
|
||||||
|
if (!fileout) {
|
||||||
|
cout<<"Error: couldn't open output file."<<endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
fileout.write(reinterpret_cast<const char*>(outbytes.data()),outbytes.size());
|
||||||
|
fileout.close();
|
||||||
|
cout<<"Reccurences map entry count: "<<rec_list.size()<<endl;
|
||||||
|
size_t total_bytes=0;
|
||||||
|
for (int i=0;i<rec_list.size();i++) {
|
||||||
|
total_bytes++;
|
||||||
|
total_bytes+=rec_list[i].size();
|
||||||
|
}
|
||||||
|
cout<<"Total spaces taken by reccurences map in bytes: "<<total_bytes<<endl;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
BIN
ccc.cpp.ccc
Normal file
BIN
ccc.cpp.ccc
Normal file
Binary file not shown.
12
hello.c
Normal file
12
hello.c
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
typedef static unsigned char HEY;
|
||||||
|
int main() {
|
||||||
|
hello[]="hello";
|
||||||
|
HEY res=8;
|
||||||
|
if (res!=9) {
|
||||||
|
printf(hello);
|
||||||
|
}
|
||||||
|
unsigned char r=6;
|
||||||
|
return r;
|
||||||
|
}
|
||||||
BIN
hello.c.ccc
Normal file
BIN
hello.c.ccc
Normal file
Binary file not shown.
BIN
test.c.ccc
Normal file
BIN
test.c.ccc
Normal file
Binary file not shown.
Reference in New Issue
Block a user