improvements incoming

This commit is contained in:
2026-02-06 00:01:23 +01:00
parent f281c71f75
commit 51ae29a898
6 changed files with 236 additions and 201 deletions

359
ccc.cpp
View File

@@ -14,56 +14,35 @@
#include <lzma.h>
using namespace std;
namespace fs=filesystem;
const vector<bool> CCC_C_KEYYORD_HEAD {0,0,0};
const vector<bool> CCC_SPACE {0,1,1,1,0,0,1};
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_IF {0,0,1,0,0,0};
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_IFDEF {0,0,1,0,0,1};
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_IFNDEF {0,0,1,0,1,0};
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ELSE {0,0,1,0,1,1};
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ELIF {0,0,1,1,0,0};
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ELIFDEF {0,0,1,1,0,1};
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ELIFNDEF {0,0,1,1,1,0};
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ENDIF {0,0,1,1,1,1};
const vector<bool> CCC_PREPROCESSOR_OTHER_DEFINE {0,1,0,0,0,0};
const vector<bool> CCC_PREPROCESSOR_OTHER_UNDEF {0,1,0,0,0,1};
const vector<bool> CCC_PREPROCESSOR_OTHER_INCLUDE {0,1,0,0,1,0};
const vector<bool> CCC_PREPROCESSOR_OTHER_ERROR {0,1,0,0,1,1};
const vector<bool> CCC_PREPROCESSOR_OTHER_WARNING {0,1,0,1,0,0};
const vector<bool> CCC_PREPROCESSOR_OTHER_PRAGMA {0,1,0,1,0,1};
const vector<bool> CCC_PREPROCESSOR_OTHER_LINE {0,1,0,1,1,0};
const vector<bool> CCC_QUOTE {0,1,0,1,1,1};
const vector<bool> CCC_DELIMITER_HEAD {0,1,1};
const vector<bool> CCC_OTHER_GRAMMAR_HEAD {1,0,0};
const vector<bool> CCC_MISCELLANEOUS_HEAD {1,0,1};
const vector<bool> CCC_REC_TABLE_REF_HEAD {1,1,0};
const vector<bool> CCC_STRING_ASCII {1,1,1,0};
const vector<bool> CCC_STRING_UTF8 {1,1,1,1};
const vector<bool> CCC_STRING_END_ASCII {0,0,0,0,0,0,0};
const vector<bool> CCC_STRING_END_UTF8 {0,0,0,0,0,0,0,0};
const vector<bool> CCC_DELIMITER_0_HEAD={0};
const vector<bool> CCC_DELIMITER_1_HEAD={1,0};
const vector<bool> CCC_C_KEYWORD_HEAD={1,1,0,0};
const vector<bool> CCC_MISCELANEOUS_HEAD={1,1,0,1};
const vector<bool> CCC_STRING_INLINE_HEAD={1,1,1,0};
const vector<bool> CCC_REC_TABLE_REF_HEAD={1,1,1,1};
const vector<bool> CCC_STRING_INLINE_END={0,0,0,0,0,0,0,0};
#define CCC_ADD_COMPOMENT(vec,tail) \
do { \
auto tmp=tail; \
vec.insert(vec.end(),tmp.begin(),tmp.end()); \
} while (0)
const vector<string> delimiter={
"\n",
"\t",
const vector<string> delimiter0={
"{",
"}",
"(",
")",
"[",
"]",
" ",
"{}",
"()",
"[]",
"",
";",
",",
"."
};
const vector<string> other_grammer={
const vector<string> delimiter1={
"{}",
"()",
"[]",
";"
};
const vector<string> miscellaneous={
"!",
"%",
"'",
@@ -79,9 +58,7 @@ const vector<string> other_grammer={
"^",
"|",
"&",
"~"
};
const vector<string> miscellaneous={
"~",
"+=",
"-=",
"*=",
@@ -116,6 +93,21 @@ const vector<string> miscellaneous={
"int64_t"
};
const vector<string> c_keywords={
"#if",
"#ifdef",
"#ifndef",
"#else",
"#elif",
"#elifdef",
"#elifndef",
"#endif",
"#define",
"#undef",
"#include",
"#error",
"#warning",
"#pragma",
"#line",
"alignas",
"alignof",
"auto",
@@ -162,7 +154,8 @@ const vector<string> c_keywords={
"volatile",
"while",
"__asm__",
"__attribute__"
"__attribute__",
"defined",
};
struct symbol {
string name;
@@ -210,17 +203,9 @@ vector<bool> byte_to_bits(unsigned char c) {
}
return out;
}
vector<bool> ascii_to_bits(unsigned char c) {
vector<bool> out;
for (int i=6;i>=0;i--) {
bool enabled=(c>>i)&0x01;
out.push_back(enabled);
}
return out;
}
vector<bool> generate_c_keyword(size_t index) {
vector<bool> out;
CCC_ADD_COMPOMENT(out,CCC_C_KEYYORD_HEAD);
CCC_ADD_COMPOMENT(out,CCC_C_KEYWORD_HEAD);
for (int i=5;i>=0;i--) {
bool enabled=(index>>i)&0x01;
out.push_back(enabled);
@@ -241,19 +226,19 @@ vector<bool> generate_rec(size_t index,size_t total_recs) {
}
return out;
}
vector<bool> generate_delimiter(size_t index) {
vector<bool> generate_delimiter0(size_t index) {
vector<bool> out;
CCC_ADD_COMPOMENT(out,CCC_DELIMITER_HEAD);
for (int i=3;i>=0;i--) {
CCC_ADD_COMPOMENT(out,CCC_DELIMITER_0_HEAD);
for (int i=2;i>=0;i--) {
bool enabled=(index>>i)&0x01;
out.push_back(enabled);
}
return out;
}
vector<bool> generate_other_grammar(size_t index) {
vector<bool> generate_delimiter1(size_t index) {
vector<bool> out;
CCC_ADD_COMPOMENT(out,CCC_OTHER_GRAMMAR_HEAD);
for (int i=3;i>=0;i--) {
CCC_ADD_COMPOMENT(out,CCC_DELIMITER_1_HEAD);
for (int i=1;i>=0;i--) {
bool enabled=(index>>i)&0x01;
out.push_back(enabled);
}
@@ -261,8 +246,8 @@ vector<bool> generate_other_grammar(size_t index) {
}
vector<bool> generate_miscellaneous(size_t index) {
vector<bool> out;
CCC_ADD_COMPOMENT(out,CCC_MISCELLANEOUS_HEAD);
for (int i=4;i>=0;i--) {
CCC_ADD_COMPOMENT(out,CCC_MISCELANEOUS_HEAD);
for (int i=5;i>=0;i--) {
bool enabled=(index>>i)&0x01;
out.push_back(enabled);
}
@@ -270,26 +255,11 @@ vector<bool> generate_miscellaneous(size_t index) {
}
vector<bool> generate_string_content(string str) {
vector<bool> out;
bool is_utf8=false;
CCC_ADD_COMPOMENT(out,CCC_STRING_INLINE_HEAD);
for (auto c:str) {
if (c>127) {
is_utf8=true;
break;
}
}
if (is_utf8) {
CCC_ADD_COMPOMENT(out,CCC_STRING_UTF8);
for (auto c:str) {
CCC_ADD_COMPOMENT(out,byte_to_bits(c));
}
CCC_ADD_COMPOMENT(out,CCC_STRING_END_UTF8);
} else {
CCC_ADD_COMPOMENT(out,CCC_STRING_ASCII);
for (auto c:str) {
CCC_ADD_COMPOMENT(out,ascii_to_bits(c));
}
CCC_ADD_COMPOMENT(out,CCC_STRING_END_ASCII);
CCC_ADD_COMPOMENT(out,byte_to_bits(c));
}
CCC_ADD_COMPOMENT(out,CCC_STRING_INLINE_END);
return out;
}
void print_debug(string text) {
@@ -301,148 +271,143 @@ vector<unsigned char> process_file_nodes(vector<TSNode> *nodes,string code,vecto
vector<bool> out;
for (int i=0;i<nodes->size();i++) {
string type=string(ts_node_type(nodes->at(i)));
string supertext=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
if (type=="#if") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IF);
print_debug("if");
} else if (type=="#ifdef") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IFDEF);
print_debug("ifdef");
} else if (type=="#ifndef") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IFNDEF);
print_debug("ifndef");
} else if (type=="#else") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELSE);
print_debug("else");
} else if (type=="#elif") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIF);
print_debug("elif");
} else if (type=="#elifdef") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIFDEF);
print_debug("elifdef");
} else if (type=="#elifndef") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIFNDEF);
print_debug("elifndef");
} else if (type=="#endif") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ENDIF);
print_debug("endif");
} else if (type=="#define") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_DEFINE);
print_debug("define");
} else if (type=="#undef") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_UNDEF);
print_debug("undef");
} else if (type=="#include") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_INCLUDE);
print_debug("include");
} else if (type=="#error") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_ERROR);
print_debug("error");
} else if (type=="#warning") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_WARNING);
print_debug("warning");
} else if (type=="#pragma" || (type=="preproc_directive" && supertext=="#pragma")) {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_PRAGMA);
print_debug("pragma");
} else if (type=="#line") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_LINE);
print_debug("line");
} else if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="type_identifier" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") {
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") {
auto it=find(rec_list.begin(),rec_list.end(),text);
if (it==rec_list.end()) {
if (!text.empty()) {
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
CCC_ADD_COMPOMENT(out,generate_string_content(text));
print_debug("string ("+type+"): "+text);
} else {
auto it=find(delimiter.begin(),delimiter.end(),"");
size_t index=distance(delimiter.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter(index));
print_debug("delimiter for empty string");
}
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
CCC_ADD_COMPOMENT(out,generate_string_content(text));
print_debug("string ("+type+"): "+text);
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
print_debug("rec_table for string ("+type+"): "+text);
}
} else if (type=="primitive_type") {
} else if (type=="primitive_type" || type=="type_identifier") {
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
auto it=find(c_keywords.begin(),c_keywords.end(),text);
if (it!=c_keywords.end()) {
size_t index=distance(c_keywords.begin(),it);
CCC_ADD_COMPOMENT(out,generate_c_keyword(index));
print_debug("primitive_type: "+text);
print_debug("type found in c keyword: "+text);
} else {
auto it=find(rec_list.begin(),rec_list.end(),text);
if (it==rec_list.end()) {
if (!text.empty()) {
CCC_ADD_COMPOMENT(out,generate_string_content(text));
print_debug("string ("+type+"): "+text);
print_debug("string for type ("+type+"): "+text);
} else {
cout<<"Error: provided primitive is empty: "<<text<<endl;;
exit(-1);
cout<<"Warning: provided primitive is empty: "<<text<<endl;
}
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
print_debug("rec_table for string ("+type+"): "+text);
print_debug("rec_table for string for type ("+type+"): "+text);
}
}
} else if (find(delimiter.begin(),delimiter.end(),type)!=delimiter.end()) {
string text;
} else if (find(delimiter0.begin(),delimiter0.end(),type)!=delimiter0.end() || find(delimiter1.begin(),delimiter1.end(),type)!=delimiter1.end() || type=="\"") {
string insert;
if (type=="(" && i+1<nodes->size()) {
if (string(ts_node_type(nodes->at(i+1)))==")") {
text="()";
insert="()";
i++;
} else {
text="(";
insert="(";
}
} else if (type=="[" && i+1<nodes->size()) {
if (string(ts_node_type(nodes->at(i+1)))=="]") {
text="[]";
insert="[]";
i++;
} else {
text="[";
insert="[";
}
} else if (type=="{" && i+1<nodes->size()) {
if (string(ts_node_type(nodes->at(i+1)))=="}") {
text="{}";
insert="{}";
i++;
} else {
text="{";
insert="{";
}
} else {
text=type;
insert=type;
}
auto it=find(delimiter.begin(),delimiter.end(),text);
if (it!=delimiter.end()) {
size_t index=distance(delimiter.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter(index));
print_debug("delimiter: "+text);
auto it=find(delimiter0.begin(),delimiter0.end(),insert);
if (it!=delimiter0.end()) {
size_t index=distance(delimiter0.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter0(index));
print_debug("delimiter 0: "+insert);
} else {
cout<<"Error: unknow delimiter, that shouldn't happen: "<<text<<endl;;
exit(-1);
if (insert!="{}" && insert!="\"") {
auto it=find(delimiter1.begin(),delimiter1.end(),insert);
if (it!=delimiter1.end()) {
size_t index=distance(delimiter1.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter1(index));
print_debug("delimiter 1: "+insert);
} else {
cout<<"Error: unknow delimiter, that shouldn't happen: "<<insert<<endl;;
// exit(-1);
}
} else {
if (insert=="{}") {
auto it=find(delimiter1.begin(),delimiter1.end(),"{}");
if (it!=delimiter1.end()) {
size_t index=distance(delimiter1.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter1(index));
CCC_ADD_COMPOMENT(out,{0});
print_debug("delimiter 1: "+insert);
} else {
cout<<"Error: unknow delimiter, that shouldn't happen: "<<insert<<endl;;
// exit(-1);
}
} else if (insert=="\"") {
auto it=find(delimiter1.begin(),delimiter1.end(),"{}");
if (it!=delimiter1.end()) {
size_t index=distance(delimiter1.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter1(index));
CCC_ADD_COMPOMENT(out,{1});
print_debug("delimiter 1: "+insert);
} else {
cout<<"Error: unknow delimiter, that shouldn't happen: "<<insert<<endl;;
exit(-1);
}
} else {
cout<<"Error: unknow delimiter, that shouldn't happen: "<<insert<<endl;;
// exit(-1);
}
}
}
} else if (find(other_grammer.begin(),other_grammer.end(),type)!=other_grammer.end()) {
auto it=find(other_grammer.begin(),other_grammer.end(),type);
if (it!=other_grammer.end()) {
size_t index=distance(other_grammer.begin(),it);
CCC_ADD_COMPOMENT(out,generate_other_grammar(index));
print_debug("other grammar: "+type);
} else if (find(c_keywords.begin(),c_keywords.end(),type)!=c_keywords.end() || type=="preproc_directive") {
if (type!="preproc_directive") {
auto it=find(c_keywords.begin(),c_keywords.end(),type);
if (it!=c_keywords.end()) {
size_t index=distance(c_keywords.begin(),it);
CCC_ADD_COMPOMENT(out,generate_c_keyword(index));
print_debug("c keyword: "+type);
} else {
cout<<"Error: unknow C keyword, that shouldn't happen: "<<type<<" "<<text<<endl;;
// exit(-1);
}
} else {
cout<<"Error: unknow other grammar symbol, that shouldn't happen: "<<type<<endl;;
exit(-1);
}
} else if (find(c_keywords.begin(),c_keywords.end(),type)!=c_keywords.end()) {
auto it=find(c_keywords.begin(),c_keywords.end(),type);
if (it!=c_keywords.end()) {
size_t index=distance(c_keywords.begin(),it);
CCC_ADD_COMPOMENT(out,generate_c_keyword(index));
print_debug("c keyword: "+type);
} else {
cout<<"Error: unknow C keyword, that shouldn't happen: "<<type<<endl;;
exit(-1);
auto it=find(c_keywords.begin(),c_keywords.end(),text);
if (it!=c_keywords.end()) {
size_t index=distance(c_keywords.begin(),it);
CCC_ADD_COMPOMENT(out,generate_c_keyword(index));
print_debug("c keyword: "+type);
} else {
auto it=find(rec_list.begin(),rec_list.end(),text);
if (it==rec_list.end()) {
if (!text.empty()) {
CCC_ADD_COMPOMENT(out,generate_string_content(text));
print_debug("string for c keyword ("+type+"): "+text);
} else {
cout<<"Warning: C keyword is empty: "<<text<<endl;
}
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
print_debug("rec_table for string for c keyword ("+type+"): "+text);
}
}
}
} else if (find(miscellaneous.begin(),miscellaneous.end(),type)!=miscellaneous.end()) {
auto it=find(miscellaneous.begin(),miscellaneous.end(),type);
@@ -452,37 +417,42 @@ vector<unsigned char> process_file_nodes(vector<TSNode> *nodes,string code,vecto
print_debug("miscellaneous: "+type);
} else {
cout<<"Error: unknow miscellaneous, that shouldn't happen: "<<type<<endl;;
exit(-1);
// exit(-1);
}
} else if (type=="comment") {
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
auto it=find(rec_list.begin(),rec_list.end(),text);
if (it==rec_list.end()) {
cout<<"Error: comment in reccurences map not found: "<<text<<endl;;
exit(-1);
if (it==rec_list.end()) {
if (!text.empty()) {
CCC_ADD_COMPOMENT(out,generate_string_content(text));
print_debug("string for comment("+type+"): "+text);
} else {
cout<<"Warning: unknow node is empty: "<<text<<endl;
}
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
print_debug("rec_table for string for comment ("+type+"): "+text);
}
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
print_debug("rec_table for comment");
}
} else if (type=="\"") {
if (i+1<nodes->size()) {
if (string(ts_node_type(nodes->at(i+1)))=="\"") {
auto it=find(delimiter.begin(),delimiter.end(),"");
size_t index=distance(delimiter.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter(index));
print_debug("double quotes mark, inserting delimiter for empty string");
i++;
} else {
CCC_ADD_COMPOMENT(out,CCC_QUOTE);
print_debug("single quote mark");
}
}
} else {
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
cout<<"Error: unknow node type: "<<type<<endl;
cout<<"Error: unknow node text: "<<text<<endl;
exit(-1);
auto it=find(rec_list.begin(),rec_list.end(),text);
if (it==rec_list.end()) {
if (!text.empty()) {
CCC_ADD_COMPOMENT(out,generate_string_content(text));
print_debug("string for unknow node ("+type+"): "+text);
} else {
cout<<"Warning: unknow node is empty: "<<text<<endl;
}
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
print_debug("rec_table for string for unknow node ("+type+"): "+text);
}
}
}
vector<unsigned char> payload_bytes;
@@ -517,6 +487,7 @@ void construct_rec_table(vector<string> &files_content,vector<string> files_name
}
}
int main(int argc,char **argv) {
cout<<c_keywords.size()<<endl;
if (argc<2) {
cout<<"Usage: ccc [FILES]"<<endl;
return -1;