whole archive start

This commit is contained in:
2026-02-04 23:13:24 +01:00
parent dea8f2425e
commit f2640c70a5
9 changed files with 1377 additions and 105 deletions

344
ccc.cpp
View File

@@ -9,6 +9,7 @@
#include <algorithm>
#include <tree_sitter/api.h>
#include <tree_sitter/tree-sitter-c.h>
#include <lzma.h>
using namespace std;
namespace fs=filesystem;
const vector<bool> CCC_C_KEYYORD_HEAD {0,0,0};
@@ -157,7 +158,9 @@ const vector<string> c_keywords={
"unsigned",
"void",
"volatile",
"while"
"while",
"__asm__",
"__attribute__"
};
struct symbol {
string name;
@@ -177,10 +180,19 @@ void insert(node* root,string str,int id) {
}
curr->token_id=id;
}
vector<TSNode> all_tokens;
void get_all_nodes(TSNode node,const string &source_code,map<string,int> &rec_map) {
struct processed_file {
string path;
uint32_t payload_size;
vector<unsigned char> payload;
bool is_payload_compressed;
};
map<string,vector<TSNode>> all_tokens;
map<string,int> rec_map;
vector<string> rec_list;
bool debug=false;
void get_all_nodes(TSNode node,const string &source_code,map<string,int> &rec_map,const string& file) {
if (ts_node_child_count(node)==0) {
all_tokens.push_back(node);
all_tokens[file].push_back(node);
string text=source_code.substr(ts_node_start_byte(node),ts_node_end_byte(node)-ts_node_start_byte(node));
if (string(ts_node_type(node))=="string_content" || string(ts_node_type(node))=="system_lib_string" || string(ts_node_type(node))=="identifier" || string(ts_node_type(node))=="number_literal" || string(ts_node_type(node))=="type_identifier" || string(ts_node_type(node))=="field_identifier" || string(ts_node_type(node))=="escape_sequence" || string(ts_node_type(node))=="statement_identifier") {
rec_map[text]++;
@@ -188,11 +200,14 @@ void get_all_nodes(TSNode node,const string &source_code,map<string,int> &rec_ma
if (string(ts_node_type(node))=="primitive_type" && find(c_keywords.begin(),c_keywords.end(),text)==c_keywords.end()) {
rec_map[text]++;
}
if (string(ts_node_type(node))=="comment") {
rec_map[text]=2;
}
} else {
uint32_t child_count=ts_node_child_count(node);
for (uint32_t i=0;i<child_count;++i) {
TSNode child=ts_node_child(node,i);
get_all_nodes(child,source_code,rec_map);
get_all_nodes(child,source_code,rec_map,file);
}
}
}
@@ -286,55 +301,60 @@ vector<bool> generate_string_content(string str) {
}
return out;
}
vector<bool> process_all_nodes(vector<TSNode> *nodes,string code,vector<string> &rec_list) {
void print_debug(string text) {
if (debug==true) {
cout<<text<<endl;
}
}
vector<unsigned char> process_file_nodes(vector<TSNode> *nodes,string code,vector<string> &rec_list) {
vector<bool> out;
for (int i=0;i<nodes->size();i++) {
string type=string(ts_node_type(nodes->at(i)));
if (type=="#if") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IF);
cout<<"if"<<endl;
print_debug("if");
} else if (type=="#ifdef") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IFDEF);
cout<<"ifdef"<<endl;
print_debug("ifdef");
} else if (type=="#ifndef") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IFNDEF);
cout<<"ifndef"<<endl;
print_debug("ifndef");
} else if (type=="#else") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELSE);
cout<<"else"<<endl;
print_debug("else");
} else if (type=="#elif") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIF);
cout<<"elif"<<endl;
print_debug("elif");
} else if (type=="#elifdef") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIFDEF);
cout<<"elifdef"<<endl;
print_debug("elifdef");
} else if (type=="#elifndef") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIFNDEF);
cout<<"elifndef"<<endl;
print_debug("elifndef");
} else if (type=="#endif") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ENDIF);
cout<<"endif"<<endl;
print_debug("endif");
} else if (type=="#define") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_DEFINE);
cout<<"define"<<endl;
print_debug("define");
} else if (type=="#undef") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_UNDEF);
cout<<"undef"<<endl;
print_debug("undef");
} else if (type=="#include") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_INCLUDE);
cout<<"include"<<endl;
print_debug("include");
} else if (type=="#error") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_ERROR);
cout<<"error"<<endl;
print_debug("error");
} else if (type=="#warning") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_WARNING);
cout<<"warning"<<endl;
print_debug("warning");
} else if (type=="#pragma") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_PRAGMA);
cout<<"pragma"<<endl;
print_debug("pragma");
} else if (type=="#line") {
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_LINE);
cout<<"line"<<endl;
print_debug("line");
} else if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="type_identifier" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") {
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
auto it=find(rec_list.begin(),rec_list.end(),text);
@@ -342,17 +362,17 @@ vector<bool> process_all_nodes(vector<TSNode> *nodes,string code,vector<string>
if (!text.empty()) {
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
CCC_ADD_COMPOMENT(out,generate_string_content(text));
cout<<"string ("<<type<<"): "<<text<<endl;
print_debug("string ("+type+"): "+text);
} else {
auto it=find(delimiter.begin(),delimiter.end(),"");
size_t index=distance(delimiter.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter(index));
cout<<"delimiter for empty string"<<endl;
print_debug("delimiter for empty string");
}
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
cout<<"rec_table for string ("<<type<<"): "<<text<<endl;
print_debug("rec_table for string ("+type+"): "+text);
}
} else if (type=="primitive_type") {
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
@@ -360,21 +380,21 @@ vector<bool> process_all_nodes(vector<TSNode> *nodes,string code,vector<string>
if (it!=c_keywords.end()) {
size_t index=distance(c_keywords.begin(),it);
CCC_ADD_COMPOMENT(out,generate_c_keyword(index));
cout<<"primitive_type: "<<text<<endl;
print_debug("primitive_type: "+text);
} else {
auto it=find(rec_list.begin(),rec_list.end(),text);
if (it==rec_list.end()) {
if (!text.empty()) {
CCC_ADD_COMPOMENT(out,generate_string_content(text));
cout<<"string ("<<type<<"): "<<text<<endl;
print_debug("string ("+type+"): "+text);
} else {
cout<<"Error: provided primitive is empty: "<<text;
cout<<"Error: provided primitive is empty: "<<text<<endl;;
exit(-1);
}
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
cout<<"rec_table for string ("<<type<<"): "<<text<<endl;
print_debug("rec_table for string ("+type+"): "+text);
}
}
} else if (find(delimiter.begin(),delimiter.end(),type)!=delimiter.end()) {
@@ -407,7 +427,7 @@ vector<bool> process_all_nodes(vector<TSNode> *nodes,string code,vector<string>
if (it!=delimiter.end()) {
size_t index=distance(delimiter.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter(index));
cout<<"delimiter: "<<text<<endl;
print_debug("delimiter: "+text);
} else {
cout<<"Error: unknow delimiter, that shouldn't happen: "<<text<<endl;;
exit(-1);
@@ -417,7 +437,7 @@ vector<bool> process_all_nodes(vector<TSNode> *nodes,string code,vector<string>
if (it!=other_grammer.end()) {
size_t index=distance(other_grammer.begin(),it);
CCC_ADD_COMPOMENT(out,generate_other_grammar(index));
cout<<"other grammar: "<<type<<endl;
print_debug("other grammar: "+type);
} else {
cout<<"Error: unknow other grammar symbol, that shouldn't happen: "<<type<<endl;;
exit(-1);
@@ -427,7 +447,7 @@ vector<bool> process_all_nodes(vector<TSNode> *nodes,string code,vector<string>
if (it!=c_keywords.end()) {
size_t index=distance(c_keywords.begin(),it);
CCC_ADD_COMPOMENT(out,generate_c_keyword(index));
cout<<"c keyword: "<<type<<endl;
print_debug("c keyword: "+type);
} else {
cout<<"Error: unknow C keyword, that shouldn't happen: "<<type<<endl;;
exit(-1);
@@ -437,111 +457,227 @@ vector<bool> process_all_nodes(vector<TSNode> *nodes,string code,vector<string>
if (it!=miscellaneous.end()) {
size_t index=distance(miscellaneous.begin(),it);
CCC_ADD_COMPOMENT(out,generate_miscellaneous(index));
cout<<"miscellaneous: "<<type<<endl;
print_debug("miscellaneous: "+type);
} else {
cout<<"Error: unknow miscellaneous, that shouldn't happen: "<<type<<endl;;
exit(-1);
}
} else if (type=="comment") {
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
auto it=find(rec_list.begin(),rec_list.end(),text);
if (it==rec_list.end()) {
cout<<"Error: comment in reccurences map not found: "<<text<<endl;;
exit(-1);
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
print_debug("rec_table for comment");
}
} else if (type=="\"") {
if (i+1<nodes->size()) {
if (string(ts_node_type(nodes->at(i+1)))=="\"") {
auto it=find(delimiter.begin(),delimiter.end(),"");
size_t index=distance(delimiter.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter(index));
cout<<"double quotes mark, inserting delimiter for empty string"<<endl;
print_debug("double quotes mark, inserting delimiter for empty string");
i++;
} else {
CCC_ADD_COMPOMENT(out,CCC_QUOTE);
cout<<"single quote mark"<<endl;
print_debug("single quote mark");
}
}
} else if (type=="comment") {
continue;
} else {
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
cout<<"unknow node type: "<<type<<endl;
cout<<"unknow node text: "<<text<<endl;
cout<<"Error: unknow node type: "<<type<<endl;
cout<<"Error: unknow node text: "<<text<<endl;
exit(-1);
}
}
return out;
}
int main(int argc,char **argv) {
if (argc!=2) {
cout<<"Usage: ccc <c file>"<<endl;
return -1;
}
string filepath=string(argv[1]);
if (!fs::exists(filepath)) {
cout<<"Error: provided file doesn't exist."<<endl;
return -1;
}
ifstream file(filepath,ios::binary);
if (!file) {
cout<<"Error: couldn't open provided file."<<endl;
return -1;
}
string code((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
TSParser *parser=ts_parser_new();
ts_parser_set_language(parser,tree_sitter_c());
TSTree *tree=ts_parser_parse_string(parser,nullptr,code.c_str(),code.size());
TSNode root=ts_tree_root_node(tree);
map<string,int> rec_map;
vector<string> rec_list;
get_all_nodes(root,code,rec_map);
for (auto s:rec_map) {
if (s.second>=2 and s.first.size()>=3 && s.first.size()<=256) {
rec_list.push_back(s.first);
}
}
auto payload=process_all_nodes(&all_tokens,code,rec_list);
vector<bool> out={0,1,0,0,0,0,1,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,1,1};
for (int i=63;i>=0;i--) {
bool enabled=(rec_list.size()>>i)&0x01;
out.push_back(enabled);
}
for (int i=0;i<rec_list.size();i++) {
uint8_t size=(uint8_t)rec_list[i].size();
for (int i=7;i>=0;i--) {
bool enabled=(size>>i)&0x01;
out.push_back(enabled);
}
for (auto c:rec_list[i]) {
for (int i=7;i>=0;i--) {
bool enabled=(c>>i)&0x01;
out.push_back(enabled);
}
}
}
CCC_ADD_COMPOMENT(out,payload);
vector<unsigned char> outbytes;
vector<unsigned char> payload_bytes;
unsigned char current=0;
int bit_index=0;
size_t bit_index=0;
for (bool b:out) {
current|=(b<<(7-bit_index));
bit_index++;
if (bit_index==8) {
outbytes.push_back(current);
payload_bytes.push_back(current);
current=0;
bit_index=0;
}
}
if (bit_index!=0) {
outbytes.push_back(current);
payload_bytes.push_back(current);
}
ofstream fileout(filepath+".ccc",ios::binary);
if (!fileout) {
cout<<"Error: couldn't open output file."<<endl;
return payload_bytes;
}
void construct_rec_table(vector<string> &files_content,vector<string> files_names) {
for (int i=0;i<files_content.size();i++) {
TSParser *parser=ts_parser_new();
ts_parser_set_language(parser,tree_sitter_c());
TSTree *tree=ts_parser_parse_string(parser,nullptr,files_content[i].c_str(),files_content[i].size());
TSNode root=ts_tree_root_node(tree);
get_all_nodes(root,files_content[i],rec_map,files_names[i]);
}
for (auto s:rec_map) {
if (s.second>=2 and s.first.size()>=3) {
rec_list.push_back(s.first);
}
}
}
int main(int argc,char **argv) {
if (argc<2) {
cout<<"Usage: ccc [FILES]"<<endl;
return -1;
}
fileout.write(reinterpret_cast<const char*>(outbytes.data()),outbytes.size());
fileout.close();
cout<<"Reccurences map entry count: "<<rec_list.size()<<endl;
size_t total_bytes=0;
for (int i=0;i<rec_list.size();i++) {
total_bytes++;
total_bytes+=rec_list[i].size();
vector<string> files;
for (int i=1;i<argc;i++) {
string file=string(argv[i]);
if (file=="-v") {
debug=true;
continue;
}
if (!fs::exists(file)) {
cout<<"Error: file doesn't exist: "<<file<<endl;
return -1;
}
files.push_back(file);
}
cout<<"Total spaces taken by reccurences map in bytes: "<<total_bytes<<endl;
return 0;
vector<string> files_content;
for (auto f:files) {
ifstream file(f,ios::binary);
if (!file) {
cout<<"Error: couldn't open provided file."<<endl;
return -1;
}
string code((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
files_content.push_back(code);
}
construct_rec_table(files_content,files);
vector<processed_file> files_archive;
for (int i=0;i<files_content.size();i++) {
processed_file pfile;
pfile.path=files[i];
auto payload_bytes=process_file_nodes(&(all_tokens.at(pfile.path)),files_content[i],rec_list);
vector<unsigned char> payload_compressed;
payload_compressed.resize(payload_bytes.size()+payload_bytes.size()/3+128);
lzma_stream strm=LZMA_STREAM_INIT;
if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) {
cout<<"Error: couldn't initialize LZMA compressor for file: "<<files[i]<<endl;
return -1;
}
strm.next_in=payload_bytes.data();
strm.avail_in=payload_bytes.size();
strm.next_out=payload_compressed.data();
strm.avail_out=payload_compressed.size();
auto ret=lzma_code(&strm,LZMA_FINISH);
if (ret!=LZMA_STREAM_END) {
cout<<"Error: couldn't compress payload for file: "<<files[i]<<endl;
return -1;
}
size_t compressed_size=payload_compressed.size()-strm.avail_out;
payload_compressed.resize(compressed_size);
size_t original_size=payload_bytes.size();
lzma_end(&strm);
if (compressed_size>=original_size) {
pfile.is_payload_compressed=false;
pfile.payload=payload_bytes;
pfile.payload_size=original_size;
} else {
pfile.is_payload_compressed=true;
pfile.payload=payload_compressed;
pfile.payload_size=compressed_size;
}
cout<<i+1<<" file(s) done on "<<files.size()<<": "<<files[i]<<endl;
}
exit(0);
// auto payload=process_all_nodes(&all_tokens[0],files_content[0],rec_list);
// vector<unsigned char> rec_table;
// for (int i=0;i<rec_list.size();i++) {
// for (auto c:rec_list[i]) {
// rec_table.push_back(c);
// }
// rec_table.push_back('\0');
// }
// vector<unsigned char> rec_table_compressed;
// rec_table_compressed.resize(rec_table.size()+rec_table.size()/3+128);
// lzma_stream strm=LZMA_STREAM_INIT;
// if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) {
// cout<<"Error: couldn't initialize LZMA compressor."<<endl;
// return -1;
// }
// strm.next_in=rec_table.data();
// strm.avail_in=rec_table.size();
// strm.next_out=rec_table_compressed.data();
// strm.avail_out=rec_table_compressed.size();
// lzma_ret ret=lzma_code(&strm,LZMA_FINISH);
// if (ret!=LZMA_STREAM_END) {
// cout<<"Error: couldn't compress reccurences table."<<endl;
// return -1;
// }
// size_t compressed_size=rec_table_compressed.size()-strm.avail_out;
// rec_table_compressed.resize(compressed_size);
// size_t original_size=rec_table.size();
// lzma_end(&strm);
// vector<unsigned char> out;
// out.push_back('C');
// out.push_back(compressed_size>=original_size?'C':'c');
// vector<unsigned char> payload_bytes;
// unsigned char current=0;
// size_t bit_index=0;
// for (bool b:payload) {
// current|=(b<<(7-bit_index));
// bit_index++;
// if (bit_index==8) {
// payload_bytes.push_back(current);
// current=0;
// bit_index=0;
// }
// }
// if (bit_index!=0) {
// payload_bytes.push_back(current);
// }
// vector<unsigned char> payload_compressed;
// payload_compressed.resize(payload_bytes.size()+payload_bytes.size()/3+128);
// strm=LZMA_STREAM_INIT;
// if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) {
// cout<<"Error: couldn't initialize LZMA compressor."<<endl;
// return -1;
// }
// strm.next_in=payload_bytes.data();
// strm.avail_in=payload_bytes.size();
// strm.next_out=payload_compressed.data();
// strm.avail_out=payload_compressed.size();
// ret=lzma_code(&strm,LZMA_FINISH);
// if (ret!=LZMA_STREAM_END) {
// cout<<"Error: couldn't compress reccurences table."<<endl;
// return -1;
// }
// size_t compressed_size1=payload_compressed.size()-strm.avail_out;
// payload_compressed.resize(compressed_size1);
// size_t original_size1=payload_bytes.size();
// lzma_end(&strm);
// out.push_back(compressed_size1>=original_size1?'C':'c');
// if (compressed_size>=original_size) {
// CCC_ADD_COMPOMENT(out,rec_table);
// } else {
// CCC_ADD_COMPOMENT(out,rec_table_compressed);
// }
// if (compressed_size1>=original_size1) {
// CCC_ADD_COMPOMENT(out,payload_bytes);
// } else {
// CCC_ADD_COMPOMENT(out,payload_compressed);
// }
// ofstream fileout("test.ccc",ios::binary);
// if (!fileout) {
// cout<<"Error: couldn't open output file."<<endl;
// return -1;
// }
// fileout.write(reinterpret_cast<const char*>(out.data()),out.size());
// fileout.close();
// return 0;
}