Files
ccc/ccc.cpp
2026-02-06 00:01:23 +01:00

672 lines
20 KiB
C++

#include <cstdint>
#include <cstring>
#include <iostream>
#include <filesystem>
#include <fstream>
#include <stdint.h>
#include <string>
#include <map>
#include <vector>
#include <iterator>
#include <algorithm>
#include <tree_sitter/api.h>
#include <tree_sitter/tree-sitter-c.h>
#include <lzma.h>
using namespace std;
namespace fs=filesystem;
const vector<bool> CCC_DELIMITER_0_HEAD={0};
const vector<bool> CCC_DELIMITER_1_HEAD={1,0};
const vector<bool> CCC_C_KEYWORD_HEAD={1,1,0,0};
const vector<bool> CCC_MISCELANEOUS_HEAD={1,1,0,1};
const vector<bool> CCC_STRING_INLINE_HEAD={1,1,1,0};
const vector<bool> CCC_REC_TABLE_REF_HEAD={1,1,1,1};
const vector<bool> CCC_STRING_INLINE_END={0,0,0,0,0,0,0,0};
#define CCC_ADD_COMPOMENT(vec,tail) \
do { \
auto tmp=tail; \
vec.insert(vec.end(),tmp.begin(),tmp.end()); \
} while (0)
const vector<string> delimiter0={
"{",
"}",
"(",
")",
"[",
"]",
",",
"."
};
const vector<string> delimiter1={
"{}",
"()",
"[]",
";"
};
const vector<string> miscellaneous={
"!",
"%",
"'",
"*",
"+",
"-",
"/",
":",
"<",
">",
"=",
"?",
"^",
"|",
"&",
"~",
"+=",
"-=",
"*=",
"/=",
"%=",
"&=",
"|=",
"^=",
"<<=",
">>=",
"++",
"--",
"<<",
">>",
"==",
"!=",
"<=",
">=",
"->",
"...",
"||",
"&&",
"NULL",
"size_t",
"uint8_t",
"uint16_t",
"uint32_t",
"uint64_t",
"int8_t",
"int16_t",
"int32_t",
"int64_t"
};
const vector<string> c_keywords={
"#if",
"#ifdef",
"#ifndef",
"#else",
"#elif",
"#elifdef",
"#elifndef",
"#endif",
"#define",
"#undef",
"#include",
"#error",
"#warning",
"#pragma",
"#line",
"alignas",
"alignof",
"auto",
"bool",
"break",
"case",
"char",
"const",
"constexpr",
"continue",
"default",
"do",
"double",
"else",
"enum",
"extern",
"false",
"float",
"for",
"goto",
"if",
"inline",
"int",
"long",
"nullptr",
"register",
"restrict",
"return",
"short",
"signed",
"sizeof",
"static",
"static_assert",
"struct",
"switch",
"thread_local",
"true",
"typedef",
"typeof",
"typeof_unequal",
"union",
"unsigned",
"void",
"volatile",
"while",
"__asm__",
"__attribute__",
"defined",
};
struct symbol {
string name;
int score;
};
#pragma pack(push,1)
struct header {
uint8_t sig[3];
uint8_t flags;
size_t size_rec_table;
size_t entry_count;
size_t size_payload;
};
#pragma pack(pop)
map<string,vector<TSNode>> all_tokens;
map<string,int> rec_map;
vector<string> rec_list;
bool debug=false;
void get_all_nodes(TSNode node,const string &source_code,map<string,int> &rec_map,const string& file) {
if (ts_node_child_count(node)==0) {
all_tokens[file].push_back(node);
string text=source_code.substr(ts_node_start_byte(node),ts_node_end_byte(node)-ts_node_start_byte(node));
if (string(ts_node_type(node))=="string_content" || string(ts_node_type(node))=="system_lib_string" || string(ts_node_type(node))=="identifier" || string(ts_node_type(node))=="number_literal" || string(ts_node_type(node))=="type_identifier" || string(ts_node_type(node))=="field_identifier" || string(ts_node_type(node))=="escape_sequence" || string(ts_node_type(node))=="statement_identifier") {
rec_map[text]++;
}
if (string(ts_node_type(node))=="primitive_type" && find(c_keywords.begin(),c_keywords.end(),text)==c_keywords.end()) {
rec_map[text]++;
}
if (string(ts_node_type(node))=="comment") {
rec_map[text]=2;
}
} else {
uint32_t child_count=ts_node_child_count(node);
for (uint32_t i=0;i<child_count;++i) {
TSNode child=ts_node_child(node,i);
get_all_nodes(child,source_code,rec_map,file);
}
}
}
vector<bool> byte_to_bits(unsigned char c) {
vector<bool> out;
for (int i=7;i>=0;i--) {
bool enabled=(c>>i)&0x01;
out.push_back(enabled);
}
return out;
}
vector<bool> generate_c_keyword(size_t index) {
vector<bool> out;
CCC_ADD_COMPOMENT(out,CCC_C_KEYWORD_HEAD);
for (int i=5;i>=0;i--) {
bool enabled=(index>>i)&0x01;
out.push_back(enabled);
}
return out;
}
vector<bool> generate_rec(size_t index,size_t total_recs) {
vector<bool> out;
size_t bits=0;
while (total_recs) {
total_recs>>=1;
++bits;
}
CCC_ADD_COMPOMENT(out,CCC_REC_TABLE_REF_HEAD);
for (int i=bits;i>=0;i--) {
bool enabled=(index>>i)&0x01;
out.push_back(enabled);
}
return out;
}
vector<bool> generate_delimiter0(size_t index) {
vector<bool> out;
CCC_ADD_COMPOMENT(out,CCC_DELIMITER_0_HEAD);
for (int i=2;i>=0;i--) {
bool enabled=(index>>i)&0x01;
out.push_back(enabled);
}
return out;
}
vector<bool> generate_delimiter1(size_t index) {
vector<bool> out;
CCC_ADD_COMPOMENT(out,CCC_DELIMITER_1_HEAD);
for (int i=1;i>=0;i--) {
bool enabled=(index>>i)&0x01;
out.push_back(enabled);
}
return out;
}
vector<bool> generate_miscellaneous(size_t index) {
vector<bool> out;
CCC_ADD_COMPOMENT(out,CCC_MISCELANEOUS_HEAD);
for (int i=5;i>=0;i--) {
bool enabled=(index>>i)&0x01;
out.push_back(enabled);
}
return out;
}
vector<bool> generate_string_content(string str) {
vector<bool> out;
CCC_ADD_COMPOMENT(out,CCC_STRING_INLINE_HEAD);
for (auto c:str) {
CCC_ADD_COMPOMENT(out,byte_to_bits(c));
}
CCC_ADD_COMPOMENT(out,CCC_STRING_INLINE_END);
return out;
}
void print_debug(string text) {
if (debug==true) {
cout<<text<<endl;
}
}
vector<unsigned char> process_file_nodes(vector<TSNode> *nodes,string code,vector<string> &rec_list) {
vector<bool> out;
for (int i=0;i<nodes->size();i++) {
string type=string(ts_node_type(nodes->at(i)));
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") {
auto it=find(rec_list.begin(),rec_list.end(),text);
if (it==rec_list.end()) {
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
CCC_ADD_COMPOMENT(out,generate_string_content(text));
print_debug("string ("+type+"): "+text);
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
print_debug("rec_table for string ("+type+"): "+text);
}
} else if (type=="primitive_type" || type=="type_identifier") {
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
auto it=find(c_keywords.begin(),c_keywords.end(),text);
if (it!=c_keywords.end()) {
size_t index=distance(c_keywords.begin(),it);
CCC_ADD_COMPOMENT(out,generate_c_keyword(index));
print_debug("type found in c keyword: "+text);
} else {
auto it=find(rec_list.begin(),rec_list.end(),text);
if (it==rec_list.end()) {
if (!text.empty()) {
CCC_ADD_COMPOMENT(out,generate_string_content(text));
print_debug("string for type ("+type+"): "+text);
} else {
cout<<"Warning: provided primitive is empty: "<<text<<endl;
}
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
print_debug("rec_table for string for type ("+type+"): "+text);
}
}
} else if (find(delimiter0.begin(),delimiter0.end(),type)!=delimiter0.end() || find(delimiter1.begin(),delimiter1.end(),type)!=delimiter1.end() || type=="\"") {
string insert;
if (type=="(" && i+1<nodes->size()) {
if (string(ts_node_type(nodes->at(i+1)))==")") {
insert="()";
i++;
} else {
insert="(";
}
} else if (type=="[" && i+1<nodes->size()) {
if (string(ts_node_type(nodes->at(i+1)))=="]") {
insert="[]";
i++;
} else {
insert="[";
}
} else if (type=="{" && i+1<nodes->size()) {
if (string(ts_node_type(nodes->at(i+1)))=="}") {
insert="{}";
i++;
} else {
insert="{";
}
} else {
insert=type;
}
auto it=find(delimiter0.begin(),delimiter0.end(),insert);
if (it!=delimiter0.end()) {
size_t index=distance(delimiter0.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter0(index));
print_debug("delimiter 0: "+insert);
} else {
if (insert!="{}" && insert!="\"") {
auto it=find(delimiter1.begin(),delimiter1.end(),insert);
if (it!=delimiter1.end()) {
size_t index=distance(delimiter1.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter1(index));
print_debug("delimiter 1: "+insert);
} else {
cout<<"Error: unknow delimiter, that shouldn't happen: "<<insert<<endl;;
// exit(-1);
}
} else {
if (insert=="{}") {
auto it=find(delimiter1.begin(),delimiter1.end(),"{}");
if (it!=delimiter1.end()) {
size_t index=distance(delimiter1.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter1(index));
CCC_ADD_COMPOMENT(out,{0});
print_debug("delimiter 1: "+insert);
} else {
cout<<"Error: unknow delimiter, that shouldn't happen: "<<insert<<endl;;
// exit(-1);
}
} else if (insert=="\"") {
auto it=find(delimiter1.begin(),delimiter1.end(),"{}");
if (it!=delimiter1.end()) {
size_t index=distance(delimiter1.begin(),it);
CCC_ADD_COMPOMENT(out,generate_delimiter1(index));
CCC_ADD_COMPOMENT(out,{1});
print_debug("delimiter 1: "+insert);
} else {
cout<<"Error: unknow delimiter, that shouldn't happen: "<<insert<<endl;;
exit(-1);
}
} else {
cout<<"Error: unknow delimiter, that shouldn't happen: "<<insert<<endl;;
// exit(-1);
}
}
}
} else if (find(c_keywords.begin(),c_keywords.end(),type)!=c_keywords.end() || type=="preproc_directive") {
if (type!="preproc_directive") {
auto it=find(c_keywords.begin(),c_keywords.end(),type);
if (it!=c_keywords.end()) {
size_t index=distance(c_keywords.begin(),it);
CCC_ADD_COMPOMENT(out,generate_c_keyword(index));
print_debug("c keyword: "+type);
} else {
cout<<"Error: unknow C keyword, that shouldn't happen: "<<type<<" "<<text<<endl;;
// exit(-1);
}
} else {
auto it=find(c_keywords.begin(),c_keywords.end(),text);
if (it!=c_keywords.end()) {
size_t index=distance(c_keywords.begin(),it);
CCC_ADD_COMPOMENT(out,generate_c_keyword(index));
print_debug("c keyword: "+type);
} else {
auto it=find(rec_list.begin(),rec_list.end(),text);
if (it==rec_list.end()) {
if (!text.empty()) {
CCC_ADD_COMPOMENT(out,generate_string_content(text));
print_debug("string for c keyword ("+type+"): "+text);
} else {
cout<<"Warning: C keyword is empty: "<<text<<endl;
}
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
print_debug("rec_table for string for c keyword ("+type+"): "+text);
}
}
}
} else if (find(miscellaneous.begin(),miscellaneous.end(),type)!=miscellaneous.end()) {
auto it=find(miscellaneous.begin(),miscellaneous.end(),type);
if (it!=miscellaneous.end()) {
size_t index=distance(miscellaneous.begin(),it);
CCC_ADD_COMPOMENT(out,generate_miscellaneous(index));
print_debug("miscellaneous: "+type);
} else {
cout<<"Error: unknow miscellaneous, that shouldn't happen: "<<type<<endl;;
// exit(-1);
}
} else if (type=="comment") {
auto it=find(rec_list.begin(),rec_list.end(),text);
if (it==rec_list.end()) {
if (it==rec_list.end()) {
if (!text.empty()) {
CCC_ADD_COMPOMENT(out,generate_string_content(text));
print_debug("string for comment("+type+"): "+text);
} else {
cout<<"Warning: unknow node is empty: "<<text<<endl;
}
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
print_debug("rec_table for string for comment ("+type+"): "+text);
}
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
print_debug("rec_table for comment");
}
} else {
auto it=find(rec_list.begin(),rec_list.end(),text);
if (it==rec_list.end()) {
if (!text.empty()) {
CCC_ADD_COMPOMENT(out,generate_string_content(text));
print_debug("string for unknow node ("+type+"): "+text);
} else {
cout<<"Warning: unknow node is empty: "<<text<<endl;
}
} else {
size_t index=distance(rec_list.begin(),it);
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
print_debug("rec_table for string for unknow node ("+type+"): "+text);
}
}
}
vector<unsigned char> payload_bytes;
unsigned char current=0;
size_t bit_index=0;
for (bool b:out) {
current|=(b<<(7-bit_index));
bit_index++;
if (bit_index==8) {
payload_bytes.push_back(current);
current=0;
bit_index=0;
}
}
if (bit_index!=0) {
payload_bytes.push_back(current);
}
return payload_bytes;
}
void construct_rec_table(vector<string> &files_content,vector<string> files_names) {
for (int i=0;i<files_content.size();i++) {
TSParser *parser=ts_parser_new();
ts_parser_set_language(parser,tree_sitter_c());
TSTree *tree=ts_parser_parse_string(parser,nullptr,files_content[i].c_str(),files_content[i].size());
TSNode root=ts_tree_root_node(tree);
get_all_nodes(root,files_content[i],rec_map,files_names[i]);
}
for (auto s:rec_map) {
if (s.second>=2 and s.first.size()>=3) {
rec_list.push_back(s.first);
}
}
}
int main(int argc,char **argv) {
cout<<c_keywords.size()<<endl;
if (argc<2) {
cout<<"Usage: ccc [FILES]"<<endl;
return -1;
}
vector<string> files;
for (int i=1;i<argc;i++) {
string file=string(argv[i]);
if (file=="-v") {
debug=true;
continue;
}
if (!fs::exists(file)) {
cout<<"Error: file doesn't exist: "<<file<<endl;
return -1;
}
files.push_back(file);
}
vector<string> files_content;
for (auto f:files) {
ifstream file(f,ios::binary);
if (!file) {
cout<<"Error: couldn't open provided file."<<endl;
return -1;
}
string code((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
files_content.push_back(code);
}
construct_rec_table(files_content,files);
vector<unsigned char> files_archive;
vector<size_t> payloads_size;
vector<size_t> payloads_start;
for (int i=0;i<files_content.size();i++) {
auto payload_bytes=process_file_nodes(&(all_tokens.at(files[i])),files_content[i],rec_list);
payloads_size.push_back(payload_bytes.size());
payloads_start.push_back(files_archive.size());
CCC_ADD_COMPOMENT(files_archive,payload_bytes);
cout<<i+1<<" file(s) done on "<<files.size()<<": "<<files[i]<<endl;
}
vector<unsigned char> payload_compressed;
payload_compressed.resize(files_archive.size()+files_archive.size()/3+128);
lzma_stream strm=LZMA_STREAM_INIT;
if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) {
cout<<"Error: couldn't initialize LZMA compressor for file archive."<<endl;
return -1;
}
strm.next_in=files_archive.data();
strm.avail_in=files_archive.size();
strm.next_out=payload_compressed.data();
strm.avail_out=payload_compressed.size();
auto ret=lzma_code(&strm,LZMA_FINISH);
if (ret!=LZMA_STREAM_END) {
cout<<"Error: couldn't compress file archive."<<endl;
return -1;
}
size_t payload_total_size;
size_t compressed_size=payload_compressed.size()-strm.avail_out;
payload_compressed.resize(compressed_size);
size_t original_size=files_archive.size();
lzma_end(&strm);
uint8_t flags=0;
if (compressed_size>=original_size) {
flags&= ~(0b00000001);
payload_total_size=original_size;
} else {
flags|=0b00000001;
payload_total_size=compressed_size;
}
vector<unsigned char> rec_table;
for (int i=0;i<rec_list.size();i++) {
for (auto c:rec_list[i]) {
rec_table.push_back(c);
}
rec_table.push_back('\0');
}
vector<unsigned char> rec_table_compressed;
rec_table_compressed.resize(rec_table.size()+rec_table.size()/3+128);
strm=LZMA_STREAM_INIT;
if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) {
cout<<"Error: couldn't initialize LZMA compressor for reccurences table."<<endl;
return -1;
}
strm.next_in=rec_table.data();
strm.avail_in=rec_table.size();
strm.next_out=rec_table_compressed.data();
strm.avail_out=rec_table_compressed.size();
ret=lzma_code(&strm,LZMA_FINISH);
if (ret!=LZMA_STREAM_END) {
cout<<"Error: couldn't compress reccurences table."<<endl;
return -1;
}
size_t rec_table_total_size;
compressed_size=rec_table_compressed.size()-strm.avail_out;
rec_table_compressed.resize(compressed_size);
original_size=rec_table.size();
lzma_end(&strm);
if (compressed_size>=original_size) {
flags&= ~(0b00000010);
rec_table_total_size=original_size;
} else {
flags|=0b00000010;
rec_table_total_size=compressed_size;
}
vector<unsigned char> files_table;
for (int i=0;i<files.size();i++) {
for (auto c:files[i]) {
files_table.push_back(c);
}
files_table.push_back('\0');
auto file_start=payloads_start[i];
for (int i=0;i<sizeof(size_t);++i) {
files_table.push_back(((uint8_t*)&file_start)[i]);
}
auto file_size=payloads_size[i];
for (int i=0;i<sizeof(size_t);++i) {
files_table.push_back(((uint8_t*)&file_size)[i]);
}
}
vector<unsigned char> files_table_compressed;
files_table_compressed.resize(files_table.size()+files_table.size()/3+128);
strm=LZMA_STREAM_INIT;
if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) {
cout<<"Error: couldn't initialize LZMA compressor for files table."<<endl;
return -1;
}
strm.next_in=files_table.data();
strm.avail_in=files_table.size();
strm.next_out=files_table_compressed.data();
strm.avail_out=files_table_compressed.size();
ret=lzma_code(&strm,LZMA_FINISH);
if (ret!=LZMA_STREAM_END) {
cout<<"Error: couldn't compress files table."<<endl;
return -1;
}
size_t files_table_total_size;
compressed_size=files_table_compressed.size()-strm.avail_out;
files_table_compressed.resize(compressed_size);
original_size=files_table.size();
lzma_end(&strm);
if (compressed_size>=original_size) {
flags&= ~(0b00000100);
files_table_total_size=original_size;
} else {
flags|=0b00000100;
files_table_total_size=compressed_size;
}
header head;
head.sig[0]='C';
head.sig[1]='C';
head.sig[2]='C';
head.flags=flags;
head.size_payload=payload_total_size;
head.size_rec_table=rec_table_total_size;
head.entry_count=files.size();
vector<unsigned char> out;
for (int i=0;i<sizeof(header);i++) {
out.push_back(((uint8_t*)&head)[i]);
}
if (flags & 0b00000010) {
CCC_ADD_COMPOMENT(out,rec_table_compressed);
} else {
CCC_ADD_COMPOMENT(out,rec_table);
}
if (flags & 0b00000100) {
CCC_ADD_COMPOMENT(out,files_table_compressed);
} else {
CCC_ADD_COMPOMENT(out,files_table);
}
if (flags & 0b00000001) {
CCC_ADD_COMPOMENT(out,payload_compressed);
} else {
CCC_ADD_COMPOMENT(out,files_archive);
}
ofstream fileout("test.ccc",ios::binary);
if (!fileout) {
cout<<"Error: couldn't open output file."<<endl;
return -1;
}
fileout.write(reinterpret_cast<const char*>(out.data()),out.size());
fileout.close();
return 0;
}