684 lines
21 KiB
C++
684 lines
21 KiB
C++
#include <iostream>
|
|
#include <filesystem>
|
|
#include <fstream>
|
|
#include <stdint.h>
|
|
#include <string>
|
|
#include <map>
|
|
#include <vector>
|
|
#include <iterator>
|
|
#include <algorithm>
|
|
#include <tree_sitter/api.h>
|
|
#include <tree_sitter/tree-sitter-c.h>
|
|
#include <lzma.h>
|
|
using namespace std;
|
|
namespace fs=filesystem;
|
|
const vector<bool> CCC_C_KEYYORD_HEAD {0,0,0};
|
|
const vector<bool> CCC_SPACE {0,1,1,1,0,0,1};
|
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_IF {0,0,1,0,0,0};
|
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_IFDEF {0,0,1,0,0,1};
|
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_IFNDEF {0,0,1,0,1,0};
|
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ELSE {0,0,1,0,1,1};
|
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ELIF {0,0,1,1,0,0};
|
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ELIFDEF {0,0,1,1,0,1};
|
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ELIFNDEF {0,0,1,1,1,0};
|
|
const vector<bool> CCC_PREPROCESSOR_CONDITIONAL_ENDIF {0,0,1,1,1,1};
|
|
const vector<bool> CCC_PREPROCESSOR_OTHER_DEFINE {0,1,0,0,0,0};
|
|
const vector<bool> CCC_PREPROCESSOR_OTHER_UNDEF {0,1,0,0,0,1};
|
|
const vector<bool> CCC_PREPROCESSOR_OTHER_INCLUDE {0,1,0,0,1,0};
|
|
const vector<bool> CCC_PREPROCESSOR_OTHER_ERROR {0,1,0,0,1,1};
|
|
const vector<bool> CCC_PREPROCESSOR_OTHER_WARNING {0,1,0,1,0,0};
|
|
const vector<bool> CCC_PREPROCESSOR_OTHER_PRAGMA {0,1,0,1,0,1};
|
|
const vector<bool> CCC_PREPROCESSOR_OTHER_LINE {0,1,0,1,1,0};
|
|
const vector<bool> CCC_QUOTE {0,1,0,1,1,1};
|
|
const vector<bool> CCC_DELIMITER_HEAD {0,1,1};
|
|
const vector<bool> CCC_OTHER_GRAMMAR_HEAD {1,0,0};
|
|
const vector<bool> CCC_MISCELLANEOUS_HEAD {1,0,1};
|
|
const vector<bool> CCC_REC_TABLE_REF_HEAD {1,1,0};
|
|
const vector<bool> CCC_STRING_ASCII {1,1,1,0};
|
|
const vector<bool> CCC_STRING_UTF8 {1,1,1,1};
|
|
const vector<bool> CCC_STRING_END_ASCII {0,0,0,0,0,0,0};
|
|
const vector<bool> CCC_STRING_END_UTF8 {0,0,0,0,0,0,0,0};
|
|
#define CCC_ADD_COMPOMENT(vec,tail) \
|
|
do { \
|
|
auto tmp=tail; \
|
|
vec.insert(vec.end(),tmp.begin(),tmp.end()); \
|
|
} while (0)
|
|
const vector<string> delimiter={
|
|
"\n",
|
|
"\t",
|
|
"{",
|
|
"}",
|
|
"(",
|
|
")",
|
|
"[",
|
|
"]",
|
|
" ",
|
|
"{}",
|
|
"()",
|
|
"[]",
|
|
"",
|
|
";",
|
|
",",
|
|
"."
|
|
};
|
|
const vector<string> other_grammer={
|
|
"!",
|
|
"%",
|
|
"'",
|
|
"*",
|
|
"+",
|
|
"-",
|
|
"/",
|
|
":",
|
|
"<",
|
|
">",
|
|
"=",
|
|
"?",
|
|
"^",
|
|
"|",
|
|
"&",
|
|
"~"
|
|
};
|
|
const vector<string> miscellaneous={
|
|
"+=",
|
|
"-=",
|
|
"*=",
|
|
"/=",
|
|
"%=",
|
|
"&=",
|
|
"|=",
|
|
"^=",
|
|
"<<=",
|
|
">>=",
|
|
"++",
|
|
"--",
|
|
"<<",
|
|
">>",
|
|
"==",
|
|
"!=",
|
|
"<=",
|
|
">=",
|
|
"->",
|
|
"...",
|
|
"||",
|
|
"&&",
|
|
"NULL",
|
|
"size_t",
|
|
"uint8_t",
|
|
"uint16_t",
|
|
"uint32_t",
|
|
"uint64_t",
|
|
"int8_t",
|
|
"int16_t",
|
|
"int32_t",
|
|
"int64_t"
|
|
};
|
|
const vector<string> c_keywords={
|
|
"alignas",
|
|
"alignof",
|
|
"auto",
|
|
"bool",
|
|
"break",
|
|
"case",
|
|
"char",
|
|
"const",
|
|
"constexpr",
|
|
"continue",
|
|
"default",
|
|
"do",
|
|
"double",
|
|
"else",
|
|
"enum",
|
|
"extern",
|
|
"false",
|
|
"float",
|
|
"for",
|
|
"goto",
|
|
"if",
|
|
"inline",
|
|
"int",
|
|
"long",
|
|
"nullptr",
|
|
"register",
|
|
"restrict",
|
|
"return",
|
|
"short",
|
|
"signed",
|
|
"sizeof",
|
|
"static",
|
|
"static_assert",
|
|
"struct",
|
|
"switch",
|
|
"thread_local",
|
|
"true",
|
|
"typedef",
|
|
"typeof",
|
|
"typeof_unequal",
|
|
"union",
|
|
"unsigned",
|
|
"void",
|
|
"volatile",
|
|
"while",
|
|
"__asm__",
|
|
"__attribute__"
|
|
};
|
|
struct symbol {
|
|
string name;
|
|
int score;
|
|
};
|
|
struct node {
|
|
map<unsigned char,node*> children;
|
|
int token_id=-1;
|
|
};
|
|
void insert(node* root,string str,int id) {
|
|
node* curr=root;
|
|
for (char c:str) {
|
|
if (curr->children.find(c)==curr->children.end()) {
|
|
curr->children[c]=new node();
|
|
}
|
|
curr=curr->children[c];
|
|
}
|
|
curr->token_id=id;
|
|
}
|
|
struct processed_file {
|
|
string path;
|
|
uint32_t payload_size;
|
|
vector<unsigned char> payload;
|
|
bool is_payload_compressed;
|
|
};
|
|
map<string,vector<TSNode>> all_tokens;
|
|
map<string,int> rec_map;
|
|
vector<string> rec_list;
|
|
bool debug=false;
|
|
void get_all_nodes(TSNode node,const string &source_code,map<string,int> &rec_map,const string& file) {
|
|
if (ts_node_child_count(node)==0) {
|
|
all_tokens[file].push_back(node);
|
|
string text=source_code.substr(ts_node_start_byte(node),ts_node_end_byte(node)-ts_node_start_byte(node));
|
|
if (string(ts_node_type(node))=="string_content" || string(ts_node_type(node))=="system_lib_string" || string(ts_node_type(node))=="identifier" || string(ts_node_type(node))=="number_literal" || string(ts_node_type(node))=="type_identifier" || string(ts_node_type(node))=="field_identifier" || string(ts_node_type(node))=="escape_sequence" || string(ts_node_type(node))=="statement_identifier") {
|
|
rec_map[text]++;
|
|
}
|
|
if (string(ts_node_type(node))=="primitive_type" && find(c_keywords.begin(),c_keywords.end(),text)==c_keywords.end()) {
|
|
rec_map[text]++;
|
|
}
|
|
if (string(ts_node_type(node))=="comment") {
|
|
rec_map[text]=2;
|
|
}
|
|
} else {
|
|
uint32_t child_count=ts_node_child_count(node);
|
|
for (uint32_t i=0;i<child_count;++i) {
|
|
TSNode child=ts_node_child(node,i);
|
|
get_all_nodes(child,source_code,rec_map,file);
|
|
}
|
|
}
|
|
}
|
|
vector<bool> byte_to_bits(unsigned char c) {
|
|
vector<bool> out;
|
|
for (int i=7;i>=0;i--) {
|
|
bool enabled=(c>>i)&0x01;
|
|
out.push_back(enabled);
|
|
}
|
|
return out;
|
|
}
|
|
vector<bool> ascii_to_bits(unsigned char c) {
|
|
vector<bool> out;
|
|
for (int i=6;i>=0;i--) {
|
|
bool enabled=(c>>i)&0x01;
|
|
out.push_back(enabled);
|
|
}
|
|
return out;
|
|
}
|
|
vector<bool> generate_c_keyword(size_t index) {
|
|
vector<bool> out;
|
|
CCC_ADD_COMPOMENT(out,CCC_C_KEYYORD_HEAD);
|
|
for (int i=5;i>=0;i--) {
|
|
bool enabled=(index>>i)&0x01;
|
|
out.push_back(enabled);
|
|
}
|
|
return out;
|
|
}
|
|
vector<bool> generate_rec(size_t index,size_t total_recs) {
|
|
vector<bool> out;
|
|
size_t bits=0;
|
|
while (total_recs) {
|
|
total_recs>>=1;
|
|
++bits;
|
|
}
|
|
CCC_ADD_COMPOMENT(out,CCC_REC_TABLE_REF_HEAD);
|
|
for (int i=bits;i>=0;i--) {
|
|
bool enabled=(index>>i)&0x01;
|
|
out.push_back(enabled);
|
|
}
|
|
return out;
|
|
}
|
|
vector<bool> generate_delimiter(size_t index) {
|
|
vector<bool> out;
|
|
CCC_ADD_COMPOMENT(out,CCC_DELIMITER_HEAD);
|
|
for (int i=3;i>=0;i--) {
|
|
bool enabled=(index>>i)&0x01;
|
|
out.push_back(enabled);
|
|
}
|
|
return out;
|
|
}
|
|
vector<bool> generate_other_grammar(size_t index) {
|
|
vector<bool> out;
|
|
CCC_ADD_COMPOMENT(out,CCC_OTHER_GRAMMAR_HEAD);
|
|
for (int i=3;i>=0;i--) {
|
|
bool enabled=(index>>i)&0x01;
|
|
out.push_back(enabled);
|
|
}
|
|
return out;
|
|
}
|
|
vector<bool> generate_miscellaneous(size_t index) {
|
|
vector<bool> out;
|
|
CCC_ADD_COMPOMENT(out,CCC_MISCELLANEOUS_HEAD);
|
|
for (int i=4;i>=0;i--) {
|
|
bool enabled=(index>>i)&0x01;
|
|
out.push_back(enabled);
|
|
}
|
|
return out;
|
|
}
|
|
vector<bool> generate_string_content(string str) {
|
|
vector<bool> out;
|
|
bool is_utf8=false;
|
|
for (auto c:str) {
|
|
if (c>127) {
|
|
is_utf8=true;
|
|
break;
|
|
}
|
|
}
|
|
if (is_utf8) {
|
|
CCC_ADD_COMPOMENT(out,CCC_STRING_UTF8);
|
|
for (auto c:str) {
|
|
CCC_ADD_COMPOMENT(out,byte_to_bits(c));
|
|
}
|
|
CCC_ADD_COMPOMENT(out,CCC_STRING_END_UTF8);
|
|
} else {
|
|
CCC_ADD_COMPOMENT(out,CCC_STRING_ASCII);
|
|
for (auto c:str) {
|
|
CCC_ADD_COMPOMENT(out,ascii_to_bits(c));
|
|
}
|
|
CCC_ADD_COMPOMENT(out,CCC_STRING_END_ASCII);
|
|
}
|
|
return out;
|
|
}
|
|
void print_debug(string text) {
|
|
if (debug==true) {
|
|
cout<<text<<endl;
|
|
}
|
|
}
|
|
vector<unsigned char> process_file_nodes(vector<TSNode> *nodes,string code,vector<string> &rec_list) {
|
|
vector<bool> out;
|
|
for (int i=0;i<nodes->size();i++) {
|
|
string type=string(ts_node_type(nodes->at(i)));
|
|
if (type=="#if") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IF);
|
|
print_debug("if");
|
|
} else if (type=="#ifdef") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IFDEF);
|
|
print_debug("ifdef");
|
|
} else if (type=="#ifndef") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_IFNDEF);
|
|
print_debug("ifndef");
|
|
} else if (type=="#else") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELSE);
|
|
print_debug("else");
|
|
} else if (type=="#elif") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIF);
|
|
print_debug("elif");
|
|
} else if (type=="#elifdef") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIFDEF);
|
|
print_debug("elifdef");
|
|
} else if (type=="#elifndef") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ELIFNDEF);
|
|
print_debug("elifndef");
|
|
} else if (type=="#endif") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_CONDITIONAL_ENDIF);
|
|
print_debug("endif");
|
|
} else if (type=="#define") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_DEFINE);
|
|
print_debug("define");
|
|
} else if (type=="#undef") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_UNDEF);
|
|
print_debug("undef");
|
|
} else if (type=="#include") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_INCLUDE);
|
|
print_debug("include");
|
|
} else if (type=="#error") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_ERROR);
|
|
print_debug("error");
|
|
} else if (type=="#warning") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_WARNING);
|
|
print_debug("warning");
|
|
} else if (type=="#pragma") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_PRAGMA);
|
|
print_debug("pragma");
|
|
} else if (type=="#line") {
|
|
CCC_ADD_COMPOMENT(out,CCC_PREPROCESSOR_OTHER_LINE);
|
|
print_debug("line");
|
|
} else if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="type_identifier" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") {
|
|
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
|
|
auto it=find(rec_list.begin(),rec_list.end(),text);
|
|
if (it==rec_list.end()) {
|
|
if (!text.empty()) {
|
|
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
|
|
CCC_ADD_COMPOMENT(out,generate_string_content(text));
|
|
print_debug("string ("+type+"): "+text);
|
|
} else {
|
|
auto it=find(delimiter.begin(),delimiter.end(),"");
|
|
size_t index=distance(delimiter.begin(),it);
|
|
CCC_ADD_COMPOMENT(out,generate_delimiter(index));
|
|
print_debug("delimiter for empty string");
|
|
}
|
|
} else {
|
|
size_t index=distance(rec_list.begin(),it);
|
|
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
|
|
print_debug("rec_table for string ("+type+"): "+text);
|
|
}
|
|
} else if (type=="primitive_type") {
|
|
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
|
|
auto it=find(c_keywords.begin(),c_keywords.end(),text);
|
|
if (it!=c_keywords.end()) {
|
|
size_t index=distance(c_keywords.begin(),it);
|
|
CCC_ADD_COMPOMENT(out,generate_c_keyword(index));
|
|
print_debug("primitive_type: "+text);
|
|
} else {
|
|
auto it=find(rec_list.begin(),rec_list.end(),text);
|
|
if (it==rec_list.end()) {
|
|
if (!text.empty()) {
|
|
CCC_ADD_COMPOMENT(out,generate_string_content(text));
|
|
print_debug("string ("+type+"): "+text);
|
|
} else {
|
|
cout<<"Error: provided primitive is empty: "<<text<<endl;;
|
|
exit(-1);
|
|
}
|
|
} else {
|
|
size_t index=distance(rec_list.begin(),it);
|
|
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
|
|
print_debug("rec_table for string ("+type+"): "+text);
|
|
}
|
|
}
|
|
} else if (find(delimiter.begin(),delimiter.end(),type)!=delimiter.end()) {
|
|
string text;
|
|
if (type=="(" && i+1<nodes->size()) {
|
|
if (string(ts_node_type(nodes->at(i+1)))==")") {
|
|
text="()";
|
|
i++;
|
|
} else {
|
|
text="(";
|
|
}
|
|
} else if (type=="[" && i+1<nodes->size()) {
|
|
if (string(ts_node_type(nodes->at(i+1)))=="]") {
|
|
text="[]";
|
|
i++;
|
|
} else {
|
|
text="[";
|
|
}
|
|
} else if (type=="{" && i+1<nodes->size()) {
|
|
if (string(ts_node_type(nodes->at(i+1)))=="}") {
|
|
text="{}";
|
|
i++;
|
|
} else {
|
|
text="{";
|
|
}
|
|
} else {
|
|
text=type;
|
|
}
|
|
auto it=find(delimiter.begin(),delimiter.end(),text);
|
|
if (it!=delimiter.end()) {
|
|
size_t index=distance(delimiter.begin(),it);
|
|
CCC_ADD_COMPOMENT(out,generate_delimiter(index));
|
|
print_debug("delimiter: "+text);
|
|
} else {
|
|
cout<<"Error: unknow delimiter, that shouldn't happen: "<<text<<endl;;
|
|
exit(-1);
|
|
}
|
|
} else if (find(other_grammer.begin(),other_grammer.end(),type)!=other_grammer.end()) {
|
|
auto it=find(other_grammer.begin(),other_grammer.end(),type);
|
|
if (it!=other_grammer.end()) {
|
|
size_t index=distance(other_grammer.begin(),it);
|
|
CCC_ADD_COMPOMENT(out,generate_other_grammar(index));
|
|
print_debug("other grammar: "+type);
|
|
} else {
|
|
cout<<"Error: unknow other grammar symbol, that shouldn't happen: "<<type<<endl;;
|
|
exit(-1);
|
|
}
|
|
} else if (find(c_keywords.begin(),c_keywords.end(),type)!=c_keywords.end()) {
|
|
auto it=find(c_keywords.begin(),c_keywords.end(),type);
|
|
if (it!=c_keywords.end()) {
|
|
size_t index=distance(c_keywords.begin(),it);
|
|
CCC_ADD_COMPOMENT(out,generate_c_keyword(index));
|
|
print_debug("c keyword: "+type);
|
|
} else {
|
|
cout<<"Error: unknow C keyword, that shouldn't happen: "<<type<<endl;;
|
|
exit(-1);
|
|
}
|
|
} else if (find(miscellaneous.begin(),miscellaneous.end(),type)!=miscellaneous.end()) {
|
|
auto it=find(miscellaneous.begin(),miscellaneous.end(),type);
|
|
if (it!=miscellaneous.end()) {
|
|
size_t index=distance(miscellaneous.begin(),it);
|
|
CCC_ADD_COMPOMENT(out,generate_miscellaneous(index));
|
|
print_debug("miscellaneous: "+type);
|
|
} else {
|
|
cout<<"Error: unknow miscellaneous, that shouldn't happen: "<<type<<endl;;
|
|
exit(-1);
|
|
}
|
|
} else if (type=="comment") {
|
|
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
|
|
auto it=find(rec_list.begin(),rec_list.end(),text);
|
|
if (it==rec_list.end()) {
|
|
cout<<"Error: comment in reccurences map not found: "<<text<<endl;;
|
|
exit(-1);
|
|
} else {
|
|
size_t index=distance(rec_list.begin(),it);
|
|
CCC_ADD_COMPOMENT(out,generate_rec(index,rec_list.size()));
|
|
print_debug("rec_table for comment");
|
|
}
|
|
} else if (type=="\"") {
|
|
if (i+1<nodes->size()) {
|
|
if (string(ts_node_type(nodes->at(i+1)))=="\"") {
|
|
auto it=find(delimiter.begin(),delimiter.end(),"");
|
|
size_t index=distance(delimiter.begin(),it);
|
|
CCC_ADD_COMPOMENT(out,generate_delimiter(index));
|
|
print_debug("double quotes mark, inserting delimiter for empty string");
|
|
i++;
|
|
} else {
|
|
CCC_ADD_COMPOMENT(out,CCC_QUOTE);
|
|
print_debug("single quote mark");
|
|
}
|
|
}
|
|
} else {
|
|
string text=code.substr(ts_node_start_byte(nodes->at(i)),ts_node_end_byte(nodes->at(i))-ts_node_start_byte(nodes->at(i)));
|
|
cout<<"Error: unknow node type: "<<type<<endl;
|
|
cout<<"Error: unknow node text: "<<text<<endl;
|
|
exit(-1);
|
|
}
|
|
}
|
|
vector<unsigned char> payload_bytes;
|
|
unsigned char current=0;
|
|
size_t bit_index=0;
|
|
for (bool b:out) {
|
|
current|=(b<<(7-bit_index));
|
|
bit_index++;
|
|
if (bit_index==8) {
|
|
payload_bytes.push_back(current);
|
|
current=0;
|
|
bit_index=0;
|
|
}
|
|
}
|
|
if (bit_index!=0) {
|
|
payload_bytes.push_back(current);
|
|
}
|
|
return payload_bytes;
|
|
}
|
|
void construct_rec_table(vector<string> &files_content,vector<string> files_names) {
|
|
for (int i=0;i<files_content.size();i++) {
|
|
TSParser *parser=ts_parser_new();
|
|
ts_parser_set_language(parser,tree_sitter_c());
|
|
TSTree *tree=ts_parser_parse_string(parser,nullptr,files_content[i].c_str(),files_content[i].size());
|
|
TSNode root=ts_tree_root_node(tree);
|
|
get_all_nodes(root,files_content[i],rec_map,files_names[i]);
|
|
}
|
|
for (auto s:rec_map) {
|
|
if (s.second>=2 and s.first.size()>=3) {
|
|
rec_list.push_back(s.first);
|
|
}
|
|
}
|
|
}
|
|
int main(int argc,char **argv) {
|
|
if (argc<2) {
|
|
cout<<"Usage: ccc [FILES]"<<endl;
|
|
return -1;
|
|
}
|
|
vector<string> files;
|
|
for (int i=1;i<argc;i++) {
|
|
string file=string(argv[i]);
|
|
if (file=="-v") {
|
|
debug=true;
|
|
continue;
|
|
}
|
|
if (!fs::exists(file)) {
|
|
cout<<"Error: file doesn't exist: "<<file<<endl;
|
|
return -1;
|
|
}
|
|
files.push_back(file);
|
|
}
|
|
vector<string> files_content;
|
|
for (auto f:files) {
|
|
ifstream file(f,ios::binary);
|
|
if (!file) {
|
|
cout<<"Error: couldn't open provided file."<<endl;
|
|
return -1;
|
|
}
|
|
string code((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
|
|
files_content.push_back(code);
|
|
}
|
|
construct_rec_table(files_content,files);
|
|
vector<processed_file> files_archive;
|
|
for (int i=0;i<files_content.size();i++) {
|
|
processed_file pfile;
|
|
pfile.path=files[i];
|
|
auto payload_bytes=process_file_nodes(&(all_tokens.at(pfile.path)),files_content[i],rec_list);
|
|
vector<unsigned char> payload_compressed;
|
|
payload_compressed.resize(payload_bytes.size()+payload_bytes.size()/3+128);
|
|
lzma_stream strm=LZMA_STREAM_INIT;
|
|
if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) {
|
|
cout<<"Error: couldn't initialize LZMA compressor for file: "<<files[i]<<endl;
|
|
return -1;
|
|
}
|
|
strm.next_in=payload_bytes.data();
|
|
strm.avail_in=payload_bytes.size();
|
|
strm.next_out=payload_compressed.data();
|
|
strm.avail_out=payload_compressed.size();
|
|
auto ret=lzma_code(&strm,LZMA_FINISH);
|
|
if (ret!=LZMA_STREAM_END) {
|
|
cout<<"Error: couldn't compress payload for file: "<<files[i]<<endl;
|
|
return -1;
|
|
}
|
|
size_t compressed_size=payload_compressed.size()-strm.avail_out;
|
|
payload_compressed.resize(compressed_size);
|
|
size_t original_size=payload_bytes.size();
|
|
lzma_end(&strm);
|
|
if (compressed_size>=original_size) {
|
|
pfile.is_payload_compressed=false;
|
|
pfile.payload=payload_bytes;
|
|
pfile.payload_size=original_size;
|
|
} else {
|
|
pfile.is_payload_compressed=true;
|
|
pfile.payload=payload_compressed;
|
|
pfile.payload_size=compressed_size;
|
|
}
|
|
cout<<i+1<<" file(s) done on "<<files.size()<<": "<<files[i]<<endl;
|
|
}
|
|
exit(0);
|
|
|
|
|
|
|
|
|
|
|
|
// auto payload=process_all_nodes(&all_tokens[0],files_content[0],rec_list);
|
|
// vector<unsigned char> rec_table;
|
|
// for (int i=0;i<rec_list.size();i++) {
|
|
// for (auto c:rec_list[i]) {
|
|
// rec_table.push_back(c);
|
|
// }
|
|
// rec_table.push_back('\0');
|
|
// }
|
|
// vector<unsigned char> rec_table_compressed;
|
|
// rec_table_compressed.resize(rec_table.size()+rec_table.size()/3+128);
|
|
// lzma_stream strm=LZMA_STREAM_INIT;
|
|
// if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) {
|
|
// cout<<"Error: couldn't initialize LZMA compressor."<<endl;
|
|
// return -1;
|
|
// }
|
|
// strm.next_in=rec_table.data();
|
|
// strm.avail_in=rec_table.size();
|
|
// strm.next_out=rec_table_compressed.data();
|
|
// strm.avail_out=rec_table_compressed.size();
|
|
// lzma_ret ret=lzma_code(&strm,LZMA_FINISH);
|
|
// if (ret!=LZMA_STREAM_END) {
|
|
// cout<<"Error: couldn't compress reccurences table."<<endl;
|
|
// return -1;
|
|
// }
|
|
// size_t compressed_size=rec_table_compressed.size()-strm.avail_out;
|
|
// rec_table_compressed.resize(compressed_size);
|
|
// size_t original_size=rec_table.size();
|
|
// lzma_end(&strm);
|
|
// vector<unsigned char> out;
|
|
// out.push_back('C');
|
|
// out.push_back(compressed_size>=original_size?'C':'c');
|
|
// vector<unsigned char> payload_bytes;
|
|
// unsigned char current=0;
|
|
// size_t bit_index=0;
|
|
// for (bool b:payload) {
|
|
// current|=(b<<(7-bit_index));
|
|
// bit_index++;
|
|
// if (bit_index==8) {
|
|
// payload_bytes.push_back(current);
|
|
// current=0;
|
|
// bit_index=0;
|
|
// }
|
|
// }
|
|
// if (bit_index!=0) {
|
|
// payload_bytes.push_back(current);
|
|
// }
|
|
// vector<unsigned char> payload_compressed;
|
|
// payload_compressed.resize(payload_bytes.size()+payload_bytes.size()/3+128);
|
|
// strm=LZMA_STREAM_INIT;
|
|
// if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) {
|
|
// cout<<"Error: couldn't initialize LZMA compressor."<<endl;
|
|
// return -1;
|
|
// }
|
|
// strm.next_in=payload_bytes.data();
|
|
// strm.avail_in=payload_bytes.size();
|
|
// strm.next_out=payload_compressed.data();
|
|
// strm.avail_out=payload_compressed.size();
|
|
// ret=lzma_code(&strm,LZMA_FINISH);
|
|
// if (ret!=LZMA_STREAM_END) {
|
|
// cout<<"Error: couldn't compress reccurences table."<<endl;
|
|
// return -1;
|
|
// }
|
|
// size_t compressed_size1=payload_compressed.size()-strm.avail_out;
|
|
// payload_compressed.resize(compressed_size1);
|
|
// size_t original_size1=payload_bytes.size();
|
|
// lzma_end(&strm);
|
|
// out.push_back(compressed_size1>=original_size1?'C':'c');
|
|
// if (compressed_size>=original_size) {
|
|
// CCC_ADD_COMPOMENT(out,rec_table);
|
|
// } else {
|
|
// CCC_ADD_COMPOMENT(out,rec_table_compressed);
|
|
// }
|
|
// if (compressed_size1>=original_size1) {
|
|
// CCC_ADD_COMPOMENT(out,payload_bytes);
|
|
// } else {
|
|
// CCC_ADD_COMPOMENT(out,payload_compressed);
|
|
// }
|
|
// ofstream fileout("test.ccc",ios::binary);
|
|
// if (!fileout) {
|
|
// cout<<"Error: couldn't open output file."<<endl;
|
|
// return -1;
|
|
// }
|
|
// fileout.write(reinterpret_cast<const char*>(out.data()),out.size());
|
|
// fileout.close();
|
|
// return 0;
|
|
}
|