final stage
This commit is contained in:
6
build.sh
6
build.sh
@@ -1 +1,5 @@
|
||||
g++ ccc.cpp -o ccc -ltree-sitter -ltree-sitter-c -llzma -Ofast -march=native
|
||||
# git clone https://github.com/facebook/zstd.git
|
||||
# cd zstd
|
||||
# exit
|
||||
# make -j$(nproc) CXXFLAGS="-DZSTD_MULTITHREAD_SUPPORT -DZSTD_MULTITHREAD"
|
||||
g++ ccc.cpp -o ccc -ltree-sitter -ltree-sitter-c -lzstd -lxxhash -llzma -Ofast -march=native
|
||||
|
||||
708
ccc.cpp
708
ccc.cpp
@@ -1,49 +1,90 @@
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <exception>
|
||||
#include <iostream>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <string_view>
|
||||
#include <threads.h>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <iterator>
|
||||
#include <algorithm>
|
||||
#include <thread>
|
||||
#include <future>
|
||||
#include <queue>
|
||||
#include <chrono>
|
||||
#include <tree_sitter/api.h>
|
||||
#include <tree_sitter/tree-sitter-c.h>
|
||||
#include <lzma.h>
|
||||
#include <xxh3.h>
|
||||
#include <unordered_map>
|
||||
#include <malloc.h>
|
||||
using namespace std;
|
||||
namespace fs=filesystem;
|
||||
const vector<bool> CCC_DELIMITER_0_HEAD={0};
|
||||
const vector<bool> CCC_DELIMITER_1_HEAD={1,0};
|
||||
const vector<bool> CCC_C_KEYWORD_HEAD={1,1,0,0};
|
||||
const vector<bool> CCC_MISCELANEOUS_HEAD={1,1,0,1};
|
||||
const vector<bool> CCC_STRING_INLINE_HEAD={1,1,1,0};
|
||||
const vector<bool> CCC_REC_TABLE_REF_HEAD={1,1,1,1};
|
||||
const vector<bool> CCC_STRING_INLINE_END={0,0,0,0,0,0,0,0};
|
||||
const uint64_t CCC_DELIMITER_0_HEAD=0b0;
|
||||
const uint64_t CCC_DELIMITER_1_HEAD=0b10;
|
||||
const uint64_t CCC_C_KEYWORD_HEAD=0b1100;
|
||||
const uint64_t CCC_MISCELANEOUS_HEAD=0b1101;
|
||||
const uint64_t CCC_STRING_INLINE_HEAD=0b1110;
|
||||
const uint64_t CCC_REC_TABLE_REF_HEAD=0b1111;
|
||||
const uint64_t CCC_STRING_INLINE_END=0b00000000;
|
||||
#define CCC_ADD_COMPONENT(vec,tail) \
|
||||
do { \
|
||||
auto tmp=tail; \
|
||||
vec.insert(vec.end(),tmp.begin(),tmp.end()); \
|
||||
} while (0)
|
||||
#define CCC_ADD_COMPONENT_ALIGNED(vec,tail) \
|
||||
do { \
|
||||
static_assert(is_same_v<decltype(vec),vector<bool>>,"vec must be vector<bool>"); \
|
||||
static_assert(is_same_v<decltype(tail),vector<bool>>,"tail must be vector<bool>"); \
|
||||
vec.reserve(vec.size()+tail.size()+8); \
|
||||
for (auto b:tail) vec.push_back(b); \
|
||||
size_t rem=vec.size()%8; \
|
||||
if (rem!=0) { \
|
||||
vec.insert(vec.end(),8-rem,false); \
|
||||
} \
|
||||
} while (0)
|
||||
struct XXH3HasherString {
|
||||
size_t operator()(const std::string& s) const {
|
||||
return static_cast<size_t>(XXH3_64bits(s.data(),s.size()));
|
||||
}
|
||||
};
|
||||
class bit_streamer {
|
||||
private:
|
||||
vector<uint8_t> out;
|
||||
uint8_t current_byte=0;
|
||||
uint8_t bit_pos=0;
|
||||
public:
|
||||
size_t index;
|
||||
bit_streamer(size_t index) {
|
||||
out.reserve(1024*1024);
|
||||
this->index=index;
|
||||
}
|
||||
size_t get_size() {
|
||||
return out.size();
|
||||
}
|
||||
void write_bits(uint64_t value,uint8_t count) {
|
||||
for (int i=count-1;i>=0;--i) {
|
||||
if ((value>>i) & 1) {
|
||||
current_byte|=(1<<(7-bit_pos));
|
||||
}
|
||||
bit_pos++;
|
||||
if (bit_pos==8) {
|
||||
out.push_back(current_byte);
|
||||
current_byte=0;
|
||||
bit_pos=0;
|
||||
}
|
||||
}
|
||||
}
|
||||
void align() {
|
||||
if (bit_pos>0) {
|
||||
out.push_back(current_byte);
|
||||
current_byte=0;
|
||||
bit_pos=0;
|
||||
}
|
||||
}
|
||||
const vector<uint8_t>& get_out() const {
|
||||
return out;
|
||||
}
|
||||
vector<uint8_t> extract_buffer() {
|
||||
align();
|
||||
return std::move(out);
|
||||
}
|
||||
};
|
||||
const vector<string> delimiter0={
|
||||
"{",
|
||||
"}",
|
||||
@@ -175,10 +216,6 @@ const vector<string> c_keywords={
|
||||
"__attribute__",
|
||||
"defined",
|
||||
};
|
||||
struct symbol {
|
||||
string name;
|
||||
int score;
|
||||
};
|
||||
#pragma pack(push,1)
|
||||
struct header {
|
||||
uint8_t sig[3];
|
||||
@@ -193,115 +230,182 @@ struct node {
|
||||
uint32_t start;
|
||||
uint32_t end;
|
||||
};
|
||||
unordered_map<string,uint16_t,XXH3HasherString> optimized_type_u16_list;
|
||||
unordered_map<uint16_t,string> type_map;
|
||||
vector<vector<node>> all_tokens;
|
||||
map<string,int> rec_map;
|
||||
struct file_entry {
|
||||
string name;
|
||||
size_t size;
|
||||
size_t index;
|
||||
};
|
||||
struct thread_iterate_input_loop_call {
|
||||
string &source_code;
|
||||
vector<node> &thread_local_node_list;
|
||||
unordered_map<uint16_t,string>& thread_local_type_map;
|
||||
unordered_map<string,uint16_t,XXH3HasherString>& thread_local_type_u16_map;
|
||||
uint16_t thread_local_next_type_id;
|
||||
unordered_map<string,int,XXH3HasherString> thread_local_rec_map;
|
||||
};
|
||||
struct thread_rec_map_result {
|
||||
unordered_map<string,int,XXH3HasherString> thread_local_rec_map;
|
||||
};
|
||||
struct thread_encoding_input_loop_call {
|
||||
string &source_code;
|
||||
vector<node> &node_list;
|
||||
unordered_map<uint16_t,string>& thread_local_type_map;
|
||||
bit_streamer& thread_local_bit_stream;
|
||||
};
|
||||
enum iterating_mode {
|
||||
REC_MAP,
|
||||
PARSING
|
||||
};
|
||||
queue<file_entry> rec_map_files_queue;
|
||||
mutex rec_map_queue_mutex;
|
||||
queue<file_entry> encoding_files_queue;
|
||||
mutex encoding_queue_mutex;
|
||||
vector<string> rec_list;
|
||||
unordered_map<string,size_t,XXH3HasherString> rec_lookup;
|
||||
unordered_map<string,size_t,XXH3HasherString> c_keyword_lookup;
|
||||
unordered_map<string,size_t,XXH3HasherString> miscelaneous_lookup;
|
||||
unordered_map<string,size_t,XXH3HasherString> delimiter0_lookup;
|
||||
unordered_map<string,size_t,XXH3HasherString> delimiter1_lookup;
|
||||
bool debug=false;
|
||||
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> rec_lookup;
|
||||
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> c_keyword_lookup;
|
||||
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> miscelaneous_lookup;
|
||||
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> delimiter0_lookup;
|
||||
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> delimiter1_lookup;
|
||||
bool show_warning=false;
|
||||
bool fail_on_warning=false;
|
||||
static uint16_t next_type_id;
|
||||
void get_all_nodes(TSNode node,const string &source_code,map<string,int> &rec_map,size_t index) {
|
||||
if (ts_node_child_count(node)==0) {
|
||||
string text=source_code.substr(ts_node_start_byte(node),ts_node_end_byte(node)-ts_node_start_byte(node));
|
||||
string type=string(ts_node_type(node));
|
||||
if (optimized_type_u16_list.find(type)==optimized_type_u16_list.end()) {
|
||||
optimized_type_u16_list[type]=next_type_id;
|
||||
type_map[optimized_type_u16_list.at(type)]=type;
|
||||
next_type_id++;
|
||||
}
|
||||
all_tokens[index].push_back({.type=optimized_type_u16_list[type],.start=ts_node_start_byte(node),.end=ts_node_end_byte(node)});
|
||||
bool enable_malloc_trim=true;
|
||||
void iterate_all_nodes_loop_call(thread_iterate_input_loop_call &settings,TSNode current_node,iterating_mode mode) {
|
||||
if (ts_node_child_count(current_node)==0) {
|
||||
uint32_t start=ts_node_start_byte(current_node);
|
||||
uint32_t end=ts_node_end_byte(current_node);
|
||||
string_view text{settings.source_code.data()+start,end-start};
|
||||
string type=string(ts_node_type(current_node));
|
||||
if (mode==iterating_mode::REC_MAP) {
|
||||
if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="type_identifier" || type=="field_identifier" || type=="escape_sequence" || type=="statement_identifier") {
|
||||
rec_map[text]++;
|
||||
settings.thread_local_rec_map[string(text)]++;
|
||||
}
|
||||
if (type=="primitive_type" && find(c_keywords.begin(),c_keywords.end(),text)==c_keywords.end()) {
|
||||
rec_map[text]++;
|
||||
settings.thread_local_rec_map[string(text)]++;
|
||||
}
|
||||
if (type=="comment") {
|
||||
rec_map[text]=2;
|
||||
settings.thread_local_rec_map[string(text)]=2;
|
||||
}
|
||||
} else if (mode==iterating_mode::PARSING) {
|
||||
if (settings.thread_local_type_u16_map.find(type)==settings.thread_local_type_u16_map.end()) {
|
||||
settings.thread_local_type_u16_map[type]=settings.thread_local_next_type_id;
|
||||
settings.thread_local_type_map[settings.thread_local_type_u16_map.at(type)]=type;
|
||||
settings.thread_local_next_type_id++;
|
||||
}
|
||||
settings.thread_local_node_list.push_back({.type=settings.thread_local_type_u16_map[type],.start=start,.end=end});
|
||||
}
|
||||
} else {
|
||||
uint32_t child_count=ts_node_child_count(node);
|
||||
uint32_t child_count=ts_node_child_count(current_node);
|
||||
for (uint32_t i=0;i<child_count;++i) {
|
||||
TSNode child=ts_node_child(node,i);
|
||||
get_all_nodes(child,source_code,rec_map,index);
|
||||
TSNode child=ts_node_child(current_node,i);
|
||||
iterate_all_nodes_loop_call(settings,child,mode);
|
||||
}
|
||||
}
|
||||
}
|
||||
vector<bool> byte_to_bits(unsigned char c) {
|
||||
vector<bool> out;
|
||||
for (int i=7;i>=0;i--) {
|
||||
bool enabled=(c>>i)&0x01;
|
||||
out.push_back(enabled);
|
||||
thread_rec_map_result run_thread_rec_map(size_t thread_num) {
|
||||
auto start=chrono::high_resolution_clock::now();
|
||||
thread_rec_map_result res;
|
||||
unordered_map<string,uint16_t,XXH3HasherString> useless_type_u16_map;
|
||||
vector<node> useless_node_vector;
|
||||
unordered_map<uint16_t,string> useless_type_map;
|
||||
TSParser *parser=ts_parser_new();
|
||||
ts_parser_set_language(parser,tree_sitter_c());
|
||||
int counter=0;
|
||||
while (true) {
|
||||
file_entry f;
|
||||
{
|
||||
lock_guard<mutex> lock(rec_map_queue_mutex);
|
||||
if (rec_map_files_queue.empty()) break;
|
||||
f=std::move(rec_map_files_queue.front());
|
||||
rec_map_files_queue.pop();
|
||||
}
|
||||
return out;
|
||||
}
|
||||
vector<bool> generate_c_keyword(size_t index) {
|
||||
vector<bool> out;
|
||||
CCC_ADD_COMPONENT(out,CCC_C_KEYWORD_HEAD);
|
||||
for (int i=5;i>=0;i--) {
|
||||
bool enabled=(index>>i)&0x01;
|
||||
out.push_back(enabled);
|
||||
ifstream file(f.name);
|
||||
if (!file) {
|
||||
cout<<"Error: couldn't open "<<f.name<<endl;
|
||||
exit(-1);
|
||||
}
|
||||
return out;
|
||||
string code;
|
||||
file.seekg(0,ios::end);
|
||||
code.reserve(file.tellg());
|
||||
file.seekg(0,ios::beg);
|
||||
code.assign((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
|
||||
file.close();
|
||||
thread_iterate_input_loop_call loop_settings {
|
||||
.source_code=code,
|
||||
.thread_local_node_list=useless_node_vector,
|
||||
.thread_local_type_map=useless_type_map,
|
||||
.thread_local_type_u16_map=useless_type_u16_map,
|
||||
.thread_local_next_type_id=0,
|
||||
.thread_local_rec_map=res.thread_local_rec_map
|
||||
};
|
||||
TSTree *tree=ts_parser_parse_string(parser,nullptr,code.c_str(),code.size());
|
||||
TSNode root=ts_tree_root_node(tree);
|
||||
loop_settings.source_code=code;
|
||||
iterate_all_nodes_loop_call(loop_settings,root,iterating_mode::REC_MAP);
|
||||
string().swap(code);
|
||||
ts_tree_delete(tree);
|
||||
{
|
||||
lock_guard<mutex> lock(encoding_queue_mutex);
|
||||
encoding_files_queue.push(std::move(f));
|
||||
}
|
||||
if (++counter%20==0 && enable_malloc_trim) malloc_trim(0);
|
||||
}
|
||||
ts_parser_delete(parser);
|
||||
auto end=chrono::high_resolution_clock::now();
|
||||
auto ms=chrono::duration_cast<chrono::milliseconds>(end-start).count();
|
||||
cout<<"Recccurences map thread number "<<thread_num<<" finished succesfully on "<<ms<<" milliseconds."<<endl;
|
||||
return res;
|
||||
}
|
||||
vector<bool> generate_rec(size_t index,size_t total_recs) {
|
||||
vector<bool> out;
|
||||
void generate_c_keyword(bit_streamer& bitstream,size_t index) {
|
||||
bitstream.align();
|
||||
bitstream.write_bits(CCC_C_KEYWORD_HEAD,4);
|
||||
bitstream.write_bits(index,6);
|
||||
bitstream.align();
|
||||
return;
|
||||
}
|
||||
void generate_rec(bit_streamer& bitstream,size_t index,size_t total_recs) {
|
||||
size_t bits=0;
|
||||
while (total_recs) {
|
||||
total_recs>>=1;
|
||||
++bits;
|
||||
}
|
||||
CCC_ADD_COMPONENT(out,CCC_REC_TABLE_REF_HEAD);
|
||||
for (int i=bits;i>=0;i--) {
|
||||
bool enabled=(index>>i)&0x01;
|
||||
out.push_back(enabled);
|
||||
}
|
||||
return out;
|
||||
bitstream.align();
|
||||
bitstream.write_bits(CCC_REC_TABLE_REF_HEAD,4);
|
||||
bitstream.write_bits(index,bits);
|
||||
bitstream.align();
|
||||
return;
|
||||
}
|
||||
vector<bool> generate_delimiter0(size_t index) {
|
||||
vector<bool> out;
|
||||
CCC_ADD_COMPONENT(out,CCC_DELIMITER_0_HEAD);
|
||||
for (int i=2;i>=0;i--) {
|
||||
bool enabled=(index>>i)&0x01;
|
||||
out.push_back(enabled);
|
||||
}
|
||||
return out;
|
||||
void generate_delimiter0(bit_streamer& bitstream,size_t index) {
|
||||
bitstream.align();
|
||||
bitstream.write_bits(CCC_DELIMITER_0_HEAD,1);
|
||||
bitstream.write_bits(index,3);
|
||||
bitstream.align();
|
||||
return;
|
||||
}
|
||||
vector<bool> generate_delimiter1(size_t index) {
|
||||
vector<bool> out;
|
||||
CCC_ADD_COMPONENT(out,CCC_DELIMITER_1_HEAD);
|
||||
for (int i=1;i>=0;i--) {
|
||||
bool enabled=(index>>i)&0x01;
|
||||
out.push_back(enabled);
|
||||
}
|
||||
return out;
|
||||
void generate_delimiter1(bit_streamer& bitstream,size_t index) {
|
||||
bitstream.align();
|
||||
bitstream.write_bits(CCC_DELIMITER_1_HEAD,2);
|
||||
bitstream.write_bits(index,2);
|
||||
bitstream.align();
|
||||
return;
|
||||
}
|
||||
vector<bool> generate_miscellaneous(size_t index) {
|
||||
vector<bool> out;
|
||||
CCC_ADD_COMPONENT(out,CCC_MISCELANEOUS_HEAD);
|
||||
for (int i=5;i>=0;i--) {
|
||||
bool enabled=(index>>i)&0x01;
|
||||
out.push_back(enabled);
|
||||
}
|
||||
return out;
|
||||
void generate_miscellaneous(bit_streamer& bitstream,size_t index) {
|
||||
bitstream.align();
|
||||
bitstream.write_bits(CCC_MISCELANEOUS_HEAD,4);
|
||||
bitstream.write_bits(index,6);
|
||||
bitstream.align();
|
||||
}
|
||||
vector<bool> generate_string_content(string str) {
|
||||
vector<bool> out;
|
||||
CCC_ADD_COMPONENT(out,CCC_STRING_INLINE_HEAD);
|
||||
for (auto c:str) {
|
||||
CCC_ADD_COMPONENT(out,byte_to_bits(c));
|
||||
void generate_string_content(bit_streamer& bitstream,const char *text,size_t text_len) {
|
||||
bitstream.align();
|
||||
bitstream.write_bits(CCC_STRING_INLINE_HEAD,4);
|
||||
for (int i=0;i<text_len;i++) {
|
||||
bitstream.write_bits(text[i],8);
|
||||
}
|
||||
CCC_ADD_COMPONENT(out,CCC_STRING_INLINE_END);
|
||||
return out;
|
||||
bitstream.write_bits(CCC_STRING_INLINE_END,8);
|
||||
bitstream.align();
|
||||
return;
|
||||
}
|
||||
void print_debug(string text) {
|
||||
if (debug==true) {
|
||||
void print_warning(string text) {
|
||||
if (show_warning==true) {
|
||||
cout<<text<<endl;
|
||||
}
|
||||
}
|
||||
@@ -310,62 +414,58 @@ void fail_if_warning() {
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
vector<unsigned char> process_file_nodes(vector<node> *nodes,string code) {
|
||||
vector<bool> out;
|
||||
for (int i=0;i<nodes->size();i++) {
|
||||
node n=nodes->at(i);
|
||||
string type=type_map[n.type];
|
||||
string text=code.substr(n.start,n.end-n.start);
|
||||
void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
||||
bit_streamer& out=settings.thread_local_bit_stream;
|
||||
for (int i=0;i<settings.node_list.size();i++) {
|
||||
node n=settings.node_list.at(i);
|
||||
string type=settings.thread_local_type_map[n.type];
|
||||
char temp[256];
|
||||
string_view text{settings.source_code.data()+n.start,n.end-n.start};
|
||||
if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") {
|
||||
auto it=rec_lookup.find(text);
|
||||
auto it=rec_lookup.find(string(text));
|
||||
if (it==rec_lookup.end()) {
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_string_content(text));
|
||||
print_debug("string ("+type+"): "+text);
|
||||
generate_string_content(out,text.data(),text.size());
|
||||
} else {
|
||||
size_t index=it->second;
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size()));
|
||||
print_debug("rec_table for string ("+type+"): "+text);
|
||||
generate_rec(out,index,rec_list.size());
|
||||
}
|
||||
} else if (type=="primitive_type" || type=="type_identifier") {
|
||||
auto it=c_keyword_lookup.find(text);
|
||||
auto it=c_keyword_lookup.find(string(text));
|
||||
if (it!=c_keyword_lookup.end()) {
|
||||
size_t index=it->second;
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_c_keyword(index));
|
||||
print_debug("type found in c keyword: "+text);
|
||||
generate_c_keyword(out,index);
|
||||
} else {
|
||||
auto it=rec_lookup.find(text);
|
||||
auto it=rec_lookup.find(string(text));
|
||||
if (it==rec_lookup.end()) {
|
||||
if (!text.empty()) {
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_string_content(text));
|
||||
print_debug("string for type ("+type+"): "+text);
|
||||
generate_string_content(out,text.data(),text.size());
|
||||
} else {
|
||||
cout<<"Warning: type node is empty: "<<text<<endl;
|
||||
print_warning("Warning: type node is empty: "+string(text));
|
||||
fail_if_warning();
|
||||
}
|
||||
} else {
|
||||
size_t index=it->second;
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size()));
|
||||
print_debug("rec_table for string for type ("+type+"): "+text);
|
||||
generate_rec(out,index,rec_list.size());
|
||||
}
|
||||
}
|
||||
} else if (delimiter0_lookup.find(type)!=delimiter0_lookup.end() || delimiter1_lookup.find(type)!=delimiter1_lookup.end() || type=="\"") {
|
||||
string insert;
|
||||
if (type=="(" && i+1<nodes->size()) {
|
||||
if (type_map[nodes->at(i+1).type]==")") {
|
||||
if (type=="(" && i+1<settings.node_list.size()) {
|
||||
if (settings.thread_local_type_map[settings.node_list.at(i+1).type]==")") {
|
||||
insert="()";
|
||||
i++;
|
||||
} else {
|
||||
insert="(";
|
||||
}
|
||||
} else if (type=="[" && i+1<nodes->size()) {
|
||||
if (type_map[nodes->at(i+1).type]=="]") {
|
||||
} else if (type=="[" && i+1<settings.node_list.size()) {
|
||||
if (settings.thread_local_type_map[settings.node_list.at(i+1).type]=="]") {
|
||||
insert="[]";
|
||||
i++;
|
||||
} else {
|
||||
insert="[";
|
||||
}
|
||||
} else if (type=="{" && i+1<nodes->size()) {
|
||||
if (type_map[nodes->at(i+1).type]=="}") {
|
||||
} else if (type=="{" && i+1<settings.node_list.size()) {
|
||||
if (settings.thread_local_type_map[settings.node_list.at(i+1).type]=="}") {
|
||||
insert="{}";
|
||||
i++;
|
||||
} else {
|
||||
@@ -377,17 +477,15 @@ vector<unsigned char> process_file_nodes(vector<node> *nodes,string code) {
|
||||
auto it=delimiter0_lookup.find(insert);
|
||||
if (it!=delimiter0_lookup.end()) {
|
||||
size_t index=it->second;
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_delimiter0(index));
|
||||
print_debug("delimiter 0: "+insert);
|
||||
generate_delimiter0(out,index);
|
||||
} else {
|
||||
if (insert!="{}" && insert!="\"") {
|
||||
auto it=delimiter1_lookup.find(insert);
|
||||
if (it!=delimiter1_lookup.end()) {
|
||||
size_t index=it->second;
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_delimiter1(index));
|
||||
print_debug("delimiter 1: "+insert);
|
||||
generate_delimiter1(out,index);
|
||||
} else {
|
||||
cout<<"Warning: unknow delimiter, that shouldn't happen: "<<insert<<endl;
|
||||
print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert);
|
||||
fail_if_warning();
|
||||
}
|
||||
} else {
|
||||
@@ -395,28 +493,30 @@ vector<unsigned char> process_file_nodes(vector<node> *nodes,string code) {
|
||||
auto it=delimiter1_lookup.find("{}");
|
||||
if (it!=delimiter1_lookup.end()) {
|
||||
size_t index=it->second;
|
||||
CCC_ADD_COMPONENT(out,generate_delimiter1(index));
|
||||
vector<bool> temp={0};
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,temp);
|
||||
print_debug("delimiter 1: "+insert);
|
||||
out.align();
|
||||
out.write_bits(CCC_DELIMITER_1_HEAD,2);
|
||||
out.write_bits(index,2);
|
||||
out.write_bits(0b0,1);
|
||||
out.align();
|
||||
} else {
|
||||
cout<<"Warning: unknow delimiter, that shouldn't happen: "<<insert<<endl;
|
||||
print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert);
|
||||
fail_if_warning();
|
||||
}
|
||||
} else if (insert=="\"") {
|
||||
auto it=delimiter1_lookup.find("{}");
|
||||
if (it!=delimiter1_lookup.end()) {
|
||||
size_t index=it->second;
|
||||
CCC_ADD_COMPONENT(out,generate_delimiter1(index));
|
||||
vector<bool> temp={1};
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,temp);
|
||||
print_debug("delimiter 1: "+insert);
|
||||
out.align();
|
||||
out.write_bits(CCC_DELIMITER_1_HEAD,2);
|
||||
out.write_bits(index,2);
|
||||
out.write_bits(0b1,1);
|
||||
out.align();
|
||||
} else {
|
||||
cout<<"Warning: unknow delimiter, that shouldn't happen: "<<insert<<endl;
|
||||
print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert);
|
||||
fail_if_warning();
|
||||
}
|
||||
} else {
|
||||
cout<<"Warning: unknow delimiter, that shouldn't happen: "<<insert<<endl;
|
||||
print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert);
|
||||
fail_if_warning();
|
||||
}
|
||||
}
|
||||
@@ -426,32 +526,28 @@ vector<unsigned char> process_file_nodes(vector<node> *nodes,string code) {
|
||||
auto it=c_keyword_lookup.find(type);
|
||||
if (it!=c_keyword_lookup.end()) {
|
||||
size_t index=it->second;
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_c_keyword(index));
|
||||
print_debug("c keyword: "+type);
|
||||
generate_c_keyword(out,index);
|
||||
} else {
|
||||
cout<<"Warning: unknow C keyword, that shouldn't happen: "<<type<<" "<<text<<endl;
|
||||
print_warning("Warning: unknow C keyword, that shouldn't happen: "+type+" "+string(text));
|
||||
fail_if_warning();
|
||||
}
|
||||
} else {
|
||||
auto it=c_keyword_lookup.find(text);
|
||||
auto it=c_keyword_lookup.find(string(text));
|
||||
if (it!=c_keyword_lookup.end()) {
|
||||
size_t index=it->second;
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_c_keyword(index));
|
||||
print_debug("c keyword: "+type);
|
||||
generate_c_keyword(out,index);
|
||||
} else {
|
||||
auto it=rec_lookup.find(text);
|
||||
auto it=rec_lookup.find(string(text));
|
||||
if (it==rec_lookup.end()) {
|
||||
if (!text.empty()) {
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_string_content(text));
|
||||
print_debug("string for c keyword ("+type+"): "+text);
|
||||
generate_string_content(out,text.data(),text.size());
|
||||
} else {
|
||||
cout<<"Warning: C keyword is empty: "<<text<<endl;
|
||||
print_warning("Warning: C keyword is empty: "+string(text));
|
||||
fail_if_warning();
|
||||
}
|
||||
} else {
|
||||
size_t index=it->second;
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size()));
|
||||
print_debug("rec_table for string for c keyword ("+type+"): "+text);
|
||||
generate_rec(out,index,rec_list.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -459,80 +555,114 @@ vector<unsigned char> process_file_nodes(vector<node> *nodes,string code) {
|
||||
auto it=miscelaneous_lookup.find(type);
|
||||
if (it!=miscelaneous_lookup.end()) {
|
||||
size_t index=it->second;
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_miscellaneous(index));
|
||||
print_debug("miscellaneous: "+type);
|
||||
generate_miscellaneous(out,index);
|
||||
} else {
|
||||
cout<<"Warning: unknow miscellaneous, that shouldn't happen: "<<type<<endl;
|
||||
print_warning("Warning: unknow miscellaneous, that shouldn't happen: "+type);
|
||||
fail_if_warning();
|
||||
}
|
||||
} else if (type=="comment") {
|
||||
auto it=rec_lookup.find(text);
|
||||
auto it=rec_lookup.find(string(text));
|
||||
if (it==rec_lookup.end()) {
|
||||
if (!text.empty()) {
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_string_content(text));
|
||||
print_debug("string for comment("+type+"): "+text);
|
||||
generate_string_content(out,text.data(),text.size());
|
||||
} else {
|
||||
cout<<"Warning: comment is empty: "<<text<<endl;
|
||||
print_warning("Warning: comment is empty: "+string(text));
|
||||
fail_if_warning();
|
||||
}
|
||||
} else {
|
||||
size_t index=it->second;
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size()));
|
||||
print_debug("rec_table for comment");
|
||||
generate_rec(out,index,rec_list.size());
|
||||
}
|
||||
} else {
|
||||
auto it=rec_lookup.find(type);
|
||||
if (it==rec_lookup.end()) {
|
||||
if (!text.empty()) {
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_string_content(text));
|
||||
print_debug("string for unknow node ("+type+"): "+text);
|
||||
generate_string_content(out,text.data(),text.size());
|
||||
} else {
|
||||
cout<<"Warning: unknow node is empty: "<<text<<endl;
|
||||
print_warning("Warning: unknow node is empty: "+string(text));
|
||||
fail_if_warning();
|
||||
}
|
||||
} else {
|
||||
size_t index=it->second;
|
||||
CCC_ADD_COMPONENT_ALIGNED(out,generate_rec(index,rec_list.size()));
|
||||
print_debug("rec_table for string for unknow node ("+type+"): "+text);
|
||||
generate_rec(out,index,rec_list.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
vector<unsigned char> payload_bytes;
|
||||
unsigned char current=0;
|
||||
size_t bit_index=0;
|
||||
for (bool b:out) {
|
||||
current|=(b<<(7-bit_index));
|
||||
bit_index++;
|
||||
if (bit_index==8) {
|
||||
payload_bytes.push_back(current);
|
||||
current=0;
|
||||
bit_index=0;
|
||||
}
|
||||
}
|
||||
if (bit_index!=0) {
|
||||
payload_bytes.push_back(current);
|
||||
}
|
||||
return payload_bytes;
|
||||
out.align();
|
||||
return;
|
||||
}
|
||||
void construct_rec_table(vector<string> &files_content,vector<string> files_names) {
|
||||
for (int i=0;i<files_content.size();i++) {
|
||||
void run_thread_encoding(size_t thread_num) {
|
||||
auto start=chrono::high_resolution_clock::now();
|
||||
unordered_map<string,int,XXH3HasherString> useless_rec_map;
|
||||
unordered_map<uint16_t,string> thread_local_type_map;
|
||||
unordered_map<string,uint16_t,XXH3HasherString> thread_local_type_u16_map;
|
||||
vector<node> thread_local_node_list;
|
||||
TSParser *parser=ts_parser_new();
|
||||
ts_parser_set_language(parser,tree_sitter_c());
|
||||
TSTree *tree=ts_parser_parse_string(parser,nullptr,files_content[i].c_str(),files_content[i].size());
|
||||
int counter=0;
|
||||
while (true) {
|
||||
bit_streamer bitstream(0);
|
||||
file_entry f;
|
||||
{
|
||||
lock_guard<mutex> lock(encoding_queue_mutex);
|
||||
if (encoding_files_queue.empty()) break;
|
||||
f=std::move(encoding_files_queue.front());
|
||||
encoding_files_queue.pop();
|
||||
}
|
||||
bitstream.index=f.index;
|
||||
ofstream temp_thread_out(".temp_ccc/temp_ccc_"+to_string(f.index)+".bin",ios::binary);
|
||||
if (!temp_thread_out) {
|
||||
cout<<"Error: couldn't open .temp_ccc/temp_ccc_"<<to_string(f.index)<<".bin"<<endl;
|
||||
exit(-1);
|
||||
}
|
||||
ifstream file(f.name);
|
||||
if (!file) {
|
||||
cout<<"Error: couldn't open "<<f.name<<endl;
|
||||
exit(-1);
|
||||
}
|
||||
string code;
|
||||
file.seekg(0,ios::end);
|
||||
code.reserve(file.tellg());
|
||||
file.seekg(0,ios::beg);
|
||||
code.assign((istreambuf_iterator<char>(file)), istreambuf_iterator<char>());
|
||||
file.close();
|
||||
thread_iterate_input_loop_call iterate_loop_settings {
|
||||
.source_code=code,
|
||||
.thread_local_node_list=thread_local_node_list,
|
||||
.thread_local_type_map=thread_local_type_map,
|
||||
.thread_local_type_u16_map=thread_local_type_u16_map,
|
||||
.thread_local_next_type_id=0,
|
||||
.thread_local_rec_map=useless_rec_map
|
||||
};
|
||||
thread_encoding_input_loop_call encoding_loop_settings {
|
||||
.source_code=code,
|
||||
.node_list=thread_local_node_list,
|
||||
.thread_local_type_map=thread_local_type_map,
|
||||
.thread_local_bit_stream=bitstream
|
||||
};
|
||||
TSTree *tree=ts_parser_parse_string(parser,nullptr,code.c_str(),code.size());
|
||||
TSNode root=ts_tree_root_node(tree);
|
||||
get_all_nodes(root,files_content[i],rec_map,i);
|
||||
iterate_loop_settings.source_code=code;
|
||||
iterate_all_nodes_loop_call(iterate_loop_settings,root,iterating_mode::PARSING);
|
||||
ts_tree_delete(tree);
|
||||
encoding_loop_settings.source_code=code;
|
||||
process_file_nodes_loop_call(encoding_loop_settings);
|
||||
auto payload=std::move(bitstream.extract_buffer());
|
||||
temp_thread_out.write(reinterpret_cast<const char*>(payload.data()),payload.size());
|
||||
temp_thread_out.close();
|
||||
payload.clear();
|
||||
payload.shrink_to_fit();
|
||||
vector<node>().swap(thread_local_node_list);
|
||||
thread_local_type_map.clear();
|
||||
thread_local_type_u16_map.clear();
|
||||
string().swap(code);
|
||||
if (++counter%10==0 && enable_malloc_trim) malloc_trim(0);
|
||||
}
|
||||
ts_parser_delete(parser);
|
||||
cout<<i+1<<" file(s) parsed on "<<files_names.size()<<": "<<files_names[i]<<endl;
|
||||
}
|
||||
for (auto s:rec_map) {
|
||||
if (s.second>=2 and s.first.size()>=3) {
|
||||
rec_list.push_back(s.first);
|
||||
}
|
||||
}
|
||||
for (int i=0;i<rec_list.size();i++) {
|
||||
rec_lookup[rec_list[i]]=i;
|
||||
}
|
||||
auto end=chrono::high_resolution_clock::now();
|
||||
auto ms=chrono::duration_cast<chrono::milliseconds>(end-start).count();
|
||||
cout<<"Parsing/encoding thread number "<<thread_num<<" finished succesfully on "<<ms<<" milliseconds."<<endl;
|
||||
return;
|
||||
}
|
||||
int main(int argc,char **argv) {
|
||||
for (int i=0;i<c_keywords.size();i++) {
|
||||
@@ -551,68 +681,163 @@ int main(int argc,char **argv) {
|
||||
cout<<"Usage: ccc [FILES]"<<endl;
|
||||
return -1;
|
||||
}
|
||||
size_t compression_ratio=6;
|
||||
vector<string> files;
|
||||
for (int i=1;i<argc;i++) {
|
||||
string file=string(argv[i]);
|
||||
if (file=="-v") {
|
||||
debug=true;
|
||||
if (file=="-W") {
|
||||
fail_on_warning=true;
|
||||
continue;
|
||||
}
|
||||
if (file=="-w") {
|
||||
fail_on_warning=true;
|
||||
show_warning=true;
|
||||
continue;
|
||||
}
|
||||
if (file.substr(0,2)=="-c" && file.size()==3) {
|
||||
try {
|
||||
compression_ratio=stoi(file.substr(2,1));
|
||||
continue;
|
||||
} catch (const exception& e) {
|
||||
cout<<"Error: invalid argument: "<<file<<endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if (file=="-f") {
|
||||
enable_malloc_trim=false;
|
||||
continue;
|
||||
}
|
||||
if (file=="-h" || file=="--help") {
|
||||
cout<<"C Code Compressor v0.1"<<endl;
|
||||
cout<<"Usage: ccc [-hfwW] [FILES]"<<endl;
|
||||
cout<<"Options:"<<endl;
|
||||
cout<<" -h : show this help message"<<endl;
|
||||
cout<<" -f : enable fast mode, reduce the total compression time but does not release unused"<<endl;
|
||||
cout<<" unused heap memory back to the OS. Usage of this option can raise memory usage."<<endl;
|
||||
cout<<" -w : show warning messages. For example, when a unknown or empty node is detected."<<endl;
|
||||
cout<<" -W : crash on warning"<<endl;
|
||||
cout<<" -c0..9: set the compression ratio for LZMA multithreaded compression phase. Default is 6."<<endl;
|
||||
cout<<" Below level 6, CCC may not compress better than tar.xz."<<endl;
|
||||
cout<<" Warning: setting this higher than -c6 will seriously raise memory usage."<<endl;
|
||||
cout<<" For exemple, using -c9 more than double memory usage in comparison with"<<endl;
|
||||
cout<<" -c6 (which is the default). "<<endl;
|
||||
cout<<" Warning: usage of higher options than -c6 combined with -f is heavily not"<<endl;
|
||||
cout<<" recommended."<<endl;
|
||||
return 0;
|
||||
}
|
||||
if (!fs::exists(file)) {
|
||||
cout<<"Error: file doesn't exist: "<<file<<endl;
|
||||
return -1;
|
||||
}
|
||||
files.push_back(file);
|
||||
}
|
||||
vector<string> files_content;
|
||||
for (int i=0;i<files.size();i++) {
|
||||
ifstream file(files[i],ios::binary);
|
||||
if (!file) {
|
||||
cout<<"Error: couldn't open provided file."<<endl;
|
||||
return -1;
|
||||
file_entry f{files[i],fs::file_size(files[i])};
|
||||
f.index=i;
|
||||
rec_map_files_queue.push(std::move(f));
|
||||
}
|
||||
string code((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
|
||||
files_content.push_back(code);
|
||||
cout<<i+1<<" file(s) readed on "<<files.size()<<": "<<files[i]<<endl;
|
||||
size_t nb_threads=thread::hardware_concurrency();
|
||||
size_t total_files=files.size();
|
||||
size_t files_per_thread=(total_files+nb_threads-1)/nb_threads;
|
||||
fs::create_directory(".temp_ccc");
|
||||
vector<future<thread_rec_map_result>> rec_map_futures;
|
||||
for (size_t i=0;i<nb_threads;++i) {
|
||||
rec_map_futures.push_back(async(launch::async,run_thread_rec_map,i+1));
|
||||
}
|
||||
all_tokens.resize(files_content.size());
|
||||
construct_rec_table(files_content,files);
|
||||
vector<unsigned char> files_archive;
|
||||
vector<size_t> payloads_size;
|
||||
vector<size_t> payloads_start;
|
||||
for (int i=0;i<files_content.size();i++) {
|
||||
auto payload_bytes=process_file_nodes(&(all_tokens[i]),files_content[i]);
|
||||
payloads_size.push_back(payload_bytes.size());
|
||||
payloads_start.push_back(files_archive.size());
|
||||
CCC_ADD_COMPONENT(files_archive,payload_bytes);
|
||||
cout<<i+1<<" file(s) encoded on "<<files.size()<<": "<<files[i]<<endl;
|
||||
vector<thread_rec_map_result> all_rec_map_results;
|
||||
map<string,int> global_rec_map;
|
||||
for (auto& fut:rec_map_futures) {
|
||||
all_rec_map_results.push_back(fut.get());
|
||||
for (auto const& [str,count]:all_rec_map_results.back().thread_local_rec_map) {
|
||||
global_rec_map[str]+=count;
|
||||
}
|
||||
}
|
||||
for (auto const& [str,count]:global_rec_map) {
|
||||
if (count>=2 && str.size()>=3) {
|
||||
rec_list.push_back(str);
|
||||
rec_lookup[str]=rec_list.size()-1;
|
||||
}
|
||||
}
|
||||
global_rec_map.clear();
|
||||
vector<file_entry> encoding_files_vec;
|
||||
while (!encoding_files_queue.empty()) {
|
||||
encoding_files_vec.push_back(std::move(encoding_files_queue.front()));
|
||||
encoding_files_queue.pop();
|
||||
}
|
||||
sort(encoding_files_vec.begin(),encoding_files_vec.end(),[](const file_entry& a,const file_entry& b) {
|
||||
return a.size>b.size;
|
||||
});
|
||||
for (auto& f:encoding_files_vec) {
|
||||
encoding_files_queue.push(std::move(f));
|
||||
}
|
||||
vector<future<void>> encoding_futures;
|
||||
for (size_t i=0;i<all_rec_map_results.size();++i) {
|
||||
encoding_futures.push_back(async(launch::async,run_thread_encoding,i+1));
|
||||
}
|
||||
all_rec_map_results.clear();
|
||||
for (auto& fut:encoding_futures) {
|
||||
fut.get();
|
||||
}
|
||||
vector<size_t> global_payloads_start;
|
||||
size_t current_offset=0;
|
||||
for (int i=0;i<files.size();i++) {
|
||||
global_payloads_start.push_back(current_offset);
|
||||
size_t size=fs::file_size(".temp_ccc/temp_ccc_"+to_string(i)+".bin");
|
||||
current_offset+=size;
|
||||
}
|
||||
vector<unsigned char> final_payloads;
|
||||
final_payloads.resize(current_offset);
|
||||
size_t offset=0;
|
||||
for (int i=0;i<files.size();i++) {
|
||||
string temp_name=".temp_ccc/temp_ccc_"+to_string(i)+".bin";
|
||||
ifstream ccc_file(temp_name,ios::binary);
|
||||
size_t current_file_size=fs::file_size(temp_name);
|
||||
ccc_file.read(reinterpret_cast<char*>(final_payloads.data()+offset),current_file_size);
|
||||
offset+=current_file_size;
|
||||
ccc_file.close();
|
||||
}
|
||||
fs::remove_all(".temp_ccc");
|
||||
vector<unsigned char> payload_compressed;
|
||||
payload_compressed.resize(files_archive.size()+files_archive.size()/3+128);
|
||||
lzma_stream strm=LZMA_STREAM_INIT;
|
||||
if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) {
|
||||
cout<<"Error: couldn't initialize LZMA compressor for file archive."<<endl;
|
||||
payload_compressed.resize(final_payloads.size()+final_payloads.size()/3+128);
|
||||
lzma_mt mt_options={};
|
||||
mt_options.flags=0;
|
||||
mt_options.threads=thread::hardware_concurrency();
|
||||
mt_options.block_size=max((size_t)8*1024*1024,final_payloads.size()/mt_options.threads);
|
||||
mt_options.timeout=0;
|
||||
mt_options.filters=nullptr;
|
||||
mt_options.check=LZMA_CHECK_CRC64;
|
||||
lzma_options_lzma opt_lzma;
|
||||
if (lzma_lzma_preset(&opt_lzma,compression_ratio)) {
|
||||
cout<<"Error: couldn't initialize LZMA compressor for files archive."<<endl;
|
||||
return -1;
|
||||
}
|
||||
strm.next_in=files_archive.data();
|
||||
strm.avail_in=files_archive.size();
|
||||
lzma_filter filters[2];
|
||||
filters[0].id=LZMA_FILTER_LZMA2;
|
||||
filters[0].options=&opt_lzma;
|
||||
filters[1].id=LZMA_VLI_UNKNOWN;
|
||||
mt_options.filters=filters;
|
||||
lzma_stream strm=LZMA_STREAM_INIT;
|
||||
auto ret=lzma_stream_encoder_mt(&strm,&mt_options);
|
||||
if (ret!=LZMA_OK) {
|
||||
cout<<"Error: couldn't initialize MT compressor for files archives."<<endl;
|
||||
return -1;
|
||||
}
|
||||
strm.next_in=final_payloads.data();
|
||||
strm.avail_in=final_payloads.size();
|
||||
strm.next_out=payload_compressed.data();
|
||||
strm.avail_out=payload_compressed.size();
|
||||
auto ret=lzma_code(&strm,LZMA_FINISH);
|
||||
auto start=chrono::high_resolution_clock::now();
|
||||
ret=lzma_code(&strm,LZMA_FINISH);
|
||||
auto end=chrono::high_resolution_clock::now();
|
||||
auto ns=chrono::duration_cast<chrono::nanoseconds>(end-start).count();
|
||||
if (ret!=LZMA_STREAM_END) {
|
||||
cout<<"Error: couldn't compress file archive."<<endl;
|
||||
cout<<"Error: couldn't compress files archive."<<endl;
|
||||
return -1;
|
||||
}
|
||||
cout<<"Compressed payloads."<<endl;
|
||||
size_t payload_total_size;
|
||||
size_t compressed_size=payload_compressed.size()-strm.avail_out;
|
||||
payload_compressed.resize(compressed_size);
|
||||
size_t original_size=files_archive.size();
|
||||
lzma_end(&strm);
|
||||
size_t original_size=final_payloads.size();
|
||||
uint8_t flags=0;
|
||||
if (compressed_size>=original_size) {
|
||||
flags&= ~(0b00000001);
|
||||
@@ -663,11 +888,16 @@ int main(int argc,char **argv) {
|
||||
files_table.push_back(c);
|
||||
}
|
||||
files_table.push_back('\0');
|
||||
auto file_start=payloads_start[i];
|
||||
auto file_start=global_payloads_start[i];
|
||||
for (int i=0;i<sizeof(size_t);++i) {
|
||||
files_table.push_back(((uint8_t*)&file_start)[i]);
|
||||
}
|
||||
auto file_size=payloads_size[i];
|
||||
size_t file_size;
|
||||
if (i==files.size()-1) {
|
||||
file_size=final_payloads.size()-global_payloads_start[i];
|
||||
} else {
|
||||
file_size=global_payloads_start[i+1]-global_payloads_start[i];
|
||||
}
|
||||
for (int i=0;i<sizeof(size_t);++i) {
|
||||
files_table.push_back(((uint8_t*)&file_size)[i]);
|
||||
}
|
||||
@@ -726,7 +956,7 @@ int main(int argc,char **argv) {
|
||||
if (flags & 0b00000001) {
|
||||
CCC_ADD_COMPONENT(out,payload_compressed);
|
||||
} else {
|
||||
CCC_ADD_COMPONENT(out,files_archive);
|
||||
CCC_ADD_COMPONENT(out,final_payloads);
|
||||
}
|
||||
ofstream fileout("test.ccc",ios::binary);
|
||||
if (!fileout) {
|
||||
|
||||
5
hello.c
Normal file
5
hello.c
Normal file
@@ -0,0 +1,5 @@
|
||||
#include <stdio.h>
|
||||
int main() {
|
||||
printf("hello");
|
||||
return 0;
|
||||
}
|
||||
57
test.py
57
test.py
@@ -3,17 +3,37 @@ import subprocess
|
||||
import time
|
||||
|
||||
def get_source_files(root_dir):
|
||||
"""Récupère les fichiers et calcule la taille totale."""
|
||||
source_files = []
|
||||
total_size = 0
|
||||
for root, _, files in os.walk(root_dir):
|
||||
for file in files:
|
||||
if file.endswith(('.c', '.h')) and len(source_files)<20000:
|
||||
if file.endswith(('.c', '.h')) and len(source_files) < 30000:
|
||||
path = os.path.join(root, file)
|
||||
source_files.append(path)
|
||||
total_size += os.path.getsize(path)
|
||||
return source_files, total_size
|
||||
|
||||
|
||||
def monitor_process(proc):
|
||||
max_rss = 0
|
||||
|
||||
while proc.poll() is None:
|
||||
try:
|
||||
with open(f"/proc/{proc.pid}/status") as f:
|
||||
for line in f:
|
||||
if line.startswith("VmHWM:"):
|
||||
rss = int(line.split()[1]) # kB
|
||||
if rss > max_rss:
|
||||
max_rss = rss
|
||||
break
|
||||
except FileNotFoundError:
|
||||
break
|
||||
|
||||
time.sleep(0.05)
|
||||
|
||||
return max_rss
|
||||
|
||||
|
||||
def main():
|
||||
target_dir = "linux"
|
||||
if not os.path.exists(target_dir):
|
||||
@@ -26,44 +46,53 @@ def main():
|
||||
print(f"Fichiers trouvés : {len(files)}")
|
||||
print(f"Taille totale brute : {raw_mo:.2f} Mo")
|
||||
|
||||
# 1. Compression avec TAR
|
||||
# 1. TAR
|
||||
print("\n--- Lancement de TAR -cJf (XZ) ---")
|
||||
start_tar = time.time()
|
||||
tar_cmd = ["tar", "-cJf", "linux_sources.tar.xz", "--files-from=-"]
|
||||
process_tar = subprocess.Popen(tar_cmd, stdin=subprocess.PIPE)
|
||||
process_tar.communicate(input="\n".join(files).encode())
|
||||
|
||||
process_tar.stdin.write("\n".join(files).encode())
|
||||
process_tar.stdin.close()
|
||||
|
||||
peak_tar = monitor_process(process_tar)
|
||||
process_tar.wait()
|
||||
end_tar = time.time()
|
||||
|
||||
# 2. Compression avec CCC
|
||||
# 2. CCC
|
||||
print("\n--- Lancement de CCC (Output temps réel) ---")
|
||||
print("-" * 40)
|
||||
start_ccc = time.time()
|
||||
try:
|
||||
# On laisse stdout et stderr par défaut pour voir l'output de CCC
|
||||
subprocess.run(["./ccc"] + files, check=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"\nErreur fatale CCC : {e}")
|
||||
process_ccc = subprocess.Popen(["./ccc"] + files)
|
||||
peak_ccc = monitor_process(process_ccc)
|
||||
process_ccc.wait()
|
||||
except OSError as e:
|
||||
print(f"\nErreur système (trop de fichiers ?) : {e}")
|
||||
print(f"\nErreur système : {e}")
|
||||
return
|
||||
|
||||
end_ccc = time.time()
|
||||
print("-" * 40)
|
||||
|
||||
# 3. Calculs finaux
|
||||
print("\n" + "="*40)
|
||||
# Résultats
|
||||
print("\n" + "=" * 40)
|
||||
print(f" RÉSULTATS (Source: {raw_mo:.2f} Mo)")
|
||||
print("="*40)
|
||||
print("=" * 40)
|
||||
|
||||
for name, filename in [("TAR.XZ", "linux_sources.tar.xz"), ("CCC", "test.ccc")]:
|
||||
if os.path.exists(filename):
|
||||
size_mo = os.path.getsize(filename) / (1024 * 1024)
|
||||
ratio = (size_mo / raw_mo) * 100
|
||||
print(f"{name:10} : {size_mo:8.2f} Mo ({ratio:5.2f}% du total)")
|
||||
print(f"{name:10} : {size_mo:8.2f} Mo ({ratio:5.2f}%)")
|
||||
else:
|
||||
print(f"{name:10} : Non généré")
|
||||
|
||||
print(f"\nTemps TAR : {end_tar - start_tar:.2f}s")
|
||||
print(f"Temps CCC : {end_ccc - start_ccc:.2f}s")
|
||||
|
||||
print(f"\nPic RAM TAR : {peak_tar / 1024:.2f} Mo")
|
||||
print(f"Pic RAM CCC : {peak_ccc / 1024:.2f} Mo")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user