ccc f
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,2 +1,4 @@
|
|||||||
ccc
|
ccc
|
||||||
linux/
|
linux/
|
||||||
|
*.tar.xz
|
||||||
|
*.ccc
|
||||||
|
|||||||
106
ccc.cpp
106
ccc.cpp
@@ -20,11 +20,14 @@
|
|||||||
#include <future>
|
#include <future>
|
||||||
#include <queue>
|
#include <queue>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
|
#include <atomic>
|
||||||
#include <tree_sitter/api.h>
|
#include <tree_sitter/api.h>
|
||||||
#include <tree_sitter/tree-sitter-c.h>
|
#include <tree_sitter/tree-sitter-c.h>
|
||||||
#include <lzma.h>
|
#include <lzma.h>
|
||||||
#include <xxh3.h>
|
#include <xxh3.h>
|
||||||
#include <malloc.h>
|
#include <malloc.h>
|
||||||
|
#include <sys/ioctl.h>
|
||||||
|
#include <unistd.h>
|
||||||
using namespace std;
|
using namespace std;
|
||||||
namespace fs=filesystem;
|
namespace fs=filesystem;
|
||||||
const uint64_t CCC_DELIMITER_0_HEAD=0b0;
|
const uint64_t CCC_DELIMITER_0_HEAD=0b0;
|
||||||
@@ -44,6 +47,11 @@ struct XXH3HasherString {
|
|||||||
return static_cast<size_t>(XXH3_64bits(s.data(),s.size()));
|
return static_cast<size_t>(XXH3_64bits(s.data(),s.size()));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
size_t get_terminal_width() {
|
||||||
|
struct winsize w;
|
||||||
|
ioctl(STDOUT_FILENO,TIOCGWINSZ,&w);
|
||||||
|
return w.ws_col?w.ws_col:80;
|
||||||
|
}
|
||||||
class bit_streamer {
|
class bit_streamer {
|
||||||
private:
|
private:
|
||||||
vector<uint8_t> out;
|
vector<uint8_t> out;
|
||||||
@@ -226,11 +234,13 @@ struct header {
|
|||||||
size_t size_payload;
|
size_t size_payload;
|
||||||
};
|
};
|
||||||
#pragma pack(pop)
|
#pragma pack(pop)
|
||||||
|
#pragma pack(push,1)
|
||||||
struct node {
|
struct node {
|
||||||
uint32_t type;
|
uint32_t type;
|
||||||
uint32_t start;
|
uint32_t start;
|
||||||
uint32_t end;
|
uint32_t end;
|
||||||
};
|
};
|
||||||
|
#pragma pack(pop)
|
||||||
struct file_entry {
|
struct file_entry {
|
||||||
string name;
|
string name;
|
||||||
string content;
|
string content;
|
||||||
@@ -240,10 +250,10 @@ struct file_entry {
|
|||||||
struct thread_iterate_input_loop_call {
|
struct thread_iterate_input_loop_call {
|
||||||
string &source_code;
|
string &source_code;
|
||||||
vector<node> &thread_local_node_list;
|
vector<node> &thread_local_node_list;
|
||||||
map<string,int>& thread_local_rec_map;
|
unordered_map<string,int,XXH3HasherString>& thread_local_rec_map;
|
||||||
};
|
};
|
||||||
struct thread_rec_map_result {
|
struct thread_rec_map_result {
|
||||||
map<string,int> thread_local_rec_map;
|
unordered_map<string,int,XXH3HasherString> thread_local_rec_map;
|
||||||
};
|
};
|
||||||
struct thread_encoding_input_loop_call {
|
struct thread_encoding_input_loop_call {
|
||||||
string &source_code;
|
string &source_code;
|
||||||
@@ -253,6 +263,13 @@ struct thread_encoding_input_loop_call {
|
|||||||
struct thread_encoding_result {
|
struct thread_encoding_result {
|
||||||
vector<bit_streamer> encoded_files;
|
vector<bit_streamer> encoded_files;
|
||||||
};
|
};
|
||||||
|
atomic<size_t> parsed_files=0;
|
||||||
|
atomic<size_t> encoded_files=0;
|
||||||
|
size_t total_files=0;
|
||||||
|
atomic<bool> parsing_done=false;
|
||||||
|
atomic<bool> encoding_started=false;
|
||||||
|
atomic<bool> encoding_done=false;
|
||||||
|
mutex ui_mutex;
|
||||||
queue<file_entry> rec_map_files_queue;
|
queue<file_entry> rec_map_files_queue;
|
||||||
mutex rec_map_queue_mutex;
|
mutex rec_map_queue_mutex;
|
||||||
queue<file_entry> encoding_files_queue;
|
queue<file_entry> encoding_files_queue;
|
||||||
@@ -271,6 +288,35 @@ bool show_warning=false;
|
|||||||
bool fail_on_warning=false;
|
bool fail_on_warning=false;
|
||||||
bool enable_malloc_trim=true;
|
bool enable_malloc_trim=true;
|
||||||
mutex type_alloc;
|
mutex type_alloc;
|
||||||
|
void print_progress_line(const string& label,size_t done,size_t total) {
|
||||||
|
size_t width=get_terminal_width();
|
||||||
|
size_t percent=(total==0)?0:(done*100/total);
|
||||||
|
string prefix=label+" "+to_string(done)+"/"+to_string(total)+" files";
|
||||||
|
size_t bar_width=width-prefix.size()-10;
|
||||||
|
if (bar_width>width) bar_width=10;
|
||||||
|
size_t filled=(total==0)?0:(done*bar_width/total);
|
||||||
|
string bar="[";
|
||||||
|
for (size_t i=0;i<bar_width;i++) {
|
||||||
|
bar+=(i<filled?'=':' ');
|
||||||
|
}
|
||||||
|
bar+="]";
|
||||||
|
cout<<"\r"<<prefix<<" "<<bar<<flush;
|
||||||
|
}
|
||||||
|
void ui_thread() {
|
||||||
|
lock_guard<mutex> lock(ui_mutex);
|
||||||
|
while (!parsing_done) {
|
||||||
|
print_progress_line("Parsing",parsed_files.load(),total_files);
|
||||||
|
this_thread::sleep_for(chrono::milliseconds(50));
|
||||||
|
}
|
||||||
|
cout<<"\n";
|
||||||
|
encoding_started=true;
|
||||||
|
while (!encoding_done) {
|
||||||
|
print_progress_line("Encoding",encoded_files.load(),total_files);
|
||||||
|
this_thread::sleep_for(chrono::milliseconds(50));
|
||||||
|
}
|
||||||
|
cout<<"\n";
|
||||||
|
return;
|
||||||
|
}
|
||||||
uint32_t get_id(const string& type) {
|
uint32_t get_id(const string& type) {
|
||||||
{
|
{
|
||||||
auto it=type_to_id.find(type);
|
auto it=type_to_id.find(type);
|
||||||
@@ -311,7 +357,6 @@ void iterate_all_nodes_loop_call(thread_iterate_input_loop_call &settings,TSNode
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
thread_rec_map_result run_thread_rec_map(size_t thread_num) {
|
thread_rec_map_result run_thread_rec_map(size_t thread_num) {
|
||||||
auto start=chrono::high_resolution_clock::now();
|
|
||||||
thread_rec_map_result res;
|
thread_rec_map_result res;
|
||||||
TSParser *parser=ts_parser_new();
|
TSParser *parser=ts_parser_new();
|
||||||
ts_parser_set_language(parser,tree_sitter_c());
|
ts_parser_set_language(parser,tree_sitter_c());
|
||||||
@@ -344,11 +389,9 @@ thread_rec_map_result run_thread_rec_map(size_t thread_num) {
|
|||||||
encoding_files_queue.push(std::move(f));
|
encoding_files_queue.push(std::move(f));
|
||||||
}
|
}
|
||||||
if (++counter%20==0 && enable_malloc_trim) malloc_trim(0);
|
if (++counter%20==0 && enable_malloc_trim) malloc_trim(0);
|
||||||
|
parsed_files++;
|
||||||
}
|
}
|
||||||
ts_parser_delete(parser);
|
ts_parser_delete(parser);
|
||||||
auto end=chrono::high_resolution_clock::now();
|
|
||||||
auto ms=chrono::duration_cast<chrono::milliseconds>(end-start).count();
|
|
||||||
cout<<"Recccurences map thread number "<<thread_num<<" finished succesfully on "<<ms<<" milliseconds."<<endl;
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
void generate_c_keyword(bit_streamer& bitstream,size_t index) {
|
void generate_c_keyword(bit_streamer& bitstream,size_t index) {
|
||||||
@@ -358,17 +401,31 @@ void generate_c_keyword(bit_streamer& bitstream,size_t index) {
|
|||||||
bitstream.align();
|
bitstream.align();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
void generate_rec(bit_streamer& bitstream,size_t index,size_t total_recs) {
|
void generate_rec(bit_streamer& bitstream,size_t index) {
|
||||||
size_t bits=0;
|
if (index==0) {
|
||||||
while (total_recs) {
|
|
||||||
total_recs>>=1;
|
|
||||||
++bits;
|
|
||||||
}
|
|
||||||
bitstream.align();
|
bitstream.align();
|
||||||
bitstream.write_bits(CCC_REC_TABLE_REF_HEAD,4);
|
bitstream.write_bits(CCC_REC_TABLE_REF_HEAD,4);
|
||||||
bitstream.write_bits(index,bits);
|
bitstream.write_bits(0,1);
|
||||||
|
bitstream.write_bits(index,3);
|
||||||
bitstream.align();
|
bitstream.align();
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
size_t tmp=index+1;
|
||||||
|
size_t k=0;
|
||||||
|
size_t threshold=1ULL<<3;
|
||||||
|
while (tmp>=threshold) {
|
||||||
|
tmp>>=3;
|
||||||
|
++k;
|
||||||
|
}
|
||||||
|
size_t payload_bits=3*(k+1);
|
||||||
|
bitstream.align();
|
||||||
|
bitstream.write_bits(CCC_REC_TABLE_REF_HEAD,4);
|
||||||
|
for (size_t i=0;i<k;++i) {
|
||||||
|
bitstream.write_bits(1,1);
|
||||||
|
}
|
||||||
|
bitstream.write_bits(0,1);
|
||||||
|
bitstream.write_bits(index,payload_bits);
|
||||||
|
bitstream.align();
|
||||||
}
|
}
|
||||||
void generate_delimiter0(bit_streamer& bitstream,size_t index) {
|
void generate_delimiter0(bit_streamer& bitstream,size_t index) {
|
||||||
bitstream.align();
|
bitstream.align();
|
||||||
@@ -422,7 +479,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
|||||||
generate_string_content(out,text.data(),text.size());
|
generate_string_content(out,text.data(),text.size());
|
||||||
} else {
|
} else {
|
||||||
size_t index=it->second;
|
size_t index=it->second;
|
||||||
generate_rec(out,index,rec_list.size());
|
generate_rec(out,index);
|
||||||
}
|
}
|
||||||
} else if (type==ID_PRIMITIVE_TYPE || type==ID_TYPE_IDENTIFIER) {
|
} else if (type==ID_PRIMITIVE_TYPE || type==ID_TYPE_IDENTIFIER) {
|
||||||
auto it=c_keyword_lookup.find(string(text));
|
auto it=c_keyword_lookup.find(string(text));
|
||||||
@@ -440,7 +497,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
size_t index=it->second;
|
size_t index=it->second;
|
||||||
generate_rec(out,index,rec_list.size());
|
generate_rec(out,index);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (delimiter0_lookup.find(id_to_type[type])!=delimiter0_lookup.end() || delimiter1_lookup.find(id_to_type[type])!=delimiter1_lookup.end() || type==ID_QUOTE) {
|
} else if (delimiter0_lookup.find(id_to_type[type])!=delimiter0_lookup.end() || delimiter1_lookup.find(id_to_type[type])!=delimiter1_lookup.end() || type==ID_QUOTE) {
|
||||||
@@ -542,7 +599,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
size_t index=it->second;
|
size_t index=it->second;
|
||||||
generate_rec(out,index,rec_list.size());
|
generate_rec(out,index);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -566,7 +623,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
size_t index=it->second;
|
size_t index=it->second;
|
||||||
generate_rec(out,index,rec_list.size());
|
generate_rec(out,index);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto it=rec_lookup.find(id_to_type[type]);
|
auto it=rec_lookup.find(id_to_type[type]);
|
||||||
@@ -579,7 +636,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
size_t index=it->second;
|
size_t index=it->second;
|
||||||
generate_rec(out,index,rec_list.size());
|
generate_rec(out,index);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -587,11 +644,9 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
thread_encoding_result run_thread_encoding(size_t thread_num) {
|
thread_encoding_result run_thread_encoding(size_t thread_num) {
|
||||||
auto start=chrono::high_resolution_clock::now();
|
|
||||||
thread_encoding_result res;
|
thread_encoding_result res;
|
||||||
vector<bit_streamer> thread_local_encoded_files;
|
vector<bit_streamer> thread_local_encoded_files;
|
||||||
int counter=0;
|
int counter=0;
|
||||||
int max=0;
|
|
||||||
while (true) {
|
while (true) {
|
||||||
file_entry f;
|
file_entry f;
|
||||||
{
|
{
|
||||||
@@ -611,11 +666,9 @@ thread_encoding_result run_thread_encoding(size_t thread_num) {
|
|||||||
vector<node>().swap(encoding_loop_settings.node_list);
|
vector<node>().swap(encoding_loop_settings.node_list);
|
||||||
string().swap(f.content);
|
string().swap(f.content);
|
||||||
if (++counter%20==0 && enable_malloc_trim) malloc_trim(0);
|
if (++counter%20==0 && enable_malloc_trim) malloc_trim(0);
|
||||||
|
encoded_files++;
|
||||||
}
|
}
|
||||||
res.encoded_files=std::move(thread_local_encoded_files);
|
res.encoded_files=std::move(thread_local_encoded_files);
|
||||||
auto end=chrono::high_resolution_clock::now();
|
|
||||||
auto ms=chrono::duration_cast<chrono::milliseconds>(end-start).count();
|
|
||||||
cout<<"Parsing/encoding thread number "<<thread_num<<" finished succesfully on "<<ms<<" milliseconds."<<endl;
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
int main(int argc,char **argv) {
|
int main(int argc,char **argv) {
|
||||||
@@ -716,7 +769,8 @@ int main(int argc,char **argv) {
|
|||||||
rec_map_files_queue.push(std::move(f));
|
rec_map_files_queue.push(std::move(f));
|
||||||
}
|
}
|
||||||
size_t nb_threads=thread::hardware_concurrency();
|
size_t nb_threads=thread::hardware_concurrency();
|
||||||
size_t total_files=files.size();
|
total_files=files.size();
|
||||||
|
thread ui(ui_thread);
|
||||||
vector<future<thread_rec_map_result>> rec_map_futures;
|
vector<future<thread_rec_map_result>> rec_map_futures;
|
||||||
for (size_t i=0;i<nb_threads;++i) {
|
for (size_t i=0;i<nb_threads;++i) {
|
||||||
rec_map_futures.push_back(async(launch::async,run_thread_rec_map,i+1));
|
rec_map_futures.push_back(async(launch::async,run_thread_rec_map,i+1));
|
||||||
@@ -729,6 +783,7 @@ int main(int argc,char **argv) {
|
|||||||
global_rec_map[str]+=count;
|
global_rec_map[str]+=count;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
parsing_done=true;
|
||||||
for (auto const& [str,count]:global_rec_map) {
|
for (auto const& [str,count]:global_rec_map) {
|
||||||
if (count>=2 && str.size()>=3) {
|
if (count>=2 && str.size()>=3) {
|
||||||
rec_list.push_back(str);
|
rec_list.push_back(str);
|
||||||
@@ -756,6 +811,7 @@ int main(int argc,char **argv) {
|
|||||||
for (auto& fut:encoding_futures) {
|
for (auto& fut:encoding_futures) {
|
||||||
all_encoding_results.push_back(fut.get());
|
all_encoding_results.push_back(fut.get());
|
||||||
}
|
}
|
||||||
|
encoding_done=true;
|
||||||
vector<bit_streamer> globals_bit_stream;
|
vector<bit_streamer> globals_bit_stream;
|
||||||
for (auto& res:all_encoding_results) {
|
for (auto& res:all_encoding_results) {
|
||||||
globals_bit_stream.insert(globals_bit_stream.end(),res.encoded_files.begin(),res.encoded_files.end());
|
globals_bit_stream.insert(globals_bit_stream.end(),res.encoded_files.begin(),res.encoded_files.end());
|
||||||
@@ -775,6 +831,8 @@ int main(int argc,char **argv) {
|
|||||||
final_payloads.insert(final_payloads.end(),encoded_file.begin(),encoded_file.end());
|
final_payloads.insert(final_payloads.end(),encoded_file.begin(),encoded_file.end());
|
||||||
current_offset+=encoded_file.size();
|
current_offset+=encoded_file.size();
|
||||||
}
|
}
|
||||||
|
ui.join();
|
||||||
|
lock_guard<mutex> lock(ui_mutex);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Payload compression
|
// Payload compression
|
||||||
|
|||||||
950
ccc.cpp.save
950
ccc.cpp.save
@@ -1,950 +0,0 @@
|
|||||||
#include <cstdint>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstring>
|
|
||||||
#include <exception>
|
|
||||||
#include <iostream>
|
|
||||||
#include <filesystem>
|
|
||||||
#include <fstream>
|
|
||||||
#include <mutex>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <string>
|
|
||||||
#include <string_view>
|
|
||||||
#include <threads.h>
|
|
||||||
#include <vector>
|
|
||||||
#include <map>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <iterator>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <thread>
|
|
||||||
#include <future>
|
|
||||||
#include <queue>
|
|
||||||
#include <chrono>
|
|
||||||
#include <tree_sitter/api.h>
|
|
||||||
#include <tree_sitter/tree-sitter-c.h>
|
|
||||||
#include <lzma.h>
|
|
||||||
#include <xxh3.h>
|
|
||||||
#include <malloc.h>
|
|
||||||
using namespace std;
|
|
||||||
namespace fs=filesystem;
|
|
||||||
const uint64_t CCC_DELIMITER_0_HEAD=0b0;
|
|
||||||
const uint64_t CCC_DELIMITER_1_HEAD=0b10;
|
|
||||||
const uint64_t CCC_C_KEYWORD_HEAD=0b1100;
|
|
||||||
const uint64_t CCC_MISCELANEOUS_HEAD=0b1101;
|
|
||||||
const uint64_t CCC_STRING_INLINE_HEAD=0b1110;
|
|
||||||
const uint64_t CCC_REC_TABLE_REF_HEAD=0b1111;
|
|
||||||
const uint64_t CCC_STRING_INLINE_END=0b00000000;
|
|
||||||
#define CCC_ADD_COMPONENT(vec,tail) \
|
|
||||||
do { \
|
|
||||||
auto tmp=tail; \
|
|
||||||
vec.insert(vec.end(),tmp.begin(),tmp.end()); \
|
|
||||||
} while (0)
|
|
||||||
struct XXH3HasherString {
|
|
||||||
size_t operator()(const std::string& s) const {
|
|
||||||
return static_cast<size_t>(XXH3_64bits(s.data(),s.size()));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
class bit_streamer {
|
|
||||||
private:
|
|
||||||
vector<uint8_t> out;
|
|
||||||
uint8_t current_byte=0;
|
|
||||||
uint8_t bit_pos=0;
|
|
||||||
public:
|
|
||||||
size_t index;
|
|
||||||
bit_streamer(size_t index) {
|
|
||||||
out.reserve(1024*1024);
|
|
||||||
this->index=index;
|
|
||||||
}
|
|
||||||
size_t get_size() {
|
|
||||||
return out.size();
|
|
||||||
}
|
|
||||||
void write_bits(uint64_t value,uint8_t count) {
|
|
||||||
for (int i=count-1;i>=0;--i) {
|
|
||||||
if ((value>>i) & 1) {
|
|
||||||
current_byte|=(1<<(7-bit_pos));
|
|
||||||
}
|
|
||||||
bit_pos++;
|
|
||||||
if (bit_pos==8) {
|
|
||||||
out.push_back(current_byte);
|
|
||||||
current_byte=0;
|
|
||||||
bit_pos=0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void align() {
|
|
||||||
if (bit_pos>0) {
|
|
||||||
out.push_back(current_byte);
|
|
||||||
current_byte=0;
|
|
||||||
bit_pos=0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const vector<uint8_t>& get_out() const {
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
vector<uint8_t> extract_buffer() {
|
|
||||||
align();
|
|
||||||
return std::move(out);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
const vector<string> delimiter0={
|
|
||||||
"{",
|
|
||||||
"}",
|
|
||||||
"(",
|
|
||||||
")",
|
|
||||||
"[",
|
|
||||||
"]",
|
|
||||||
",",
|
|
||||||
"."
|
|
||||||
};
|
|
||||||
const vector<string> delimiter1={
|
|
||||||
"{}",
|
|
||||||
"()",
|
|
||||||
"[]",
|
|
||||||
";"
|
|
||||||
};
|
|
||||||
const vector<string> miscellaneous={
|
|
||||||
"!",
|
|
||||||
"%",
|
|
||||||
"'",
|
|
||||||
"*",
|
|
||||||
"+",
|
|
||||||
"-",
|
|
||||||
"/",
|
|
||||||
":",
|
|
||||||
"<",
|
|
||||||
">",
|
|
||||||
"=",
|
|
||||||
"?",
|
|
||||||
"^",
|
|
||||||
"|",
|
|
||||||
"&",
|
|
||||||
"~",
|
|
||||||
"+=",
|
|
||||||
"-=",
|
|
||||||
"*=",
|
|
||||||
"/=",
|
|
||||||
"%=",
|
|
||||||
"&=",
|
|
||||||
"|=",
|
|
||||||
"^=",
|
|
||||||
"<<=",
|
|
||||||
">>=",
|
|
||||||
"++",
|
|
||||||
"--",
|
|
||||||
"<<",
|
|
||||||
">>",
|
|
||||||
"==",
|
|
||||||
"!=",
|
|
||||||
"<=",
|
|
||||||
">=",
|
|
||||||
"->",
|
|
||||||
"...",
|
|
||||||
"||",
|
|
||||||
"&&",
|
|
||||||
"NULL",
|
|
||||||
"size_t",
|
|
||||||
"uint8_t",
|
|
||||||
"uint16_t",
|
|
||||||
"uint32_t",
|
|
||||||
"uint64_t",
|
|
||||||
"int8_t",
|
|
||||||
"int16_t",
|
|
||||||
"int32_t",
|
|
||||||
"int64_t"
|
|
||||||
};
|
|
||||||
const vector<string> c_keywords={
|
|
||||||
"#if",
|
|
||||||
"#ifdef",
|
|
||||||
"#ifndef",
|
|
||||||
"#else",
|
|
||||||
"#elif",
|
|
||||||
"#elifdef",
|
|
||||||
"#elifndef",
|
|
||||||
"#endif",
|
|
||||||
"#define",
|
|
||||||
"#undef",
|
|
||||||
"#include",
|
|
||||||
"#error",
|
|
||||||
"#warning",
|
|
||||||
"#pragma",
|
|
||||||
"#line",
|
|
||||||
"alignas",
|
|
||||||
"alignof",
|
|
||||||
"auto",
|
|
||||||
"bool",
|
|
||||||
"break",
|
|
||||||
"case",
|
|
||||||
"char",
|
|
||||||
"const",
|
|
||||||
"constexpr",
|
|
||||||
"continue",
|
|
||||||
"default",
|
|
||||||
"do",
|
|
||||||
"double",
|
|
||||||
"else",
|
|
||||||
"enum",
|
|
||||||
"extern",
|
|
||||||
"false",
|
|
||||||
"float",
|
|
||||||
"for",
|
|
||||||
"goto",
|
|
||||||
"if",
|
|
||||||
"inline",
|
|
||||||
"int",
|
|
||||||
"long",
|
|
||||||
"nullptr",
|
|
||||||
"register",
|
|
||||||
"restrict",
|
|
||||||
"return",
|
|
||||||
"short",
|
|
||||||
"signed",
|
|
||||||
"sizeof",
|
|
||||||
"static",
|
|
||||||
"static_assert",
|
|
||||||
"struct",
|
|
||||||
"switch",
|
|
||||||
"thread_local",
|
|
||||||
"true",
|
|
||||||
"typedef",
|
|
||||||
"typeof",
|
|
||||||
"typeof_unequal",
|
|
||||||
"union",
|
|
||||||
"unsigned",
|
|
||||||
"void",
|
|
||||||
"volatile",
|
|
||||||
"while",
|
|
||||||
"__asm__",
|
|
||||||
"__attribute__",
|
|
||||||
"defined",
|
|
||||||
};
|
|
||||||
#pragma pack(push,1)
|
|
||||||
struct header {
|
|
||||||
uint8_t sig[3];
|
|
||||||
uint8_t flags;
|
|
||||||
size_t size_rec_table;
|
|
||||||
size_t entry_count;
|
|
||||||
size_t size_payload;
|
|
||||||
};
|
|
||||||
#pragma pack(pop)
|
|
||||||
struct node {
|
|
||||||
uint16_t type;
|
|
||||||
uint32_t start;
|
|
||||||
uint32_t end;
|
|
||||||
};
|
|
||||||
struct file_entry {
|
|
||||||
string name;
|
|
||||||
string content;
|
|
||||||
size_t size;
|
|
||||||
size_t index;
|
|
||||||
};
|
|
||||||
struct thread_iterate_input_loop_call {
|
|
||||||
string &source_code;
|
|
||||||
vector<node> &thread_local_node_list;
|
|
||||||
unordered_map<uint16_t,string>& thread_local_type_map;
|
|
||||||
unordered_map<string,uint16_t,XXH3HasherString>& thread_local_type_u16_map;
|
|
||||||
uint16_t thread_local_next_type_id;
|
|
||||||
map<string,int> thread_local_rec_map;
|
|
||||||
};
|
|
||||||
struct thread_rec_map_result {
|
|
||||||
map<string,int> thread_local_rec_map;
|
|
||||||
};
|
|
||||||
struct thread_encoding_input_loop_call {
|
|
||||||
string &source_code;
|
|
||||||
vector<node> &node_list;
|
|
||||||
unordered_map<uint16_t,string>& thread_local_type_map;
|
|
||||||
bit_streamer& thread_local_bit_stream;
|
|
||||||
};
|
|
||||||
struct thread_encoding_result {
|
|
||||||
vector<bit_streamer> encoded_files;
|
|
||||||
};
|
|
||||||
enum iterating_mode {
|
|
||||||
REC_MAP,
|
|
||||||
PARSING
|
|
||||||
};
|
|
||||||
queue<file_entry> rec_map_files_queue;
|
|
||||||
mutex rec_map_queue_mutex;
|
|
||||||
queue<file_entry> encoding_files_queue;
|
|
||||||
mutex encoding_queue_mutex;
|
|
||||||
vector<string> rec_list;
|
|
||||||
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> rec_lookup;
|
|
||||||
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> c_keyword_lookup;
|
|
||||||
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> miscelaneous_lookup;
|
|
||||||
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> delimiter0_lookup;
|
|
||||||
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> delimiter1_lookup;
|
|
||||||
bool show_warning=false;
|
|
||||||
bool fail_on_warning=false;
|
|
||||||
bool enable_malloc_trim=true;
|
|
||||||
void iterate_all_nodes_loop_call(thread_iterate_input_loop_call &settings,TSNode current_node,iterating_mode mode) {
|
|
||||||
if (ts_node_child_count(current_node)==0) {
|
|
||||||
uint32_t start=ts_node_start_byte(current_node);
|
|
||||||
uint32_t end=ts_node_end_byte(current_node);
|
|
||||||
string_view text{settings.source_code.data()+start,end-start};
|
|
||||||
string type=string(ts_node_type(current_node));
|
|
||||||
if (mode==iterating_mode::REC_MAP) {
|
|
||||||
if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="type_identifier" || type=="field_identifier" || type=="escape_sequence" || type=="statement_identifier") {
|
|
||||||
settings.thread_local_rec_map[string(text)]++;
|
|
||||||
}
|
|
||||||
if (type=="primitive_type" && find(c_keywords.begin(),c_keywords.end(),text)==c_keywords.end()) {
|
|
||||||
settings.thread_local_rec_map[string(text)]++;
|
|
||||||
}
|
|
||||||
if (type=="comment") {
|
|
||||||
settings.thread_local_rec_map[string(text)]=2;
|
|
||||||
}
|
|
||||||
} else if (mode==iterating_mode::PARSING) {
|
|
||||||
if (settings.thread_local_type_u16_map.find(type)==settings.thread_local_type_u16_map.end()) {
|
|
||||||
settings.thread_local_type_u16_map[type]=settings.thread_local_next_type_id;
|
|
||||||
settings.thread_local_type_map[settings.thread_local_type_u16_map.at(type)]=type;
|
|
||||||
settings.thread_local_next_type_id++;
|
|
||||||
}
|
|
||||||
settings.thread_local_node_list.push_back({.type=settings.thread_local_type_u16_map[type],.start=start,.end=end});
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
uint32_t child_count=ts_node_child_count(current_node);
|
|
||||||
for (uint32_t i=0;i<child_count;++i) {
|
|
||||||
TSNode child=ts_node_child(current_node,i);
|
|
||||||
iterate_all_nodes_loop_call(settings,child,mode);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
thread_rec_map_result run_thread_rec_map(size_t thread_num) {
|
|
||||||
auto start=chrono::high_resolution_clock::now();
|
|
||||||
thread_rec_map_result res;
|
|
||||||
unordered_map<string,uint16_t,XXH3HasherString> useless_type_u16_map;
|
|
||||||
vector<node> useless_node_vector;
|
|
||||||
unordered_map<uint16_t,string> useless_type_map;
|
|
||||||
TSParser *parser=ts_parser_new();
|
|
||||||
ts_parser_set_language(parser,tree_sitter_c());
|
|
||||||
int counter=0;
|
|
||||||
while (true) {
|
|
||||||
file_entry f;
|
|
||||||
{
|
|
||||||
lock_guard<mutex> lock(rec_map_queue_mutex);
|
|
||||||
if (rec_map_files_queue.empty()) break;
|
|
||||||
f=std::move(rec_map_files_queue.front());
|
|
||||||
rec_map_files_queue.pop();
|
|
||||||
}
|
|
||||||
thread_iterate_input_loop_call loop_settings {
|
|
||||||
.source_code=f.content,
|
|
||||||
.thread_local_node_list=useless_node_vector,
|
|
||||||
.thread_local_type_map=useless_type_map,
|
|
||||||
.thread_local_type_u16_map=useless_type_u16_map,
|
|
||||||
.thread_local_next_type_id=0,
|
|
||||||
.thread_local_rec_map=res.thread_local_rec_map
|
|
||||||
};
|
|
||||||
TSTree *tree=ts_parser_parse_string(parser,nullptr,f.content.c_str(),f.content.size());
|
|
||||||
TSNode root=ts_tree_root_node(tree);
|
|
||||||
loop_settings.source_code=f.content;
|
|
||||||
iterate_all_nodes_loop_call(loop_settings,root,iterating_mode::REC_MAP);
|
|
||||||
ts_tree_delete(tree);
|
|
||||||
{
|
|
||||||
lock_guard<mutex> lock(encoding_queue_mutex);
|
|
||||||
encoding_files_queue.push(std::move(f));
|
|
||||||
}
|
|
||||||
if (++counter%20==0 && enable_malloc_trim) malloc_trim(0);
|
|
||||||
}
|
|
||||||
ts_parser_delete(parser);
|
|
||||||
auto end=chrono::high_resolution_clock::now();
|
|
||||||
auto ms=chrono::duration_cast<chrono::milliseconds>(end-start).count();
|
|
||||||
cout<<"Recccurences map thread number "<<thread_num<<" finished succesfully on "<<ms<<" milliseconds."<<endl;
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
void generate_c_keyword(bit_streamer& bitstream,size_t index) {
|
|
||||||
bitstream.align();
|
|
||||||
bitstream.write_bits(CCC_C_KEYWORD_HEAD,4);
|
|
||||||
bitstream.write_bits(index,6);
|
|
||||||
bitstream.align();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
void generate_rec(bit_streamer& bitstream,size_t index,size_t total_recs) {
|
|
||||||
size_t bits=0;
|
|
||||||
while (total_recs) {
|
|
||||||
total_recs>>=1;
|
|
||||||
++bits;
|
|
||||||
}
|
|
||||||
bitstream.align();
|
|
||||||
bitstream.write_bits(CCC_REC_TABLE_REF_HEAD,4);
|
|
||||||
bitstream.write_bits(index,bits);
|
|
||||||
bitstream.align();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
void generate_delimiter0(bit_streamer& bitstream,size_t index) {
|
|
||||||
bitstream.align();
|
|
||||||
bitstream.write_bits(CCC_DELIMITER_0_HEAD,1);
|
|
||||||
bitstream.write_bits(index,3);
|
|
||||||
bitstream.align();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
void generate_delimiter1(bit_streamer& bitstream,size_t index) {
|
|
||||||
bitstream.align();
|
|
||||||
bitstream.write_bits(CCC_DELIMITER_1_HEAD,2);
|
|
||||||
bitstream.write_bits(index,2);
|
|
||||||
bitstream.align();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
void generate_miscellaneous(bit_streamer& bitstream,size_t index) {
|
|
||||||
bitstream.align();
|
|
||||||
bitstream.write_bits(CCC_MISCELANEOUS_HEAD,4);
|
|
||||||
bitstream.write_bits(index,6);
|
|
||||||
bitstream.align();
|
|
||||||
}
|
|
||||||
void generate_string_content(bit_streamer& bitstream,const char *text,size_t text_len) {
|
|
||||||
bitstream.align();
|
|
||||||
bitstream.write_bits(CCC_STRING_INLINE_HEAD,4);
|
|
||||||
for (int i=0;i<text_len;i++) {
|
|
||||||
bitstream.write_bits(text[i],8);
|
|
||||||
}
|
|
||||||
bitstream.write_bits(CCC_STRING_INLINE_END,8);
|
|
||||||
bitstream.align();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
void print_warning(string text) {
|
|
||||||
if (show_warning==true) {
|
|
||||||
cout<<text<<endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void fail_if_warning() {
|
|
||||||
if (fail_on_warning) {
|
|
||||||
exit(-1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
|
||||||
bit_streamer& out=settings.thread_local_bit_stream;
|
|
||||||
for (int i=0;i<settings.node_list.size();i++) {
|
|
||||||
node n=settings.node_list.at(i);
|
|
||||||
string type=settings.thread_local_type_map[n.type];
|
|
||||||
char temp[256];
|
|
||||||
string_view text{settings.source_code.data()+n.start,n.end-n.start};
|
|
||||||
if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="field_identifier" || type=="preproc_arg" || type=="escape_sequence" || type=="character" || type=="statement_identifier") {
|
|
||||||
auto it=rec_lookup.find(string(text));
|
|
||||||
if (it==rec_lookup.end()) {
|
|
||||||
generate_string_content(out,text.data(),text.size());
|
|
||||||
} else {
|
|
||||||
size_t index=it->second;
|
|
||||||
generate_rec(out,index,rec_list.size());
|
|
||||||
}
|
|
||||||
} else if (type=="primitive_type" || type=="type_identifier") {
|
|
||||||
auto it=c_keyword_lookup.find(string(text));
|
|
||||||
if (it!=c_keyword_lookup.end()) {
|
|
||||||
size_t index=it->second;
|
|
||||||
generate_c_keyword(out,index);
|
|
||||||
} else {
|
|
||||||
auto it=rec_lookup.find(string(text));
|
|
||||||
if (it==rec_lookup.end()) {
|
|
||||||
if (!text.empty()) {
|
|
||||||
generate_string_content(out,text.data(),text.size());
|
|
||||||
} else {
|
|
||||||
print_warning("Warning: type node is empty: "+string(text));
|
|
||||||
fail_if_warning();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
size_t index=it->second;
|
|
||||||
generate_rec(out,index,rec_list.size());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (delimiter0_lookup.find(type)!=delimiter0_lookup.end() || delimiter1_lookup.find(type)!=delimiter1_lookup.end() || type=="\"") {
|
|
||||||
string insert;
|
|
||||||
if (type=="(" && i+1<settings.node_list.size()) {
|
|
||||||
if (settings.thread_local_type_map[settings.node_list.at(i+1).type]==")") {
|
|
||||||
insert="()";
|
|
||||||
i++;
|
|
||||||
} else {
|
|
||||||
insert="(";
|
|
||||||
}
|
|
||||||
} else if (type=="[" && i+1<settings.node_list.size()) {
|
|
||||||
if (settings.thread_local_type_map[settings.node_list.at(i+1).type]=="]") {
|
|
||||||
insert="[]";
|
|
||||||
i++;
|
|
||||||
} else {
|
|
||||||
insert="[";
|
|
||||||
}
|
|
||||||
} else if (type=="{" && i+1<settings.node_list.size()) {
|
|
||||||
if (settings.thread_local_type_map[settings.node_list.at(i+1).type]=="}") {
|
|
||||||
insert="{}";
|
|
||||||
i++;
|
|
||||||
} else {
|
|
||||||
insert="{";
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
insert=type;
|
|
||||||
}
|
|
||||||
auto it=delimiter0_lookup.find(insert);
|
|
||||||
if (it!=delimiter0_lookup.end()) {
|
|
||||||
size_t index=it->second;
|
|
||||||
generate_delimiter0(out,index);
|
|
||||||
} else {
|
|
||||||
if (insert!="{}" && insert!="\"") {
|
|
||||||
auto it=delimiter1_lookup.find(insert);
|
|
||||||
if (it!=delimiter1_lookup.end()) {
|
|
||||||
size_t index=it->second;
|
|
||||||
generate_delimiter1(out,index);
|
|
||||||
} else {
|
|
||||||
print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert);
|
|
||||||
fail_if_warning();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (insert=="{}") {
|
|
||||||
auto it=delimiter1_lookup.find("{}");
|
|
||||||
if (it!=delimiter1_lookup.end()) {
|
|
||||||
size_t index=it->second;
|
|
||||||
out.align();
|
|
||||||
out.write_bits(CCC_DELIMITER_1_HEAD,2);
|
|
||||||
out.write_bits(index,2);
|
|
||||||
out.write_bits(0b0,1);
|
|
||||||
out.align();
|
|
||||||
} else {
|
|
||||||
print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert);
|
|
||||||
fail_if_warning();
|
|
||||||
}
|
|
||||||
} else if (insert=="\"") {
|
|
||||||
auto it=delimiter1_lookup.find("{}");
|
|
||||||
if (it!=delimiter1_lookup.end()) {
|
|
||||||
size_t index=it->second;
|
|
||||||
out.align();
|
|
||||||
out.write_bits(CCC_DELIMITER_1_HEAD,2);
|
|
||||||
out.write_bits(index,2);
|
|
||||||
out.write_bits(0b1,1);
|
|
||||||
out.align();
|
|
||||||
} else {
|
|
||||||
print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert);
|
|
||||||
fail_if_warning();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert);
|
|
||||||
fail_if_warning();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (c_keyword_lookup.find(type)!=c_keyword_lookup.end() || type=="preproc_directive") {
|
|
||||||
if (type!="preproc_directive") {
|
|
||||||
auto it=c_keyword_lookup.find(type);
|
|
||||||
if (it!=c_keyword_lookup.end()) {
|
|
||||||
size_t index=it->second;
|
|
||||||
generate_c_keyword(out,index);
|
|
||||||
} else {
|
|
||||||
print_warning("Warning: unknow C keyword, that shouldn't happen: "+type+" "+string(text));
|
|
||||||
fail_if_warning();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
auto it=c_keyword_lookup.find(string(text));
|
|
||||||
if (it!=c_keyword_lookup.end()) {
|
|
||||||
size_t index=it->second;
|
|
||||||
generate_c_keyword(out,index);
|
|
||||||
} else {
|
|
||||||
auto it=rec_lookup.find(string(text));
|
|
||||||
if (it==rec_lookup.end()) {
|
|
||||||
if (!text.empty()) {
|
|
||||||
generate_string_content(out,text.data(),text.size());
|
|
||||||
} else {
|
|
||||||
print_warning("Warning: C keyword is empty: "+string(text));
|
|
||||||
fail_if_warning();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
size_t index=it->second;
|
|
||||||
generate_rec(out,index,rec_list.size());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (miscelaneous_lookup.find(type)!=miscelaneous_lookup.end()) {
|
|
||||||
auto it=miscelaneous_lookup.find(type);
|
|
||||||
if (it!=miscelaneous_lookup.end()) {
|
|
||||||
size_t index=it->second;
|
|
||||||
generate_miscellaneous(out,index);
|
|
||||||
} else {
|
|
||||||
print_warning("Warning: unknow miscellaneous, that shouldn't happen: "+type);
|
|
||||||
fail_if_warning();
|
|
||||||
}
|
|
||||||
} else if (type=="comment") {
|
|
||||||
auto it=rec_lookup.find(string(text));
|
|
||||||
if (it==rec_lookup.end()) {
|
|
||||||
if (!text.empty()) {
|
|
||||||
generate_string_content(out,text.data(),text.size());
|
|
||||||
} else {
|
|
||||||
print_warning("Warning: comment is empty: "+string(text));
|
|
||||||
fail_if_warning();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
size_t index=it->second;
|
|
||||||
generate_rec(out,index,rec_list.size());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
auto it=rec_lookup.find(type);
|
|
||||||
if (it==rec_lookup.end()) {
|
|
||||||
if (!text.empty()) {
|
|
||||||
generate_string_content(out,text.data(),text.size());
|
|
||||||
} else {
|
|
||||||
print_warning("Warning: unknow node is empty: "+string(text));
|
|
||||||
fail_if_warning();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
size_t index=it->second;
|
|
||||||
generate_rec(out,index,rec_list.size());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
out.align();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
thread_encoding_result run_thread_encoding(size_t thread_num) {
|
|
||||||
auto start=chrono::high_resolution_clock::now();
|
|
||||||
thread_encoding_result res;
|
|
||||||
map<string,int> useless_rec_map;
|
|
||||||
unordered_map<uint16_t,string> thread_local_type_map;
|
|
||||||
unordered_map<string,uint16_t,XXH3HasherString> thread_local_type_u16_map;
|
|
||||||
vector<node> thread_local_node_list;
|
|
||||||
vector<bit_streamer> thread_local_encoded_files;
|
|
||||||
TSParser *parser=ts_parser_new();
|
|
||||||
ts_parser_set_language(parser,tree_sitter_c());
|
|
||||||
int counter=0;
|
|
||||||
int max=0;
|
|
||||||
while (true) {
|
|
||||||
file_entry f;
|
|
||||||
{
|
|
||||||
lock_guard<mutex> lock(encoding_queue_mutex);
|
|
||||||
if (encoding_files_queue.empty()) break;
|
|
||||||
f=std::move(encoding_files_queue.front());
|
|
||||||
encoding_files_queue.pop();
|
|
||||||
}
|
|
||||||
thread_iterate_input_loop_call iterate_loop_settings {
|
|
||||||
.source_code=f.content,
|
|
||||||
.thread_local_node_list=thread_local_node_list,
|
|
||||||
.thread_local_type_map=thread_local_type_map,
|
|
||||||
.thread_local_type_u16_map=thread_local_type_u16_map,
|
|
||||||
.thread_local_next_type_id=0,
|
|
||||||
.thread_local_rec_map=useless_rec_map
|
|
||||||
};
|
|
||||||
thread_local_encoded_files.emplace_back(f.index);
|
|
||||||
thread_encoding_input_loop_call encoding_loop_settings {
|
|
||||||
.source_code=f.content,
|
|
||||||
.node_list=thread_local_node_list,
|
|
||||||
.thread_local_type_map=thread_local_type_map,
|
|
||||||
.thread_local_bit_stream=thread_local_encoded_files[counter]
|
|
||||||
};
|
|
||||||
TSTree *tree=ts_parser_parse_string(parser,nullptr,f.content.c_str(),f.content.size());
|
|
||||||
TSNode root=ts_tree_root_node(tree);
|
|
||||||
iterate_loop_settings.source_code=f.content;
|
|
||||||
iterate_all_nodes_loop_call(iterate_loop_settings,root,iterating_mode::PARSING);
|
|
||||||
ts_tree_delete(tree);
|
|
||||||
encoding_loop_settings.source_code=f.content;
|
|
||||||
process_file_nodes_loop_call(encoding_loop_settings);
|
|
||||||
vector<node>().swap(thread_local_node_list);
|
|
||||||
thread_local_type_map.clear();
|
|
||||||
thread_local_type_u16_map.clear();
|
|
||||||
string().swap(f.content);
|
|
||||||
if (++counter%20==0 && enable_malloc_trim) malloc_trim(0);
|
|
||||||
}
|
|
||||||
ts_parser_delete(parser);
|
|
||||||
res.encoded_files=std::move(thread_local_encoded_files);
|
|
||||||
auto end=chrono::high_resolution_clock::now();
|
|
||||||
auto ms=chrono::duration_cast<chrono::milliseconds>(end-start).count();
|
|
||||||
cout<<"Parsing/encoding thread number "<<thread_num<<" finished succesfully on "<<ms<<" milliseconds."<<endl;
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
int main(int argc,char **argv) {
|
|
||||||
for (int i=0;i<c_keywords.size();i++) {
|
|
||||||
c_keyword_lookup[c_keywords[i]]=i;
|
|
||||||
}
|
|
||||||
for (int i=0;i<miscellaneous.size();i++) {
|
|
||||||
miscelaneous_lookup[miscellaneous[i]]=i;
|
|
||||||
}
|
|
||||||
for (int i=0;i<delimiter0.size();i++) {
|
|
||||||
delimiter0_lookup[delimiter0[i]]=i;
|
|
||||||
}
|
|
||||||
for (int i=0;i<delimiter1.size();i++) {
|
|
||||||
delimiter1_lookup[delimiter1[i]]=i;
|
|
||||||
}
|
|
||||||
if (argc<2) {
|
|
||||||
cout<<"Usage: ccc [FILES]"<<endl;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
size_t compression_ratio=6;
|
|
||||||
vector<string> files;
|
|
||||||
for (int i=1;i<argc;i++) {
|
|
||||||
string file=string(argv[i]);
|
|
||||||
if (file=="-W") {
|
|
||||||
fail_on_warning=true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (file=="-w") {
|
|
||||||
show_warning=true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (file.substr(0,2)=="-c" && file.size()==3) {
|
|
||||||
try {
|
|
||||||
compression_ratio=stoi(file.substr(2,1));
|
|
||||||
continue;
|
|
||||||
} catch (const exception& e) {
|
|
||||||
cout<<"Error: invalid argument: "<<file<<endl;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (file=="-f") {
|
|
||||||
enable_malloc_trim=false;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (file=="-h" || file=="--help") {
|
|
||||||
cout<<"C Code Compressor v0.1"<<endl;
|
|
||||||
cout<<"Usage: ccc [-hfwW] [FILES]"<<endl;
|
|
||||||
cout<<"Options:"<<endl;
|
|
||||||
cout<<" -h : show this help message"<<endl;
|
|
||||||
cout<<" -f : enable fast mode, reduce the total compression time but does not release unused"<<endl;
|
|
||||||
cout<<" unused heap memory back to the OS. Usage of this option can raise memory usage."<<endl;
|
|
||||||
cout<<" -w : show warning messages. For example, when a unknown or empty node is detected."<<endl;
|
|
||||||
cout<<" -W : crash on warning"<<endl;
|
|
||||||
cout<<" -c0..9: set the compression ratio for LZMA multithreaded compression phase. Default is 6."<<endl;
|
|
||||||
cout<<" Below level 6, CCC may not compress better than tar.xz."<<endl;
|
|
||||||
cout<<" Warning: setting this higher than -c6 will seriously raise memory usage."<<endl;
|
|
||||||
cout<<" For exemple, using -c9 more than double memory usage in comparison with"<<endl;
|
|
||||||
cout<<" -c6 (which is the default). "<<endl;
|
|
||||||
cout<<" Warning: usage of higher options than -c6 combined with -f is heavily not"<<endl;
|
|
||||||
cout<<" recommended."<<endl;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if (!fs::exists(file)) {
|
|
||||||
cout<<"Error: file doesn't exist: "<<file<<endl;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
files.push_back(file);
|
|
||||||
}
|
|
||||||
for (int i=0;i<files.size();i++) {
|
|
||||||
ifstream file(files[i],ios::binary);
|
|
||||||
if (!file) {
|
|
||||||
cout<<"Error: couldn't open provided file."<<endl;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
string code((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
|
|
||||||
file_entry f{files[i],std::move(code),code.size()};
|
|
||||||
f.index=i;
|
|
||||||
rec_map_files_queue.push(std::move(f));
|
|
||||||
}
|
|
||||||
size_t nb_threads=thread::hardware_concurrency()/2;
|
|
||||||
size_t total_files=files.size();
|
|
||||||
size_t files_per_thread=(total_files+nb_threads-1)/nb_threads;
|
|
||||||
vector<future<thread_rec_map_result>> rec_map_futures;
|
|
||||||
for (size_t i=0;i<nb_threads;++i) {
|
|
||||||
rec_map_futures.push_back(async(launch::async,run_thread_rec_map,i+1));
|
|
||||||
}
|
|
||||||
vector<thread_rec_map_result> all_rec_map_results;
|
|
||||||
map<string,int> global_rec_map;
|
|
||||||
for (auto& fut:rec_map_futures) {
|
|
||||||
all_rec_map_results.push_back(fut.get());
|
|
||||||
for (auto const& [str,count]:all_rec_map_results.back().thread_local_rec_map) {
|
|
||||||
global_rec_map[str]+=count;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (auto const& [str,count]:global_rec_map) {
|
|
||||||
if (count>=2 && str.size()>=3) {
|
|
||||||
rec_list.push_back(str);
|
|
||||||
rec_lookup[str]=rec_list.size()-1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
global_rec_map.clear();
|
|
||||||
vector<file_entry> encoding_files_vec;
|
|
||||||
while (!encoding_files_queue.empty()) {
|
|
||||||
encoding_files_vec.push_back(std::move(encoding_files_queue.front()));
|
|
||||||
encoding_files_queue.pop();
|
|
||||||
}
|
|
||||||
sort(encoding_files_vec.begin(),encoding_files_vec.end(),[](const file_entry& a,const file_entry& b) {
|
|
||||||
return a.size>b.size;
|
|
||||||
});
|
|
||||||
for (auto& f:encoding_files_vec) {
|
|
||||||
encoding_files_queue.push(std::move(f));
|
|
||||||
}
|
|
||||||
vector<future<thread_encoding_result>> encoding_futures;
|
|
||||||
for (size_t i=0;i<all_rec_map_results.size();++i) {
|
|
||||||
encoding_futures.push_back(async(launch::async,run_thread_encoding,i+1));
|
|
||||||
}
|
|
||||||
all_rec_map_results.clear();
|
|
||||||
vector<thread_encoding_result> all_encoding_results;
|
|
||||||
for (auto& fut:encoding_futures) {
|
|
||||||
all_encoding_results.push_back(fut.get());
|
|
||||||
}
|
|
||||||
vector<bit_streamer> globals_bit_stream;
|
|
||||||
for (auto& res:all_encoding_results) {
|
|
||||||
globals_bit_stream.insert(globals_bit_stream.end(),res.encoded_files.begin(),res.encoded_files.end());
|
|
||||||
}
|
|
||||||
sort(globals_bit_stream.begin(),globals_bit_stream.end(),[](const bit_streamer& a,const bit_streamer& b) {
|
|
||||||
return a.index<b.index;
|
|
||||||
});
|
|
||||||
vector<unsigned char> final_payloads;
|
|
||||||
vector<size_t> global_payloads_start;
|
|
||||||
size_t total_size2=0;
|
|
||||||
for(auto& bstr:globals_bit_stream) total_size2+=bstr.get_size();
|
|
||||||
final_payloads.reserve(total_size2);
|
|
||||||
size_t current_offset=0;
|
|
||||||
for (auto& bstr:globals_bit_stream) {
|
|
||||||
global_payloads_start.push_back(current_offset);
|
|
||||||
auto encoded_file=std::move(bstr.extract_buffer());
|
|
||||||
final_payloads.insert(final_payloads.end(),encoded_file.begin(),encoded_file.end());
|
|
||||||
current_offset+=encoded_file.size();
|
|
||||||
}
|
|
||||||
vector<unsigned char> payload_compressed;
|
|
||||||
payload_compressed.resize(final_payloads.size()+final_payloads.size()/3+128);
|
|
||||||
lzma_mt mt_options={};
|
|
||||||
mt_options.flags=0;
|
|
||||||
mt_options.threads=thread::hardware_concurrency()/4;
|
|
||||||
mt_options.block_size=max((size_t)8*1024*1024,final_payloads.size()/mt_options.threads);
|
|
||||||
mt_options.timeout=0;
|
|
||||||
mt_options.filters=nullptr;
|
|
||||||
mt_options.check=LZMA_CHECK_CRC64;
|
|
||||||
lzma_options_lzma opt_lzma;
|
|
||||||
if (lzma_lzma_preset(&opt_lzma,compression_ratio)) {
|
|
||||||
cout<<"Error: couldn't initialize LZMA compressor for files archive."<<endl;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
lzma_filter filters[2];
|
|
||||||
filters[0].id=LZMA_FILTER_LZMA2;
|
|
||||||
filters[0].options=&opt_lzma;
|
|
||||||
filters[1].id=LZMA_VLI_UNKNOWN;
|
|
||||||
mt_options.filters=filters;
|
|
||||||
lzma_stream strm=LZMA_STREAM_INIT;
|
|
||||||
auto ret=lzma_stream_encoder_mt(&strm,&mt_options);
|
|
||||||
if (ret!=LZMA_OK) {
|
|
||||||
cout<<"Error: couldn't initialize MT compressor for files archives."<<endl;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
strm.next_in=final_payloads.data();
|
|
||||||
strm.avail_in=final_payloads.size();
|
|
||||||
strm.next_out=payload_compressed.data();
|
|
||||||
strm.avail_out=payload_compressed.size();
|
|
||||||
auto start=chrono::high_resolution_clock::now();
|
|
||||||
ret=lzma_code(&strm,LZMA_FINISH);
|
|
||||||
auto end=chrono::high_resolution_clock::now();
|
|
||||||
auto ns=chrono::duration_cast<chrono::nanoseconds>(end-start).count();
|
|
||||||
if (ret!=LZMA_STREAM_END) {
|
|
||||||
cout<<"Error: couldn't compress files archive."<<endl;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
cout<<"Compressed payloads."<<endl;
|
|
||||||
size_t payload_total_size;
|
|
||||||
size_t compressed_size=payload_compressed.size()-strm.avail_out;
|
|
||||||
payload_compressed.resize(compressed_size);
|
|
||||||
size_t original_size=final_payloads.size();
|
|
||||||
uint8_t flags=0;
|
|
||||||
if (compressed_size>=original_size) {
|
|
||||||
flags&= ~(0b00000001);
|
|
||||||
payload_total_size=original_size;
|
|
||||||
} else {
|
|
||||||
flags|=0b00000001;
|
|
||||||
payload_total_size=compressed_size;
|
|
||||||
}
|
|
||||||
vector<unsigned char> rec_table;
|
|
||||||
for (int i=0;i<rec_list.size();i++) {
|
|
||||||
for (auto c:rec_list[i]) {
|
|
||||||
rec_table.push_back(c);
|
|
||||||
}
|
|
||||||
rec_table.push_back('\0');
|
|
||||||
}
|
|
||||||
vector<unsigned char> rec_table_compressed;
|
|
||||||
rec_table_compressed.resize(rec_table.size()+rec_table.size()/3+128);
|
|
||||||
strm=LZMA_STREAM_INIT;
|
|
||||||
if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) {
|
|
||||||
cout<<"Error: couldn't initialize LZMA compressor for reccurences table."<<endl;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
strm.next_in=rec_table.data();
|
|
||||||
strm.avail_in=rec_table.size();
|
|
||||||
strm.next_out=rec_table_compressed.data();
|
|
||||||
strm.avail_out=rec_table_compressed.size();
|
|
||||||
ret=lzma_code(&strm,LZMA_FINISH);
|
|
||||||
if (ret!=LZMA_STREAM_END) {
|
|
||||||
cout<<"Error: couldn't compress reccurences table."<<endl;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
cout<<"Compressed reccurences table."<<endl;
|
|
||||||
size_t rec_table_total_size;
|
|
||||||
compressed_size=rec_table_compressed.size()-strm.avail_out;
|
|
||||||
rec_table_compressed.resize(compressed_size);
|
|
||||||
original_size=rec_table.size();
|
|
||||||
lzma_end(&strm);
|
|
||||||
if (compressed_size>=original_size) {
|
|
||||||
flags&= ~(0b00000010);
|
|
||||||
rec_table_total_size=original_size;
|
|
||||||
} else {
|
|
||||||
flags|=0b00000010;
|
|
||||||
rec_table_total_size=compressed_size;
|
|
||||||
}
|
|
||||||
vector<unsigned char> files_table;
|
|
||||||
for (int i=0;i<files.size();i++) {
|
|
||||||
for (auto c:files[i]) {
|
|
||||||
files_table.push_back(c);
|
|
||||||
}
|
|
||||||
files_table.push_back('\0');
|
|
||||||
auto file_start=global_payloads_start[i];
|
|
||||||
for (int i=0;i<sizeof(size_t);++i) {
|
|
||||||
files_table.push_back(((uint8_t*)&file_start)[i]);
|
|
||||||
}
|
|
||||||
size_t file_size;
|
|
||||||
if (i==files.size()-1) {
|
|
||||||
file_size=final_payloads.size()-global_payloads_start[i];
|
|
||||||
} else {
|
|
||||||
file_size=global_payloads_start[i+1]-global_payloads_start[i];
|
|
||||||
}
|
|
||||||
for (int i=0;i<sizeof(size_t);++i) {
|
|
||||||
files_table.push_back(((uint8_t*)&file_size)[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
vector<unsigned char> files_table_compressed;
|
|
||||||
files_table_compressed.resize(files_table.size()+files_table.size()/3+128);
|
|
||||||
strm=LZMA_STREAM_INIT;
|
|
||||||
if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) {
|
|
||||||
cout<<"Error: couldn't initialize LZMA compressor for files table."<<endl;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
strm.next_in=files_table.data();
|
|
||||||
strm.avail_in=files_table.size();
|
|
||||||
strm.next_out=files_table_compressed.data();
|
|
||||||
strm.avail_out=files_table_compressed.size();
|
|
||||||
ret=lzma_code(&strm,LZMA_FINISH);
|
|
||||||
if (ret!=LZMA_STREAM_END) {
|
|
||||||
cout<<"Error: couldn't compress files table."<<endl;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
cout<<"Compressed files table."<<endl;
|
|
||||||
size_t files_table_total_size;
|
|
||||||
compressed_size=files_table_compressed.size()-strm.avail_out;
|
|
||||||
files_table_compressed.resize(compressed_size);
|
|
||||||
original_size=files_table.size();
|
|
||||||
lzma_end(&strm);
|
|
||||||
if (compressed_size>=original_size) {
|
|
||||||
flags&= ~(0b00000100);
|
|
||||||
files_table_total_size=original_size;
|
|
||||||
} else {
|
|
||||||
flags|=0b00000100;
|
|
||||||
files_table_total_size=compressed_size;
|
|
||||||
}
|
|
||||||
header head;
|
|
||||||
head.sig[0]='C';
|
|
||||||
head.sig[1]='C';
|
|
||||||
head.sig[2]='C';
|
|
||||||
head.flags=flags;
|
|
||||||
head.size_payload=payload_total_size;
|
|
||||||
head.size_rec_table=rec_table_total_size;
|
|
||||||
head.entry_count=files.size();
|
|
||||||
vector<unsigned char> out;
|
|
||||||
for (int i=0;i<sizeof(header);i++) {
|
|
||||||
out.push_back(((uint8_t*)&head)[i]);
|
|
||||||
}
|
|
||||||
if (flags & 0b00000010) {
|
|
||||||
CCC_ADD_COMPONENT(out,rec_table_compressed);
|
|
||||||
} else {
|
|
||||||
CCC_ADD_COMPONENT(out,rec_table);
|
|
||||||
}
|
|
||||||
if (flags & 0b00000100) {
|
|
||||||
CCC_ADD_COMPONENT(out,files_table_compressed);
|
|
||||||
} else {
|
|
||||||
CCC_ADD_COMPONENT(out,files_table);
|
|
||||||
}
|
|
||||||
if (flags & 0b00000001) {
|
|
||||||
CCC_ADD_COMPONENT(out,payload_compressed);
|
|
||||||
} else {
|
|
||||||
CCC_ADD_COMPONENT(out,final_payloads);
|
|
||||||
}
|
|
||||||
ofstream fileout("test.ccc",ios::binary);
|
|
||||||
if (!fileout) {
|
|
||||||
cout<<"Error: couldn't open output file."<<endl;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
fileout.write(reinterpret_cast<const char*>(out.data()),out.size());
|
|
||||||
fileout.close();
|
|
||||||
cout<<"Finished !"<<endl;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
Binary file not shown.
5
test.py
5
test.py
@@ -8,7 +8,7 @@ def get_source_files(root_dir):
|
|||||||
total_size = 0
|
total_size = 0
|
||||||
for root, _, files in os.walk(root_dir):
|
for root, _, files in os.walk(root_dir):
|
||||||
for file in files:
|
for file in files:
|
||||||
if file.endswith(('.c', '.h')) and len(source_files)<20000:
|
if file.endswith(('.c', '.h')) and len(source_files)<30000:
|
||||||
path = os.path.join(root, file)
|
path = os.path.join(root, file)
|
||||||
source_files.append(path)
|
source_files.append(path)
|
||||||
total_size += os.path.getsize(path)
|
total_size += os.path.getsize(path)
|
||||||
@@ -34,6 +34,7 @@ def main():
|
|||||||
process_tar.communicate(input="\n".join(files).encode())
|
process_tar.communicate(input="\n".join(files).encode())
|
||||||
end_tar = time.time()
|
end_tar = time.time()
|
||||||
|
|
||||||
|
|
||||||
# 2. Compression avec CCC
|
# 2. Compression avec CCC
|
||||||
print("\n--- Lancement de CCC (Output temps réel) ---")
|
print("\n--- Lancement de CCC (Output temps réel) ---")
|
||||||
print("-" * 40)
|
print("-" * 40)
|
||||||
@@ -54,7 +55,7 @@ def main():
|
|||||||
print(f" RÉSULTATS (Source: {raw_mo:.2f} Mo)")
|
print(f" RÉSULTATS (Source: {raw_mo:.2f} Mo)")
|
||||||
print("="*40)
|
print("="*40)
|
||||||
|
|
||||||
for name, filename in [("TAR.XZ", "linux_sources.tar.xz"), ("CCC", "test.ccc")]:
|
for name, filename in [("TAR.XZ", "linux_sources.tar.xz"), ("CCC", "test.ccc"), ("TAR.ZXC", "linux_sources.tar.zxc")]:
|
||||||
if os.path.exists(filename):
|
if os.path.exists(filename):
|
||||||
size_mo = os.path.getsize(filename) / (1024 * 1024)
|
size_mo = os.path.getsize(filename) / (1024 * 1024)
|
||||||
ratio = (size_mo / raw_mo) * 100
|
ratio = (size_mo / raw_mo) * 100
|
||||||
|
|||||||
Reference in New Issue
Block a user