ccc f
This commit is contained in:
106
ccc.cpp
106
ccc.cpp
@@ -20,11 +20,14 @@
|
||||
#include <future>
|
||||
#include <queue>
|
||||
#include <chrono>
|
||||
#include <atomic>
|
||||
#include <tree_sitter/api.h>
|
||||
#include <tree_sitter/tree-sitter-c.h>
|
||||
#include <lzma.h>
|
||||
#include <xxh3.h>
|
||||
#include <malloc.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <unistd.h>
|
||||
using namespace std;
|
||||
namespace fs=filesystem;
|
||||
const uint64_t CCC_DELIMITER_0_HEAD=0b0;
|
||||
@@ -44,6 +47,11 @@ struct XXH3HasherString {
|
||||
return static_cast<size_t>(XXH3_64bits(s.data(),s.size()));
|
||||
}
|
||||
};
|
||||
size_t get_terminal_width() {
|
||||
struct winsize w;
|
||||
ioctl(STDOUT_FILENO,TIOCGWINSZ,&w);
|
||||
return w.ws_col?w.ws_col:80;
|
||||
}
|
||||
class bit_streamer {
|
||||
private:
|
||||
vector<uint8_t> out;
|
||||
@@ -226,11 +234,13 @@ struct header {
|
||||
size_t size_payload;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
#pragma pack(push,1)
|
||||
struct node {
|
||||
uint32_t type;
|
||||
uint32_t start;
|
||||
uint32_t end;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
struct file_entry {
|
||||
string name;
|
||||
string content;
|
||||
@@ -240,10 +250,10 @@ struct file_entry {
|
||||
struct thread_iterate_input_loop_call {
|
||||
string &source_code;
|
||||
vector<node> &thread_local_node_list;
|
||||
map<string,int>& thread_local_rec_map;
|
||||
unordered_map<string,int,XXH3HasherString>& thread_local_rec_map;
|
||||
};
|
||||
struct thread_rec_map_result {
|
||||
map<string,int> thread_local_rec_map;
|
||||
unordered_map<string,int,XXH3HasherString> thread_local_rec_map;
|
||||
};
|
||||
struct thread_encoding_input_loop_call {
|
||||
string &source_code;
|
||||
@@ -253,6 +263,13 @@ struct thread_encoding_input_loop_call {
|
||||
struct thread_encoding_result {
|
||||
vector<bit_streamer> encoded_files;
|
||||
};
|
||||
atomic<size_t> parsed_files=0;
|
||||
atomic<size_t> encoded_files=0;
|
||||
size_t total_files=0;
|
||||
atomic<bool> parsing_done=false;
|
||||
atomic<bool> encoding_started=false;
|
||||
atomic<bool> encoding_done=false;
|
||||
mutex ui_mutex;
|
||||
queue<file_entry> rec_map_files_queue;
|
||||
mutex rec_map_queue_mutex;
|
||||
queue<file_entry> encoding_files_queue;
|
||||
@@ -271,6 +288,35 @@ bool show_warning=false;
|
||||
bool fail_on_warning=false;
|
||||
bool enable_malloc_trim=true;
|
||||
mutex type_alloc;
|
||||
void print_progress_line(const string& label,size_t done,size_t total) {
|
||||
size_t width=get_terminal_width();
|
||||
size_t percent=(total==0)?0:(done*100/total);
|
||||
string prefix=label+" "+to_string(done)+"/"+to_string(total)+" files";
|
||||
size_t bar_width=width-prefix.size()-10;
|
||||
if (bar_width>width) bar_width=10;
|
||||
size_t filled=(total==0)?0:(done*bar_width/total);
|
||||
string bar="[";
|
||||
for (size_t i=0;i<bar_width;i++) {
|
||||
bar+=(i<filled?'=':' ');
|
||||
}
|
||||
bar+="]";
|
||||
cout<<"\r"<<prefix<<" "<<bar<<flush;
|
||||
}
|
||||
void ui_thread() {
|
||||
lock_guard<mutex> lock(ui_mutex);
|
||||
while (!parsing_done) {
|
||||
print_progress_line("Parsing",parsed_files.load(),total_files);
|
||||
this_thread::sleep_for(chrono::milliseconds(50));
|
||||
}
|
||||
cout<<"\n";
|
||||
encoding_started=true;
|
||||
while (!encoding_done) {
|
||||
print_progress_line("Encoding",encoded_files.load(),total_files);
|
||||
this_thread::sleep_for(chrono::milliseconds(50));
|
||||
}
|
||||
cout<<"\n";
|
||||
return;
|
||||
}
|
||||
uint32_t get_id(const string& type) {
|
||||
{
|
||||
auto it=type_to_id.find(type);
|
||||
@@ -311,7 +357,6 @@ void iterate_all_nodes_loop_call(thread_iterate_input_loop_call &settings,TSNode
|
||||
}
|
||||
}
|
||||
thread_rec_map_result run_thread_rec_map(size_t thread_num) {
|
||||
auto start=chrono::high_resolution_clock::now();
|
||||
thread_rec_map_result res;
|
||||
TSParser *parser=ts_parser_new();
|
||||
ts_parser_set_language(parser,tree_sitter_c());
|
||||
@@ -344,11 +389,9 @@ thread_rec_map_result run_thread_rec_map(size_t thread_num) {
|
||||
encoding_files_queue.push(std::move(f));
|
||||
}
|
||||
if (++counter%20==0 && enable_malloc_trim) malloc_trim(0);
|
||||
parsed_files++;
|
||||
}
|
||||
ts_parser_delete(parser);
|
||||
auto end=chrono::high_resolution_clock::now();
|
||||
auto ms=chrono::duration_cast<chrono::milliseconds>(end-start).count();
|
||||
cout<<"Recccurences map thread number "<<thread_num<<" finished succesfully on "<<ms<<" milliseconds."<<endl;
|
||||
return res;
|
||||
}
|
||||
void generate_c_keyword(bit_streamer& bitstream,size_t index) {
|
||||
@@ -358,17 +401,31 @@ void generate_c_keyword(bit_streamer& bitstream,size_t index) {
|
||||
bitstream.align();
|
||||
return;
|
||||
}
|
||||
void generate_rec(bit_streamer& bitstream,size_t index,size_t total_recs) {
|
||||
size_t bits=0;
|
||||
while (total_recs) {
|
||||
total_recs>>=1;
|
||||
++bits;
|
||||
void generate_rec(bit_streamer& bitstream,size_t index) {
|
||||
if (index==0) {
|
||||
bitstream.align();
|
||||
bitstream.write_bits(CCC_REC_TABLE_REF_HEAD,4);
|
||||
bitstream.write_bits(0,1);
|
||||
bitstream.write_bits(index,3);
|
||||
bitstream.align();
|
||||
return;
|
||||
}
|
||||
size_t tmp=index+1;
|
||||
size_t k=0;
|
||||
size_t threshold=1ULL<<3;
|
||||
while (tmp>=threshold) {
|
||||
tmp>>=3;
|
||||
++k;
|
||||
}
|
||||
size_t payload_bits=3*(k+1);
|
||||
bitstream.align();
|
||||
bitstream.write_bits(CCC_REC_TABLE_REF_HEAD,4);
|
||||
bitstream.write_bits(index,bits);
|
||||
for (size_t i=0;i<k;++i) {
|
||||
bitstream.write_bits(1,1);
|
||||
}
|
||||
bitstream.write_bits(0,1);
|
||||
bitstream.write_bits(index,payload_bits);
|
||||
bitstream.align();
|
||||
return;
|
||||
}
|
||||
void generate_delimiter0(bit_streamer& bitstream,size_t index) {
|
||||
bitstream.align();
|
||||
@@ -422,7 +479,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
||||
generate_string_content(out,text.data(),text.size());
|
||||
} else {
|
||||
size_t index=it->second;
|
||||
generate_rec(out,index,rec_list.size());
|
||||
generate_rec(out,index);
|
||||
}
|
||||
} else if (type==ID_PRIMITIVE_TYPE || type==ID_TYPE_IDENTIFIER) {
|
||||
auto it=c_keyword_lookup.find(string(text));
|
||||
@@ -440,7 +497,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
||||
}
|
||||
} else {
|
||||
size_t index=it->second;
|
||||
generate_rec(out,index,rec_list.size());
|
||||
generate_rec(out,index);
|
||||
}
|
||||
}
|
||||
} else if (delimiter0_lookup.find(id_to_type[type])!=delimiter0_lookup.end() || delimiter1_lookup.find(id_to_type[type])!=delimiter1_lookup.end() || type==ID_QUOTE) {
|
||||
@@ -542,7 +599,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
||||
}
|
||||
} else {
|
||||
size_t index=it->second;
|
||||
generate_rec(out,index,rec_list.size());
|
||||
generate_rec(out,index);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -566,7 +623,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
||||
}
|
||||
} else {
|
||||
size_t index=it->second;
|
||||
generate_rec(out,index,rec_list.size());
|
||||
generate_rec(out,index);
|
||||
}
|
||||
} else {
|
||||
auto it=rec_lookup.find(id_to_type[type]);
|
||||
@@ -579,7 +636,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
||||
}
|
||||
} else {
|
||||
size_t index=it->second;
|
||||
generate_rec(out,index,rec_list.size());
|
||||
generate_rec(out,index);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -587,11 +644,9 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
|
||||
return;
|
||||
}
|
||||
thread_encoding_result run_thread_encoding(size_t thread_num) {
|
||||
auto start=chrono::high_resolution_clock::now();
|
||||
thread_encoding_result res;
|
||||
vector<bit_streamer> thread_local_encoded_files;
|
||||
int counter=0;
|
||||
int max=0;
|
||||
while (true) {
|
||||
file_entry f;
|
||||
{
|
||||
@@ -611,11 +666,9 @@ thread_encoding_result run_thread_encoding(size_t thread_num) {
|
||||
vector<node>().swap(encoding_loop_settings.node_list);
|
||||
string().swap(f.content);
|
||||
if (++counter%20==0 && enable_malloc_trim) malloc_trim(0);
|
||||
encoded_files++;
|
||||
}
|
||||
res.encoded_files=std::move(thread_local_encoded_files);
|
||||
auto end=chrono::high_resolution_clock::now();
|
||||
auto ms=chrono::duration_cast<chrono::milliseconds>(end-start).count();
|
||||
cout<<"Parsing/encoding thread number "<<thread_num<<" finished succesfully on "<<ms<<" milliseconds."<<endl;
|
||||
return res;
|
||||
}
|
||||
int main(int argc,char **argv) {
|
||||
@@ -716,7 +769,8 @@ int main(int argc,char **argv) {
|
||||
rec_map_files_queue.push(std::move(f));
|
||||
}
|
||||
size_t nb_threads=thread::hardware_concurrency();
|
||||
size_t total_files=files.size();
|
||||
total_files=files.size();
|
||||
thread ui(ui_thread);
|
||||
vector<future<thread_rec_map_result>> rec_map_futures;
|
||||
for (size_t i=0;i<nb_threads;++i) {
|
||||
rec_map_futures.push_back(async(launch::async,run_thread_rec_map,i+1));
|
||||
@@ -729,6 +783,7 @@ int main(int argc,char **argv) {
|
||||
global_rec_map[str]+=count;
|
||||
}
|
||||
}
|
||||
parsing_done=true;
|
||||
for (auto const& [str,count]:global_rec_map) {
|
||||
if (count>=2 && str.size()>=3) {
|
||||
rec_list.push_back(str);
|
||||
@@ -756,6 +811,7 @@ int main(int argc,char **argv) {
|
||||
for (auto& fut:encoding_futures) {
|
||||
all_encoding_results.push_back(fut.get());
|
||||
}
|
||||
encoding_done=true;
|
||||
vector<bit_streamer> globals_bit_stream;
|
||||
for (auto& res:all_encoding_results) {
|
||||
globals_bit_stream.insert(globals_bit_stream.end(),res.encoded_files.begin(),res.encoded_files.end());
|
||||
@@ -775,6 +831,8 @@ int main(int argc,char **argv) {
|
||||
final_payloads.insert(final_payloads.end(),encoded_file.begin(),encoded_file.end());
|
||||
current_offset+=encoded_file.size();
|
||||
}
|
||||
ui.join();
|
||||
lock_guard<mutex> lock(ui_mutex);
|
||||
|
||||
//
|
||||
// Payload compression
|
||||
|
||||
Reference in New Issue
Block a user