This commit is contained in:
2026-05-17 22:40:37 +02:00
parent 3dc7375f89
commit a7ec741963
5 changed files with 87 additions and 976 deletions

106
ccc.cpp
View File

@@ -20,11 +20,14 @@
#include <future>
#include <queue>
#include <chrono>
#include <atomic>
#include <tree_sitter/api.h>
#include <tree_sitter/tree-sitter-c.h>
#include <lzma.h>
#include <xxh3.h>
#include <malloc.h>
#include <sys/ioctl.h>
#include <unistd.h>
using namespace std;
namespace fs=filesystem;
const uint64_t CCC_DELIMITER_0_HEAD=0b0;
@@ -44,6 +47,11 @@ struct XXH3HasherString {
return static_cast<size_t>(XXH3_64bits(s.data(),s.size()));
}
};
size_t get_terminal_width() {
struct winsize w;
ioctl(STDOUT_FILENO,TIOCGWINSZ,&w);
return w.ws_col?w.ws_col:80;
}
class bit_streamer {
private:
vector<uint8_t> out;
@@ -226,11 +234,13 @@ struct header {
size_t size_payload;
};
#pragma pack(pop)
#pragma pack(push,1)
struct node {
uint32_t type;
uint32_t start;
uint32_t end;
};
#pragma pack(pop)
struct file_entry {
string name;
string content;
@@ -240,10 +250,10 @@ struct file_entry {
struct thread_iterate_input_loop_call {
string &source_code;
vector<node> &thread_local_node_list;
map<string,int>& thread_local_rec_map;
unordered_map<string,int,XXH3HasherString>& thread_local_rec_map;
};
struct thread_rec_map_result {
map<string,int> thread_local_rec_map;
unordered_map<string,int,XXH3HasherString> thread_local_rec_map;
};
struct thread_encoding_input_loop_call {
string &source_code;
@@ -253,6 +263,13 @@ struct thread_encoding_input_loop_call {
struct thread_encoding_result {
vector<bit_streamer> encoded_files;
};
atomic<size_t> parsed_files=0;
atomic<size_t> encoded_files=0;
size_t total_files=0;
atomic<bool> parsing_done=false;
atomic<bool> encoding_started=false;
atomic<bool> encoding_done=false;
mutex ui_mutex;
queue<file_entry> rec_map_files_queue;
mutex rec_map_queue_mutex;
queue<file_entry> encoding_files_queue;
@@ -271,6 +288,35 @@ bool show_warning=false;
bool fail_on_warning=false;
bool enable_malloc_trim=true;
mutex type_alloc;
void print_progress_line(const string& label,size_t done,size_t total) {
size_t width=get_terminal_width();
size_t percent=(total==0)?0:(done*100/total);
string prefix=label+" "+to_string(done)+"/"+to_string(total)+" files";
size_t bar_width=width-prefix.size()-10;
if (bar_width>width) bar_width=10;
size_t filled=(total==0)?0:(done*bar_width/total);
string bar="[";
for (size_t i=0;i<bar_width;i++) {
bar+=(i<filled?'=':' ');
}
bar+="]";
cout<<"\r"<<prefix<<" "<<bar<<flush;
}
void ui_thread() {
lock_guard<mutex> lock(ui_mutex);
while (!parsing_done) {
print_progress_line("Parsing",parsed_files.load(),total_files);
this_thread::sleep_for(chrono::milliseconds(50));
}
cout<<"\n";
encoding_started=true;
while (!encoding_done) {
print_progress_line("Encoding",encoded_files.load(),total_files);
this_thread::sleep_for(chrono::milliseconds(50));
}
cout<<"\n";
return;
}
uint32_t get_id(const string& type) {
{
auto it=type_to_id.find(type);
@@ -311,7 +357,6 @@ void iterate_all_nodes_loop_call(thread_iterate_input_loop_call &settings,TSNode
}
}
thread_rec_map_result run_thread_rec_map(size_t thread_num) {
auto start=chrono::high_resolution_clock::now();
thread_rec_map_result res;
TSParser *parser=ts_parser_new();
ts_parser_set_language(parser,tree_sitter_c());
@@ -344,11 +389,9 @@ thread_rec_map_result run_thread_rec_map(size_t thread_num) {
encoding_files_queue.push(std::move(f));
}
if (++counter%20==0 && enable_malloc_trim) malloc_trim(0);
parsed_files++;
}
ts_parser_delete(parser);
auto end=chrono::high_resolution_clock::now();
auto ms=chrono::duration_cast<chrono::milliseconds>(end-start).count();
cout<<"Recccurences map thread number "<<thread_num<<" finished succesfully on "<<ms<<" milliseconds."<<endl;
return res;
}
void generate_c_keyword(bit_streamer& bitstream,size_t index) {
@@ -358,17 +401,31 @@ void generate_c_keyword(bit_streamer& bitstream,size_t index) {
bitstream.align();
return;
}
void generate_rec(bit_streamer& bitstream,size_t index,size_t total_recs) {
size_t bits=0;
while (total_recs) {
total_recs>>=1;
++bits;
void generate_rec(bit_streamer& bitstream,size_t index) {
if (index==0) {
bitstream.align();
bitstream.write_bits(CCC_REC_TABLE_REF_HEAD,4);
bitstream.write_bits(0,1);
bitstream.write_bits(index,3);
bitstream.align();
return;
}
size_t tmp=index+1;
size_t k=0;
size_t threshold=1ULL<<3;
while (tmp>=threshold) {
tmp>>=3;
++k;
}
size_t payload_bits=3*(k+1);
bitstream.align();
bitstream.write_bits(CCC_REC_TABLE_REF_HEAD,4);
bitstream.write_bits(index,bits);
for (size_t i=0;i<k;++i) {
bitstream.write_bits(1,1);
}
bitstream.write_bits(0,1);
bitstream.write_bits(index,payload_bits);
bitstream.align();
return;
}
void generate_delimiter0(bit_streamer& bitstream,size_t index) {
bitstream.align();
@@ -422,7 +479,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
generate_string_content(out,text.data(),text.size());
} else {
size_t index=it->second;
generate_rec(out,index,rec_list.size());
generate_rec(out,index);
}
} else if (type==ID_PRIMITIVE_TYPE || type==ID_TYPE_IDENTIFIER) {
auto it=c_keyword_lookup.find(string(text));
@@ -440,7 +497,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
}
} else {
size_t index=it->second;
generate_rec(out,index,rec_list.size());
generate_rec(out,index);
}
}
} else if (delimiter0_lookup.find(id_to_type[type])!=delimiter0_lookup.end() || delimiter1_lookup.find(id_to_type[type])!=delimiter1_lookup.end() || type==ID_QUOTE) {
@@ -542,7 +599,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
}
} else {
size_t index=it->second;
generate_rec(out,index,rec_list.size());
generate_rec(out,index);
}
}
}
@@ -566,7 +623,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
}
} else {
size_t index=it->second;
generate_rec(out,index,rec_list.size());
generate_rec(out,index);
}
} else {
auto it=rec_lookup.find(id_to_type[type]);
@@ -579,7 +636,7 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
}
} else {
size_t index=it->second;
generate_rec(out,index,rec_list.size());
generate_rec(out,index);
}
}
}
@@ -587,11 +644,9 @@ void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
return;
}
thread_encoding_result run_thread_encoding(size_t thread_num) {
auto start=chrono::high_resolution_clock::now();
thread_encoding_result res;
vector<bit_streamer> thread_local_encoded_files;
int counter=0;
int max=0;
while (true) {
file_entry f;
{
@@ -611,11 +666,9 @@ thread_encoding_result run_thread_encoding(size_t thread_num) {
vector<node>().swap(encoding_loop_settings.node_list);
string().swap(f.content);
if (++counter%20==0 && enable_malloc_trim) malloc_trim(0);
encoded_files++;
}
res.encoded_files=std::move(thread_local_encoded_files);
auto end=chrono::high_resolution_clock::now();
auto ms=chrono::duration_cast<chrono::milliseconds>(end-start).count();
cout<<"Parsing/encoding thread number "<<thread_num<<" finished succesfully on "<<ms<<" milliseconds."<<endl;
return res;
}
int main(int argc,char **argv) {
@@ -716,7 +769,8 @@ int main(int argc,char **argv) {
rec_map_files_queue.push(std::move(f));
}
size_t nb_threads=thread::hardware_concurrency();
size_t total_files=files.size();
total_files=files.size();
thread ui(ui_thread);
vector<future<thread_rec_map_result>> rec_map_futures;
for (size_t i=0;i<nb_threads;++i) {
rec_map_futures.push_back(async(launch::async,run_thread_rec_map,i+1));
@@ -729,6 +783,7 @@ int main(int argc,char **argv) {
global_rec_map[str]+=count;
}
}
parsing_done=true;
for (auto const& [str,count]:global_rec_map) {
if (count>=2 && str.size()>=3) {
rec_list.push_back(str);
@@ -756,6 +811,7 @@ int main(int argc,char **argv) {
for (auto& fut:encoding_futures) {
all_encoding_results.push_back(fut.get());
}
encoding_done=true;
vector<bit_streamer> globals_bit_stream;
for (auto& res:all_encoding_results) {
globals_bit_stream.insert(globals_bit_stream.end(),res.encoded_files.begin(),res.encoded_files.end());
@@ -775,6 +831,8 @@ int main(int argc,char **argv) {
final_payloads.insert(final_payloads.end(),encoded_file.begin(),encoded_file.end());
current_offset+=encoded_file.size();
}
ui.join();
lock_guard<mutex> lock(ui_mutex);
//
// Payload compression