Files
ccc/ccc.cpp
2026-05-17 22:40:37 +02:00

1051 lines
32 KiB
C++

#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <exception>
#include <iostream>
#include <filesystem>
#include <fstream>
#include <mutex>
#include <stdint.h>
#include <string>
#include <string_view>
#include <threads.h>
#include <vector>
#include <map>
#include <unordered_map>
#include <iterator>
#include <algorithm>
#include <thread>
#include <future>
#include <queue>
#include <chrono>
#include <atomic>
#include <tree_sitter/api.h>
#include <tree_sitter/tree-sitter-c.h>
#include <lzma.h>
#include <xxh3.h>
#include <malloc.h>
#include <sys/ioctl.h>
#include <unistd.h>
using namespace std;
namespace fs=filesystem;
const uint64_t CCC_DELIMITER_0_HEAD=0b0;
const uint64_t CCC_DELIMITER_1_HEAD=0b10;
const uint64_t CCC_C_KEYWORD_HEAD=0b1100;
const uint64_t CCC_MISCELANEOUS_HEAD=0b1101;
const uint64_t CCC_STRING_INLINE_HEAD=0b1110;
const uint64_t CCC_REC_TABLE_REF_HEAD=0b1111;
const uint64_t CCC_STRING_INLINE_END=0b00000000;
#define CCC_ADD_COMPONENT(vec,tail) \
do { \
auto tmp=tail; \
vec.insert(vec.end(),tmp.begin(),tmp.end()); \
} while (0)
struct XXH3HasherString {
size_t operator()(const std::string& s) const {
return static_cast<size_t>(XXH3_64bits(s.data(),s.size()));
}
};
size_t get_terminal_width() {
struct winsize w;
ioctl(STDOUT_FILENO,TIOCGWINSZ,&w);
return w.ws_col?w.ws_col:80;
}
class bit_streamer {
private:
vector<uint8_t> out;
uint8_t current_byte=0;
uint8_t bit_pos=0;
public:
size_t index;
bit_streamer(size_t index) {
out.reserve(1024*1024);
this->index=index;
}
size_t get_size() {
return out.size();
}
void write_bits(uint64_t value,uint8_t count) {
for (int i=count-1;i>=0;--i) {
if ((value>>i) & 1) {
current_byte|=(1<<(7-bit_pos));
}
bit_pos++;
if (bit_pos==8) {
out.push_back(current_byte);
current_byte=0;
bit_pos=0;
}
}
}
void align() {
if (bit_pos>0) {
out.push_back(current_byte);
current_byte=0;
bit_pos=0;
}
}
const vector<uint8_t>& get_out() const {
return out;
}
vector<uint8_t> extract_buffer() {
align();
return std::move(out);
}
};
const vector<string> delimiter0={
"{",
"}",
"(",
")",
"[",
"]",
",",
"."
};
const vector<string> delimiter1={
"{}",
"()",
"[]",
";"
};
const vector<string> miscellaneous={
"!",
"%",
"'",
"*",
"+",
"-",
"/",
":",
"<",
">",
"=",
"?",
"^",
"|",
"&",
"~",
"+=",
"-=",
"*=",
"/=",
"%=",
"&=",
"|=",
"^=",
"<<=",
">>=",
"++",
"--",
"<<",
">>",
"==",
"!=",
"<=",
">=",
"->",
"...",
"||",
"&&",
"NULL",
"size_t",
"uint8_t",
"uint16_t",
"uint32_t",
"uint64_t",
"int8_t",
"int16_t",
"int32_t",
"int64_t"
};
const vector<string> c_keywords={
"#if",
"#ifdef",
"#ifndef",
"#else",
"#elif",
"#elifdef",
"#elifndef",
"#endif",
"#define",
"#undef",
"#include",
"#error",
"#warning",
"#pragma",
"#line",
"alignas",
"alignof",
"auto",
"bool",
"break",
"case",
"char",
"const",
"constexpr",
"continue",
"default",
"do",
"double",
"else",
"enum",
"extern",
"false",
"float",
"for",
"goto",
"if",
"inline",
"int",
"long",
"nullptr",
"register",
"restrict",
"return",
"short",
"signed",
"sizeof",
"static",
"static_assert",
"struct",
"switch",
"thread_local",
"true",
"typedef",
"typeof",
"typeof_unequal",
"union",
"unsigned",
"void",
"volatile",
"while",
"__asm__",
"__attribute__",
"defined",
};
#pragma pack(push,1)
struct header {
uint8_t sig[3];
uint8_t flags;
size_t size_rec_table;
size_t entry_count;
size_t size_payload;
};
#pragma pack(pop)
#pragma pack(push,1)
struct node {
uint32_t type;
uint32_t start;
uint32_t end;
};
#pragma pack(pop)
struct file_entry {
string name;
string content;
size_t size;
size_t index;
};
struct thread_iterate_input_loop_call {
string &source_code;
vector<node> &thread_local_node_list;
unordered_map<string,int,XXH3HasherString>& thread_local_rec_map;
};
struct thread_rec_map_result {
unordered_map<string,int,XXH3HasherString> thread_local_rec_map;
};
struct thread_encoding_input_loop_call {
string &source_code;
vector<node> node_list;
bit_streamer& thread_local_bit_stream;
};
struct thread_encoding_result {
vector<bit_streamer> encoded_files;
};
atomic<size_t> parsed_files=0;
atomic<size_t> encoded_files=0;
size_t total_files=0;
atomic<bool> parsing_done=false;
atomic<bool> encoding_started=false;
atomic<bool> encoding_done=false;
mutex ui_mutex;
queue<file_entry> rec_map_files_queue;
mutex rec_map_queue_mutex;
queue<file_entry> encoding_files_queue;
mutex encoding_queue_mutex;
mutex filename_nodes_mutex;
vector<string> rec_list;
unordered_map<string,size_t> rec_lookup;
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> c_keyword_lookup;
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> miscelaneous_lookup;
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> delimiter0_lookup;
unordered_map<string,size_t,XXH3HasherString,std::equal_to<>> delimiter1_lookup;
unordered_map<string,uint32_t,XXH3HasherString> type_to_id;
vector<string> id_to_type;
unordered_map<string,vector<node>,XXH3HasherString> filename_to_node_list;
bool show_warning=false;
bool fail_on_warning=false;
bool enable_malloc_trim=true;
mutex type_alloc;
void print_progress_line(const string& label,size_t done,size_t total) {
size_t width=get_terminal_width();
size_t percent=(total==0)?0:(done*100/total);
string prefix=label+" "+to_string(done)+"/"+to_string(total)+" files";
size_t bar_width=width-prefix.size()-10;
if (bar_width>width) bar_width=10;
size_t filled=(total==0)?0:(done*bar_width/total);
string bar="[";
for (size_t i=0;i<bar_width;i++) {
bar+=(i<filled?'=':' ');
}
bar+="]";
cout<<"\r"<<prefix<<" "<<bar<<flush;
}
void ui_thread() {
lock_guard<mutex> lock(ui_mutex);
while (!parsing_done) {
print_progress_line("Parsing",parsed_files.load(),total_files);
this_thread::sleep_for(chrono::milliseconds(50));
}
cout<<"\n";
encoding_started=true;
while (!encoding_done) {
print_progress_line("Encoding",encoded_files.load(),total_files);
this_thread::sleep_for(chrono::milliseconds(50));
}
cout<<"\n";
return;
}
uint32_t get_id(const string& type) {
{
auto it=type_to_id.find(type);
if (it!=type_to_id.end()) return it->second;
}
lock_guard<mutex> lock(type_alloc);
auto it=type_to_id.find(type);
if (it!=type_to_id.end()) return it->second;
uint32_t id=id_to_type.size();
type_to_id[type]=id;
id_to_type.push_back(type);
return id;
}
uint32_t ID_STRING_CONTENT,ID_SYSTEM_LIB_STRING,ID_IDENTIFIER,ID_NUMBER_LITERAL,ID_TYPE_IDENTIFIER,ID_FIELD_IDENTIFIER,ID_ESCAPE_SEQUENCE,ID_STATEMENT_IDENTIFIER,ID_PRIMITIVE_TYPE,ID_COMMENT,ID_PREPROC_ARG,ID_CHARACTER,ID_PREPROC_DIRECTIVE;
uint32_t ID_LEFT_PAR,ID_RIGHT_PAR,ID_LEFT_CROCHET,ID_RIGHT_CROCHET,ID_LEFT_ACC,ID_RIGHT_ACC,ID_QUOTE;
void iterate_all_nodes_loop_call(thread_iterate_input_loop_call &settings,TSNode current_node) {
if (ts_node_child_count(current_node)==0) {
uint32_t start=ts_node_start_byte(current_node);
uint32_t end=ts_node_end_byte(current_node);
string_view text{settings.source_code.data()+start,end-start};
string type=string(ts_node_type(current_node));
if (type=="string_content" || type=="system_lib_string" || type=="identifier" || type=="number_literal" || type=="type_identifier" || type=="field_identifier" || type=="escape_sequence" || type=="statement_identifier") {
settings.thread_local_rec_map[string(text)]++;
}
if (type=="primitive_type" && find(c_keywords.begin(),c_keywords.end(),text)==c_keywords.end()) {
settings.thread_local_rec_map[string(text)]++;
}
if (type=="comment") {
settings.thread_local_rec_map[string(text)]=2;
}
settings.thread_local_node_list.push_back({.type=get_id(type),.start=start,.end=end});
} else {
uint32_t child_count=ts_node_child_count(current_node);
for (uint32_t i=0;i<child_count;++i) {
TSNode child=ts_node_child(current_node,i);
iterate_all_nodes_loop_call(settings,child);
}
}
}
thread_rec_map_result run_thread_rec_map(size_t thread_num) {
thread_rec_map_result res;
TSParser *parser=ts_parser_new();
ts_parser_set_language(parser,tree_sitter_c());
int counter=0;
while (true) {
vector<node> node_vector={};
file_entry f;
{
lock_guard<mutex> lock(rec_map_queue_mutex);
if (rec_map_files_queue.empty()) break;
f=std::move(rec_map_files_queue.front());
rec_map_files_queue.pop();
}
thread_iterate_input_loop_call loop_settings {
.source_code=f.content,
.thread_local_node_list=node_vector,
.thread_local_rec_map=res.thread_local_rec_map
};
TSTree *tree=ts_parser_parse_string(parser,nullptr,f.content.c_str(),f.content.size());
TSNode root=ts_tree_root_node(tree);
loop_settings.source_code=f.content;
iterate_all_nodes_loop_call(loop_settings,root);
ts_tree_delete(tree);
{
lock_guard<mutex> lock(filename_nodes_mutex);
filename_to_node_list[f.name]=std::move(node_vector);
}
{
lock_guard<mutex> lock(encoding_queue_mutex);
encoding_files_queue.push(std::move(f));
}
if (++counter%20==0 && enable_malloc_trim) malloc_trim(0);
parsed_files++;
}
ts_parser_delete(parser);
return res;
}
void generate_c_keyword(bit_streamer& bitstream,size_t index) {
bitstream.align();
bitstream.write_bits(CCC_C_KEYWORD_HEAD,4);
bitstream.write_bits(index,6);
bitstream.align();
return;
}
void generate_rec(bit_streamer& bitstream,size_t index) {
if (index==0) {
bitstream.align();
bitstream.write_bits(CCC_REC_TABLE_REF_HEAD,4);
bitstream.write_bits(0,1);
bitstream.write_bits(index,3);
bitstream.align();
return;
}
size_t tmp=index+1;
size_t k=0;
size_t threshold=1ULL<<3;
while (tmp>=threshold) {
tmp>>=3;
++k;
}
size_t payload_bits=3*(k+1);
bitstream.align();
bitstream.write_bits(CCC_REC_TABLE_REF_HEAD,4);
for (size_t i=0;i<k;++i) {
bitstream.write_bits(1,1);
}
bitstream.write_bits(0,1);
bitstream.write_bits(index,payload_bits);
bitstream.align();
}
void generate_delimiter0(bit_streamer& bitstream,size_t index) {
bitstream.align();
bitstream.write_bits(CCC_DELIMITER_0_HEAD,1);
bitstream.write_bits(index,3);
bitstream.align();
return;
}
void generate_delimiter1(bit_streamer& bitstream,size_t index) {
bitstream.align();
bitstream.write_bits(CCC_DELIMITER_1_HEAD,2);
bitstream.write_bits(index,2);
bitstream.align();
return;
}
void generate_miscellaneous(bit_streamer& bitstream,size_t index) {
bitstream.align();
bitstream.write_bits(CCC_MISCELANEOUS_HEAD,4);
bitstream.write_bits(index,6);
bitstream.align();
}
void generate_string_content(bit_streamer& bitstream,const char *text,size_t text_len) {
bitstream.align();
bitstream.write_bits(CCC_STRING_INLINE_HEAD,4);
for (int i=0;i<text_len;i++) {
bitstream.write_bits(text[i],8);
}
bitstream.write_bits(CCC_STRING_INLINE_END,8);
bitstream.align();
return;
}
void print_warning(string text) {
if (show_warning==true) {
cout<<text<<endl;
}
}
void fail_if_warning() {
if (fail_on_warning) {
exit(-1);
}
}
void process_file_nodes_loop_call(thread_encoding_input_loop_call& settings) {
bit_streamer& out=settings.thread_local_bit_stream;
for (int i=0;i<settings.node_list.size();i++) {
node n=settings.node_list.at(i);
uint32_t &type=n.type;
string_view text{settings.source_code.data()+n.start,n.end-n.start};
if (type==ID_STRING_CONTENT || type==ID_SYSTEM_LIB_STRING || type==ID_IDENTIFIER || type==ID_NUMBER_LITERAL || type==ID_FIELD_IDENTIFIER || type==ID_PREPROC_ARG || type==ID_ESCAPE_SEQUENCE || type==ID_CHARACTER || type==ID_STATEMENT_IDENTIFIER) {
auto it=rec_lookup.find(string(text));
if (it==rec_lookup.end()) {
generate_string_content(out,text.data(),text.size());
} else {
size_t index=it->second;
generate_rec(out,index);
}
} else if (type==ID_PRIMITIVE_TYPE || type==ID_TYPE_IDENTIFIER) {
auto it=c_keyword_lookup.find(string(text));
if (it!=c_keyword_lookup.end()) {
size_t index=it->second;
generate_c_keyword(out,index);
} else {
auto it=rec_lookup.find(string(text));
if (it==rec_lookup.end()) {
if (!text.empty()) {
generate_string_content(out,text.data(),text.size());
} else {
print_warning("Warning: type node is empty: "+string(text));
fail_if_warning();
}
} else {
size_t index=it->second;
generate_rec(out,index);
}
}
} else if (delimiter0_lookup.find(id_to_type[type])!=delimiter0_lookup.end() || delimiter1_lookup.find(id_to_type[type])!=delimiter1_lookup.end() || type==ID_QUOTE) {
string insert;
if (type==ID_LEFT_PAR && i+1<settings.node_list.size()) {
if (settings.node_list[i+1].type==ID_RIGHT_PAR) {
insert="()";
i++;
} else {
insert="(";
}
} else if (type==ID_LEFT_CROCHET && i+1<settings.node_list.size()) {
if (settings.node_list[i+1].type==ID_RIGHT_CROCHET) {
insert="[]";
i++;
} else {
insert="[";
}
} else if (type==ID_LEFT_ACC && i+1<settings.node_list.size()) {
if (settings.node_list[i+1].type==ID_RIGHT_ACC) {
insert="{}";
i++;
} else {
insert="{";
}
} else {
insert=id_to_type[type];
}
auto it=delimiter0_lookup.find(insert);
if (it!=delimiter0_lookup.end()) {
size_t index=it->second;
generate_delimiter0(out,index);
} else {
if (insert!="{}" && insert!="\"") {
auto it=delimiter1_lookup.find(insert);
if (it!=delimiter1_lookup.end()) {
size_t index=it->second;
generate_delimiter1(out,index);
} else {
print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert);
fail_if_warning();
}
} else {
if (insert=="{}") {
auto it=delimiter1_lookup.find("{}");
if (it!=delimiter1_lookup.end()) {
size_t index=it->second;
out.align();
out.write_bits(CCC_DELIMITER_1_HEAD,2);
out.write_bits(index,2);
out.write_bits(0b0,1);
out.align();
} else {
print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert);
fail_if_warning();
}
} else if (insert=="\"") {
auto it=delimiter1_lookup.find("{}");
if (it!=delimiter1_lookup.end()) {
size_t index=it->second;
out.align();
out.write_bits(CCC_DELIMITER_1_HEAD,2);
out.write_bits(index,2);
out.write_bits(0b1,1);
out.align();
} else {
print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert);
fail_if_warning();
}
} else {
print_warning("Warning: unknow delimiter, that shouldn't happen: "+insert);
fail_if_warning();
}
}
}
} else if (c_keyword_lookup.find(id_to_type[type])!=c_keyword_lookup.end() || type==ID_PREPROC_DIRECTIVE) {
if (type!=ID_PREPROC_DIRECTIVE) {
auto it=c_keyword_lookup.find(id_to_type[type]);
if (it!=c_keyword_lookup.end()) {
size_t index=it->second;
generate_c_keyword(out,index);
} else {
print_warning("Warning: unknow C keyword, that shouldn't happen: "+id_to_type[type]+" "+string(text));
fail_if_warning();
}
} else {
auto it=c_keyword_lookup.find(string(text));
if (it!=c_keyword_lookup.end()) {
size_t index=it->second;
generate_c_keyword(out,index);
} else {
auto it=rec_lookup.find(string(text));
if (it==rec_lookup.end()) {
if (!text.empty()) {
generate_string_content(out,text.data(),text.size());
} else {
print_warning("Warning: C keyword is empty: "+string(text));
fail_if_warning();
}
} else {
size_t index=it->second;
generate_rec(out,index);
}
}
}
} else if (miscelaneous_lookup.find(id_to_type[type])!=miscelaneous_lookup.end()) {
auto it=miscelaneous_lookup.find(id_to_type[type]);
if (it!=miscelaneous_lookup.end()) {
size_t index=it->second;
generate_miscellaneous(out,index);
} else {
print_warning("Warning: unknow miscellaneous, that shouldn't happen: "+id_to_type[type]);
fail_if_warning();
}
} else if (type==ID_COMMENT) {
auto it=rec_lookup.find(string(text));
if (it==rec_lookup.end()) {
if (!text.empty()) {
generate_string_content(out,text.data(),text.size());
} else {
print_warning("Warning: comment is empty: "+string(text));
fail_if_warning();
}
} else {
size_t index=it->second;
generate_rec(out,index);
}
} else {
auto it=rec_lookup.find(id_to_type[type]);
if (it==rec_lookup.end()) {
if (!text.empty()) {
generate_string_content(out,text.data(),text.size());
} else {
print_warning("Warning: unknow node is empty: "+string(text));
fail_if_warning();
}
} else {
size_t index=it->second;
generate_rec(out,index);
}
}
}
out.align();
return;
}
thread_encoding_result run_thread_encoding(size_t thread_num) {
thread_encoding_result res;
vector<bit_streamer> thread_local_encoded_files;
int counter=0;
while (true) {
file_entry f;
{
lock_guard<mutex> lock(encoding_queue_mutex);
if (encoding_files_queue.empty()) break;
f=std::move(encoding_files_queue.front());
encoding_files_queue.pop();
}
thread_local_encoded_files.emplace_back(f.index);
thread_encoding_input_loop_call encoding_loop_settings {
.source_code=f.content,
.node_list=std::move(filename_to_node_list[f.name]),
.thread_local_bit_stream=thread_local_encoded_files[counter]
};
encoding_loop_settings.source_code=f.content;
process_file_nodes_loop_call(encoding_loop_settings);
vector<node>().swap(encoding_loop_settings.node_list);
string().swap(f.content);
if (++counter%20==0 && enable_malloc_trim) malloc_trim(0);
encoded_files++;
}
res.encoded_files=std::move(thread_local_encoded_files);
return res;
}
int main(int argc,char **argv) {
ID_STRING_CONTENT=get_id("string_content");
ID_SYSTEM_LIB_STRING=get_id("system_lib_string");
ID_IDENTIFIER=get_id("identifier");
ID_NUMBER_LITERAL=get_id("number_literal");
ID_TYPE_IDENTIFIER=get_id("type_identifier");
ID_FIELD_IDENTIFIER=get_id("field_identifier");
ID_ESCAPE_SEQUENCE=get_id("escape_sequence");
ID_STATEMENT_IDENTIFIER=get_id("statement_identifier"),
ID_PRIMITIVE_TYPE=get_id("primitive_type");
ID_COMMENT=get_id("comment");
ID_PREPROC_ARG=get_id("preproc_arg");
ID_CHARACTER=get_id("character");
ID_LEFT_PAR=get_id("(");
ID_RIGHT_PAR=get_id(")");
ID_LEFT_CROCHET=get_id("[");
ID_RIGHT_CROCHET=get_id("]");
ID_LEFT_ACC=get_id("{");
ID_RIGHT_ACC=get_id("}");
ID_PREPROC_DIRECTIVE=get_id("preproc_directive");
ID_QUOTE=get_id("\"");
for (int i=0;i<c_keywords.size();i++) {
c_keyword_lookup[c_keywords[i]]=i;
}
for (int i=0;i<miscellaneous.size();i++) {
miscelaneous_lookup[miscellaneous[i]]=i;
}
for (int i=0;i<delimiter0.size();i++) {
delimiter0_lookup[delimiter0[i]]=i;
}
for (int i=0;i<delimiter1.size();i++) {
delimiter1_lookup[delimiter1[i]]=i;
}
if (argc<2) {
cout<<"Usage: ccc [FILES]"<<endl;
return -1;
}
size_t compression_ratio=6;
vector<string> files;
for (int i=1;i<argc;i++) {
string file=string(argv[i]);
if (file=="-W") {
fail_on_warning=true;
continue;
}
if (file=="-w") {
show_warning=true;
continue;
}
if (file.substr(0,2)=="-c" && file.size()==3) {
try {
compression_ratio=stoi(file.substr(2,1));
continue;
} catch (const exception& e) {
cout<<"Error: invalid argument: "<<file<<endl;
return -1;
}
}
if (file=="-f") {
enable_malloc_trim=false;
continue;
}
if (file=="-h" || file=="--help") {
cout<<"C Code Compressor v0.1"<<endl;
cout<<"Usage: ccc [-hfwW] [FILES]"<<endl;
cout<<"Options:"<<endl;
cout<<" -h : show this help message"<<endl;
cout<<" -f : enable fast mode, reduce the total compression time but does not release unused"<<endl;
cout<<" unused heap memory back to the OS. Usage of this option can raise memory usage."<<endl;
cout<<" -w : show warning messages. For example, when a unknown or empty node is detected."<<endl;
cout<<" -W : crash on warning"<<endl;
cout<<" -c0..9: set the compression ratio for LZMA multithreaded compression phase. Default is 6."<<endl;
cout<<" Below level 6, CCC may not compress better than tar.xz."<<endl;
cout<<" Warning: setting this higher than -c6 will seriously raise memory usage."<<endl;
cout<<" For exemple, using -c9 more than double memory usage in comparison with"<<endl;
cout<<" -c6 (which is the default). "<<endl;
cout<<" Warning: usage of higher options than -c6 combined with -f is heavily not"<<endl;
cout<<" recommended."<<endl;
return 0;
}
if (!fs::exists(file)) {
cout<<"Error: file doesn't exist: "<<file<<endl;
return -1;
}
files.push_back(file);
}
for (int i=0;i<files.size();i++) {
ifstream file(files[i],ios::binary);
if (!file) {
cout<<"Error: couldn't open provided file."<<endl;
return -1;
}
string code((istreambuf_iterator<char>(file)),istreambuf_iterator<char>());
file_entry f{files[i],std::move(code),code.size()};
f.index=i;
rec_map_files_queue.push(std::move(f));
}
size_t nb_threads=thread::hardware_concurrency();
total_files=files.size();
thread ui(ui_thread);
vector<future<thread_rec_map_result>> rec_map_futures;
for (size_t i=0;i<nb_threads;++i) {
rec_map_futures.push_back(async(launch::async,run_thread_rec_map,i+1));
}
vector<thread_rec_map_result> all_rec_map_results;
map<string,int> global_rec_map;
for (auto& fut:rec_map_futures) {
all_rec_map_results.push_back(fut.get());
for (auto const& [str,count]:all_rec_map_results.back().thread_local_rec_map) {
global_rec_map[str]+=count;
}
}
parsing_done=true;
for (auto const& [str,count]:global_rec_map) {
if (count>=2 && str.size()>=3) {
rec_list.push_back(str);
rec_lookup[str]=rec_list.size()-1;
}
}
global_rec_map.clear();
vector<file_entry> encoding_files_vec;
while (!encoding_files_queue.empty()) {
encoding_files_vec.push_back(std::move(encoding_files_queue.front()));
encoding_files_queue.pop();
}
sort(encoding_files_vec.begin(),encoding_files_vec.end(),[](const file_entry& a,const file_entry& b) {
return a.size>b.size;
});
for (auto& f:encoding_files_vec) {
encoding_files_queue.push(std::move(f));
}
vector<future<thread_encoding_result>> encoding_futures;
for (size_t i=0;i<all_rec_map_results.size();++i) {
encoding_futures.push_back(async(launch::async,run_thread_encoding,i+1));
}
all_rec_map_results.clear();
vector<thread_encoding_result> all_encoding_results;
for (auto& fut:encoding_futures) {
all_encoding_results.push_back(fut.get());
}
encoding_done=true;
vector<bit_streamer> globals_bit_stream;
for (auto& res:all_encoding_results) {
globals_bit_stream.insert(globals_bit_stream.end(),res.encoded_files.begin(),res.encoded_files.end());
}
sort(globals_bit_stream.begin(),globals_bit_stream.end(),[](const bit_streamer& a,const bit_streamer& b) {
return a.index<b.index;
});
vector<unsigned char> final_payloads;
vector<size_t> global_payloads_start;
size_t total_size2=0;
for(auto& bstr:globals_bit_stream) total_size2+=bstr.get_size();
final_payloads.reserve(total_size2);
size_t current_offset=0;
for (auto& bstr:globals_bit_stream) {
global_payloads_start.push_back(current_offset);
auto encoded_file=std::move(bstr.extract_buffer());
final_payloads.insert(final_payloads.end(),encoded_file.begin(),encoded_file.end());
current_offset+=encoded_file.size();
}
ui.join();
lock_guard<mutex> lock(ui_mutex);
//
// Payload compression
//
vector<unsigned char> payload_compressed;
cout<<"Files Payloads (in bytes): "<<final_payloads.size()<<endl;
payload_compressed.resize(final_payloads.size()+final_payloads.size()/3+128);
lzma_mt mt_options={};
mt_options.flags=0;
mt_options.threads=thread::hardware_concurrency();
mt_options.block_size=max((size_t)8*1024*1024,final_payloads.size()/mt_options.threads);
mt_options.timeout=0;
mt_options.filters=nullptr;
mt_options.check=LZMA_CHECK_CRC64;
lzma_options_lzma opt_lzma;
if (lzma_lzma_preset(&opt_lzma,compression_ratio)) {
cout<<"Error: couldn't initialize LZMA compressor for files archive."<<endl;
return -1;
}
lzma_filter filters[2];
filters[0].id=LZMA_FILTER_LZMA2;
filters[0].options=&opt_lzma;
filters[1].id=LZMA_VLI_UNKNOWN;
mt_options.filters=filters;
lzma_stream strm=LZMA_STREAM_INIT;
auto ret=lzma_stream_encoder_mt(&strm,&mt_options);
if (ret!=LZMA_OK) {
cout<<"Error: couldn't initialize MT compressor for files archives."<<endl;
return -1;
}
strm.next_in=final_payloads.data();
strm.avail_in=final_payloads.size();
strm.next_out=payload_compressed.data();
strm.avail_out=payload_compressed.size();
auto start=chrono::high_resolution_clock::now();
ret=lzma_code(&strm,LZMA_FINISH);
auto end=chrono::high_resolution_clock::now();
auto ns=chrono::duration_cast<chrono::nanoseconds>(end-start).count();
if (ret!=LZMA_STREAM_END) {
cout<<"Error: couldn't compress files archive."<<endl;
return -1;
}
size_t payload_total_size;
size_t compressed_size=payload_compressed.size()-strm.avail_out;
payload_compressed.resize(compressed_size);
cout<<"Files Payload compressed (in bytes): "<<compressed_size<<endl;
size_t original_size=final_payloads.size();
uint8_t flags=0;
if (compressed_size>=original_size) {
flags&= ~(0b00000001);
payload_total_size=original_size;
vector<unsigned char>().swap(payload_compressed);
} else {
flags|=0b00000001;
payload_total_size=compressed_size;
vector<unsigned char>().swap(final_payloads);
}
lzma_end(&strm);
//
// Rec table compression
//
vector<unsigned char> rec_table;
for (size_t i=0;i<rec_list.size();i++) {
for (auto c:rec_list[i]) {
rec_table.push_back(c);
}
rec_table.push_back('\0');
}
vector<unsigned char> rec_table_compressed;
cout<<"Reccurences table (in bytes): "<<rec_table.size()<<endl;
rec_table_compressed.resize(rec_table.size()+rec_table.size()/3+128);
mt_options={};
mt_options.flags=0;
mt_options.threads=thread::hardware_concurrency();
mt_options.block_size=max((size_t)8*1024*1024,rec_table.size()/mt_options.threads);
mt_options.timeout=0;
mt_options.filters=nullptr;
mt_options.check=LZMA_CHECK_CRC64;
lzma_options_lzma opt_lzma2;
if (lzma_lzma_preset(&opt_lzma2,compression_ratio)) {
cout<<"Error: couldn't initialize LZMA compressor for reccurences table."<<endl;
return -1;
}
lzma_filter filters2[2];
filters2[0].id=LZMA_FILTER_LZMA2;
filters2[0].options=&opt_lzma2;
filters2[1].id=LZMA_VLI_UNKNOWN;
mt_options.filters=filters2;
lzma_stream strm2=LZMA_STREAM_INIT;
ret=lzma_stream_encoder_mt(&strm2,&mt_options);
if (ret!=LZMA_OK) {
cout<<"Error: couldn't initialize MT compressor for reccurences table."<<endl;
return -1;
}
strm2.next_in=rec_table.data();
strm2.avail_in=rec_table.size();
strm2.next_out=rec_table_compressed.data();
strm2.avail_out=rec_table_compressed.size();
start=chrono::high_resolution_clock::now();
ret=lzma_code(&strm2,LZMA_FINISH);
end=chrono::high_resolution_clock::now();
ns=chrono::duration_cast<chrono::nanoseconds>(end-start).count();
if (ret!=LZMA_STREAM_END) {
cout<<"Error: couldn't compress reccurences table."<<endl;
return -1;
}
size_t rec_table_total_size;
compressed_size=rec_table_compressed.size()-strm2.avail_out;
rec_table_compressed.resize(compressed_size);
cout<<"Reccurences table compressed (in bytes): "<<compressed_size<<endl;
original_size=rec_table.size();
if (compressed_size>=original_size) {
flags&= ~(0b00000010);
rec_table_total_size=original_size;
vector<unsigned char>().swap(rec_table_compressed);
} else {
flags|=0b00000010;
rec_table_total_size=compressed_size;
vector<unsigned char>().swap(rec_table);
}
lzma_end(&strm2);
//
// Files table
//
vector<unsigned char> files_table;
for (int i=0;i<files.size();i++) {
for (auto c:files[i]) {
files_table.push_back(c);
}
files_table.push_back('\0');
auto file_start=global_payloads_start[i];
for (int i=0;i<sizeof(size_t);++i) {
files_table.push_back(((uint8_t*)&file_start)[i]);
}
size_t file_size;
if (i==files.size()-1) {
file_size=final_payloads.size()-global_payloads_start[i];
} else {
file_size=global_payloads_start[i+1]-global_payloads_start[i];
}
for (int i=0;i<sizeof(size_t);++i) {
files_table.push_back(((uint8_t*)&file_size)[i]);
}
}
cout<<"Files table (in bytes): "<<files_table.size()<<endl;
vector<unsigned char> files_table_compressed;
files_table_compressed.resize(files_table.size()+files_table.size()/3+128);
strm=LZMA_STREAM_INIT;
if (lzma_easy_encoder(&strm,9,LZMA_CHECK_CRC64)!=LZMA_OK) {
cout<<"Error: couldn't initialize LZMA compressor for files table."<<endl;
return -1;
}
strm.next_in=files_table.data();
strm.avail_in=files_table.size();
strm.next_out=files_table_compressed.data();
strm.avail_out=files_table_compressed.size();
ret=lzma_code(&strm,LZMA_FINISH);
if (ret!=LZMA_STREAM_END) {
cout<<"Error: couldn't compress files table."<<endl;
return -1;
}
size_t files_table_total_size;
compressed_size=files_table_compressed.size()-strm.avail_out;
files_table_compressed.resize(compressed_size);
cout<<"Files table compressed (in bytes): "<<compressed_size<<endl;
original_size=files_table.size();
lzma_end(&strm);
if (compressed_size>=original_size) {
flags&= ~(0b00000100);
files_table_total_size=original_size;
vector<unsigned char>().swap(files_table);
} else {
flags|=0b00000100;
files_table_total_size=compressed_size;
vector<unsigned char>().swap(files_table_compressed);
}
header head;
head.sig[0]='C';
head.sig[1]='C';
head.sig[2]='C';
head.flags=flags;
head.size_payload=payload_total_size;
head.size_rec_table=rec_table_total_size;
head.entry_count=files.size();
vector<unsigned char> out;
for (int i=0;i<sizeof(header);i++) {
out.push_back(((uint8_t*)&head)[i]);
}
if (flags & 0b00000010) {
CCC_ADD_COMPONENT(out,rec_table_compressed);
} else {
CCC_ADD_COMPONENT(out,rec_table);
}
if (flags & 0b00000100) {
CCC_ADD_COMPONENT(out,files_table_compressed);
} else {
CCC_ADD_COMPONENT(out,files_table);
}
if (flags & 0b00000001) {
CCC_ADD_COMPONENT(out,payload_compressed);
} else {
CCC_ADD_COMPONENT(out,final_payloads);
}
ofstream fileout("test.ccc",ios::binary);
if (!fileout) {
cout<<"Error: couldn't open output file."<<endl;
return -1;
}
fileout.write(reinterpret_cast<const char*>(out.data()),out.size());
fileout.close();
cout<<"Finished !"<<endl;
return 0;
}