原理就是统计带压缩文件字符频率,构建哈夫曼树,然后求哈夫曼编码,将字符频率(解压的时候通过字符频率建树)和哈夫曼编码写入文件,完成压缩。
压缩代码:
//获取一个文件的每个字符的频率
void get_frequency(string filename, int frequency[256])
{
ifstream fin(filename);
if (!fin.is_open())
{
return ;
}
memset(frequency, 0, sizeof(int) * 256);
while (!fin.eof())
{
unsigned char temp = fin.get();
if (fin.eof())
{
break;
}
frequency[temp]++;
}
fin.close();
}
//哈夫曼树的节点
struct node
{
unsigned char ch;
int w;
node *rch, *lch;
};
//获取一个行自定义属性的节点
node* new_node(unsigned char ch, int w, node* lch = NULL, node* rch = NULL)
{
node* temp = (node*)malloc(sizeof(node));
temp->ch = ch;
temp->w = w;
temp->rch = rch;
temp->lch = lch;
return temp;
}
//优先级队列比较大小的方法
struct cmp
{
bool operator () (node* x, node* y)
{
return x->w > y->w;
}
};
//建树,返回根节点
node* build_haffman(int frequency[256])
{
priority_queue, cmp> q;
for (int i = 0; i < 256; i++)
{
if (frequency[i] != 0)
{
node* temp = new_node((unsigned char)i, frequency[i]);
q.push(temp);
}
}
while (q.size() > 1)
{
node* x = q.top();
q.pop();
node* y = q.top();
q.pop();
node* temp = new_node(0, x->w + y->w, x, y);
q.push(temp);
}
return q.top();
}
//后跟遍历销毁树
void destory_haffman(node **root)
{
if (*root)
{
destory_haffman(&(*root)->lch);
destory_haffman(&(*root)->rch);
free(*root);
}
}
//获取字符的哈夫曼编码
void get_haffman_code(node* root, vector& v, string code[256])
{
if (root)
{
if (root->lch == NULL && root->rch == NULL)
{
string temp = "";
for (int i = 0; i < v.size(); i++)
{
temp += v[i];
}
code[root->ch] = temp;
}
v.push_back('0');
get_haffman_code(root->lch, v, code);
v.pop_back();
v.push_back('1');
get_haffman_code(root->rch, v, code);
v.pop_back();
}
}
//将8位01码表示为一个unsigned char
unsigned char create_uchar(string haff_code, int index)
{
unsigned char ch = 0;
unsigned char flag = 128;
for (int i = index; i < index + 8; i++)
{
ch += flag * (haff_code[i] - '0');
flag /= 2;
}
return ch;
}
//压缩文件的流程
void compress_to_file(string src_file, string dst_file)
{
ifstream fin(src_file);
ofstream fout(dst_file, ios::binary);
if (!fin.is_open() || !fout.is_open())
{
return;
}
int frequency[256];
string code[256];
vector v;
get_frequency("/Users/Rubik/Desktop/123.txt", frequency);
node* root = build_haffman(frequency);
get_haffman_code(root, v, code);
string haff_code = "";
unsigned char ch;
while (!fin.eof())
{
ch = fin.get();
if (fin.eof()) break;
haff_code += code[ch];
}
int len = (int)haff_code.length();
cout << len << endl;
fout.write((const char*)frequency, sizeof(int) * 256);
fout.write((const char*)&len, sizeof(int));
while (haff_code.length() % 8 != 0)
{
haff_code += '0';
}
for (int i = 0; i < haff_code.length(); i += 8)
{
unsigned char temp = create_uchar(haff_code, i);
fout.write((const char*)&temp, sizeof(char));
}
fout.close();
fin.close();
destory_haffman(&root);
}
解压部分比较简单,获取字符频率,建树,获取unsigned char,遍历树,遇到叶子节点就输出到解压文件
//通过一个unsigned char遍历haffman树,存到s[]里,s长度为slen, cnt为已走长度,len为有效长度
node* get_res(node* root, node* pos, unsigned char temp, char* s, int &slen, int &cnt, int len)
{
slen = 0;
for (int i = 128; i > 0 && cnt < len; i >>= 1)
{
if (i & temp)
{
pos = pos->rch;
}
else
{
pos = pos->lch;
}
cnt++;
if (pos->lch == pos->rch && pos->lch == NULL)
{
s[slen++] = pos->ch;
pos = root;
}
}
return pos;
}
void decompress_to_file(string src_file, string dst_file)
{
ifstream fin(src_file);
ofstream fout(dst_file, ios::binary);
int frequency[256];
fin.read((char*)frequency, sizeof(int) * 256);
node* root = build_haffman(frequency);
vector v;
string code[256];
get_haffman_code(root, v, code);
for (int i = 0; i < 256; i++)
{
if (code[i].length() > 0)
{
cout << code[i] << endl;
}
}
int len;
fin.read((char*)&len, sizeof(int));
unsigned char temp;
node *pos = root;
char s[8];
int slen, cnt = 0;
while (!fin.eof())
{
fin.read((char*)&temp, sizeof(char));
pos = get_res(root, pos, temp, s, slen, cnt, len);
for (int i = 0; i < slen; i++)
{
fout << s[i];
}
}
destory_haffman(&root);
fin.close();
fout.close();
}
int main()
{
compress_to_file("/Users/Rubik/Desktop/123.txt", "/Users/Rubik/Desktop/out.txt");
decompress_to_file("/Users/Rubik/Desktop/out.txt", "/Users/Rubik/Desktop/456.txt");
return 0;
}
效果如下