给定n个权值作为n个叶子结点,构造一棵二叉树,若该树的带权路径长度达到最小,称这样的二叉树为最优二叉树,也称为哈夫曼树(Huffman Tree)。哈夫曼树是带权路径长度最短的树,权值较大的结点离根较近。
本项目使用的Huffman Tree的K-V里面存放的是字符以及字符出现的次数,构建成Huffman Tree 大体如下:
根据这棵树,我们可以得到每个字符的编码,例如a的编码就是1,d的编码就是000,c的编码就是001。那么问题来了,我们要如何得到这棵数呢?得到的编码又该如何处理呢?
构建Huffman Tree我们采用贪心算法。首先要生成一张关于字符的K-V模型的哈希表(采用直接定值法)。
CharInfo _hashInfos[256];
//哈希表存储元素的信息:
struct configInfo{
char _ch; //字符
LongType _count; //字符出现的次数
};
注意:此处哈希表的范围是[0, 255],而字符的范围是[-127, 128],所以统计字符个数时要先把字符强转成无符号类型的,例如:_hashTable[(unsigned char)ch]._count++
HufmanTree(W* w, size_t n, W& invalid)
{
// 构建哈弗曼树
priority_queuevector, NodeCompare> minHead;
size_t i = 0;
for (i = 0; i < n; ++i){
if (w[i] != invalid)
minHead.push(new Node(w[i]));
}
while (minHead.size() > 1){
Node* left = minHead.top();
minHead.pop();
Node* right = minHead.top();
minHead.pop();
Node* parent = new Node(left->_w + right->_w);
parent->_left = left;
parent->_right = right;
minHead.push(parent);
}
_root = minHead.top();
}
void generateHuffmanCode(Node* root)
{
if(root == nullptr)
return;
if(!root->_left && !root->_right){
_hashInfos[(unsigned char)root->_w._ch]._code = root->_w._code;
return;
}
// 往左边走,把左孩子的 code 加上'0'
if(root->_left){
root->_left->_w._code = root->_w._code + '0';
generateHuffmanCode(root->_left);
}
// 往右边走,把右孩子的 code 加上'1'
if(root->_right){
root->_right->_w._code = root->_w._code + '1';
generateHuffmanCode(root->_right);
}
}
// HuffmanTree.h
#ifndef __HUFFMANTREE_H__
#define __HUFFMANTREE_H__
#include
#include
#include
#include
// 哈弗曼树节点内容
template<class W>
class HuffmanTreeNode{
public:
HuffmanTreeNode* _left;
HuffmanTreeNode* _right;
HuffmanTreeNode* _parent;
W _w;
HuffmanTreeNode(const W& w)
:_left(NULL)
, _right(NULL)
, _parent(NULL)
, _w(w)
{}
};
// 哈弗曼树
template<class W>
class HuffmanTree{
typedef HuffmanTreeNode Node;
public:
HuffmanTree()
:_root(NULL)
{}
// 仿函数(根据字符出现的次数进行比较)
struct NodeCompare{
bool operator()(const Node* l, const Node* r){
return l->_w > r->_w;
}
};
HuffmanTree(W* w, size_t n, W& invalid)
{
// 构建哈弗曼树
priority_queuevector, NodeCompare> minHead;
size_t i = 0;
for (i = 0; i < n; ++i){
if (w[i] != invalid)
minHead.push(new Node(w[i]));
}
while (minHead.size() > 1){
Node* left = minHead.top();
minHead.pop();
Node* right = minHead.top();
minHead.pop();
Node* parent = new Node(left->_w + right->_w);
parent->_left = left;
parent->_right = right;
minHead.push(parent);
}
_root = minHead.top();
}
~HuffmanTree()
{
Destory(_root);
_root = NULL;
}
void Destory(Node* root)
{
if (root == NULL){
return;
}
Destory(root->_left);
Destory(root->_right);
delete root;
}
Node* GetRoot()
{
return _root;
}
private:
HuffmanTree(const HuffmanTree& t);
HuffmanTree& operator=(const HuffmanTree& t);
protected:
Node* _root;
};
#endif //__HUFFMANTREE_H__
// FileCompress.h
#ifndef __FILECOMPRESS_H__
#define __FILECOMPRESS_H__
#include "HuffmanTree.h"
#include
#include
#include
#include
#include
using namespace std;
typedef long long LongType;
struct CharInfo{
char _ch; //字符
string _code; //编码
LongType _count; //出现次数
// 重载+ 是为了在构建哈夫曼树时,parent = left+right
CharInfo operator+(const CharInfo& info)
{
CharInfo ret;
ret._count = _count + info._count;
return ret;
}
bool operator>(const CharInfo& info) const
{
return _count > info._count;
}
bool operator != (const CharInfo& invalid)
{
return _count != invalid._count;
}
};
class FileCompress{
typedef HuffmanTreeNode Node;
public:
struct configInfo
{
char _ch;
LongType _count;
};
FileCompress()
{
size_t i = 0;
for (i = 0; i < 256; ++i){
_hashInfos[i]._ch = i;
_hashInfos[i]._count = 0;
}
}
void Compress(const char* file)
{
// 1.统计文件中字符出现的次数
ifstream ifs(file, ios_base::in | ios_base::binary);
char ch;
while (ifs.get(ch)){
// 因为 char的范围是-127-128,而使用负数做下标对哈希表进行访问会出错,所以强转成无符号的
_hashInfos[(unsigned char)ch]._count++;
}
// 2.生成哈弗曼树
CharInfo invalid;
invalid._count = 0;
HuffmanTree tree(_hashInfos, 256, invalid);
// 3.生成 Huffman code
generateHuffmanCode(tree.GetRoot());
// 4.压缩
string compressFile = file;
// 给压缩文件添加后缀
compressFile += ".huffman";
ofstream ofs(compressFile.c_str(), ios_base::out | ios_base::binary);
// 把字符和次数也写到压缩文件中,方便解压时生成 _hashInfos
for (size_t i = 0; i < 256; ++i){
if (_hashInfos[i]._count > 0){
configInfo info;
info._ch = _hashInfos[i]._ch;
info._count = _hashInfos[i]._count;
ofs.write((char*)&info, sizeof(configInfo));
}
}
configInfo end;
end._count = 0;
ofs.write((char*)&end, sizeof(configInfo));
ifs.clear(); //清理一下,下面seekg才起作用
ifs.seekg(0);
char value = 0;
int pos = 0;
//把编码写入到压缩文件中
while (ifs.get(ch)){
string& code = _hashInfos[(unsigned char)ch]._code;
for (size_t i = 0; i < code.size(); ++i){
if (code[i] == '0') value &= (~(1 << pos));
else if (code[i] == '1') value |= (1 << pos);
else assert(false);
++pos;
if (pos == 8){
ofs.put(value);
pos = 0;
value = 0;
}
}
}
// 如果最后一个字节没填满,直接放进去
if (pos > 0) ofs.put(value);
}
void generateHuffmanCode(Node* root)
{
if(root == nullptr)
return;
if(!root->_left && !root->_right){
_hashInfos[(unsigned char)root->_w._ch]._code = root->_w._code;
return;
}
// 往左边走,把左孩子的 code 加上'0'
if(root->_left){
root->_left->_w._code = root->_w._code + '0';
generateHuffmanCode(root->_left);
}
// 往右边走,把右孩子的 code 加上'1'
if(root->_right){
root->_right->_w._code = root->_w._code + '1';
generateHuffmanCode(root->_right);
}
}
void Uncompress(const char* file)
{
// 1. 打开压缩文件,进行解压
ifstream ifs(file, ios_base::in | ios_base::binary);
string uncompressfile = file;
size_t pos = uncompressfile.rfind('.');
assert(pos != string::npos);
uncompressfile.erase(pos);
uncompressfile += ".unhuffman";
ofstream ofs(uncompressfile.c_str(), ios_base::out | ios_base::binary);
// 重构 _hashInfos
while (1){
configInfo info;
ifs.read((char*)&info, sizeof(configInfo));
if (info._count > 0)
{
_hashInfos[(unsigned char)info._ch]._count = info._count;
}
else
{
break;
}
}
// 2. 重建Huffman Tree
CharInfo invalid;
invalid._count = 0;
HuffmanTree tree(_hashInfos, 256, invalid);
// 3. 根据 Huffman Code 解压缩
char ch;
Node* root = tree.GetRoot();
LongType fileCount = root->_w._count; //记录了文件总字符的个数,为解压做准备
Node* cur = root;
while (ifs.get(ch)){
for (size_t pos = 0; pos < 8; ++pos){
if (ch & (1 << pos)) cur = cur->_right; // 1
else cur = cur->_left; // 0
if (!cur->_left && !cur->_right) { //
ofs.put(cur->_w._ch);
cur = root;
if (--fileCount == 0) break; // 解压完成
}
}
}
}
private:
CharInfo _hashInfos[256];
};
void TestFileCompress()
{
FileCompress fc;
fc.Compress("Input.txt");
}
void TestFileUncompress()
{
FileCompress fc;
fc.Uncompress("Input.txt.huffman");
}
#endif //__FILECOMPRESS_H__
源文件类型 | 源文件大小 | 压缩文件大小 | 压缩率 |
---|---|---|---|
txt文本文档 | 3502kb | 2546kb | 72.70% |
视频文件 | 56.5mb | 56.24mb | 99.53% |