halfman! halfman! 半人万岁!(来自权力的游戏 Tyrion Lannister)
huffman coding哈夫曼编码的核心是构造哈夫曼树─即最优二叉树,带权路径长度最小的二叉树。它会使出现概率高的字符使用较短的编码,反之出现概率低的则使用较长的编码,这便使编码之后的字符串的平均期望长度降低,可用于数据的无损耗压缩。
如果对多个字符进行哈夫曼编码,如 a(7),b(5),c(2),d(4)四个字符进行编码,其中括号中为该字符平均出现次数,称为权重,就要将四个字符结点作为叶子结点构造一棵二叉树,当这棵二叉树的带权路径长度最小时,最优二叉树就形成了。
结点的路径长度:该结点到根结点之间的分支数目
结点的带权路径长度:结点的路径长度与权重的乘积
树的带权路径长度:所有结点的带权路径长度之和
上图为一棵最优二叉树,结点的权值越大,越靠近根结点。该哈夫曼树的带权路径长度WPL为:7+5x2+2x3+4x3=35
它的哈夫曼编码为:
a: 0 b: 10 c: 110 d: 111
哈夫曼树结点中的parent指针指向父节点的下标编号。
typedef struct htNode
{
char data;//结点中的信息
int weight;//权重
int parent, lc, rc;//分别指向父结点,左右孩子结点
}htNode,*HuffmanTree;
哈夫曼编码保存在二维字符数组中,二维数组的每一行对应一个字符的编码,如下:
typedef char** huffmanCode;
实现哈夫曼编码的函数如下:其中存储哈夫曼结点的ht和存储哈夫曼编码的hc均为引用,调用的select()函数找出一些结点中权重最小的两个结点来创建合适的新节点,并且有目的使最终算法实现的哈夫曼树左边结点权重总是小于右边结点的。
void huffmanCoding(HuffmanTree& ht, huffmanCode& hc,
int* weigh, char* character, int n)
//weigh数组保存字符的权值,character数组保存字符,n表示字符个数
{
if (n < 2)
return;
int m = 2 * n - 1;//8个字符一共需要建立15个结点
ht = new htNode[m];
int i, s1, s2;
for (i = 0; i < n; i++)//对叶子结点进行初始化
{
ht[i].weight = weigh[i];
ht[i].data = character[i];
ht[i].lc = ht[i].rc = ht[i].parent = 0;
}
while (i < m)//进行非叶子结点构造,建成哈夫曼树
{
select(ht, i - 1, s1, s2);//每次挑选当前结点i之前最小的parent为0的两个结点,
//且s1表示的权值总是小于s2的,所以哈夫曼树是权值小的在左边,权值大的在右边。
ht[s1].parent = i;
ht[s2].parent = i;
ht[i].lc = s1;
ht[i].rc = s2;
ht[i].parent = 0;//i在此作为父结点,将其parent总是要置为0的
ht[i].weight = ht[s1].weight + ht[s2].weight;
i++;
}
//哈夫曼树构造完毕
hc = new char*[n];
char* cd = new char[n];//备用字符数组,从后往前输入数据
cd[n - 1] = '\0';
for (i = 0; i <n; i++)
{
int start = n - 1;
int c, f;
for (c = i, f = ht[i].parent;
f != 0; c = f, f = ht[f].parent)//从叶子结点到根结点计算哈夫曼编码
{
if (ht[f].lc == c)//左0右1
cd[--start] = '0';
else
cd[--start] = '1';
}
hc[i] = new char[n - start];//为第i个字符创建存储空间
c = 0;
while (start < n)
hc[i][c++] = cd[start++];//将cd中的字符复制到该字符的哈夫曼编码存储结构中
}
delete[] cd;
}
void deCoding(HuffmanTree ht, int m, char* buff)//buff中为0,1组成的字符串
{
int p = m-1;
while (*buff != '\0'&&p != -1)//直到buff中字符被读取完毕,程序结束。
{
if (*buff == '0')
p = ht[p].lc;
else
p = ht[p].rc;
++buff;
if (!ht[p].lc&&!ht[p].rc)//叶子结点输出
{
std::cout << ht[p].data << ' ';
p = m-1;//又从根结点开始查找
}
}
}
这边列出了一个完整的范例,是对给定的8个字符进行编码,输入字符存储在data数组中,相应的权值存储在weigh数组中,编码之后进行译码。代码如下:其中包含编码输出结果和译码结果
#include
#define N 8
typedef struct htNode
{
char data;//结点中的信息
int weight;//权重
int parent, lc, rc;//分别指向父结点,左右孩子结点
}htNode,*HuffmanTree;
typedef char** huffmanCode;
void huffmanCoding(HuffmanTree& ht, huffmanCode& hc,
int* weigh, char* character, int n);//构建哈夫曼树及哈夫曼编码
void select(HuffmanTree& a, int b, int& s1, int& s2);//查找下标b之前的两个最小值
void deCoding(HuffmanTree ht, int m, char* buff);//哈夫曼编码的译码过程
int main()
{
using namespace std;
char data[N] = { 'a', 'b', 'c', 'd', 'e', 'f','g','h' };
int weigh[N] = { 10, 5, 2, 15, 32, 26, 19, 9 };
HuffmanTree ht;
huffmanCode hc;
huffmanCoding(ht, hc, weigh, data, N);
cout << "哈夫曼编码:" << endl;
int count;
for (int i = 0; i < N; i++)
{
count = 0;
cout << data[i] <<": ";
while (hc[i][count] != '\0')
cout << hc[i][count++] << ' ';
cout << endl;
}
cout << "哈夫曼译码: " << endl;
//char* code = "0110001";//选用f,e,d的哈夫曼编码
const char* c = "0110001";//适用于vs2017版本
int len = strlen(c);
char* code = new char[len+1];
code[len] = '\0';
for (int i = 0; i < len; ++i)
code[i] = c[i];
deCoding(ht, 2 * N - 1, code);
cout << endl;
return 0;
}
void huffmanCoding(HuffmanTree& ht, huffmanCode& hc,
int* weigh, char* character, int n)
//weigh数组保存字符的权值,character数组保存字符,n表示字符个数
{
if (n < 2)
return;
int m = 2 * n - 1;//8个字符一共需要建立15个结点
ht = new htNode[m];
int i, s1, s2;
for (i = 0; i < n; i++)//对叶子结点进行初始化
{
ht[i].weight = weigh[i];
ht[i].data = character[i];
ht[i].lc = ht[i].rc = ht[i].parent = 0;
}
while (i < m)//进行非叶子结点构造,建成哈夫曼树
{
select(ht, i - 1, s1, s2);//每次挑选当前结点i之前最小的parent为0的两个结点,
//且s1表示的权值总是小于s2的,所以哈夫曼树是权值小的在左边,权值大的在右边。
ht[s1].parent = i;
ht[s2].parent = i;
ht[i].lc = s1;
ht[i].rc = s2;
ht[i].parent = 0;//i在此作为父结点,将其parent总是要置为0的
ht[i].weight = ht[s1].weight + ht[s2].weight;
i++;
}
//哈夫曼树构造完毕
hc = new char*[n];
char* cd = new char[n];//备用字符数组,从后往前输入数据
cd[n - 1] = '\0';
for (i = 0; i <n; i++)
{
int start = n - 1;
int c, f;
for (c = i, f = ht[i].parent;
f != 0; c = f, f = ht[f].parent)//从叶子结点到根结点计算哈夫曼编码
{
if (ht[f].lc == c)//左0右1
cd[--start] = '0';
else
cd[--start] = '1';
}
hc[i] = new char[n - start];//为第i个字符创建存储空间
c = 0;
while (start < n)
hc[i][c++] = cd[start++];//将cd中的字符复制到该字符的哈夫曼编码存储结构中
}
delete[] cd;
}
void select(HuffmanTree& a, int b, int& s1, int& s2)
{
int i, temp1=INT_MAX, temp2=INT_MAX;
for (i = 0; i <= b; i++)
{
if (a[i].parent == 0)
{
if (temp1 > a[i].weight)
{
temp1 = a[i].weight;
s1 = i;
}
}
}
for (i = 0; i <= b; i++)
{
if (a[i].parent == 0 && i != s1)
{
if (temp2 > a[i].weight)
{
temp2 = a[i].weight;
s2 = i;
}
}
}
}
void deCoding(HuffmanTree ht, int m, char* buff)//buff中为0,1组成的字符串
{
int p = m-1;
while (*buff != '\0'&&p != -1)//直到buff中字符被读取完毕,程序结束。
{
if (*buff == '0')
p = ht[p].lc;
else
p = ht[p].rc;
++buff;
if (!ht[p].lc&&!ht[p].rc)//叶子结点输出
{
std::cout << ht[p].data << ' ';
p = m-1;//又从根结点开始查找
}
}
}