Huffman codes

参考维基百科与《算法导论》

霍夫曼编码Huffman Coding)是一种编码方式,是一种用于无损数据压缩熵编码(权编码)算法。也称哈夫曼编码赫夫曼编码1952年,David A. Huffman麻省理工攻读博士时所发明的,并发表于《一种构建极小多余编码的方法》(A Method for the Construction of Minimum-Redundancy Codes)一文。

计算机数据处理中,霍夫曼编码使用变长编码表对源符号(如文件中的一个字母)进行编码,其中变长编码表是通过一种评估来源符号出现机率的方法得到的,出现机率高的字母使用较短的编码,反之出现机率低的则使用较长的编码,这便使编码之后的字符串的平均长度、期望值降低,从而达到无损压缩数据的目的。

例如,在英文中,e的出现机率最高,而z的出现概率则最低。当利用霍夫曼编码对一篇英文进行压缩时,e极有可能用一个比特来表示,而z则可能花去25比特(不是26)。用普通的表示方法时,每个英文字母均占用一个字节(byte),即8比特。二者相比,e使用了一般编码的1/8的长度,z则使用了3倍多。倘若我们能实现对于英文中各个字母出现概率的较准确的估算,就可以大幅度提高无损压缩的比例。

霍夫曼树又称最优二叉树,是一种带权路径长度最短的二叉树。所谓树的带权路径长度,就是树中所有的叶结点的权值乘上其到根结点的路径长度(若根结点为0层,叶结点到根结点的路径长度为叶结点的层数)。树的路径长度是从树根到每一结点的路径长度之和,记为WPL=W1*L1+W2*L2+W3*L3+...+Wn*Ln),N个权值Wii=1,2,...n)构成一棵有N个叶结点的二叉树,相应的叶结点的路径长度为Lii=1,2,...n)。可以证明霍夫曼树的WPL是最小的。

Huffman codes_第1张图片

Huffman codes_第2张图片

霍夫曼编码可以有效的压缩数据;通常可以节省20%~90%的空间,具体压缩率依赖于数据的特征。变长编码(variable-length code赋予高频字符短码字,赋予低频字符长码字,这样可以达到比定长编码好得多的压缩率。前缀码(prefix code即没有任何码字其他码字的前缀。

霍夫曼设计了一个贪心算法来构造最优前缀码,被称为霍夫曼编码(Huffman code)。

赫夫曼编码的实现 

在实现中,我们假设C是一个含n个字符的集合,而其中每个字符c∈C都是一个对象,其属性c.freq给出了字符的出现频率。算法自底向上地构建出对应最优编码的二叉树T。它从|C|个叶子结点开始,执行|C|-1合并操作创建出最终的二叉树。算法使用一个以属性freq为关键字最小优先队列Q,以识别两个最低频率的对象将其合并。当合并两个对象时,得到的新对象的频率设置为原来两个对象的频率之和。

Huffman codes_第3张图片


代码实现1:

//=============================================================
// Huffman编码实现(2014/8/18)
// 使用二叉树构建最小优先队列
//=============================================================
#include<stdio.h>
#include<stdlib.h>

typedef struct hmap {
	char c;
	int freq;
} HMap; 

typedef struct hnode { // huffman树结点
	char c;
	int freq;
	struct hnode *left;
	struct hnode *right;
} HNode, *HTree;

// 用数组遍历可能实现更简单一些
#define LEFT(i)   (2 * (i) + 1)
#define RIGHT(i)  (2 * (i) + 2)
#define PARENT(i) (((i) - 1) / 2)

//=============================================================
// 对堆中第i个元素进行堆化
//=============================================================
void Heapify(HNode **arr, int n, int i)
{
	int min; // 对小的孩子进行标记

	for (int j = i; j <= n / 2 - 1; j = min) {
		min = 2 * j + 1; // left
		if (min + 1 < n && arr[min+1]->freq < arr[min]->freq)
			min += 1; // 2 * j + 2 right
		if (arr[j]->freq > arr[min]->freq) {
			HNode *tmp = arr[j];
			arr[j] = arr[min];
			arr[min] = tmp;
		}
	}
}

//=============================================================
// 建小顶堆
//=============================================================
void BuildHeap(HNode **arr, int n)
{
	for (int i = n / 2 - 1; i >= 0; i--) {
		Heapify(arr, n, i);
	}
}

//=============================================================
// 获取堆中最小结点的指针
//=============================================================
HNode *ExtractMin(HNode **arr, int n)
{
	HNode *min = arr[0];
	arr[0] = arr[n - 1];
	arr[n - 1] = NULL;
	Heapify(arr, n - 1, 0);

	return min;
}

//=============================================================
// 向堆中插入元素
//=============================================================
void MinHeapInsert(HNode **arr, int n, HNode *x)
{
	if (n == 0) { // 堆为空的情况
		arr[0] = x;
		return;
	}

	// 找到x插入的位置, 向上过滤
	while (n > 0 && arr[PARENT(n)]->freq > x->freq) {
		arr[n] = arr[PARENT(n)];
		n = PARENT(n);
	}
	arr[n] = x; 
}

//=========================================================
// Huffman编码实现
//=========================================================
HTree Huffman(HMap *C, int n)
{
	HNode **Q = NULL;
	Q = (HNode **)malloc(sizeof(HNode *) * n);
	if (!Q) {
		printf("Q malloc error\n");
		return NULL;
	}
	for (int i = 0; i < n; i++)
		Q[i] = NULL;

	// 初始化Q
	for (int i = 0; i < n; i++) {
		HNode *p = (HNode *)malloc(sizeof(HNode));
		if (!p) {
			printf("p malloc error\n");
			return NULL;
		}
		p->c = C[i].c;
		p->freq = C[i].freq;
		p->left = p->right = NULL;
		Q[i] = p;
	}
	// 建堆,形成最小优先队列
	BuildHeap(Q, n);

	// 建立huffman树
	for (int i = 0; i < n - 1; i++) {
		HNode *z = (HNode *)malloc(sizeof(HNode));
		if (!z) {
			printf("z malloc error\n");
			return NULL;
		}
		HNode *x = ExtractMin(Q, n - i);
		HNode *y = ExtractMin(Q, n - i - 1);
		z->left = x;
		z->right = y;
		z->freq = x->freq + y->freq;
		MinHeapInsert(Q, n - i - 2, z);
	}
	return ExtractMin(Q, 1);
}

HMap C[] = {{'a', 35}, {'b', 13}, {'c', 12},
	{'d', 16}, {'e', 9}, {'f', 5}, {'g', 10}};

#include <iostream>
#include <vector>
#include <map>
using namespace std;

typedef vector<int> Huff_code; // 8 bit code of one char
map<char, Huff_code> Huff_Dic;	// huffman coding dictionary

//=============================================================
// Give Huffman Coding to the Huffman Tree
//=============================================================
void Huffman_Coding(HTree root, Huff_code& curcode)
{
	if (root->left == NULL && root->right == NULL) {
		Huff_Dic[root->c] = curcode;
		return;
	}

	Huff_code lcode = curcode;
	Huff_code rcode = curcode;
	lcode.push_back(0);
	rcode.push_back(1);

	Huffman_Coding(root->left, lcode);
	Huffman_Coding(root->right, rcode);
}

int main()
{
	int n = sizeof(C) / sizeof(C[0]);
	
	HTree root;
	root = Huffman(C, n);

	Huff_code nullcode;
	nullcode.clear();
	Huffman_Coding(root, nullcode);
	// 打印Huffman编码
	for (map<char,Huff_code>::iterator it = Huff_Dic.begin(); 
		it != Huff_Dic.end(); ++it) {
		cout << (*it).first << '\t';
		for (vector<int>::iterator vit = (*it).second.begin(); 
			vit != (*it).second.end(); ++vit) {
			cout << *vit;
		}
		cout << endl;
	}

	system("pause");
	return 0;
}

代码实现2:

//=============================================================
// Huffman编码实现(2014/8/18)
// 使用数组构建最小优先队列
//=============================================================
#include<stdio.h>
#include<stdlib.h>

typedef struct hmap {
	char c;
	int freq;
} HMap; 

typedef struct hnode { // huffman树结点
	char c;
	int freq;
	struct hnode *left;
	struct hnode *right;
} HNode, *HTree;

typedef struct qnode {
	HNode **node;
	bool *visited;
} QNode, *QUEUE;

//=========================================================
// 构建一个最小优先队列
//=========================================================
QUEUE CreateQueue(int n)
{
	QUEUE Q = (QNode *)malloc(sizeof(QNode));
	if (!Q) {
		printf("Q malloc error\n");
		return NULL;
	}
	// 开辟2n-1个pointer用于盛放所有结点的地址 
	Q->node = (HNode **)malloc(sizeof(HNode *) * (2 * n - 1));
	if (!Q->node) {
		free(Q);
		printf("Q->node malloc error\n");
		return NULL;
	}
	for (int i = 0; i < 2 * n - 1; i++) {
		Q->node[i] = NULL;
	}

	// 开辟2n-1个bool用于标识结点是否被访问过 
	Q->visited = (bool *)malloc(sizeof(bool) * (2 * n - 1));
	if (!Q->visited) {
		free(Q);
		free(Q->node);
		printf("Q->visited malloc error\n");
		return NULL;
	}
	for (int i = 0; i < 2 * n - 1; i++) {
		Q->visited[i] = false;
	} 

	return Q;
}

//=========================================================
// 获取队列中freq最小的结点
//=========================================================
HNode *ExtractMin(QUEUE Q, int n)
{
	int min = 0xff;
	int index = -1; // 最小结点的索引 
	for (int i = 0; i < 2 * n - 1; i++) {
		// 找到队列中未被访问的最小元素
		if (Q->node[i] && !Q->visited[i] && Q->node[i]->freq < min) {
			min = Q->node[i]->freq;
			index = i;
		}
	}

	if (index == -1)
		return NULL;

	Q->visited[index] = true;
	return Q->node[index];
}

//=========================================================
// 将node结点插入队列中
//=========================================================
void QueueInsert(QUEUE Q, int n, HNode *node)
{
	int index = 0;
	while (index < 2 * n - 1) {
		if (Q->node[index] == NULL) { // 找到新的位置,进行插入
			Q->node[index] = node;
			break;
		}
		index++;
	}
}

//=========================================================
// Huffman编码实现
//=========================================================
HTree Huffman(HMap *C, int n)
{
	QUEUE Q = CreateQueue(n);
	// 初始化Q
	for (int i = 0; i < n; i++) {
		HNode *p = (HNode *)malloc(sizeof(HNode));
		if (!p) {
			printf("p malloc error\n");
			return NULL;
		}
		p->c = C[i].c;
		p->freq = C[i].freq;
		p->left = p->right = NULL;
		Q->node[i] = p;
	}

	// 建立huffman树
	for (int i = 0; i < n - 1; i++) {
		HNode *z = (HNode *)malloc(sizeof(HNode));
		if (!z) {
			printf("z malloc error\n");
			return NULL;
		}
		HNode *x = ExtractMin(Q, n);
		HNode *y = ExtractMin(Q, n);
		z->left = x;
		z->right = y;
		z->freq = x->freq + y->freq;
		QueueInsert(Q, n, z);
	}
	return ExtractMin(Q, n);
}

HMap C[] = {{'a', 35}, {'b', 13}, {'c', 12},
{'d', 16}, {'e', 9}, {'f', 5}, {'g', 10}};

#include <iostream>
#include <vector>
#include <map>
using namespace std;

typedef vector<int> Huff_code; // 8 bit code of one char
map<char, Huff_code> Huff_Dic;	// huffman coding dictionary

//=============================================================
// Give Huffman Coding to the Huffman Tree
//=============================================================
void Huffman_Coding(HTree root, Huff_code& curcode)
{
	if (root->left == NULL && root->right == NULL) {
		Huff_Dic[root->c] = curcode;
		return;
	}

	Huff_code lcode = curcode;
	Huff_code rcode = curcode;
	lcode.push_back(0);
	rcode.push_back(1);

	Huffman_Coding(root->left, lcode);
	Huffman_Coding(root->right, rcode);
}


int main()
{
	int n = sizeof(C) / sizeof(C[0]);

	HTree root;
	root = Huffman(C, n);

	Huff_code nullcode;
	nullcode.clear();
	Huffman_Coding(root, nullcode);
	for (map<char,Huff_code>::iterator it = Huff_Dic.begin(); 
		it != Huff_Dic.end(); ++it) {
			cout << (*it).first << '\t';
			for (vector<int>::iterator vit = (*it).second.begin(); 
				vit != (*it).second.end(); ++vit) {
					cout << *vit;
			}
			cout << endl;
	}

	system("pause");
	return 0;
}

代码实现3(C++)

引用以妹子的,链接:http://blog.csdn.net/abcjennifer/article/details/8020695

/************************************************************************/
/*	File Name: Huffman.cpp
*		@Function: Lossless Compression
		@Author: Sophia Zhang
		@Create Time: 2012-9-26 10:40
		@Last Modify: 2012-9-26 12:10
*/
/************************************************************************/

#include"iostream"
#include "queue"
#include "map"
#include "string"
#include "iterator"
#include "vector"
#include "algorithm"
using namespace std;

#define NChar 8	//suppose use 8 bits to describe all symbols
#define Nsymbols 1<<NChar	//can describe 256 symbols totally (include a-z, A-Z)
typedef vector<bool> Huff_code;//8 bit code of one char
map<char,Huff_code> Huff_Dic;	//huffman coding dictionary

/************************************************************************/
/* Tree Class elements:
*2 child trees
*character and frequency of current node
*/
/************************************************************************/
class HTree
{
public :
	HTree* left;
	HTree* right;
	char ch;
	int weight;

	HTree(){left = right = NULL; weight=0;ch ='\0';}
	HTree(HTree* l,HTree* r,int w,char c){left = l;	right = r;	weight=w;	ch=c;}
	~HTree(){delete left; delete right;}
	bool Isleaf(){return !left && !right; }
};

/************************************************************************/
/* prepare for pointer sorting*/
/*because we cannot use overloading in class HTree directly*/
/************************************************************************/
class Compare_tree
{
public:
	bool operator () (HTree* t1, HTree* t2)
	{
		return t1->weight> t2->weight;
	}
};

/************************************************************************/
/* use priority queue to build huffman tree*/
/************************************************************************/
HTree* BuildTree(int *frequency)
{
	priority_queue<HTree*,vector<HTree*>,Compare_tree> QTree;

	//1st level add characters
	for (int i=0;i<Nsymbols;i++)
	{
		if(frequency[i])
			QTree.push(new HTree(NULL,NULL,frequency[i],(char)i));			
	}

	//build
	while (QTree.size()>1)
	{
		HTree* lc  = QTree.top();
		QTree.pop();
		HTree* rc = QTree.top();
		QTree.pop();

		HTree* parent = new HTree(lc,rc,lc->weight+rc->weight,(char)256);
		QTree.push(parent);
	}
	//return tree root
	return QTree.top();
}

/************************************************************************/
/* Give Huffman Coding to the Huffman Tree*/
/************************************************************************/
void Huffman_Coding(HTree* root, Huff_code& curcode)
{
	if(root->Isleaf())
	{
		Huff_Dic[root->ch] = curcode;
		return;
	}
	Huff_code lcode = curcode;
	Huff_code rcode = curcode;
	lcode.push_back(false);
	rcode.push_back(true);

	Huffman_Coding(root->left,lcode);
	Huffman_Coding(root->right,rcode);
}

int main()
{
	int freq[Nsymbols] = {0};
	char *str = "this is the string need to be compressed";

	//statistic character frequency
	while (*str!='\0')
		freq[*str++]++;

	//build tree
	HTree* r = BuildTree(freq);
	Huff_code nullcode;
	nullcode.clear();
	Huffman_Coding(r,nullcode);

	for(map<char,Huff_code>::iterator it = Huff_Dic.begin(); it != Huff_Dic.end(); it++)
	{
		cout<<(*it).first<<'\t';
		std::copy(it->second.begin(),it->second.end(),std::ostream_iterator<bool>(cout));
		cout<<endl;
	}
}




你可能感兴趣的:(Huffman codes)