参考维基百科与《算法导论》
霍夫曼编码(Huffman Coding)是一种编码方式,是一种用于无损数据压缩的熵编码(权编码)算法。也称“哈夫曼编码”,“赫夫曼编码”。1952年,David A. Huffman在麻省理工攻读博士时所发明的,并发表于《一种构建极小多余编码的方法》(A Method for the Construction of Minimum-Redundancy Codes)一文。
在计算机数据处理中,霍夫曼编码使用变长编码表对源符号(如文件中的一个字母)进行编码,其中变长编码表是通过一种评估来源符号出现机率的方法得到的,出现机率高的字母使用较短的编码,反之出现机率低的则使用较长的编码,这便使编码之后的字符串的平均长度、期望值降低,从而达到无损压缩数据的目的。
例如,在英文中,e的出现机率最高,而z的出现概率则最低。当利用霍夫曼编码对一篇英文进行压缩时,e极有可能用一个比特来表示,而z则可能花去25个比特(不是26)。用普通的表示方法时,每个英文字母均占用一个字节(byte),即8个比特。二者相比,e使用了一般编码的1/8的长度,z则使用了3倍多。倘若我们能实现对于英文中各个字母出现概率的较准确的估算,就可以大幅度提高无损压缩的比例。
霍夫曼树又称最优二叉树,是一种带权路径长度最短的二叉树。所谓树的带权路径长度,就是树中所有的叶结点的权值乘上其到根结点的路径长度(若根结点为0层,叶结点到根结点的路径长度为叶结点的层数)。树的路径长度是从树根到每一结点的路径长度之和,记为WPL=(W1*L1+W2*L2+W3*L3+...+Wn*Ln),N个权值Wi(i=1,2,...n)构成一棵有N个叶结点的二叉树,相应的叶结点的路径长度为Li(i=1,2,...n)。可以证明霍夫曼树的WPL是最小的。
霍夫曼编码可以有效的压缩数据;通常可以节省20%~90%的空间,具体压缩率依赖于数据的特征。变长编码(variable-length code)赋予高频字符短码字,赋予低频字符长码字,这样可以达到比定长编码好得多的压缩率。前缀码(prefix code)即没有任何码字是其他码字的前缀。
霍夫曼设计了一个贪心算法来构造最优前缀码,被称为霍夫曼编码(Huffman code)。
赫夫曼编码的实现
在实现中,我们假设C是一个含n个字符的集合,而其中每个字符c∈C都是一个对象,其属性c.freq给出了字符的出现频率。算法自底向上地构建出对应最优编码的二叉树T。它从|C|个叶子结点开始,执行|C|-1个“合并”操作创建出最终的二叉树。算法使用一个以属性freq为关键字最小优先队列Q,以识别两个最低频率的对象将其合并。当合并两个对象时,得到的新对象的频率设置为原来两个对象的频率之和。
代码实现1:
//============================================================= // Huffman编码实现(2014/8/18) // 使用二叉树构建最小优先队列 //============================================================= #include<stdio.h> #include<stdlib.h> typedef struct hmap { char c; int freq; } HMap; typedef struct hnode { // huffman树结点 char c; int freq; struct hnode *left; struct hnode *right; } HNode, *HTree; // 用数组遍历可能实现更简单一些 #define LEFT(i) (2 * (i) + 1) #define RIGHT(i) (2 * (i) + 2) #define PARENT(i) (((i) - 1) / 2) //============================================================= // 对堆中第i个元素进行堆化 //============================================================= void Heapify(HNode **arr, int n, int i) { int min; // 对小的孩子进行标记 for (int j = i; j <= n / 2 - 1; j = min) { min = 2 * j + 1; // left if (min + 1 < n && arr[min+1]->freq < arr[min]->freq) min += 1; // 2 * j + 2 right if (arr[j]->freq > arr[min]->freq) { HNode *tmp = arr[j]; arr[j] = arr[min]; arr[min] = tmp; } } } //============================================================= // 建小顶堆 //============================================================= void BuildHeap(HNode **arr, int n) { for (int i = n / 2 - 1; i >= 0; i--) { Heapify(arr, n, i); } } //============================================================= // 获取堆中最小结点的指针 //============================================================= HNode *ExtractMin(HNode **arr, int n) { HNode *min = arr[0]; arr[0] = arr[n - 1]; arr[n - 1] = NULL; Heapify(arr, n - 1, 0); return min; } //============================================================= // 向堆中插入元素 //============================================================= void MinHeapInsert(HNode **arr, int n, HNode *x) { if (n == 0) { // 堆为空的情况 arr[0] = x; return; } // 找到x插入的位置, 向上过滤 while (n > 0 && arr[PARENT(n)]->freq > x->freq) { arr[n] = arr[PARENT(n)]; n = PARENT(n); } arr[n] = x; } //========================================================= // Huffman编码实现 //========================================================= HTree Huffman(HMap *C, int n) { HNode **Q = NULL; Q = (HNode **)malloc(sizeof(HNode *) * n); if (!Q) { printf("Q malloc error\n"); return NULL; } for (int i = 0; i < n; i++) Q[i] = NULL; // 初始化Q for (int i = 0; i < n; i++) { HNode *p = (HNode *)malloc(sizeof(HNode)); if (!p) { printf("p malloc error\n"); return NULL; } p->c = C[i].c; p->freq = C[i].freq; p->left = p->right = NULL; Q[i] = p; } // 建堆,形成最小优先队列 BuildHeap(Q, n); // 建立huffman树 for (int i = 0; i < n - 1; i++) { HNode *z = (HNode *)malloc(sizeof(HNode)); if (!z) { printf("z malloc error\n"); return NULL; } HNode *x = ExtractMin(Q, n - i); HNode *y = ExtractMin(Q, n - i - 1); z->left = x; z->right = y; z->freq = x->freq + y->freq; MinHeapInsert(Q, n - i - 2, z); } return ExtractMin(Q, 1); } HMap C[] = {{'a', 35}, {'b', 13}, {'c', 12}, {'d', 16}, {'e', 9}, {'f', 5}, {'g', 10}}; #include <iostream> #include <vector> #include <map> using namespace std; typedef vector<int> Huff_code; // 8 bit code of one char map<char, Huff_code> Huff_Dic; // huffman coding dictionary //============================================================= // Give Huffman Coding to the Huffman Tree //============================================================= void Huffman_Coding(HTree root, Huff_code& curcode) { if (root->left == NULL && root->right == NULL) { Huff_Dic[root->c] = curcode; return; } Huff_code lcode = curcode; Huff_code rcode = curcode; lcode.push_back(0); rcode.push_back(1); Huffman_Coding(root->left, lcode); Huffman_Coding(root->right, rcode); } int main() { int n = sizeof(C) / sizeof(C[0]); HTree root; root = Huffman(C, n); Huff_code nullcode; nullcode.clear(); Huffman_Coding(root, nullcode); // 打印Huffman编码 for (map<char,Huff_code>::iterator it = Huff_Dic.begin(); it != Huff_Dic.end(); ++it) { cout << (*it).first << '\t'; for (vector<int>::iterator vit = (*it).second.begin(); vit != (*it).second.end(); ++vit) { cout << *vit; } cout << endl; } system("pause"); return 0; }
代码实现2:
//============================================================= // Huffman编码实现(2014/8/18) // 使用数组构建最小优先队列 //============================================================= #include<stdio.h> #include<stdlib.h> typedef struct hmap { char c; int freq; } HMap; typedef struct hnode { // huffman树结点 char c; int freq; struct hnode *left; struct hnode *right; } HNode, *HTree; typedef struct qnode { HNode **node; bool *visited; } QNode, *QUEUE; //========================================================= // 构建一个最小优先队列 //========================================================= QUEUE CreateQueue(int n) { QUEUE Q = (QNode *)malloc(sizeof(QNode)); if (!Q) { printf("Q malloc error\n"); return NULL; } // 开辟2n-1个pointer用于盛放所有结点的地址 Q->node = (HNode **)malloc(sizeof(HNode *) * (2 * n - 1)); if (!Q->node) { free(Q); printf("Q->node malloc error\n"); return NULL; } for (int i = 0; i < 2 * n - 1; i++) { Q->node[i] = NULL; } // 开辟2n-1个bool用于标识结点是否被访问过 Q->visited = (bool *)malloc(sizeof(bool) * (2 * n - 1)); if (!Q->visited) { free(Q); free(Q->node); printf("Q->visited malloc error\n"); return NULL; } for (int i = 0; i < 2 * n - 1; i++) { Q->visited[i] = false; } return Q; } //========================================================= // 获取队列中freq最小的结点 //========================================================= HNode *ExtractMin(QUEUE Q, int n) { int min = 0xff; int index = -1; // 最小结点的索引 for (int i = 0; i < 2 * n - 1; i++) { // 找到队列中未被访问的最小元素 if (Q->node[i] && !Q->visited[i] && Q->node[i]->freq < min) { min = Q->node[i]->freq; index = i; } } if (index == -1) return NULL; Q->visited[index] = true; return Q->node[index]; } //========================================================= // 将node结点插入队列中 //========================================================= void QueueInsert(QUEUE Q, int n, HNode *node) { int index = 0; while (index < 2 * n - 1) { if (Q->node[index] == NULL) { // 找到新的位置,进行插入 Q->node[index] = node; break; } index++; } } //========================================================= // Huffman编码实现 //========================================================= HTree Huffman(HMap *C, int n) { QUEUE Q = CreateQueue(n); // 初始化Q for (int i = 0; i < n; i++) { HNode *p = (HNode *)malloc(sizeof(HNode)); if (!p) { printf("p malloc error\n"); return NULL; } p->c = C[i].c; p->freq = C[i].freq; p->left = p->right = NULL; Q->node[i] = p; } // 建立huffman树 for (int i = 0; i < n - 1; i++) { HNode *z = (HNode *)malloc(sizeof(HNode)); if (!z) { printf("z malloc error\n"); return NULL; } HNode *x = ExtractMin(Q, n); HNode *y = ExtractMin(Q, n); z->left = x; z->right = y; z->freq = x->freq + y->freq; QueueInsert(Q, n, z); } return ExtractMin(Q, n); } HMap C[] = {{'a', 35}, {'b', 13}, {'c', 12}, {'d', 16}, {'e', 9}, {'f', 5}, {'g', 10}}; #include <iostream> #include <vector> #include <map> using namespace std; typedef vector<int> Huff_code; // 8 bit code of one char map<char, Huff_code> Huff_Dic; // huffman coding dictionary //============================================================= // Give Huffman Coding to the Huffman Tree //============================================================= void Huffman_Coding(HTree root, Huff_code& curcode) { if (root->left == NULL && root->right == NULL) { Huff_Dic[root->c] = curcode; return; } Huff_code lcode = curcode; Huff_code rcode = curcode; lcode.push_back(0); rcode.push_back(1); Huffman_Coding(root->left, lcode); Huffman_Coding(root->right, rcode); } int main() { int n = sizeof(C) / sizeof(C[0]); HTree root; root = Huffman(C, n); Huff_code nullcode; nullcode.clear(); Huffman_Coding(root, nullcode); for (map<char,Huff_code>::iterator it = Huff_Dic.begin(); it != Huff_Dic.end(); ++it) { cout << (*it).first << '\t'; for (vector<int>::iterator vit = (*it).second.begin(); vit != (*it).second.end(); ++vit) { cout << *vit; } cout << endl; } system("pause"); return 0; }
代码实现3(C++)
引用以妹子的,链接:http://blog.csdn.net/abcjennifer/article/details/8020695
/************************************************************************/ /* File Name: Huffman.cpp * @Function: Lossless Compression @Author: Sophia Zhang @Create Time: 2012-9-26 10:40 @Last Modify: 2012-9-26 12:10 */ /************************************************************************/ #include"iostream" #include "queue" #include "map" #include "string" #include "iterator" #include "vector" #include "algorithm" using namespace std; #define NChar 8 //suppose use 8 bits to describe all symbols #define Nsymbols 1<<NChar //can describe 256 symbols totally (include a-z, A-Z) typedef vector<bool> Huff_code;//8 bit code of one char map<char,Huff_code> Huff_Dic; //huffman coding dictionary /************************************************************************/ /* Tree Class elements: *2 child trees *character and frequency of current node */ /************************************************************************/ class HTree { public : HTree* left; HTree* right; char ch; int weight; HTree(){left = right = NULL; weight=0;ch ='\0';} HTree(HTree* l,HTree* r,int w,char c){left = l; right = r; weight=w; ch=c;} ~HTree(){delete left; delete right;} bool Isleaf(){return !left && !right; } }; /************************************************************************/ /* prepare for pointer sorting*/ /*because we cannot use overloading in class HTree directly*/ /************************************************************************/ class Compare_tree { public: bool operator () (HTree* t1, HTree* t2) { return t1->weight> t2->weight; } }; /************************************************************************/ /* use priority queue to build huffman tree*/ /************************************************************************/ HTree* BuildTree(int *frequency) { priority_queue<HTree*,vector<HTree*>,Compare_tree> QTree; //1st level add characters for (int i=0;i<Nsymbols;i++) { if(frequency[i]) QTree.push(new HTree(NULL,NULL,frequency[i],(char)i)); } //build while (QTree.size()>1) { HTree* lc = QTree.top(); QTree.pop(); HTree* rc = QTree.top(); QTree.pop(); HTree* parent = new HTree(lc,rc,lc->weight+rc->weight,(char)256); QTree.push(parent); } //return tree root return QTree.top(); } /************************************************************************/ /* Give Huffman Coding to the Huffman Tree*/ /************************************************************************/ void Huffman_Coding(HTree* root, Huff_code& curcode) { if(root->Isleaf()) { Huff_Dic[root->ch] = curcode; return; } Huff_code lcode = curcode; Huff_code rcode = curcode; lcode.push_back(false); rcode.push_back(true); Huffman_Coding(root->left,lcode); Huffman_Coding(root->right,rcode); } int main() { int freq[Nsymbols] = {0}; char *str = "this is the string need to be compressed"; //statistic character frequency while (*str!='\0') freq[*str++]++; //build tree HTree* r = BuildTree(freq); Huff_code nullcode; nullcode.clear(); Huffman_Coding(r,nullcode); for(map<char,Huff_code>::iterator it = Huff_Dic.begin(); it != Huff_Dic.end(); it++) { cout<<(*it).first<<'\t'; std::copy(it->second.begin(),it->second.end(),std::ostream_iterator<bool>(cout)); cout<<endl; } }