Huffman编码是最流行的基于统计学的数据压缩方法,下面我们简单介绍它的实现步骤:
1. 将信源符号按照概率递减顺序排列;
2. 取两个概率最小的符号分别分配以“0”和“1”,然后把它们的概率相加,并作为一个新的符号的概率,与其它未分配符号按照(1)重新排列;
3. 重复(1)(2)过程,直至所有概率相加得1;
4. 寻找从每一个信源符号到概率为1处的路径,记录下路径上的“0”和“1”;
5. 写出每一个符号的“0”和“1”序列(从树根到信源符号节点)。
缺点:
从理论上讲,采用Huffman编码可以获得最佳编码效果,但是在实际中,由于计算机中存储和处理的最小数据单位是比特(bit),因此在某种情况下,实际的压缩编码效果往往达不到理论的压缩比。例如:信源符号{X, Y},其对应的概率为{2/3, 1/3},则根据理论计算,符号X, Y的最佳码长分别是:X: log(2/3)2 = 0.588(bit),Y: log(1/3)2 = 1.58(bit)
这表明,要获得最佳效果,符号{X, Y }的码字长度应分别为0.588bit和1.58bit,而计算机不可能有非整数位出现,只能按整数位进行,即采用哈夫曼编码对{X,Y}进行编码,得{X,Y}的码字分别为0和1,也就是两符号的信息编码长度都为1。可见,对于大概率符号X并未赋予较短的码字,实际编码效果没有达到理论编码效果。由上述分析可见,Huffman编码的主要缺点在于其编码方法是对每个符号进行编码,每个符号的码长只能是整数。为此提出算术编码,以解决计算机中必须以整数位进行编码的问题。
编码实现:
头文件
/ Huffman.h: interface for the Huffman class. // ////////////////////////////////////////////////////////////////////// #if !defined(AFX_HUFFMAN_H__08A1863A_6641_4FE9_9596_5EEBE76B53F7__INCLUDED_) #define AFX_HUFFMAN_H__08A1863A_6641_4FE9_9596_5EEBE76B53F7__INCLUDED_ #if _MSC_VER > 1000 #pragma once #endif // _MSC_VER > 1000 #include <string> /***********************数据结构***********************/ //哈弗曼树节点 typedef struct { unsigned int weight; unsigned int parent; unsigned int lchild; unsigned int rchild; }HuffTreeNode,*HuffTree; //字符-权值-编码映射 typedef struct { char c; unsigned int weight; char *code; }CharMapNode,*CharMap; /*************************类定义****************************/ class Huffman { private: void select(int n, int &s1, int &s2); HuffTree huffTree; //哈弗曼树 CharMap chars; //字符表 int n; //字符数 std::string text; //原文 std::string code; //编码 public: void InputCharsWeight(); void CountCharsWeight(); void Decode(); void ReadTextFromFile(char *filename); void ReadCodeFromFile(char *filename); void SaveTextToFile(char *filename); void SaveCodeToFile(char *filename); void PrintCode(); void MakeCharMap(); void PrintText(); void PrintCharCode(); void PrintCharWeight(); void SetCharMap(CharMap m, int number); void Encode(); Huffman(); virtual ~Huffman(); }; #endif // !defined(AFX_HUFFMAN_H__08A1863A_6641_4FE9_9596_5EEBE76B53F7__INCLUDED_)
Huffman类
// Huffman.cpp: implementation of the Huffman class. // ////////////////////////////////////////////////////////////////////// #include "Huffman.h" #include <iostream> #include <fstream> using namespace std; ////////////////////////////////////////////////////////////////////// // Construction/Destruction ////////////////////////////////////////////////////////////////////// Huffman::Huffman() { huffTree = NULL; chars = NULL; n = 0; } Huffman::~Huffman() { } //对Text串进行哈弗曼编码 void Huffman::Encode() { code = ""; for (string::size_type i = 0; i != text.size(); ++i) { for (int j = 1; j <= n; ++j) if (chars[j].c == text[i]) code += chars[j].code; //code为数组名,可表示数组存放的地址 } } //设置字符表 void Huffman::SetCharMap(CharMap m, int number) { chars = m; n = number; } //在huffTree[1..n]中选择parent为0且weight最小的两个节点,其序号为s1,s2 void Huffman::select(int n, int &s1, int &s2) { s1 = s2 = 0; for (int i = 1; i <= n; ++i) { if (huffTree[i].parent != 0) continue; if (s1 == 0) s1 = i; else if (s2 == 0) { //此处采用的策略,使得整个过程中s1的权值小于s2的权值 if (huffTree[i].weight < huffTree[s1].weight) { s2 = s1; s1 = i; } else s2 = i; } else { if (huffTree[i].weight < huffTree[s1].weight) { s2 = s1; s1 = i; } else if (huffTree[i].weight < huffTree[s2].weight) s2 = i; } } } void Huffman::PrintCharWeight() { for (int i = 1; i <= n; ++i) { /* switch (chars[i].c) { case '\t': cout << "\\t"; break; case '\n': cout << "\\n"; break; default:*/ cout << chars[i].c; // break; //} cout << "——" << chars[i].weight << endl; } } void Huffman::PrintCharCode() { for (int i = 1; i <= n; ++i) { /*switch (chars[i].c) { case '\t': cout << "\\t"; break; case '\n': cout << "\\n"; break; default:*/ cout << chars[i].c; // break; //} cout << "——" << chars[i].code << endl; } } //输出文本串 void Huffman::PrintText() { cout << text << endl; } //输出0-1编码 void Huffman::PrintCode() { cout << code << endl; } //根据各字符的权值建立字符-编码表 void Huffman::MakeCharMap() { if (n <= 1) return; int m = 2 * n - 1; //哈弗曼树所需节点数 huffTree = new HuffTreeNode[m+1]; //0号单元未使用 //初始化 int i; for (i = 1; i <= n; ++i) //从1开始 { huffTree[i].weight = chars[i].weight; huffTree[i].parent = 0; huffTree[i].lchild = 0; huffTree[i].rchild = 0; } for (i = n + 1; i <= m; ++i) { huffTree[i].weight = 0; huffTree[i].parent = 0; huffTree[i].lchild = 0; huffTree[i].rchild = 0; } //建哈弗曼树 for (i = n + 1; i <= m; ++i) { int s1,s2; select(i - 1, s1, s2); huffTree[s1].parent = huffTree[s2].parent = i; huffTree[i].lchild = s1; huffTree[i].rchild = s2; huffTree[i].weight = huffTree[s1].weight + huffTree[s2].weight; } //从叶子到根节点逆向求每个字符的哈弗曼编码 char *cd = new char[n]; //分配求编码的工作空间(每个字符编码结果最长n-1再加上'\0') cd[n-1] = '\0'; //编码结束符 for(i = 1; i <= n; ++i) //逐个字符求哈弗曼编码 { int start = n - 1; int c,f; //从叶子到根逆向求编码 for (c = i, f = huffTree[i].parent; f != 0; c = f, f = huffTree[f].parent) { if (huffTree[f].lchild == c) //左孩子编码为0 cd[--start] = '0'; else //右孩子编码为1 cd[--start] = '1'; } chars[i].code = new char[n - start]; //为第i个字符编码分配空间 strcpy(chars[i].code,&cd[start]); } delete cd; } //从文件读入原文 void Huffman::ReadTextFromFile(char *filename) { ifstream infile(filename); if(!infile) { cerr << "无法打开文件!" <<endl; return; } char c; while(infile.get(c)) { text += c; } } //将编码存入文件 void Huffman::SaveCodeToFile(char *filename) { ofstream outfile(filename); if (!outfile) { cerr << "保存文件出错!" << endl; return; } outfile << code; } //从文件读入编码 void Huffman::ReadCodeFromFile(char *filename) { ifstream infile(filename); if (!infile) { cerr << "无法打开文件!" <<endl; return; } infile >> code; } //将0-1编码串解码 void Huffman::Decode() { text = ""; string::size_type i,count; for (i = 0; i < code.size(); i += count) { //每个字符的编码结果最长n-1,从1至n-1依次尝试 for (count = 1; count < n; ++count) { for (int j = 1; j <= n; ++j) if (code.substr(i, count) == chars[j].code)//code.substr(a,b)表示对字符串code截取从第a个到第b个,依次与chars[].code比较 { text += chars[j].c; //text表示原文存放的数组名,作指针使用 goto next; } } next: ; } } //统计原文中各字符的权值 void Huffman::CountCharsWeight() { if (text.empty()) return; if (chars != NULL) delete chars; int i = 0; n = 0; chars = new CharMapNode[2]; chars[1].c = text[i]; chars[1].weight = 1; ++n; for (i = 1; i != text.size(); ++i) { int j; for (j = 1; j <= n; ++j) //遍历当前字符表,如果已存在该字符,权值+1 { if (text[i] == chars[j].c) { ++chars[j].weight; break; } } if (j > n) //该字符不存在,添加该字符 { ++n; CharMap newchars = new CharMapNode[n + 1]; memcpy(newchars, chars, n * sizeof(CharMapNode)); delete chars; chars = newchars; chars[n].c = text[i]; chars[n].weight = 1; } } } //输入字符和对应权值 void Huffman::InputCharsWeight() { cout << "请输入字符集大小n(n>1):" << endl; cin >> n; if (chars != NULL) delete chars; chars = new CharMapNode[n+1]; //0号单元未使用 cout << "请输入字符和权值:" << endl; for (int i = 1; i <= n; ++i) { cin.ignore(); //清除输入缓冲区 cin.get(chars[i].c); //输入单个字符,可以是空白符 cin >> chars[i].weight; } } void Huffman::SaveTextToFile(char *filename) { ofstream outfile(filename); if (!outfile) { cerr << "保存文件出错!" << endl; return; } outfile << text; }
主函数
#include <iostream> #include "Huffman.h" using namespace std; int main() { Huffman huffman; huffman.ReadTextFromFile("text.txt"); /****************第一步输入字符和对应权值******************/ cout << "请选择: 1.程序自动统计字符和权值(推荐) 2.用户输入" << endl; int r; do { cin >> r; } while((r != 1) && (r != 2)); if (r == 1) huffman.CountCharsWeight(); else huffman.InputCharsWeight(); cout << "字符及对应权值:" << endl; huffman.PrintCharWeight(); //计算每个字符对应的权值 system("pause"); cout << endl; /****************第二步建哈弗曼树,输出字符与编码的对应关系******************/ huffman.MakeCharMap(); //实现哈弗曼编码,对应存入chars[i].c-chars[i].code cout << "字符及对应的编码:" << endl; huffman.PrintCharCode(); //打印每个字符及其对应的编码,即chars[i].c-chars[i].code system("pause"); cout << endl; /****************第三步对字符进行编码,将结果输出并存入文件******************/ cout << "对原文进行编码:" << endl; cout << "原文:" << endl; huffman.PrintText(); //输出文本串 huffman.Encode(); //对文本串进行编码 cout << "编码:" << endl; huffman.PrintCode(); huffman.SaveCodeToFile("code.txt"); system("pause"); cout << endl; /****************第四步从文件读入0、1代码串解码后输出并存入文件******************/ cout << "对编码进行解码:" << endl; huffman.ReadCodeFromFile("code.txt"); cout << "编码:" << endl; huffman.PrintCode(); huffman.Decode(); cout << "原文:" << endl; huffman.PrintText(); huffman.SaveTextToFile("resulttext.txt"); cout << "\n Over ^_^" << endl; system("pause"); return 0; }
输出界面