hello everybody!你们机智大气的阿俊又回来了,最近事比较多,闲话少说,直接切入正题,聊聊如何给一篇全为英文字符的文章利用哈夫曼编码得到每个字符的最优编码,并完成解码功能,注意,这次也是用文件操作哟,今天可被二进制文件折磨惨了,不过搞懂后真好用,呜呜呜,我该不会是个受虐狂叭。。。
哈夫曼编码思想很简单,每次从已有序列中挑出两个权值最小的节点,这两个节点作为一个新根节点的左右子树,同时从原有序列删除这两个节点,重复过程,直到仅剩一棵树为止
听起来容易,还是先上一个实例帮助大家理解算法执行过程叭
以下例子的执行过程是我在考前做得整理,时间仓促,有些粗糙,大家凑合着看
至此已构造好哈夫曼树,编码只需从叶子节点找到根,逆向输出即可;解码则从根开始寻找,0左1右,碰到叶子停止
掌握思想后,那就进入实战演练叭~
从前的我是读入一个字符,查找一下数组里有没有,没有的话填入,次数加1,有的话直接次数加1,由于每次都得从头找到尾,效率不高;现在的我,拥有武林秘籍–数据结构,自然采取更简便的方法,想知道吗?那请向下看
由于字符个数有限,所有英文字符不超过255,所以将字符的ASCII码作为下标,数组值作为次数,每读一个值只要将其转化为ASCII码即可(类似希尔映射)
所以关键在于如何将字符转化为ASCII码,很简单,只要将字符赋给一个整型变量,编译器自动帮你转化为ASCII码。爱思考的小伙伴就会问“那ASCII码转化为字符咋办?”,嘿嘿嘿,和之前步骤相似,将整型赋给一个字符变量,转化完成,耶~~
瞧瞧可爱的代码
//统计每个字符出现次数:只有英文符号,否则报错
void Count_Character_Occur_Frequency()
{
int cof[256];//存储相应字符出现的次数,字符ASCII为下标。charater_occur_frequency
for(int i = 0; i < 256; i++)//初始化字符出现次数统计表
{
cof[i] = 0;
}
//从源文件按行读取,并统计字符个数,由于字符个数有限,所以用字符的ASCII码作为数组下标,数组值作为次数,类似哈希映射
fstream inFile("source.txt",ios::in);
if(!inFile)cout<<"source.txt 打开失败!"<<endl;
int sum = 0;//总行数,记录换行个数
string s;//存放一行
while(true)
{
getline(inFile,s);
if(!inFile)break;//避免重复读取最后一个字符
sum++;
for(int i = 0; i < s.size(); i++)
{
int a = s[i];//cout<<"a:"<
cof[a]++; //计数
}
}
inFile.close();//好习惯
int a = '\n';//换行符
cof[a] = sum; //换行符个数
//=======将所有出现的字符及其次数写入文件(类似全局数组)=========
int n = 0;//计算出现字符总个数
for(int i = 0; i < 256; i++)
{
if(cof[i] != 0)n++;
}
fstream outFile("字频表.txt",ios::out);//写入若是无此文件,系统自动创建
if(!outFile)cout<<"字频表.txt 打开失败!"<<endl;
outFile<<n<<endl;//写入字符总个数
//打印调试&&写入文件
for(int i = 0; i < 256; i++)
{
if(cof[i] != 0)
{
char ch = i - '\0';
// cout<<"i: "<
outFile<<i<<" "<<cof[i]<<endl;//写入文件
}
}
outFile.close();
}
算法思想很简单,把n个带权节点看成n棵树,每次选取根节点权值最小的两棵树构建出一颗新二叉树,加入树的集合中,新二叉树的权值为左右子树根节点权值之和,同时删除被选中的两棵二叉树,此时变成n-1棵树。重复以上过程知道仅有一颗树存在,那棵树就是哈夫曼树
具体算法步骤我就不写了,最好是能跟着算法走一遍,体会一下
这步依赖于字频表的建立
//创建哈夫曼树
void CreateHT()
{
HuffmanTree HTree;
fstream inFile("字频表.txt",ios::in);
if(!inFile)cout<<"字频表.txt 打开失败!"<<endl;
int n;//节点个数
inFile>>n;
HTree = (HTNode*)malloc(2*n*sizeof(HTNode));//哈夫曼构造,共需2n-1个,0号单元不用
for(int i = 1; i < 2*n; i++)//初始化 1
{
HTree[i].ascii = HTree[i].lchild = HTree[i].parent = HTree[i].rchild = HTree[i].weight = 0;//0号单元无用
}
for(int i = 1; i <= n; i++)//初始化 2,从文件读取ASCII码及相应权值
{
inFile>>HTree[i].ascii>>HTree[i].weight;
}
inFile.close();
for(int i = n+1; i < 2*n; i++)//从n+1开始,进行n-1次计算
{
//==============寻找最小,次小值,记录其下标 =========
int min1 = MIN1,min2 = MIN2;
int index1 = 0,index2 = 0;
for(int j = 1; j < i; j++)//i是即将要被填入的根节点
{
if(HTree[j].parent == 0)//双亲为0表示尚待操作
{
if(min1 > HTree[j].weight)
{
min2 = min1;//先赋给次小值
index2 = index1;
min1 = HTree[j].weight;
index1 = j;
}
else if(min2 > HTree[j].weight)
{
min2 = HTree[j].weight;
index2 = j;
}
}
}
//==============五处状态更新==================================
HTree[i].weight = HTree[index1].weight + HTree[index2].weight;//双亲权值更新
HTree[index1].parent = HTree[index2].parent = i;//孩子的双亲节点更新
if(HTree[index1].weight < HTree[index2].weight)//1,两个节点权值不同,左小右大 ;相同,下标小者在左
{
HTree[i].lchild = index1;//下标赋值
HTree[i].rchild = index2;
}
else if(HTree[index1].weight > HTree[index2].weight)
{
HTree[i].lchild = index2;
HTree[i].rchild = index1;
}
else
{
if(index1 < index2)
{
HTree[i].lchild = index1;
HTree[i].rchild = index2;
}
else
{
HTree[i].lchild = index2;
HTree[i].rchild = index1;
}
}
}
//====================写入文件=====================
fstream outFile("哈夫曼树.txt",ios::out);
if(!outFile)cout<<"哈夫曼树.txt 无法打开!"<<endl;
outFile<<n<<endl;//节点个数
for(int i = 1; i < 2*n; i++)//打印输出调试
{
outFile<<" "<<HTree[i].ascii <<" "<<HTree[i].weight<<" "<<HTree[i].parent<<" "<<HTree[i].lchild<<" "<<HTree[i].rchild<<endl;
}
outFile.close();
//==========建立编码表,写入字符,权值,编码==================
outFile.open("哈夫曼编码表.txt",ios::out);
if(!outFile)cout<<"哈夫曼编码表.txt 打开失败!"<<endl;
//利用栈从叶子出发读取每个字符的编码,在写入文件
stack<char> code;//存储编码
for(int i = 1; i <= n; i++)//对n个字符分别求编码
{
int j = i;
do{
int p = HTree[j].parent;
if(p != 0)
{
int l,r;
l = HTree[p].lchild;
r = HTree[p].rchild;
if(j == l)code.push('0');
if(j == r)code.push('1');
j = p;
}
}while(HTree[j].parent != 0);
outFile<<HTree[i].ascii<<" "<<HTree[i].weight<<" ";//写入字符,权值
while(!code.empty())
{
outFile<<code.top();//写入编码
code.pop();
}outFile<<endl;
}
outFile.close();
}
万事具备,只欠东风咯,前期工具已就位,现在拿来实战,看看好不好用。
根据咱们针对原文本source.txt建立的字频表,在此基础上建立哈夫曼树并求出每个字符的编码,现在求出整个文本的编码
其实思想和之前的提到的哈希映射一致,字符数有限,所以把每个字符的ASCII码当字符串数组的下标,字符串存相应编码,对文本编码时,每读入一个字符,映射取值就OK啦
话不多说,直接上菜,注意二进制文件写入
void Encode()
{
fstream inFile("哈夫曼编码表.txt",ios::in);
if(!inFile)cout<< "哈夫曼编码表.txt"<<endl;
string s,codeList[256];//将编码表从文件读入该数组中,ASCII码为下标,类似哈希映射
int ch,w;
while(true)
{
inFile>>ch>>w>>s;
if(!inFile)break;
// cout<<" ch:"<
codeList[ch] = s;
}
inFile.close();
inFile.open("source.txt",ios::in);
if(!inFile)cout<<"source.txt 打开失败!"<<endl;
fstream outFile("code.dat",ios::out|ios::binary);
if(!outFile)cout<<"code.dat打开失败!"<<endl;
string s2;
while(true)
{
getline(inFile,s);
if(!inFile)break;
int a;
for(int i = 0; i < s.size(); i++)
{
a = s[i];//转化为ASCII码
int j;
for(j = 0; j < codeList[a].size();j++)
{
s2 = codeList[a];
code[j] = s2[j];
}code[j]='\0';//!!!关键的一句
outFile.write((char*)code,20*sizeof(char));
}
a = '\n';//换行符手动加入,因为getline读不出来
for(int j = 0; j < codeList[a].size();j++)
{
code[j] = (codeList[a])[j];
}
outFile.write((char*)code,20*sizeof(char));
}
inFile.close();
outFile.close();
}
至此咱们已经可以对一篇文章编码啦,编码也是一种加密方式哈,没人能看得懂,嘿嘿嘿,偷偷传递一些消息就不会被识破啦(不过仅限不太重要的事,重要的得要高级加密算法啦)。有加密,就会有人解密,这是最吸引人的地方,看着一堆杂乱无章的二进制串在你的神奇程序下还原出一片文辞优美的文章,想想都令人兴奋呢,那还等什么,快来叭~
解码原理相对容易,碰到0向左走,遇到1向右走,如果该点是叶子就找到了字符,继续从头开始处理0,1串
粗暴一点,没法做个安静的美男子咯
//解码
void Decode()
{
//==============读入哈夫曼树====================
fstream inFile("哈夫曼树.txt",ios::in);
if(!inFile)cout<<"哈夫曼树.txt 打开失败!"<<endl;
int n;
inFile>>n;//n个节点
HuffmanTree HTree;
HTree = (HTNode*)malloc(2*n*sizeof(HTNode));
for(int i = 1; i < 2*n; i++)
{
inFile>>HTree[i].ascii >>HTree[i].weight>>HTree[i].parent>>HTree[i].lchild>>HTree[i].rchild;
}
inFile.close();
//========处理编码信息=====================
inFile.open("code.dat",ios::in|ios::binary);
if(!inFile)cout<<"code.dat 打开失败!"<<endl;
fstream outFile("recode.txt",ios::out);//存储解码结果
if(!outFile)cout<<"recode.txt 打开失败!"<<endl;
//=================解码开始===============================
char ch;
int root = 2*n - 1;
while(true)
{
inFile.read((char*)code,20*sizeof(char));
if(!inFile)break;
// cout<<"ch: "<
for(int i = 0; code[i] != '\0'; i++)
{//cout<
ch = code[i];
if(ch == '0')root = HTree[root].lchild;
else if(ch == '1')root = HTree[root].rchild;
if(HTree[root].lchild == 0)
{//cout<
char cht = HTree[root].ascii;
outFile<<cht;
root = 2*n - 1;
}
}
}
outFile.close();
}
tips:
元旦长假第一天,祝大家玩得开心,耍的愉快,我呢,继续滚去学习吧,再见各位亲~
哦哦哦,差点完了,源码耶
为保持原汁原味及错误示范,我不删除多余代码,各位看前文足够啦
/*时间:2018.12.30
*作者:吴扬俊
*内容:给出任意一篇全英文字符的文本,求出其最优编码,并能解码
思路:
1,建立字频表(统计文本中每个字符出现的次数)
2,建立哈夫曼树,并求每个字符对应的编码
3,给文本编码,以二进制保存于文件code.dat
4,根据哈夫曼树,将code.dat解码,结果保存于recode.txt
tips:
1,二进制文件读写(无法使用string)
2,ASCII码和字符转换
3,哈夫曼算法
4,哈希思想的妙用(计算字频;编码使用)
数据结构
typedef struct
{
int ascii,weight,parent,lchild,rchild;//哈夫曼树,它们依次表示:字符的ASCII码,双亲,左孩子,右孩子
}HTNode,*HuffmanTree;
算法
假设有n个字符,申请2n个空间,0号不用 ,HTree数组首地址
1,初始化1,所有成员赋0;初始化2,读入字符及相应的权值
2,令下个根节点j = n+1,在parent=0的点中挑选出最小值,次小值分别记录其下标index1,index2;
3,最小值和次小值的parent=j;
*/
#include
using namespace std;
#include
#include
#include
#include
#define MIN1 0x1fffffff
#define MIN2 0x2fffffff
//Attention:只可以识别英文输入法下的所有字符,中文打出来的‘?’都不行
char code[20];//二进制读写准备
typedef struct
{
int ascii,weight,parent,lchild,rchild;//哈夫曼树,它们一次表示:字符的ASCII码,双亲,左孩子,右孩子
}HTNode,*HuffmanTree;
//统计每个字符出现次数:只有英文符号,否则报错
void Count_Character_Occur_Frequency()
{
int cof[256];//存储相应字符出现的次数,字符ASCII为下标。charater_occur_frequency
for(int i = 0; i < 256; i++)//初始化字符出现次数统计表
{
cof[i] = 0;
}
//从源文件按行读取,并统计字符个数,由于字符个数有限,所以用字符的ASCII码作为数组下标,数组值作为次数,类似哈希映射
fstream inFile("source.txt",ios::in);
if(!inFile)cout<<"source.txt 打开失败!"<<endl;
int sum = 0;//总行数,记录换行个数
string s;//存放一行
while(true)
{
getline(inFile,s);
if(!inFile)break;//避免重复读取最后一个字符
sum++;
for(int i = 0; i < s.size(); i++)
{
int a = s[i];//cout<<"a:"<
cof[a]++; //计数
}
}
inFile.close();//好习惯
int a = '\n';//换行符
cof[a] = sum; //换行符个数
//=======将所有出现的字符及其次数写入文件(类似全局数组)=========
int n = 0;//计算出现字符总个数
for(int i = 0; i < 256; i++)
{
if(cof[i] != 0)n++;
}
fstream outFile("字频表.txt",ios::out);
if(!outFile)cout<<"字频表.txt 打开失败!"<<endl;
outFile<<n<<endl;//写入字符总个数
//打印调试
for(int i = 0; i < 256; i++)
{
if(cof[i] != 0)
{
char ch = i - '\0';
// cout<<"i: "<
outFile<<i<<" "<<cof[i]<<endl;//写入文件
}
}
outFile.close();
}
//创建哈夫曼树
void CreateHT()
{
HuffmanTree HTree;
fstream inFile("字频表.txt",ios::in);
if(!inFile)cout<<"字频表.txt 打开失败!"<<endl;
int n;//节点个数
inFile>>n;
HTree = (HTNode*)malloc(2*n*sizeof(HTNode));//哈夫曼构造,共需2n-1个,0号单元不用
for(int i = 1; i < 2*n; i++)//初始化 1
{
HTree[i].ascii = HTree[i].lchild = HTree[i].parent = HTree[i].rchild = HTree[i].weight = 0;//0号单元无用
}
for(int i = 1; i <= n; i++)//初始化 2,从文件读取ASCII码及相应权值
{
inFile>>HTree[i].ascii>>HTree[i].weight;
}
inFile.close();
/* for(int i = 1; i < 2*n; i++)//打印输出调试
{
cout<
for(int i = n+1; i < 2*n; i++)//从n+1开始,进行n-1次计算
{
//寻找最小,次小值,记录其下标
int min1 = MIN1,min2 = MIN2;
int index1 = 0,index2 = 0;
for(int j = 1; j < i; j++)//i是即将要被填入的根节点
{
if(HTree[j].parent == 0)//双亲为0表示尚待操作
{
if(min1 > HTree[j].weight)
{
min2 = min1;//先赋给次小值
index2 = index1;
min1 = HTree[j].weight;
index1 = j;
}
else if(min2 > HTree[j].weight)
{
min2 = HTree[j].weight;
index2 = j;
}
}
}//cout<
HTree[i].weight = HTree[index1].weight + HTree[index2].weight;//双亲权值更新
HTree[index1].parent = HTree[index2].parent = i;//孩子的双亲节点更新
if(HTree[index1].weight < HTree[index2].weight)//1,两个节点权值不同,左小右大 ;相同,下标小者在左
{
HTree[i].lchild = index1;//下标赋值
HTree[i].rchild = index2;
}
else if(HTree[index1].weight > HTree[index2].weight)
{
HTree[i].lchild = index2;
HTree[i].rchild = index1;
}
else
{
if(index1 < index2)
{
HTree[i].lchild = index1;
HTree[i].rchild = index2;
}
else
{
HTree[i].lchild = index2;
HTree[i].rchild = index1;
}
}
}
fstream outFile("哈夫曼树.txt",ios::out);
if(!outFile)cout<<"哈夫曼树.txt 无法打开!"<<endl;
outFile<<n<<endl;//节点个数
for(int i = 1; i < 2*n; i++)//打印输出调试
{
// cout<<"i:"<
outFile<<" "<<HTree[i].ascii <<" "<<HTree[i].weight<<" "<<HTree[i].parent<<" "<<HTree[i].lchild<<" "<<HTree[i].rchild<<endl;
}
outFile.close();
//==========建立编码表,写入字符,权值,编码==================
outFile.open("哈夫曼编码表.txt",ios::out);
if(!outFile)cout<<"哈夫曼编码表.txt 打开失败!"<<endl;
//利用栈从叶子出发读取每个字符的编码,在写入文件
stack<char> code;//存储编码
for(int i = 1; i <= n; i++)//对n个字符分别求编码
{
int j = i;
do{
int p = HTree[j].parent;
if(p != 0)
{
int l,r;
l = HTree[p].lchild;
r = HTree[p].rchild;
if(j == l)code.push('0');
if(j == r)code.push('1');
j = p;
}
}while(HTree[j].parent != 0);
outFile<<HTree[i].ascii<<" "<<HTree[i].weight<<" ";//写入字符,权值
while(!code.empty())
{
outFile<<code.top();//写入编码
code.pop();
}outFile<<endl;
}
outFile.close();
}
void Encode()
{
fstream inFile("哈夫曼编码表.txt",ios::in);
if(!inFile)cout<< "哈夫曼编码表.txt"<<endl;
string s,codeList[256];//将编码表从文件读入该数组中,ASCII码为下标,类似哈希映射
int ch,w;
while(true)
{
inFile>>ch>>w>>s;
if(!inFile)break;
// cout<<" ch:"<
codeList[ch] = s;
}
inFile.close();
// cout<
/* for(int i = 0; i < 256; i++)//测试
{
if(!codeList[i].empty())
{//cout<<"!"<
inFile.open("source.txt",ios::in);
if(!inFile)cout<<"source.txt 打开失败!"<<endl;
/* fstream outFile("code.txt",ios::out);
if(!outFile)cout<<"code.txt打开失败!"<
fstream outFile("code.dat",ios::out|ios::binary);
if(!outFile)cout<<"code.dat打开失败!"<<endl;
string s2;
while(true)
{
getline(inFile,s);
if(!inFile)break;
int a;
for(int i = 0; i < s.size(); i++)
{
a = s[i];//转化为ASCII码
int j;
for(j = 0; j < codeList[a].size();j++)
{
s2 = codeList[a];
code[j] = s2[j];
}code[j]='\0';//!!!关键的一句
outFile.write((char*)code,20*sizeof(char));
}
a = '\n';
for(int j = 0; j < codeList[a].size();j++)
{
code[j] = (codeList[a])[j];
}
outFile.write((char*)code,20*sizeof(char));
}
inFile.close();
outFile.close();
}
//解码
void Decode()
{
fstream inFile("哈夫曼树.txt",ios::in);
if(!inFile)cout<<"哈夫曼树.txt 打开失败!"<<endl;
int n;
inFile>>n;
HuffmanTree HTree;
HTree = (HTNode*)malloc(2*n*sizeof(HTNode));
for(int i = 1; i < 2*n; i++)
{
inFile>>HTree[i].ascii >>HTree[i].weight>>HTree[i].parent>>HTree[i].lchild>>HTree[i].rchild;
}
inFile.close();
for(int i = 1; i < 2*n; i++)//打印输出调试
{
// cout<<"i:"<
}
/* inFile.open("code.txt",ios::in);
if(!inFile)cout<<"code.txt 打开失败!"<
inFile.open("code.dat",ios::in|ios::binary);
if(!inFile)cout<<"code.dat 打开失败!"<<endl;
fstream outFile("recode.txt",ios::out);
if(!outFile)cout<<"recode.txt 打开失败!"<<endl;
char ch;
int root = 2*n - 1;//char code[100];
// string s;
while(true)
{
inFile.read((char*)code,20*sizeof(char));
if(!inFile)break;
// cout<<"ch: "<
for(int i = 0; code[i] != '\0'; i++)
{//cout<
ch = code[i];
if(ch == '0')root = HTree[root].lchild;
else if(ch == '1')root = HTree[root].rchild;
if(HTree[root].lchild == 0)
{//cout<
char cht = HTree[root].ascii;
outFile<<cht;
root = 2*n - 1;
}
}//cout<
}
outFile.close();
}
int main()
{
Count_Character_Occur_Frequency();
CreateHT();
Encode();
Decode();
return 0;
}