哈夫曼编码(Huffman Coding),又称霍夫曼编码,是一种编码方式,哈夫曼编码是可变字长编码(VLC)的一种。Huffman于1952年提出一种编码方法,该方法完全依据字符出现概率来构造异字头的平均长度最短的码字,有时称之为最佳编码,一般就叫做Huffman编码(有时也称为霍夫曼编码)----源自百度百科
以字符串”this is a test“为例,生成霍夫曼编码的步骤如下:
t 3
h 1
i 2
s 3
\space 3
a 1
e 1
14
|---8
| |---t(3)
| |---5
| |---i(2)
| |---3
| |---h(1)
| |---2
| |---e(1)
| |---a(1)
|---6
| |---\space(3)
| |---s(3)
t 10
h 1110
i 110
s 01
\space 00
a 11111
e 11110
以上得到霍夫曼编码后可以对文件进行压缩,继续以字符串“this is a test"为例,步骤如下:
10111011001001100100111110010111100110
1011101100100110010011111001011110011000
10111011 00100110 01001111 10010111 10011000
———————— ———————— ———————— ———————— ————————
0xbb 0x26 0x4f 0x97 0x98
经过以上压缩,文件由14字节转为了5字节,解压即为逆过程。
#include
#include
#include
typedef struct{
unsigned char ch;
long int weight;
int left,right,parent;
}node;
typedef struct{
unsigned char ch;
char*cd;
}code;
int count;
long int len,sumBytes;
node huffmanNode[256];
node*huffmanTree;
code*huffmanCode;
int getWeight(char*filePath){
FILE *fp=fopen(filePath,"rb");
if(fp==NULL){
printf("can not open file %s\n",filePath);
return -1;
}
int i;
count=0;
long int flag=0;
unsigned char ch;
memset(huffmanNode,0,sizeof(node)*256);
fseek(fp,0,SEEK_END);
sumBytes=ftell(fp);
fseek(fp,0,SEEK_SET);
while(flag<sumBytes){
flag++;
ch=fgetc(fp);
for(i=0;i<count;i++)
if(huffmanNode[i].ch==ch)
break;
if(i==count){
huffmanNode[count].ch=ch;
huffmanNode[count].weight=1;
count++;
}
else{
huffmanNode[i].weight++;
}
}
fclose(fp);
return 0;
}
void createHuffmanTree(){
int min1,min2;
int x1,x2,i,j;
huffmanTree=(node*)realloc(huffmanTree,(2*count-1)*sizeof(node));
for(i=0;i<count;i++){
huffmanTree[i].ch=huffmanNode[i].ch;
huffmanTree[i].weight=huffmanNode[i].weight;
huffmanTree[i].left=huffmanTree[i].right=huffmanTree[i].parent=-1;
}
for(;i<2*count-1;i++){
huffmanTree[i].ch=0;
huffmanTree[i].weight=0;
huffmanTree[i].left=huffmanTree[i].right=huffmanTree[i].parent=-1;
}
for(i=count;i<2*count-1;i++){
min1=min2=999999;
x1=x2=0;
for(j=0;j<i;j++){
if(huffmanTree[j].parent==-1&&huffmanTree[j].weight<=min1){
min2=min1;x2=x1;
min1=huffmanTree[j].weight;x1=j;
}
else if(huffmanTree[j].parent==-1&&huffmanTree[j].weight<=min2){
min2=huffmanTree[j].weight;x2=j;
}
}
huffmanTree[x1].parent=huffmanTree[x2].parent=i;
huffmanTree[i].left=x1;huffmanTree[i].right=x2;
huffmanTree[i].weight=min1+min2;
}
}
void genHuffmanCode(){
if(huffmanTree==NULL){
printf("huffman tree is null! can not generate huffman code\n");
return;
}
int cur,p,start;
char*temp=(char*)malloc(sizeof(char)*(count+1));
temp[count]='\0';
huffmanCode=(code*)malloc(sizeof(code)*count);
for(int i=0;i<count;i++){
cur=i;
p=huffmanTree[i].parent;
start=count;
while(p!=-1){
if(huffmanTree[p].left==cur)temp[--start]='0';
else temp[--start]='1';
cur=p;p=huffmanTree[cur].parent;
}
huffmanCode[i].ch=huffmanTree[i].ch;
huffmanCode[i].cd=(char*)malloc(sizeof(char)*(count-start+1));
strcpy(huffmanCode[i].cd,&temp[start]);
}
}
int compress(char*file_in,char*file_out){
len=0;
int i,j,k=0,sum;
unsigned char ch,temp[264];
if(getWeight(file_in)<0)return -1;
createHuffmanTree();
genHuffmanCode();
FILE*fp_in=fopen(file_in,"rb");
FILE*fp_out=fopen(file_out,"wb");
if(fp_out==NULL){
printf("can not create %s!\n",file_out);
return -1;
}
long int flag=0;
while(flag<sumBytes){
flag++;
ch=fgetc(fp_in);
for(i=0;i<count;i++){
if(huffmanCode[i].ch==ch){
len+=strlen(huffmanCode[i].cd);
for(j=0;j<strlen(huffmanCode[i].cd);j++)
temp[k++]=huffmanCode[i].cd[j]-'0';
while(k>=8){
sum=0;
for(j=0;j<8;j++)sum=sum*2+(temp[j]&0x1);
for(j=8;j<k;j++)temp[j-8]=temp[j];
k=j-8;
fputc(sum,fp_out);
fflush(fp_out);
}
break;
}
}
}
if(k){
sum=0;
for(j=0;j<k;j++)sum=sum*2+(temp[j]&0x1);
sum=sum<<(8-k);
fputc(sum,fp_out);
fflush(fp_out);
}
fclose(fp_in);fclose(fp_out);
printf("压缩完毕!\n");
return 0;
}
int unCompress(char*file_in,char*file_out){
FILE*fp_in=fopen(file_in,"rb");
FILE*fp_out=fopen(file_out,"wb");
if(fp_in==NULL || fp_out==NULL){
perror("file");
return -1;
}
char temp[512];
unsigned char ch;
int i,j,k=0,tmp,flag;
while(len>0){
len-=8;
ch=fgetc(fp_in);
if(len<0)tmp=len;
else tmp=0;
for(i=0;i<8+tmp;i++)
temp[k++]=((ch>>(7-i))&0x01)+'0';
while(1){
for(i=0;i<count;i++){
if(k>=strlen(huffmanCode[i].cd)){
if(strncmp(huffmanCode[i].cd,temp,strlen(huffmanCode[i].cd))==0){
fputc(huffmanCode[i].ch,fp_out);
fflush(fp_out);
for(j=strlen(huffmanCode[i].cd);j<k;j++)
temp[j-strlen(huffmanCode[i].cd)]=temp[j];
k-=strlen(huffmanCode[i].cd);
break;
}
}
}
if(i==count)break;
}
}
fclose(fp_in);fclose(fp_out);
printf("解压完毕!\n");
return 0;
}
int main(){
int i;
compress("test.txt","test.txt.myZip");
unCompress("test.txt.myZip","test_uzip.txt");
return 0;
}
压缩算法目前可用于压缩txt,jpg,pdf等各类文件,但是对除txt外的文件的压缩效果不好,且压缩文件存在大小限制,估计不超过2M,仅比较适用于txt文档的压缩,有很大优化空间。