霍夫曼编码及文件压缩

霍夫曼编码

哈夫曼编码(Huffman Coding),又称霍夫曼编码,是一种编码方式,哈夫曼编码是可变字长编码(VLC)的一种。Huffman于1952年提出一种编码方法,该方法完全依据字符出现概率来构造异字头的平均长度最短的码字,有时称之为最佳编码,一般就叫做Huffman编码(有时也称为霍夫曼编码)----源自百度百科

以字符串”this is a test“为例,生成霍夫曼编码的步骤如下:

  1. 计算各字符的权重,这里直接用字符出现的次数表示。
t 3
h 1
i 2
s 3
\space  3
a 1
e 1
  1. 根据各字符的权重生成霍夫曼树
14
|---8
|	|---t(3)
|   |---5
|       |---i(2)
|       |---3
|           |---h(1)
|           |---2
|               |---e(1)
|               |---a(1)
|---6
|   |---\space(3)
|   |---s(3)
  1. 根据霍夫曼树生成霍夫曼编码
t 10
h 1110
i 110
s 01
\space  00
a 11111
e 11110

霍夫曼编码文件压缩

以上得到霍夫曼编码后可以对文件进行压缩,继续以字符串“this is a test"为例,步骤如下:

  1. 使用霍夫曼编码代替字符串中的字符:
10111011001001100100111110010111100110
  1. 对不满8位的进行填充
1011101100100110010011111001011110011000
  1. 将替换填充好的代码放入压缩文件中即实现了文件压缩
10111011 00100110 01001111 10010111 10011000
———————— ———————— ———————— ———————— ————————
  0xbb      0x26   0x4f      0x97     0x98

经过以上压缩,文件由14字节转为了5字节,解压即为逆过程。

文件压缩和解压具体实现

#include 
#include
#include

typedef struct{
     
	unsigned char ch;
	long int weight;
	int left,right,parent;
}node;

typedef struct{
     
	unsigned char ch;
	char*cd;
}code;

int count;
long int len,sumBytes;
node huffmanNode[256];
node*huffmanTree;
code*huffmanCode;

int getWeight(char*filePath){
     
	FILE *fp=fopen(filePath,"rb");
	if(fp==NULL){
     
		printf("can not open file %s\n",filePath);
		return -1;
	}
	
	int i;
	count=0;
	long int flag=0;
	unsigned char ch;
	memset(huffmanNode,0,sizeof(node)*256);
	fseek(fp,0,SEEK_END);
	sumBytes=ftell(fp);
	fseek(fp,0,SEEK_SET);

	while(flag<sumBytes){
     
		flag++;
		ch=fgetc(fp);
		for(i=0;i<count;i++)
			if(huffmanNode[i].ch==ch)
				break;
		if(i==count){
     
			huffmanNode[count].ch=ch;
			huffmanNode[count].weight=1;
			count++;
		}
		else{
     
			huffmanNode[i].weight++;
		}
	}
	fclose(fp);
	return 0;
}

void createHuffmanTree(){
     
	int min1,min2;
	int x1,x2,i,j;
	
	huffmanTree=(node*)realloc(huffmanTree,(2*count-1)*sizeof(node));
	for(i=0;i<count;i++){
     
		huffmanTree[i].ch=huffmanNode[i].ch;
		huffmanTree[i].weight=huffmanNode[i].weight;
		huffmanTree[i].left=huffmanTree[i].right=huffmanTree[i].parent=-1;
	}
	for(;i<2*count-1;i++){
     
		huffmanTree[i].ch=0;
		huffmanTree[i].weight=0;
		huffmanTree[i].left=huffmanTree[i].right=huffmanTree[i].parent=-1;
	}
	
	for(i=count;i<2*count-1;i++){
     
		min1=min2=999999;
		x1=x2=0;
		for(j=0;j<i;j++){
     
			if(huffmanTree[j].parent==-1&&huffmanTree[j].weight<=min1){
     
				min2=min1;x2=x1;
				min1=huffmanTree[j].weight;x1=j;
			}
			else if(huffmanTree[j].parent==-1&&huffmanTree[j].weight<=min2){
     
				min2=huffmanTree[j].weight;x2=j;
			}
		}
		huffmanTree[x1].parent=huffmanTree[x2].parent=i;
		huffmanTree[i].left=x1;huffmanTree[i].right=x2;
		huffmanTree[i].weight=min1+min2;
	}
}

void genHuffmanCode(){
     
	if(huffmanTree==NULL){
     
		printf("huffman tree is null! can not generate huffman code\n");
		return;
	}
	
	int cur,p,start;
	char*temp=(char*)malloc(sizeof(char)*(count+1));
	temp[count]='\0';	
	huffmanCode=(code*)malloc(sizeof(code)*count);
	for(int i=0;i<count;i++){
     
		cur=i;
		p=huffmanTree[i].parent;
		start=count;
		while(p!=-1){
     
			if(huffmanTree[p].left==cur)temp[--start]='0';
			else temp[--start]='1';
			cur=p;p=huffmanTree[cur].parent;
		}
		huffmanCode[i].ch=huffmanTree[i].ch;
		huffmanCode[i].cd=(char*)malloc(sizeof(char)*(count-start+1));
		strcpy(huffmanCode[i].cd,&temp[start]);
	}
}

int compress(char*file_in,char*file_out){
     
	len=0;
	int i,j,k=0,sum;
	unsigned char ch,temp[264];
	
	if(getWeight(file_in)<0)return -1;
	createHuffmanTree();
	genHuffmanCode();

	FILE*fp_in=fopen(file_in,"rb");
	FILE*fp_out=fopen(file_out,"wb");
	if(fp_out==NULL){
     
		printf("can not create %s!\n",file_out);
		return -1;
	}
	
	long int flag=0;
	while(flag<sumBytes){
     
		flag++;
		ch=fgetc(fp_in);
		for(i=0;i<count;i++){
     
			if(huffmanCode[i].ch==ch){
     
				len+=strlen(huffmanCode[i].cd);
				for(j=0;j<strlen(huffmanCode[i].cd);j++)
					temp[k++]=huffmanCode[i].cd[j]-'0';
					
				while(k>=8){
     
					sum=0;
					for(j=0;j<8;j++)sum=sum*2+(temp[j]&0x1);
					for(j=8;j<k;j++)temp[j-8]=temp[j];
					k=j-8;
					fputc(sum,fp_out);
					fflush(fp_out);
				}
				break;
			}
		}
	}
	if(k){
     
		sum=0;
		for(j=0;j<k;j++)sum=sum*2+(temp[j]&0x1);
		sum=sum<<(8-k);
		fputc(sum,fp_out);
		fflush(fp_out);
	}
	fclose(fp_in);fclose(fp_out);
	printf("压缩完毕!\n");
	return 0;
}

int unCompress(char*file_in,char*file_out){
     
	FILE*fp_in=fopen(file_in,"rb");
	FILE*fp_out=fopen(file_out,"wb");
	if(fp_in==NULL || fp_out==NULL){
     
		perror("file");
		return -1;
	}
	
	char temp[512];
	unsigned char ch;
	int i,j,k=0,tmp,flag;
	while(len>0){
     
		len-=8;
		ch=fgetc(fp_in);
		if(len<0)tmp=len;
		else tmp=0;
		for(i=0;i<8+tmp;i++)
			temp[k++]=((ch>>(7-i))&0x01)+'0';

		while(1){
     
			for(i=0;i<count;i++){
     
				if(k>=strlen(huffmanCode[i].cd)){
     
					if(strncmp(huffmanCode[i].cd,temp,strlen(huffmanCode[i].cd))==0){
     
						fputc(huffmanCode[i].ch,fp_out);
						fflush(fp_out);
					
						for(j=strlen(huffmanCode[i].cd);j<k;j++)
							temp[j-strlen(huffmanCode[i].cd)]=temp[j];
						k-=strlen(huffmanCode[i].cd);
						break;
					}
				}
			}
			if(i==count)break;
		}
	}
	fclose(fp_in);fclose(fp_out);
	printf("解压完毕!\n");
	return 0;
}

int main(){
     
	int i;
	compress("test.txt","test.txt.myZip");
	unCompress("test.txt.myZip","test_uzip.txt");
	return 0;
}

代码效果:
霍夫曼编码及文件压缩_第1张图片

压缩算法目前可用于压缩txt,jpg,pdf等各类文件,但是对除txt外的文件的压缩效果不好,且压缩文件存在大小限制,估计不超过2M,仅比较适用于txt文档的压缩,有很大优化空间。

你可能感兴趣的:(杂记,算法)