Huffman编码实现压缩、解压文件

Huffman编码:根据词频构建Huffman树,实现对文本的前缀编码。

1、统计文本中每个字符出现的次数,放入优先队列中,构建一棵空的二叉树

2、取出频率最小的两个字符a、b,字符a、b的频率分别作为此二叉树的左右结点,左结点的编号为1,右结点的编号为0,其频率之和(fa + fb)作为该二叉树的父亲节点,放入优先队列,并将f 、fb 从优先队列中除去;

3、重复第二步操作,直至优先队列中只剩下一个数,即为此Huffman树的根节点。

4、从根节点到每个叶节点(文本中出现的字符)的“路径”,即0、1序列串就是该字符的前缀编码。

注:这种编码方式保证了,任意一个字符的编码都不会是其他字符编码的前缀,这样在解码过程中就不会混淆。


数据结构:

为方便记录每个字符的前缀编码,在构建Huffman树过程中,需要保存每一个结点的父亲节点、左右儿子结点、叶节点对应字符、当前结点频率。


压缩过程:

1、首先构建Huffman树,获得每个字符对应的前缀编码;

2、将字符及其对应的前缀编码等压缩信息写入压缩文档中,便于解码;

3、扫描文本,将文本中的字符转换成0、1串,每八位,即一个字节对应的字符存储到压缩文件中。

注:如果最后存储的0、1串不足八位,则在末尾补0,然后将补的位数信息写入压缩文件中。


解压过程:

1、读取压缩信息;

2、扫描压缩文本,将每个字符转化成0、1串,匹配字符的前缀编码,转化成原始文件。

注:解码时需删除之前补充的位数


一点体会:

1、总在循环内,动态申请数组,会导致程序崩溃;

2、千万不要在循环内,每次都调用strlen函数,我表示没能深入了解此函数内涵,导致程序慢的要死;

3、原文本越大,压缩率越高,对于一个2M的文件,压缩率大约在45%左右;

4、感谢领导倾情指点,比赛加油!


压缩过程程序源码:

#include 
#include 
#include 
#include 
#include 
#include 
using namespace std;

typedef long long LL;
const int FILE_LENGTH = 1000;
//maximal bytes which is read from file each time
const long long MAX_MEMORY = 3 * 1024 * 1024;
//number of kinds of character
const int KIND_OF_CHARACTER = 260;
//the maximal length of Huffman code
const int HUFFMAN_CODE_LENGTH = 1000;
//the position of the size of original file in compressed file
const int OFFSET = 20;
//store compress file in 8 bits
const int nBits = 8;

struct Node {
    char c; // character
    int parent, lChild, rChild;//children node
    int iNode; //the serial number of node
    LL number; //number of corresponding character
    friend bool operator < (Node a, Node b) {
        return a.number > b.number;
    }
}node[KIND_OF_CHARACTER];

char HuffmanCode[KIND_OF_CHARACTER][HUFFMAN_CODE_LENGTH];
//LL characters[KIND_OF_CHARACTER];

void CountKinds(); //for test
int BuildHuffmanTree();
void CompressFile(const char *filePath, const char *outPutFilePath, int numberOfNode);
void BitToInt(ofstream &outPut, char *HTstr, LL len);


int main() {

    //scan the file to count frequency of each character.
    char filePath[FILE_LENGTH] = "graph.txt";   //"Aesop_Fables.txt"; "graph.txt";  "1.txt";
    char compressFilePath[FILE_LENGTH] = "result.txt";

    ifstream readIn;
    readIn.open(filePath, ios::binary);
    if (readIn.is_open() == 0) {
        cout << "OPEN FAILED!" << endl;
        exit(0);
    }
     //get size of file
    readIn.seekg(0, ios::end);
    LL fileSize = (LL)readIn.tellg();
    readIn.seekg(0, ios::beg);
    cout<<"fileSize" < q;
    int  numberOfNode = 0;
    for (int i = 0; i < KIND_OF_CHARACTER; i++) {
        if (node[i].number != 0) {
            node[i].iNode = numberOfNode;
            node[i].c = i;
            q.push(node[i]);
            HT[numberOfNode] = node[i];
            numberOfNode++;
        }
    }
    cout << numberOfNode << endl;
    int jNode = numberOfNode;
    while (q.size() > 1){
        //get two minimal weight nodes and set their parent
        Node leftNode = q.top();
        q.pop();
        Node rightNode = q.top();
        q.pop();
        //cout <<" ##"<< leftNode.number < MAX_MEMORY) {
               // cout<<"****"<> j);
        j++;
    }

   // outPut.write(buf, strlen(buf) * sizeof(char));
    outPut.write((char *)&sum, sizeof(char));
   // free(buf);
   // cout <


解压过程程序源码:

#include 
#include 
#include 
#include 
using namespace std;

typedef long long  LL;
const int FILE_LENGTH = 1000;
//the maximal length of Huffman code
const int HUFFMAN_CODE_LENGTH = 1000;
//number of kinds of character
const int KIND_OF_CHARACTER = 256;
//maximal bytes which is read from file each time
const long long MAX_MEMORY = 1 * 1024 * 1024;


struct Node {
	char c; //character
	char Huffmancode[HUFFMAN_CODE_LENGTH]; //bits string
}node[KIND_OF_CHARACTER]; //encoding information

//store each nBits
int  nBits = 8;
LL originalFileSize; //the size of original file
int numberOfNode;   //number of kind of character
int bitsAdded;
int OFFSET;

int GetCompressInformation(ifstream &readIn);
void DecompressFile(ifstream &readIn, ofstream &writeOut, int maxEncodingLength);

int main() {
	char compressFilePath[FILE_LENGTH] = "result.txt"; //graph.txt  "1.txt";
	char decompressFilePath[FILE_LENGTH] = "decompressResult.txt";
	ifstream readIn;
	readIn.open(compressFilePath, ios::binary);
	if (readIn.is_open() == 0) {
		cout << "OPEN FAILED!" << endl;
		exit(0);
	}
	ofstream writeOut;
	writeOut.open(decompressFilePath, ios::binary);
	if (writeOut.is_open() == 0) {
		cout << "OPEN FAILED!" << endl;
		exit(0);
	}
	//get information of compressed file
	int maxEncodingLength = GetCompressInformation(readIn);
	//decompress File
	DecompressFile(readIn, writeOut, maxEncodingLength);
	readIn.close();
	writeOut.close();
	return 0;
}

int GetCompressInformation(ifstream &readIn){
	readIn.read((char *)&bitsAdded, sizeof(int));
	readIn.read((char *)&OFFSET, sizeof(int));
	readIn.seekg(OFFSET, ios::beg);
	readIn.read((char *)&originalFileSize, sizeof(LL));
	readIn.read((char *)&numberOfNode, sizeof(int));
	cout << originalFileSize << " " << numberOfNode << endl;
	//record the character and its Huffman code
	int maxEncodingLength = 0;
	for (int i = 0; i < numberOfNode; i++) {
		readIn.read((char *)&node[i].c, sizeof(char));
		int bits;
		readIn.read((char *)&bits, sizeof(int));
		readIn.read((char *)&node[i].Huffmancode, bits*sizeof(char));
		node[i].Huffmancode[bits] = '\0';
		cout << node[i].c << " " << node[i].Huffmancode << endl;
		if (maxEncodingLength < strlen(node[i].Huffmancode)) {
			maxEncodingLength = strlen(node[i].Huffmancode);
		}
	}
	cout << " maxEncodingLength :" << maxEncodingLength << endl;
	return maxEncodingLength;
}

void DecompressFile(ifstream &readIn, ofstream &writeOut, int maxEncodingLength){
	//get size of compressed file
	streampos curPos = readIn.tellg();
	readIn.seekg(0, ios::end);
	LL compressedFileSize = (LL)(readIn.tellg() - curPos);
	readIn.seekg(curPos, ios::beg);
	cout << "size of compressed file : " << compressedFileSize << endl;
	//read data in batches, each time read MAX_MEMORY characters
	int nTimes = (int)(compressedFileSize / MAX_MEMORY);
	if (compressedFileSize % MAX_MEMORY != 0) nTimes++;
	char *str = (char *)calloc(1, (MAX_MEMORY + HUFFMAN_CODE_LENGTH)* sizeof(char));
	int lenOfChar = 0;
	for (int j = 1; j <= nTimes; j++) {
		LL numberOfCharacter = MAX_MEMORY;
		if (j == nTimes) {
			numberOfCharacter = compressedFileSize % MAX_MEMORY;
		}
		char *strTemp = (char *)calloc(1, (2*HUFFMAN_CODE_LENGTH) * sizeof(char));
		char *buf = (char *)calloc(1, (MAX_MEMORY + HUFFMAN_CODE_LENGTH)* sizeof(char));

		readIn.read(buf, numberOfCharacter * sizeof(char));
		//cout<= 0; i--) {
				huffmanString[i] = ascII % 2 + '0';
				ascII = ascII / 2;
			}
			//if read last character, then minus bits which is added
			if ((j == nTimes) && (k == numberOfCharacter - 1)) {
               // printf("ascII:%d\n", ascII);
				nBits = nBits - bitsAdded;
			}
			huffmanString[nBits] = '\0';

			// cout< strlen(strcmp)) continue;
					int lenHuffmanCode = strlen(node[z].Huffmancode);
					if (!memcmp(node[z].Huffmancode, strTemp, lenHuffmanCode)) {
						str[lenOfChar] = node[z].c;
						str[lenOfChar+1] = '\0';
						lenOfChar ++;
						//cout<<"strTempF:"< MAX_MEMORY) {
				writeOut.write(str, lenOfChar * sizeof(char));
				//apply a new memory will result in crash
				//free(str);
				//char *str = (char *)calloc(1, (MAX_MEMORY + HUFFMAN_CODE_LENGTH)* sizeof(char));
				strcpy(str, "");
				lenOfChar = 0;
			}
		}
		free(buf);
		free(strTemp);
	}
	//cout<






你可能感兴趣的:(算法总结,&,实现)