这篇笔记主要为了解决文件包含不同频率的字符时构造其赫夫曼编码的问题。

具体可参考附录练习10.3

基本介绍:

贪心算法的一个应用,称为文件压缩file compression.

具体就不说了,主要是为了根据字符出现频率而采取不同长度的编码。同时保证解析后的结果是唯一的(只要没有任何字符编码是别的字符编码的前缀就行)。这样的编码称为前缀码prefix code。即要将所有字符放在叶子节点上。赫夫曼给出了一个算法,因此,这种编码系统通常称为赫夫曼编码Huffmancode。

Huffman algorithm可以描述如下:假设字符的个数为C,维护一个由树组成的森林。一棵树的权等于它的叶子的频率的和。任一选取最小权的两棵树T1和T2,并任意形成以T1和T2为子树的新树,将这样的过程进行C-1次。在算法的开始,存在C棵单结点树—每个字符一棵。在算法结束时得到一棵树,这棵树就是最优赫夫曼编码树。新树的总的权正是那些老树的权的和。

具体代码如下

/********************************************************************************

**

** Filename: HuffmanCode.cpp

**

** Description: Huffman algorithm,generating prefix code.Note of learning greedy algorithm

**

** Version: 1.0

** Created: 2011年12月16日 14时35分24秒

** Revision: none

** Compiler: gcc

**

** Author: zhy (),[email protected]

*********************************************************************************/

#include <iostream>

#include <string>

using namespace std;

struct HuffmanNode {

char nodeChar;

int counter;

HuffmanNode *leftChild;

HuffmanNode *rightChild;

string code;

HuffmanNode(char c='?', int t=0, HuffmanNode *l=NULL, HuffmanNode *r=NULL, string ce=string()) :

nodeChar(c) ,

counter(t) ,

leftChild(l) ,

rightChild(r) ,

code(ce)

{}

bool operator<(const HuffmanNode& node) { return this->counter<node.counter; }

};

template<typename T>

class PriorityQueue {

public:

PriorityQueue(T a[], int size)

{

array = a;

currentSize = size;

for ( int i=size/2; i>=0; --i)

percolateDown(i);

}

void deleteMin(T &minItem)

{

minItem = array[0];

array[0] = array[--currentSize];

percolateDown(0);

}

void insert(const T &item)

{

array[currentSize++] = item;

//percolate Up

T temp = item;

int curhole = currentSize-1;

while (curhole>0 && temp<array[(curhole-1)/2]) {

array[curhole] = array[(curhole-1)/2];

curhole = (curhole-1)/2;

}

array[curhole] = temp;

}

private:

int leftChild(int i) { return 2*i+1; }

void percolateDown(int hole)

{

int child;

T temp = array[hole];

while (leftChild(hole)<currentSize) {

child = leftChild(hole);

if (child!=currentSize-1 && array[child+1]<array[child])

child++;

if (array[child]<temp)

array[hole] = array[child];

else

break;

hole = child;

}

array[hole] = temp;

}

void percolateUp(int hole)

{

T temp = array[hole];

while (hole>0 && temp<array[(hole-1)/2]) {

array[hole] = array[(hole-1)/2];

hole = (hole-1)/2;

}

array[hole] = temp;

}

public:

int currentSize;

T *array;

};

int totalCodeNum;

void postorderTraverse(HuffmanNode *node,char left,string base)

{

if (!node) return;

base.append(1,left);

node->code = base;

if (node->leftChild) postorderTraverse(node->leftChild,'0',base);

if (node->rightChild) postorderTraverse(node->rightChild,'1',base);

if (node->nodeChar!='?') cout << node->nodeChar << "\t" << node->code << "\t" << (base.size()-1)*node->counter << endl;

totalCodeNum+=(base.size()-1)*node->counter;

}

int main()

{

HuffmanNode node[14];

node[0].nodeChar = ':';node[0].counter = 100;

node[1].nodeChar = ' ';node[1].counter = 605;

node[2].nodeChar = '/';node[2].counter = 100;

node[3].nodeChar = ',';node[3].counter = 705;

node[4].nodeChar = '0';node[4].counter = 431;

node[5].nodeChar = '1';node[5].counter = 242;

node[6].nodeChar = '2';node[6].counter = 176;

node[7].nodeChar = '3';node[7].counter = 59;

node[8].nodeChar = '4';node[8].counter = 185;

node[9].nodeChar = '5';node[9].counter = 250;

node[10].nodeChar = '6';node[10].counter = 174;

node[11].nodeChar = '7';node[11].counter = 199;

node[12].nodeChar = '8';node[12].counter = 205;

node[13].nodeChar = '9';node[13].counter = 217;

PriorityQueue<HuffmanNode> queue(node,14);

HuffmanNode *temp1,*temp2,*t;

while (queue.currentSize>1) {

temp1 = new HuffmanNode;

temp2 = new HuffmanNode;

queue.deleteMin(*temp1);

queue.deleteMin(*temp2);

t = new HuffmanNode('?',temp1->counter+temp2->counter,temp1,temp2);

cout << t->counter << endl;

queue.insert(*t);

}

postorderTraverse(t,0,"");

cout << "total: " << totalCodeNum << endl;

return 0;

}


问题:

还有一个问题就是感觉空间的问题,优先队列里存储的是对象本身,感觉存储指针会节省不少空间。因为最初的基本思路是,构造树的结点都重新new出来,如果跟优先队列里使用相同的结点应该会好很多。

附录:

数据结构与算法分析—C++描述