哈夫曼编码与解码

这是我的第一篇博客,希望大神们批评指正。

首先介绍以下什么是哈夫曼树(来自百度百科)

哈夫曼树─即最优二叉树,带权路径长度最小的二叉树,经常应用于数据压缩。 在计算机信息处理中,“哈夫曼编码”是一种一致性编码法(又称“熵编码法”),用于数据的无损耗压缩。这一术语是指使用一张特殊的编码表将源字符(例如某文件中的一个符号)进行编码。这张编码表的特殊之处在于,它是根据每一个源字符出现的估算概率而建立起来的(出现概率高的字符使用较短的编码,反之出现概率低的则使用较长的编码,这便使编码之后的字符串的平均期望长度降低,从而达到无损压缩数据的目的)。

构造哈夫曼树的主要思想:

构造哈夫曼树非常简单,将所有的节点放到一个队列中,用一个节点替换两个频率最低的节点,新节点的频率就是这两个节点的频率之和。这样,新节点就是两个被替换节点的父节点了。如此循环,直到队列中只剩一个节点(树根)。


这里用到了最小优先队列。

我这里用STL来实现。(这里有优先队列的介绍)

template<typename T>

struct cmp

{

    bool operator()(TreeNode<T>* t1, TreeNode<T>*  t2)

    {

        return !(*t1 < *t2);

    }

};

优先队列的定义:

priority_queue<TreeNode*,vector<TreeNode* >,cmp > pri_que;

哈夫曼树节点结构

template<typename T>

class TreeNode

{

public:

    TreeNode():pfather(NULL),plchild(NULL),prchild(NULL)

    {



    }

    T data;

    TreeNode *pfather;

    TreeNode *plchild;

    TreeNode *prchild;



    bool operator < (const TreeNode& rhs)

    {

        return data < rhs.data;

    }



};

构造哈夫曼树

每次从最小优先队列取头两个节点,合并后放回最小优先队列,如此重复。

template<typename T>

TreeNode<T>* MakeHuffmanTree(T* begin, T* end) //构造哈夫曼树

{

    priority_queue<TreeNode<T>*,vector<TreeNode<T>* >,cmp<T> > pri_que;

    T *iter = begin;

    TreeNode<T>* pNode;

    TreeNode<T>* pf = NULL;

    while(iter != end)

    {

        pNode = new TreeNode<T>;

        pNode->data = *iter++;

        pNode->pfather = pf;

        pri_que.push(pNode);

    }

    TreeNode<T>* plchild;

    TreeNode<T>* prchild;

    while(pri_que.size() > 1)//取两个小的合并

    {

        plchild = pri_que.top();

        pri_que.pop();

        prchild = pri_que.top();

        pri_que.pop();



        pNode = new TreeNode<T>;

        pNode->plchild = plchild;

        pNode->prchild = prchild;

        pNode->data = plchild->data + prchild->data;



        pri_que.push(pNode);

    }



    pNode = pri_que.top();

    pri_que.pop();

    return pNode;

}

构造哈夫曼树这个函数的参数是一个结构体,保存着对应字符,其频率,编码值。

重载它的+运算符,为了合并时的+运算(只是频率相加)。

 

 
  

到此为止,已经可以把哈夫曼树生成出来了。

template<typename T>

struct mydata

{

    mydata(){}

    mydata(int i):freq(i)

    {



    }

    string coded;

    int freq;

    T data;



    bool operator<(const mydata& rhs)

    {

        return freq < rhs.freq;

    }



    mydata operator+(mydata& rhs)

    {

        return mydata(freq + rhs.freq);

    }

};

我们可以通过DFS将每个叶子节点的路径记录下来(用一个全局变量数组path),然后得到它的编码。

当发现当前节点是叶子节点,就把当前的路径赋值至该叶子节点的编码属性(coded)。

const int MAXLEN = 20;

char path[MAXLEN] = {0};

template<typename T>

void DFS(T* root,int deep = -1, char a = '-')  //DFS 得到叶子节点的编码

{

    if(root == NULL)

        return;



    if(a != '-')

        path[deep] = a;



    if(root->plchild == NULL || root->prchild == NULL)//leaf

        (root->data).coded = string(path,path + deep + 1);



    if(root->plchild != NULL)

        DFS(root->plchild, deep + 1, '0');

    if(root->prchild != NULL)

        DFS(root->prchild, deep + 1, '1');

}

这样整个哈夫曼编码工作已经完成,为了查看我们的编码结果,我们可以用BFS跟DFS来看到我们的结果。在这里我选择了BFS。

当遍历到叶子节点,就将其编码属性(coded)和其对应字符输出。

template<typename T,typename U>

void BFS(T* root, mydata<U>* data) //BFS 将叶子节点的编码,提到data指向的数据

{

    queue<T*> que;

    que.push(root);



    T* pT = NULL;

    while(!que.empty())

    {

        pT = que.front();

        //cout<<pT->data.freq<<endl;

        que.pop();



        if(pT->plchild != NULL)

            que.push(pT->plchild);

        if(pT->prchild != NULL)

            que.push(pT->prchild);

        if(pT->plchild == NULL || pT->prchild == NULL)// leaf 提取叶子节点的编码

        {

            //cout<<(pT->data).data<<":"<<(pT->data).coded<<endl;

            mydata<U>* pd = data;

            while((pT->data).data != pd->data)

                pd++;

            assert(pd->data == (pT->data).data);

            pd->coded = (pT->data).coded;

        }

    }

}

测试驱动代码

    mydata<char> *pdata = new mydata<char>[4];

    pdata[0].data = 'a';

    pdata[0].freq = 7;

    pdata[1].data = 'b';

    pdata[1].freq = 5;

    pdata[2].data = 'c';

    pdata[2].freq = 2;

    pdata[3].data = 'd';

    pdata[3].freq = 4;

    TreeNode<mydata<char> >* pihuffmanTree = MakeHuffmanTree(pdata, pdata + 4);



    DFS(pihuffmanTree);

    BFS(pihuffmanTree);


为了更方便的使用我将这些封装到一个类里面。

template<typename T>

class Huffman

{

public:

    void Coded(string& coded);//传入待输出的编码

    void DeCode(const string& codedstr,string& decodestr);//输入待解码字符串,输出解码字符串

    void InputData(T* begin,T* end);//传入数据



private:

    string FindVal(char c);

    void m_CalcFreq(T* begin, T* end);//计算输入数据的频率

    TreeNode<mydata<T> > *root;//huffman根节点

    mydata<T>* data;

    int data_size;

    

    T* m_begin;//保存原始数据的开始与结束的位置

    T* m_end;

    //string codedstr;



};

输入数据并计算其频率。

用map容器来统计输入字符每个出现的个数。

template<typename T>

void Huffman<T>::InputData(T* begin, T* end)

{

    this->m_begin = begin;

    this->m_end = end;

        m_CalcFreq(begin, end);



}



template<typename T>

void Huffman<T>::m_CalcFreq(T* begin, T* end)

{



    int len = end - begin;

    //data_size = len;

    if(len == 0)

        return;



    map<T,int> countMap;

    map<T,int>::iterator mapIter = countMap.begin();

    T *pT = begin;

    while(pT != end)

    {

        mapIter = countMap.find(*pT);

        if(mapIter != countMap.end())//在map里有没有字符*iter

            ++mapIter->second;

        else

        {

            countMap.insert(make_pair(*pT,1));

        }

        pT++;

    }



    data_size = countMap.size();

    data = new mydata<T>[data_size];

    int i = 0;

    for (mapIter = countMap.begin(); mapIter != countMap.end(); ++mapIter)

    {

        data[i].data = mapIter->first;        

        data[i].freq = mapIter->second;

        i++;

    }



}

编码

template<typename T>

void Huffman<T>::Coded(string& coded)

{

    root = MakeHuffmanTree(data,data + data_size);

    DFS(root);



    BFS(root,data);



    cout<<"code:"<<endl;

    for (int i = 0; i < data_size; ++i)

    {

        cout<<data[i].data<<":"<<data[i].coded<<endl;

    }



    

    T *begin = m_begin;

    while (begin != m_end)

    {

        coded += FindVal(*begin);

        begin++;

    }

    //string subcode = 

}

 

解码

template<typename T>

void Huffman<T>::DeCode(const string& codedstr,string& decodestr)

{

    string::const_iterator iter = codedstr.begin();

    TreeNode<mydata<T> >* curNode = root;

    while (iter != codedstr.end())

    {

        if (curNode->plchild == NULL || curNode->prchild == NULL)

        {

            decodestr += (curNode->data).data;

            curNode = root;

            continue;

        }

        if (*iter == '0')

            curNode = curNode->plchild;

        

        if(*iter == '1')

            curNode = curNode->prchild;



        iter++;

    }

}

测试驱动程序

        char *pmystr = "cbcddddbbbbaaaaaaa";



        Huffman<char> h;

        h.InputData(pmystr, pmystr + 18);



    cout<<"originstr: "<<pmystr<<endl;



    string coded;

    h.Coded(coded);



    cout<<"coded: "<<coded<<endl;



    string decode;

    h.DeCode(coded,decode);

    cout<<"decode: "<<decode<<endl;

完整程序(环境:VS2012)

#include <iostream>

//#include <algorithm>

#include <queue>

#include <string>

#include <vector>

#include <cassert>

#include <map>

using namespace std;





template<typename T>

class TreeNode

{

public:

    TreeNode():pfather(NULL),plchild(NULL),prchild(NULL)

    {





    }

    T data;

    TreeNode *pfather;

    TreeNode *plchild;

    TreeNode *prchild;





    bool operator < (const TreeNode& rhs)

    {

        return data < rhs.data;

    }





};





template<typename T>

struct cmp

{

    bool operator()(TreeNode<T>* t1, TreeNode<T>*  t2)

    {

        return !(*t1 < *t2);

    }

};









template<typename T>

TreeNode<T>* MakeHuffmanTree(T* begin, T* end) //构造哈夫曼树

{

    priority_queue<TreeNode<T>*,vector<TreeNode<T>* >,cmp<T> > pri_que;

    T *iter = begin;

    TreeNode<T>* pNode;

    TreeNode<T>* pf = NULL;

    while(iter != end)

    {

        pNode = new TreeNode<T>;

        pNode->data = *iter++;

        pNode->pfather = pf;

        pri_que.push(pNode);

    }

    TreeNode<T>* plchild;

    TreeNode<T>* prchild;

    while(pri_que.size() > 1)//取两个小的合并

    {

        //cout<<static_cast<TreeNode<T>* >(pri_que.top())->data<<endl;

        //pri_que.pop();

        plchild = pri_que.top();

        pri_que.pop();

        prchild = pri_que.top();

        pri_que.pop();





        pNode = new TreeNode<T>;

        pNode->plchild = plchild;

        pNode->prchild = prchild;

        pNode->data = plchild->data + prchild->data;





        pri_que.push(pNode);

    }





    pNode = pri_que.top();

    pri_que.pop();

    return pNode;

}





template<typename T>

struct mydata

{

    mydata(){}

    mydata(int i):freq(i)

    {





    }

    string coded;

    int freq;

    T data;





    bool operator<(const mydata& rhs)

    {

        return freq < rhs.freq;

    }





    mydata operator+(mydata& rhs)

    {

        return mydata(freq + rhs.freq);

    }

};





template<typename T,typename U>

void BFS(T* root, mydata<U>* data) //BFS 将叶子节点的编码,提到data指向的数据

{

    queue<T*> que;

    que.push(root);





    T* pT = NULL;

    while(!que.empty())

    {

        pT = que.front();

        //cout<<pT->data.freq<<endl;

        que.pop();





        if(pT->plchild != NULL)

            que.push(pT->plchild);

        if(pT->prchild != NULL)

            que.push(pT->prchild);

        if(pT->plchild == NULL || pT->prchild == NULL)// leaf 提取叶子节点的编码

        {

            //cout<<(pT->data).data<<":"<<(pT->data).coded<<endl;

            mydata<U>* pd = data;

            while((pT->data).data != pd->data)

                pd++;

            assert(pd->data == (pT->data).data);

            pd->coded = (pT->data).coded;

        }

    }

}





const int MAXLEN = 20;

char path[MAXLEN] = {0};

template<typename T>

void DFS(T* root,int deep = -1, char a = '-')  //DFS 得到叶子节点的编码

{

    if(root == NULL)

        return;





    if(a != '-')

        path[deep] = a;





    if(root->plchild == NULL || root->prchild == NULL)//leaf

        (root->data).coded = string(path,path + deep + 1);





    if(root->plchild != NULL)

        DFS(root->plchild, deep + 1, '0');

    if(root->prchild != NULL)

        DFS(root->prchild, deep + 1, '1');

}





template<typename T>

class Huffman

{

public:

    void Coded(string& coded);

    void DeCode(const string& codedstr,string& decodestr);

    void InputData(T* begin,T* end);





private:

    string FindVal(char c);

    void m_CalcFreq(T* begin, T* end);

    TreeNode<mydata<T> > *root;

    mydata<T>* data;

    int data_size;

    

    T* m_begin;

    T* m_end;

    //string codedstr;





};





template<typename T>

void Huffman<T>::InputData(T* begin, T* end)

{

    this->m_begin = begin;

    this->m_end = end;

    m_CalcFreq(begin, end);





}





template<typename T>

void Huffman<T>::m_CalcFreq(T* begin, T* end)

{





    int len = end - begin;

    //data_size = len;

    if(len == 0)

        return;





    map<T,int> countMap;

    map<T,int>::iterator mapIter = countMap.begin();

    T *pT = begin;

    while(pT != end)

    {

        mapIter = countMap.find(*pT);

        if(mapIter != countMap.end())

            ++mapIter->second;

        else

        {

            countMap.insert(make_pair(*pT,1));

        }

        pT++;

    }





    data_size = countMap.size();

    data = new mydata<T>[data_size];

    int i = 0;

    for (mapIter = countMap.begin(); mapIter != countMap.end(); ++mapIter)

    {

        data[i].data = mapIter->first;        

        data[i].freq = mapIter->second;

        i++;

    }





}





template<typename T>

void Huffman<T>::Coded(string& coded)

{

    root = MakeHuffmanTree(data,data + data_size);

    DFS(root);





    BFS(root,data);





    cout<<"code:"<<endl;

    for (int i = 0; i < data_size; ++i)

    {

        cout<<data[i].data<<":"<<data[i].coded<<endl;

    }





    

    T *begin = m_begin;

    while (begin != m_end)

    {

        coded += FindVal(*begin);

        begin++;

    }

    //string subcode = 









}





template<typename T>

void Huffman<T>::DeCode(const string& codedstr,string& decodestr)

{

    string::const_iterator iter = codedstr.begin();

    TreeNode<mydata<T> >* curNode = root;

    while (iter != codedstr.end())

    {

        if (curNode->plchild == NULL || curNode->prchild == NULL)

        {

            decodestr += (curNode->data).data;

            curNode = root;

            continue;

        }

        if (*iter == '0')

            curNode = curNode->plchild;

        

        if(*iter == '1')

            curNode = curNode->prchild;





        iter++;

    }

}





template<typename T>

string Huffman<T>::FindVal(char c)

{

    for (int i = 0; i < data_size ; ++i)

    {

        if (c != data[i].data)

            continue;

        return data[i].coded;

    }

    return string();

}





int main()

{

    //mydata<char> *pdata = new mydata<char>[4];

    //pdata[0].data = 'a';

    //pdata[0].freq = 7;

    //pdata[1].data = 'b';

    //pdata[1].freq = 5;

    //pdata[2].data = 'c';

    //pdata[2].freq = 2;

    //pdata[3].data = 'd';

    //pdata[3].freq = 4;

    ////int a[12]={14,10,56,7,83,22,36,91,3,47,72,0};

    //TreeNode<mydata<char> >* pihuffmanTree = MakeHuffmanTree(pdata, pdata + 4);





    //DFS(pihuffmanTree);





    //BFS(pihuffmanTree);





    //string str = "cbcddddbbbbaaaaaaa";

    char *pmystr = "cbcddddbbbbaaaaaaa";





    Huffman<char> h;

    h.InputData(pmystr, pmystr + 18);





    cout<<"originstr: "<<pmystr<<endl;





    string coded;

    h.Coded(coded);





    cout<<"coded: "<<coded<<endl;





    string decode;

    h.DeCode(coded,decode);

    cout<<"decode: "<<decode<<endl;





    return 0;

}

 




你可能感兴趣的:(编码)