哈夫曼树实现文件压缩

最近在学了哈夫曼树之后,作为练习,写了一个文件压缩的小项目;
在这里和大家分享一下;

主要实现思路: 利用哈夫曼树的特性对字符进行哈夫曼编码,其中运用到了最小堆;利用最小堆的特性,找出构造哈夫曼树的结点;

统计文件字符出现的次数,本质是利用字符出现的次数进行构造哈夫曼树; 然后通过遍历哈夫曼树获取哈夫曼编码;

配置文件的主要内容:字符,字符出现次数,哈夫曼编码
当解压缩时可以利用配置文件中的内容构造出一颗新的哈夫曼树,然后利用其中的哈夫曼编码还原文件信息;

//构造哈夫曼树的头文件

#include
#include
#include
using namespace std;

//哈夫曼编码的过程
//给出数据的权值,根据权值构造哈夫曼树
//权值即统计文件中字符出现的次数

//构造哈夫曼树的过程
//利用堆的特性;

//写一个堆类
template<class T>
struct Max
{
    bool operator()(const T& l,const T& r)
    {
        return l > r;
    }
};

template<class T>
struct Less
{
    bool operator()(const T& l, const T& r)
    {
        return l < r;
    }
};

template<class T>
struct Less
{
    bool operator()(T* node1,const T* node2)
    {
        return (node1->_weight) < (node2->_weight);
    }
};

template<class T, class Campare = Less>
class Heap
{
public:
    Heap(){}
    Heap(T* arr, const int size)
    {
        assert(arr);
        for(size_t i = 0; i < size; i++)
            _arr[i] = arr[i];

        //建堆
        for(int i = (size-2)/2; i >= 0; i-- )
        {
            _AdJustDown(i, size);
        }
    }

    //push

    void Push(const T& data)
    {
        _arr.push_back(data);

        _AdJustUp(_arr.size ());
    }

    //pop
    void Pop()
    {
        if(!_arr.empty ())
        {
            swap(_arr[0],_arr[_arr.size () - 1]);
            _arr.pop_back ();

            _AdJustDown(0, _arr.size ());

        }
    }

    //size
    int Size()
    {
        return _arr.size ();
    }

    //top
    T& Top()
    {
        if(!_arr.empty ())
            return _arr[0];
    }

protected :
    void _AdJustDown(int parent, int size)
    {
        //assert(_arr);
        int child = parent*2 + 1;

        while(child < size)
        {
            if(child + 1 < size && Campare()(_arr[child+1],_arr[child]))
                ++child;
            if(Campare()(_arr[child],_arr[parent]))
            {
                swap(_arr[child],_arr[parent]);
                parent = child;
                child = parent*2 + 1;
            }
            else
                break;
        }
    }

    void _AdJustUp(int size)
    {
        int child = size - 1;


        while(child)
        {
            int parent = (child - 1)/2;
            if(Campare()(_arr[child],_arr[parent]))
            {
                swap(_arr[child],_arr[parent]);
                child = parent;
            }
            else
                break;
        }
    }

private:
    vector _arr;
};


//先构造出哈夫曼树的结点结构体
//哈夫曼树的根节点代表所有字符出现的次数;

template<class T>
struct HuffmanTreeNode
{
    T _weight;
    HuffmanTreeNode* _left;
    HuffmanTreeNode* _right;

    HuffmanTreeNode(const T& data)
        :_weight(data)
        ,_left(NULL)
        ,_right(NULL)
    {}
};


template<class T>
class HuffmanTree
{
    typedef HuffmanTreeNode Node;
public:
    HuffmanTree():_root(0){}
    HuffmanTree(T* arr, int size, const T& Invalid)
    {
        _root = GreatHuffman(arr, size,Invalid);
    }

    Node* GetRoot()
    {
        return _root;
    }

protected:
    Node* GreatHuffman(T* arr,int size, const T& invalid)
    {
        assert(arr);
        Heap> minheap;
        //建小堆
        for(int i = 0; i < size; i++)
        {
            if(arr[i] != invalid)
            {
                Node* tmp = new Node(arr[i]);
                minheap.Push (tmp);
            }
        }

        //构造哈夫曼树
        if(minheap.Size())
        {
            while(minheap.Size() > 1)
            {
                Node* left = minheap.Top();
                minheap.Pop ();
                Node* right = minheap .Top ();
                minheap .Pop ();

                Node *newNode = new Node(left->_weight + right->_weight );

                newNode ->_left = left;
                newNode ->_right = right;

                minheap .Push (newNode);
            }
            return minheap.Top ();
        }
        else
            return NULL;
    }

private:
    Node* _root;
};

//实现文件压缩和解压缩的代码

#pragma once
#define  _CRT_SECURE_NO_WARNINGS 1

#include
#include
#include
#include
#include "Huffman.h"
typedef long long TypeData;

using namespace std;

struct CharInfo
{
    //并不一定256个字符都有
    unsigned char _ch; //字符
    TypeData _count;   //字符出现的次数
    string _code;      //编码

    CharInfo(const TypeData& count = 0):_count(count){}
    CharInfo(const char ch):_ch(ch){}

    CharInfo operator+(const CharInfo& x)
    {
        //return _count + x._count ;
        return CharInfo(_count + x._count);
    }

    bool operator<(const CharInfo& x)
    {
        return _count < x._count ;
    }

    bool operator!=(const CharInfo& x)
    {
        return _count != x._count ;
    }

};

class FileCompress
{
private:
    CharInfo _Info[256]; //ASCII表中的字符表示从 0~255;

public:
    typedef HuffmanTreeNode Node;


    //构建哈夫曼树
    //HuffmanTree huffmantree(char* _Info,int ,CharInfo CharInfo());
    //文件压缩模块
    void Compress(const char* filename)
    {
        FILE* fread = fopen(filename,"rb");
        if(fread == NULL)
        {
            cout<<"the file open fail"<exit(0);
        }

        //采用哈希表的方式统计字符出现次数
        for(int i = 0; i < 256; i++)
            _Info[i]._ch = i;

        //统计文件中字符出现的次数
        int ch = fgetc(fread);
        while(ch != EOF)
        {
            _Info[ch]._count ++;
            ch = fgetc(fread);
        }

        //构建哈夫曼树
        HuffmanTree huffmantree(_Info,256,CharInfo());

        Node* root = huffmantree.GetRoot ();

        //获取哈夫曼编码
        string code;
        _GetHuffmanCode(root,code);

        fseek(fread,0,SEEK_SET);
        string write(filename);
        write = write + ".Compress";

        //利用string的成员函数将write转为char*类型的字符串
        FILE* fwrite = fopen( write.c_str() , "wb");

        ch = fgetc(fread);
        unsigned char data = 0;//压缩的数据以二进制的形式存储在文件中;
        int pos = 7; //控制bit位的移动次数

        while(ch != EOF)
        {
            const char* ptr = _Info[ch]._code.c_str();
            //遍历保存的编码结点;
            while(*ptr)
            {
                if(pos >= 0)
                {
                    data = data | ((*ptr - '0')<if(pos < 0)
                {
                    fputc(data,fwrite);
                    pos = 7;
                    data = 0;
                }
                ptr++;
            }
            ch = fgetc(fread);
        }
        //最后一个字节的数据不管写没写满都放进去
        fputc(data,fwrite);

        //写配置文件
        WriteConfig(filename);

        fclose(fread);
        fclose(fwrite);
        cout<<"压缩成功"<//解压缩模块
    void UnCompress(const char* filename)
    {
        assert(filename);

        //解压缩需要遍历哈夫曼树,重建哈夫曼树
        //怎样重建哈夫曼树?
        //第一种,在已有原文件的情况下进行解压缩


        //开始解压缩
        FILE* fread = fopen(filename,"rb");
        unsigned char ch = fgetc(fread);

        string write(filename);
        unsigned int index = write.rfind ('.',write.size ());
        write = write.substr (0,index);

        //根据压缩文件对哈夫曼树进行遍历,写入新的解压缩文件;

        //读配置文件
        CharInfo RInfo[256];
        ReadConfig((write + ".config").c_str(), RInfo);

        write = write + ".jpg"; 

        FILE* fwrite = fopen(write.c_str(),"wb");

        //需要获取哈夫曼树的根节点,因为根节点的权值就是总共字数的个数
        //HuffmanTree h(_Info,256,CharInfo());
        HuffmanTree h(RInfo, 256, CharInfo());
        HuffmanTreeNode* root = h.GetRoot ();
        if(root == NULL)
            return;
        HuffmanTreeNode* cur = root;
        TypeData charcount = root->_weight ._count ;

        //用字符的总数来控制循环条件

        int pos = 8;
        while(charcount)
        {   
            --pos;
            unsigned char value = 1;
            //需要对压缩文件一个字节一个字节的访问;
            if(ch & (value << pos))
            {
                cur = cur->_right ;
            }
            else
            {
                cur = cur->_left ;
            }
            if(cur->_left == NULL && cur->_right == NULL )
            {
                //读到叶子结点则把对应的字符写入解压缩文件
                fputc(cur->_weight._ch, fwrite);
                //每次将cur置为根节点,因为每次是从根节点开始遍历;
                cur = root;
                if(--charcount == 0)
                    break;
            }
            if(pos == 0)
            {
                pos = 8;
                ch = fgetc(fread);
            }

        }
        fclose(fread);
        fclose(fwrite);
    }
protected:
    //后序遍历哈夫曼树
    //因为我们只需要访问叶子结点;
    void _GetHuffmanCode(Node* root,string code)
    {
        if(root == NULL)
            return;
        _GetHuffmanCode(root->_left ,code + '0');
        _GetHuffmanCode(root->_right,code + '1');

        if(root->_left == NULL && root->_right == NULL)
        {
            _Info[root->_weight ._ch ]._code = code;
        }
    }

    //写配置文件
    void WriteConfig(const char* filename)
    {
        //压缩的文件信息保存在.config后缀的文件中
        string write(filename);
        write = write + ".config";
        FILE* fwrite = fopen(write.c_str (), "wb");


        //需要保存的是字符的出现次数,哈夫曼编码
        //string line;
        //char buff[128];

        for(int i = 0; i < 256; i++)
        {
            if(_Info[i]._count)
            {
            fputc(_Info[i]._ch , fwrite);
            fputc(',',fwrite);
            //将_count以十进制字符的形式存入字符数组arr;
            char arr[126];
            _itoa(_Info[i]._count, arr, 10);
            fputs(arr, fwrite);
            fputc(',', fwrite);
            fputs(_Info[i]._code.c_str (), fwrite);
            fputc('\n',fwrite);

                /*line += _Info[i]._ch;
                line += ',';
                line += itoa(_Info[i]._count, buff, 10);
                line += ',';
                line += _Info[i]._code;
                line += '\n';
                fputs(line.c_str(), fwrite);
                line.clear();*/
            }
        }
        //关闭文件其实就是保存!
        fclose(fwrite);
    }

    //读配置文件
    void ReadConfig(const char* configfilename, CharInfo* RInfo)
    {
        FILE* fread = fopen(configfilename, "rb");
        if(fread == NULL)
        {
            cout<<"read file fault"<exit(0);
        }

        int ch = fgetc(fread);
        while(ch != EOF)
        {
            //字符,数量,编码
            RInfo[ch]._ch = ch;
            unsigned char tmp = ch;

            //','的ASCII值为44
            ch = fgetc(fread);
            ch = fgetc(fread);

            string arr;
            while(ch != ',')
            {   
                arr.push_back(ch);
                ch = fgetc(fread);
            }

            RInfo[tmp]._count = atoi(arr.c_str());


            ch = fgetc(fread);
            while(ch != '\n')
            {
                RInfo[tmp]._code.push_back (ch);

                ch = fgetc(fread);
            }

            ch = fgetc(fread);
        }
    }

};


//测试图片,mp3,和大文件的部分
int main()
{
    FileCompress f;

    int start =  GetTickCount();
    //f.Compress("test.exe");
    int end = GetTickCount();
    //cout<<"压缩文件耗时: "<

    //start = end;
    //f.UnCompress ("test.exe.Compress");
    //end = GetTickCount();
    //cout<<"解压缩文件耗时: "<

    /*f.Compress("music.mp3");
    cout<<"压缩文件耗时: "<

   // f.Compress("picture.jpg");
    //cout<<"压缩文件耗时: "<
    f.UnCompress ("picture.jpg.Compress");
    end = GetTickCount();
    cout<<"解压缩文件耗时: "<"毫秒"<"pause");
    return 0;
}

你可能感兴趣的:(随笔存档,数据结构)