huffman文件压缩及其解压(附源码,针对各种文件)

  • huffman压缩简介
  • 构建压缩信息
  • 开始压缩
    • 统计字符
    • 建立huffman树
    • 得到huffman编码
    • 将huffman编码压缩
    • 书写配置信息
  • 解压缩
    • 读取配置信息
    • 重新建树
    • 还原文件
  • 整体源码
    • test.cpp
    • compress.h
    • HuffmanTree.h
    • Heap.h

huffman压缩简介

哈夫曼编码(Huffman Coding),又称霍夫曼编码,是一种编码方式,该方法完全依据字符出现概率来构造异字头的平均长度最短的码字,有时称之为最佳编码,一般就叫做Huffman编码(有时也称为霍夫曼编码);
因为这个特性,所以我们可以利用其来进行文件压缩,例如:有一个原始数据序列,ABACCDAA则编码为A(0),B(10),C(110),(D111),压缩后为010011011011100,再将其转为一个整形写入,实现了文件压缩;

构建压缩信息

那么这个过程中,我们在类中也就是整个压缩过程中要对其字符对应的信息例如:其对应的字符出现的次数,其字符对应的huffman编码,好方便我们的压缩和解压的过程;

在类中定义一个CharInfo _infos[256]来表示其所有信息;

typedef long long LongType;

struct CharInfo
{  
    LongType _count;//字符出现的次数
    string _code;//huffman编码
    unsigned char _ch;//字符
    CharInfo(LongType count = 0)
        :_count(count)
        , _code("")
    {}
};

开始压缩

压缩又可以分为以下几步:

统计字符


统计字符出现的次数

FILE* fout = fopen(filename, "rb");
        assert(fout);
        //char ch = fgetc(fout);
        int ch = fgetc(fout);//因为压缩汉字或者图片有些字符可能超过一个char表示的范围,具体与编码方式有关,在此不做过多描述
        while (ch != EOF)
        {
            _infos[(unsigned char)ch]._count++;
            ch = fgetc(fout);
        }

建立huffman树

CharInfo invalid;
invalid._count = 0;
HuffmanTree tree(_infos, 256,invalid);
//另外创建一个HuffmanTree.h文件
#pragma once
template <class T>
struct HuffmanTreeNode//huffman节点
{
    T _w;
    HuffmanTreeNode* _left;
    HuffmanTreeNode* _right;
    HuffmanTreeNode* _parent;

    HuffmanTreeNode(const T& x)
        :_w(x)
        , _left(NULL)
        , _right(NULL)
        , _parent(NULL)
    {}

};

template <class T>
class HuffmanTree
{
    typedef HuffmanTreeNode<T> Node;

public:
    HuffmanTree()
        :_root(NULL)
    {
    }
    HuffmanTree(T* a, size_t n,const T& invalid)
    {
        Heap<Node*> minHeap;//利用堆来进行排序的比较
        for (size_t i = 0; i < n; i++)
        {
            if (a[i] != invalid)
                minHeap.Push(new Node(a[i]));
        }
        while (minHeap.Size() > 1)
        {
            Node* left = minHeap.Top();
            minHeap.Pop();
            Node* right = minHeap.Top();
            minHeap.Pop();

            Node* parent = new Node(left->_w + right->_w);
            parent->_left = left;
            left->_parent = parent;
            parent->_right = right;
            right->_parent = parent;

            minHeap.Push(parent);
        }
        _root = minHeap.Top();
    }
    ~HuffmanTree()
    {
        _Destory(_root);
    }

    Node* Root()
    {
        return _root;
    }
protected:
    void _Destory(Node* root)
    {
        if (root == NULL)
            return;

        _Destory(root->_left);
        _Destory(root->_right);

        delete root;
    }
    HuffmanTree(const HuffmanTree<T>&);//防拷贝
    HuffmanTree& operator=(const HuffmanTree<T>&);
protected:
    Node* _root;
};

其中利用的堆排序的代码:

//利用仿函数进行大堆小堆
template <class T>
struct Less
{
    bool operator()(const T& l, const T& r) const
    {
        return l->_w._count <= r->_w._count;
    }
};
template <class T>
struct Greater
{
    bool operator()(const T& l, const T& r) const
    {
        return l->_w._count >= r->_w._count;
    }
};
template <class T, class Compare = Less>
//template 
class Heap
{
public:
    Heap()
    {}
    Heap( T* a, size_t size)
    {
        _a.reserve(size);
        for (size_t i = 0; i < size; ++i)
        {
            _a.push_back(a[i]);
        }
        for (int i = (_a.size() - 2) / 2; i >= 0;--i)
        _AdjustDown(i);
    }
    size_t Size()
    {
        return _a.size();
    }
    void Push(const T& x)
    {
        _a.push_back(x);
        _AdjustUp(_a.size() - 1);
    }
    void Pop()
    {
        assert(!_a.empty());
        swap(_a[0], _a[_a.size() - 1]);
        _a.pop_back();
        _AdjustDown(0);
    }
    const T& Top() const
    {
        return _a[0];
    }
protected:
    void _AdjustDown(int root)
    {
        Compare comFunc;
        int parent = root;
        size_t child = root * 2 + 1;
        while (child < _a.size())
        {
            if (child + 1 < _a.size() && comFunc(_a[child + 1],_a[child]))
            {
                ++child;
            }
            if (comFunc(_a[child] , _a[parent]))
            {
                swap(_a[child], _a[parent]);
                parent = child;
                child = parent * 2 + 1;
            }
            else
            {
                break;
            }
        }
    }
    void _AdjustUp(int child)
    {
        Compare comFunc;
        int parent = (child - 1) >> 1;
        while (child > 0)
        {
            if (comFunc(_a[child], _a[parent]))
            //if (_a[child] >= _a[parent])
            {
                swap(_a[child], _a[parent]);
                child = parent;
                parent = (child - 1) >> 1;
            }
            else
            {
                break;
            }
        }
    }
protected:
    vector _a;
};

得到huffman编码

void GetHuffmanCode( HuffmanTreeNode* root)
    {
        if (root == NULL)
            return ;

        if (root->_left == NULL && root->_right == NULL)
        {
            HuffmanTreeNode *parent, *cur;
            cur = root;
            parent = cur->_parent;
            string& code = _infos[(unsigned)root->_w._ch]._code;
            while (parent)
            {
                if (cur == parent->_left)
                    code += '0';
                else
                    code += '1';

                cur = parent;
                parent = cur->_parent;
            }
            reverse(code.begin(), code.end());
            return;
            //_infos[root->_w._ch]._code = code;
        }

        GetHuffmanCode(root->_left);
        GetHuffmanCode(root->_right); 
    }

将huffman编码压缩

压缩的时候就是根据字符的顺序,将其转化为对应的huffman编码,当满8位的时候就将其写入到文件中;

fseek(fout,0, SEEK_SET);
        ch = fgetc(fout);
        unsigned char value = 0;
        int pos = 0;
        while (!feof(fout))
        {
            string& code = _infos[ch]._code;
            for (size_t i = 0; i < (code.size()); ++i)
            {
                value <<= 1;
                ++pos;
                if (code[i] == '1')//将其相应位置置为1
                    value |= 1;

                if (pos == 8)//满8写入
                {
                    printf("%d ", value);
                    fputc(value, fin);
                    value = 0;
                    pos = 0;
                }
            }
            //cout << ch << " ";
            ch = fgetc(fout);
        }

        if (pos)//如果最后的字符huffman编码不够8位将其补够8位写入
        {
            value <<= (8 - pos);
            fputc(value, fin);
        }

书写配置信息

这样的话,压缩就算是完成了,但是在压缩的时候我们还需要写配置信息,也就是需要将我们的CharInfo _infos[256]的信息写入,不然的话,当单独解压缩的时候就没有办法了;

我采取的方法是将其信息写入到文件开头处,一开始解压缩就先读配置信息,这样子就需要一个断点来标识配置信息和文件内容了;
因为其本身的结构体中有string类型,如果直接写入会影响压缩效率,而我们只需要只要字符出现的次数就够了;

struct CountInfo
{
    int _ch;
    LongType _count;
};
CountInfo info;
        for (size_t i = 0; i < 256; ++i)
        {
            if (_infos[i]._count)
            {
                info._ch = _infos[i]._ch;
                info._count = _infos[i]._count;

                fwrite(&info,sizeof(info),1,fin);
            }
        }
        //CharInfo info1;
        info._count = -1;//断点
        fwrite(&info, sizeof(info), 1, fin);

这样子压缩就算是完成了;

解压缩

解压缩的过程就跟压缩类似了,所以简单了一些;就是先根据配置信息读出文件字符信息,然后重新建树,根据文件中现在的数字的位来找字符,再依次写入就好;

读取配置信息

根据写入的配置信息读取到断点处;

FILE* fout = fopen(filename, "rb");
        assert(fout);

        FILE* fin = fopen(uncompressFile.c_str(), "wb");
        assert(fin);

        CountInfo info;

        while (1)
        {
            fread(&info, sizeof(CountInfo), 1, fout);
            if (info._count == -1) 
                break;
            _infos[info._ch]._ch = info._ch;
            _infos[info._ch]._count = info._count;

        } 

重新建树

根据配置信息重新构建huffman树

HuffmanTree<CharInfo> tree(_infos, 256, invalid);

还原文件

根据压缩文件读取出来的数字按位,依次去huffman树中找到字符写入文件,需要注意一种就是当文件只有一种字符的时候,特别考虑

if (root->_left == NULL && root->_right == NULL)
        {
            LongType count1 = count;
            while (count1)
            {
                cout << cur->_w._ch<<" ";
                fputc(cur->_w._ch, fin);
                count1--;
            }
        }
        while (!feof(fout))
        {
            int pos = 7;
            char test = 1;
            while (pos >= 0 )
            {
                if (value &(test << pos))
                    cur = cur->_right;
                else
                    cur = cur->_left;

                --pos;
                if (cur->_left == NULL && cur->_right == NULL)
                {
                    //fflush(stdin);

                    //cout << cur->_w._ch << " ";
                    fputc(cur->_w._ch, fin);
                    cur = root;
                    count--;
                }
            }
            if (count == 0)
                break;
            value = fgetc(fout);

        }

这个样子整个压缩解压就完成了;

整体源码

下面是整体源码:

test.cpp

#include 
#include 
#include 
#include 
#include 
using namespace std;

#include "HuffmanTree.h"
#include "compress.h"
#include "Heap.h"


int main()
{
    Test1();
    return 0;
}

compress.h

#pragma once 



typedef long long LongType;

struct CharInfo
{  
    LongType _count;//字符出现的次数
    string _code;//huffman编码
    unsigned char _ch;//字符
    CharInfo(LongType count = 0)
        :_count(count)
        , _code("")
    {}
    bool operator!=(const CharInfo& info) const
    {
        return _count != info._count;
    }
    CharInfo operator+(const CharInfo& info)
    {
        return CharInfo(_count + info._count);
    }

};

struct CountInfo
{
    int _ch;
    LongType _count;
};
class CompressFile
{
public:
    CompressFile()//对其字符进行初始化
    {
        for (size_t i = 0; i <= 256; i++)
        {
            _infos[i]._ch = i;
        }
    }

    void Compress(const char* filename)
    {
        assert(filename);
        FILE* fout = fopen(filename, "rb");
        assert(fout);
        //char ch = fgetc(fout);
        int ch = fgetc(fout);
        while (ch != EOF)
        {
            _infos[(unsigned char)ch]._count++;
            ch = fgetc(fout);
        }
        //fclose(fout);
        CharInfo invalid;
        invalid._count = 0;
        HuffmanTree tree(_infos, 256,invalid);

        GetHuffmanCode(tree.Root());



        string compressFile = filename;
        compressFile += ".huffman";
        FILE* fin = fopen(compressFile.c_str(), "wb");
        assert(fin);

        CountInfo info;
        for (size_t i = 0; i < 256; ++i)
        {
            if (_infos[i]._count)
            {
                info._ch = _infos[i]._ch;
                info._count = _infos[i]._count;

                fwrite(&info,sizeof(info),1,fin);
            }
        }
        //CharInfo info1;
        info._count = -1;
        fwrite(&info, sizeof(info), 1, fin);



        fseek(fout,0, SEEK_SET);
        ch = fgetc(fout);
        unsigned char value = 0;
        int pos = 0;
        while (!feof(fout))
        {
            string& code = _infos[ch]._code;
            for (size_t i = 0; i < (code.size()); ++i)
            {
                value <<= 1;
                ++pos;
                if (code[i] == '1')
                    value |= 1;

                if (pos == 8)
                {
                    printf("%d ", value);
                    fputc(value, fin);
                    value = 0;
                    pos = 0;
                }
            }
            //cout << ch << " ";
            ch = fgetc(fout);
        }

        if (pos)
        {
            value <<= (8 - pos);
            fputc(value, fin);
        }
        fclose(fin);
        fclose(fout);

    }
    void UnCompress(const char* filename)//当这个文件中只有一种字符时,只有一个根节点
    {
        assert(filename);


        string uncompressFile = filename;
        size_t index = uncompressFile.rfind('.');
        assert(index != string::npos);
        uncompressFile = uncompressFile.substr(0, index);
        index = uncompressFile.rfind('.');
        assert(index != string::npos);
        uncompressFile = uncompressFile.substr(0, index);
        uncompressFile += "1.jpg";




        FILE* fout = fopen(filename, "rb");
        assert(fout);

        FILE* fin = fopen(uncompressFile.c_str(), "wb");
        assert(fin);

        CountInfo info;

        while (1)
        {
            fread(&info, sizeof(CountInfo), 1, fout);
            if (info._count == -1) 
                break;
            _infos[info._ch]._ch = info._ch;
            _infos[info._ch]._count = info._count;

        } 



        CharInfo invalid;
        invalid._count = 0;

        HuffmanTree tree(_infos, 256, invalid);
        HuffmanTreeNode* root = tree.Root();
        HuffmanTreeNode* cur = root;
        LongType count = root->_w._count;

        unsigned char value = fgetc(fout);

        if (root->_left == NULL && root->_right == NULL)
        {
            LongType count1 = count;
            while (count1)
            {
                cout << cur->_w._ch<<" ";
                fputc(cur->_w._ch, fin);
                count1--;
            }
        }
        while (!feof(fout))
        {
            int pos = 7;
            char test = 1;
            while (pos >= 0 )
            {
                if (value &(test << pos))
                    cur = cur->_right;
                else
                    cur = cur->_left;

                --pos;
                if (cur->_left == NULL && cur->_right == NULL)
                {
                    //fflush(stdin);

                    //cout << cur->_w._ch << " ";
                    fputc(cur->_w._ch, fin);
                    cur = root;
                    count--;
                }
            }
            if (count == 0)
                break;
            value = fgetc(fout);

        }
        fclose(fout);
        fclose(fin);

    }
protected:
    void GetHuffmanCode( HuffmanTreeNode* root)
    {
        if (root == NULL)
            return ;

        if (root->_left == NULL && root->_right == NULL)
        {
            HuffmanTreeNode *parent, *cur;
            cur = root;
            parent = cur->_parent;
            string& code = _infos[(unsigned)root->_w._ch]._code;
            while (parent)
            {
                if (cur == parent->_left)
                    code += '0';
                else
                    code += '1';

                cur = parent;
                parent = cur->_parent;
            }
            reverse(code.begin(), code.end());
            return;
            //_infos[root->_w._ch]._code = code;
        }

        GetHuffmanCode(root->_left);
        GetHuffmanCode(root->_right); 
    }


protected:
    CharInfo _infos[256];
};


void Test1()
{
    CompressFile c1;
    cout << "压缩中:" << endl;
    c1.Compress("psb.jpg");

    cout << "----------------------------------------------------------------" << endl;
    cout << "解压中" << endl;
    //c1.UnCompress("psb.jpg.huffman");

    CompressFile c2;
    c1.UnCompress("psb.jpg.huffman");

}

HuffmanTree.h

#pragma once


template <class T>
struct HuffmanTreeNode
{
    T _w;
    HuffmanTreeNode* _left;
    HuffmanTreeNode* _right;
    HuffmanTreeNode* _parent;

    HuffmanTreeNode(const T& x)
        :_w(x)
        , _left(NULL)
        , _right(NULL)
        , _parent(NULL)
    {}

};

template <class T>
class HuffmanTree
{
    typedef HuffmanTreeNode<T> Node;

public:
    HuffmanTree()
        :_root(NULL)
    {
    }
    HuffmanTree(T* a, size_t n,const T& invalid)
    {
        Heap<Node*> minHeap;
        for (size_t i = 0; i < n; i++)
        {
            if (a[i] != invalid)
                minHeap.Push(new Node(a[i]));
        }
        while (minHeap.Size() > 1)
        {
            Node* left = minHeap.Top();
            minHeap.Pop();
            Node* right = minHeap.Top();
            minHeap.Pop();

            Node* parent = new Node(left->_w + right->_w);
            parent->_left = left;
            left->_parent = parent;
            parent->_right = right;
            right->_parent = parent;

            minHeap.Push(parent);
        }
        _root = minHeap.Top();
    }
    ~HuffmanTree()
    {
        _Destory(_root);
    }

    Node* Root()
    {
        return _root;
    }
protected:
    void _Destory(Node* root)
    {
        if (root == NULL)
            return;

        _Destory(root->_left);
        _Destory(root->_right);

        delete root;
    }
    HuffmanTree(const HuffmanTree<T>&);//防拷贝
    HuffmanTree& operator=(const HuffmanTree<T>&);
protected:
    Node* _root;
};

Heap.h

#pragma once


template <class T>
struct Less
{
    bool operator()(const T& l, const T& r) const
    {
        return l->_w._count <= r->_w._count;
    }
};
template <class T>
struct Greater
{
    bool operator()(const T& l, const T& r) const
    {
        return l->_w._count >= r->_w._count;
    }
};
template <class T, class Compare = Less>
//template 
class Heap
{
public:
    Heap()
    {}
    Heap( T* a, size_t size)
    {
        _a.reserve(size);
        for (size_t i = 0; i < size; ++i)
        {
            _a.push_back(a[i]);
        }
        for (int i = (_a.size() - 2) / 2; i >= 0;--i)
        _AdjustDown(i);
    }
    size_t Size()
    {
        return _a.size();
    }
    void Push(const T& x)
    {
        _a.push_back(x);
        _AdjustUp(_a.size() - 1);
    }
    void Pop()
    {
        assert(!_a.empty());
        swap(_a[0], _a[_a.size() - 1]);
        _a.pop_back();
        _AdjustDown(0);
    }
    const T& Top() const
    {
        return _a[0];
    }
protected:
    void _AdjustDown(int root)
    {
        Compare comFunc;
        int parent = root;
        size_t child = root * 2 + 1;
        while (child < _a.size())
        {
            if (child + 1 < _a.size() && comFunc(_a[child + 1],_a[child]))
            {
                ++child;
            }
            if (comFunc(_a[child] , _a[parent]))
            {
                swap(_a[child], _a[parent]);
                parent = child;
                child = parent * 2 + 1;
            }
            else
            {
                break;
            }
        }
    }
    void _AdjustUp(int child)
    {
        Compare comFunc;
        int parent = (child - 1) >> 1;
        while (child > 0)
        {
            if (comFunc(_a[child], _a[parent]))
            //if (_a[child] >= _a[parent])
            {
                swap(_a[child], _a[parent]);
                child = parent;
                parent = (child - 1) >> 1;
            }
            else
            {
                break;
            }
        }
    }
protected:
    vector _a;
};

你可能感兴趣的:(小项目)