实验三 Huffman编解码算法实现与压缩效率分析

一、Huffman编解码原理

1. Huffman编码

对原始文件进行Huffman编码,首先需要解决以下几点问题:

  1. 文件符号的概率分布情况是怎样的?
  2. Huffman树是如何建立的?
  3. 建立起Huffman树后,又是怎样读出符号对应码字的?

这三个问题在程序中的实现思路如下图:

实验三 Huffman编解码算法实现与压缩效率分析_第1张图片

将待编码文件里的数据参照已形成的Huffman码表一一进行转换,就可以得到编码后的文件了。

2. Huffman解码

Huffman解码是查表+翻译的过程。读取随接收文件传来的码表后,再逐位读取文件实际数据,对照码表进行翻译即可。

二、程序实现

流程中最关键的对Huffman树的操作在程序中主要通过两个结构体实现:Huffman_node和Huffman_code。
建立的二叉树上每个节点都以Huffman_node类型存在。节点之间的主要关系有父子、兄弟,Huffman_node中定义了指向父节点的指针*parent和指向孩子的指针*zero, *one来表述节点与节点之间的关系。除此之外,还有节点本身的属性:isLeaf、count、symbol。
而编码码字定义为了Huffman_code,本身属性包括码字占用的比特数和码字本身。
具体程序如下,部分理解在注释中给出。

Huffcode.c

/*
 *  huffcode - Encode/Decode files using Huffman encoding.
 *  http://huffman.sourceforge.net
 *  Copyright (C) 2003  Douglas Ryan Richardson; Gauss Interprise, Inc
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include "huffman.h"
#include 
#include 
#include 
#include 
#include 

#ifdef WIN32
#include 
extern int getopt(int, char**, char*);
extern char* optarg;
#else
#include 
#endif

static int memory_encode_file(FILE *in, FILE *out);
static int memory_decode_file(FILE *in, FILE *out);

static void
version(FILE *out)
{
    fputs("huffcode 0.3\n"
          "Copyright (C) 2003 Douglas Ryan Richardson"
          "; Gauss Interprise, Inc\n",
          out);
}

static void
usage(FILE* out)
{
    fputs("Usage: huffcode [-i] [-o] [-d|-c]\n"
          "-i - input file (default is standard input)\n"
          "-o - output file (default is standard output)\n"
          "-d - decompress\n"
          "-c - compress (default)\n"
          "-m - read file into memory, compress, then write to file (not default)\n",
          // step1: by yzhang, for huffman statistics
          "-t - output huffman statistics\n",
          //step1:end by yzhang
          out);
}

int
main(int argc, char** argv)
{
    char memory = 0;
    char compress = 1;
    int opt;
    const char *file_in = NULL, *file_out = NULL;
    //step1:add by yzhang for huffman statistics
    const char *file_out_table = NULL;
    //end by yzhang
    FILE *in = stdin;
    FILE *out = stdout;
    //step1:add by yzhang for huffman statistics
    FILE * outTable = NULL;
    //end by yzhang

    /* Get the command line arguments. */
    while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1) //演示如何跳出循环,及查找括号对
    {
        switch(opt)
        {
        case 'i':
            file_in = optarg;
            break;
        case 'o':
            file_out = optarg;
            break;
        case 'c':
            compress = 1;//压缩
            break;
        case 'd':
            compress = 0;//解压
            break;
        case 'h':
            usage(stdout);
            return 0;
        case 'v':
            version(stdout);
            return 0;
        case 'm':
            memory = 1;
            break;
        // by yzhang for huffman statistics
        case 't':
            file_out_table = optarg;            
            break;
        //end by yzhang
        default:
            usage(stderr);
            return 1;
        }
    }

    /* If an input file is given then open it. */
    if(file_in)
    {
        in = fopen(file_in, "rb");
        if(!in)
        {
            fprintf(stderr,
                    "Can't open input file '%s': %s\n",
                    file_in, strerror(errno));
            return 1;
        }
    }

    /* If an output file is given then create it. */
    if(file_out)
    {
        out = fopen(file_out, "wb");
        if(!out)
        {
            fprintf(stderr,
                    "Can't open output file '%s': %s\n",
                    file_out, strerror(errno));
            return 1;
        }
    }

    //by yzhang for huffman statistics
    if(file_out_table)
    {
        outTable = fopen(file_out_table, "w");
        if(!outTable)
        {
            fprintf(stderr,
                "Can't open output file '%s': %s\n",
                file_out_table, strerror(errno));
            return 1;
        }
    }
    //end by yzhang

    if(memory)
    {
        return compress ?
            memory_encode_file(in, out) : memory_decode_file(in, out);
    }

    if(compress)  //change by yzhang
        huffman_encode_file(in, out,outTable);//step1:changed by yzhang from huffman_encode_file(in, out) to huffman_encode_file(in, out,outTable)
    else
    huffman_decode_file(in, out);

    if(in)
        fclose(in);
    if(out)
        fclose(out);
    if(outTable)
        fclose(outTable);
    return 0;
}

static int
memory_encode_file(FILE *in, FILE *out)
{
    unsigned char *buf = NULL, *bufout = NULL;
    unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;

    assert(in && out);

    /* Read the file into memory. */
    while(!feof(in))
    {
        unsigned char *tmp;
        len += inc;
        tmp = (unsigned char*)realloc(buf, len);
        if(!tmp)
        {
            if(buf)
                free(buf);
            return 1;
        }

        buf = tmp;
        cur += fread(buf + cur, 1, inc, in);
    }

    if(!buf)
        return 1;

    /* Encode the memory. */
    if(huffman_encode_memory(buf, cur, &bufout, &bufoutlen))
    {
        free(buf);
        return 1;
    }

    free(buf);

    /* Write the memory to the file. */
    if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen)
    {
        free(bufout);
        return 1;
    }

    free(bufout);

    return 0;
}

static int
memory_decode_file(FILE *in, FILE *out)
{
    unsigned char *buf = NULL, *bufout = NULL;
    unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;
    assert(in && out);

    /* Read the file into memory. */
    while(!feof(in))
    {
        unsigned char *tmp;
        len += inc;
        tmp = (unsigned char*)realloc(buf, len);
        if(!tmp)
        {
            if(buf)
                free(buf);
            return 1;
        }

        buf = tmp;
        cur += fread(buf + cur, 1, inc, in);
    }

    if(!buf)
        return 1;

    /* Decode the memory. */
    if(huffman_decode_memory(buf, cur, &bufout, &bufoutlen))
    {
        free(buf);
        return 1;
    }

    free(buf);

    /* Write the memory to the file. */
    if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen)
    {
        free(bufout);
        return 1;
    }

    free(bufout);

    return 0;
}

Huffman.c

/*
 *  huffman - Encode/Decode files using Huffman encoding.
 *  http://huffman.sourceforge.net
 *  Copyright (C) 2003  Douglas Ryan Richardson; Gauss Interprise, Inc
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include 
#include 
#include 
#include 
#include "huffman.h"

#ifdef WIN32
#include 
#include 
#define alloca _alloca
#else
#include 
#endif

typedef struct huffman_node_tag
{
    unsigned char isLeaf;
    unsigned long count;
    struct huffman_node_tag *parent;

    union
    {
        struct
        {
            struct huffman_node_tag *zero, *one;
        };
        unsigned char symbol;
    };
} huffman_node;

typedef struct huffman_code_tag
{
    /* The length of this code in bits. */
    unsigned long numbits;

    /* The bits that make up this code. The first
       bit is at position 0 in bits[0]. The second
       bit is at position 1 in bits[0]. The eighth
       bit is at position 7 in bits[0]. The ninth
       bit is at position 0 in bits[1]. */
    unsigned char *bits;
} huffman_code;

//step2:add by yzhang for huffman statistics
//存放信源符号的信息:符号频率、比特数、符号码字
typedef struct huffman_statistics_result
{
    float freq[256];
    unsigned long numbits[256];
    unsigned char bits[256][100];
}huffman_stat;

/*huffman_stat *init_huffstatistics()
{   huffman_stat *p;
    int i;
    p = (huffman_stat*)malloc(sizeof(huffman_stat));
    p->freq = (float *)malloc(sizeof(float)*256 );
    p->numbits = (unsigned long *) malloc(sizeof(unsigned long)*256);
    for (i=0 ; i<256;i++)
        p->bits[i] = (unsigned char *)malloc(sizeof(unsigned char)*100); 
    return p;
}*/
//end by yzhang


//将bit数转换为其对应的byte数,不能被8整除的部分要多分配一整个byte给它
static unsigned long
numbytes_from_numbits(unsigned long numbits)
{
    return numbits / 8 + (numbits % 8 ? 1 : 0);
}

/*
 * get_bit returns the ith bit in the bits array
 * in the 0th position of the return value.
 */
static unsigned char
get_bit(unsigned char* bits, unsigned long i)
{
    return (bits[i / 8] >> i % 8) & 1;
}

//由于程序中从二叉树形成码字的过程是从叶到根的,所以需要bit反转函数来获得顺序正确的码字,同时以byte为单位对其进行规范化
//例:传入倒序码字为010111011,通过bit反转函数变为00000001 10111010
static void
reverse_bits(unsigned char* bits, unsigned long numbits)
{
    unsigned long numbytes = numbytes_from_numbits(numbits);
    unsigned char *tmp =
        (unsigned char*)alloca(numbytes);//alloca与malloc功能相似,但alloca会自动释放申请的空间
    unsigned long curbit;
    long curbyte = 0;

    memset(tmp, 0, numbytes);//将tmp指向空间的前numbytes个字节内容全部置0

    for(curbit = 0; curbit < numbits; ++curbit)
    {
        unsigned int bitpos = curbit % 8;

        //如果一个byte写满了,就跳到下一个byte继续写
        if(curbit > 0 && curbit % 8 == 0)
            ++curbyte;
        //通过get_bit函数从传入的bits里获得当前操作的比特结果,用移位运算将其移动到在一个byte里对应的位置
        //由于tmp的指向操作是以byte为单位的,这里只能通过按位取或(|=)来把bit一个一个写到tmp指向的空间里去
        //bit反转是靠numbits-curbit-1实现的
        tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);
    }

    memcpy(bits, tmp, numbytes);//把反转后的tmp写回到bits里
}

/*
 * new_code builds a huffman_code from a leaf in
 * a Huffman tree.
 */
static huffman_code*
new_code(const huffman_node* leaf)
{
    /* Build the huffman code by walking up to
     * the root node and then reversing the bits,
     * since the Huffman code is calculated by
     * walking down the tree. */
    unsigned long numbits = 0;
    unsigned char* bits = NULL;
    huffman_code *p;

    //此段while循环的目的是从传入的叶结点开始向上进行寻根,得到该叶结点对应的码字
    while(leaf && leaf->parent)
    {
        huffman_node *parent = leaf->parent;
        unsigned char cur_bit = (unsigned char)(numbits % 8);
        unsigned long cur_byte = numbits / 8;

        /* If we need another byte to hold the code,
           then allocate it. */
        if(cur_bit == 0)
        {
            size_t newSize = cur_byte + 1;
            bits = (unsigned char*)realloc(bits, newSize);//把bits所占的空间大小调整为newSize个字节
            bits[newSize - 1] = 0; /* Initialize the new byte. */
        }

        /* If a one must be added then or it in. If a zero
         * must be added then do nothing, since the byte
         * was initialized to zero. */
        if(leaf == parent->one)//如果叶结点的地址等于该叶结点的爹妈的1孩子地址,则进行对应的移位操作
            bits[cur_byte] |= 1 << cur_bit;

        ++numbits;
        leaf = parent;
    }

    if(bits)
        reverse_bits(bits, numbits);

    p = (huffman_code*)malloc(sizeof(huffman_code));
    p->numbits = numbits;
    p->bits = bits;
    return p;//p里包含了编完的码字、码字长度
}

#define MAX_SYMBOLS 256
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];
typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];

//传入符号,建立其对应的叶结点,设置参数
static huffman_node*
new_leaf_node(unsigned char symbol)
{
    huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));
    p->isLeaf = 1;
    p->symbol = symbol;
    p->count = 0;
    p->parent = 0;
    return p;
}

//建立一个非叶结点,并将它的0、1孩子地址设置为传入的0、1结点地址
static huffman_node*
new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)
{
    huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));
    p->isLeaf = 0;
    p->count = count;
    p->zero = zero;
    p->one = one;
    p->parent = 0;

    return p;
}

static void
free_huffman_tree(huffman_node *subtree)
{
    if(subtree == NULL)
        return;

    if(!subtree->isLeaf)
    {
        free_huffman_tree(subtree->zero);
        free_huffman_tree(subtree->one);
    }

    free(subtree);
}

static void
free_code(huffman_code* p)
{
    free(p->bits);
    free(p);
}

static void
free_encoder(SymbolEncoder *pSE)
{
    unsigned long i;
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
        huffman_code *p = (*pSE)[i];
        if(p)
            free_code(p);
    }

    free(pSE);
}

static void
init_frequencies(SymbolFrequencies *pSF)
{
    memset(*pSF, 0, sizeof(SymbolFrequencies));
#if 0
    unsigned int i;
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
        unsigned char uc = (unsigned char)i;
        (*pSF)[i] = new_leaf_node(uc);
    }
#endif
}

typedef struct buf_cache_tag
{
    unsigned char *cache;
    unsigned int cache_len;
    unsigned int cache_cur;
    unsigned char **pbufout;
    unsigned int *pbufoutlen;
} buf_cache;

static int init_cache(buf_cache* pc,
                      unsigned int cache_size,
                      unsigned char **pbufout,
                      unsigned int *pbufoutlen)
{
    assert(pc && pbufout && pbufoutlen);
    if(!pbufout || !pbufoutlen)
        return 1;

    pc->cache = (unsigned char*)malloc(cache_size);
    pc->cache_len = cache_size;
    pc->cache_cur = 0;
    pc->pbufout = pbufout;
    *pbufout = NULL;
    pc->pbufoutlen = pbufoutlen;
    *pbufoutlen = 0;

    return pc->cache ? 0 : 1;
}

static void free_cache(buf_cache* pc)
{
    assert(pc);
    if(pc->cache)
    {
        free(pc->cache);
        pc->cache = NULL;
    }
}

static int flush_cache(buf_cache* pc)
{
    assert(pc);

    if(pc->cache_cur > 0)
    {
        unsigned int newlen = pc->cache_cur + *pc->pbufoutlen;
        unsigned char* tmp = realloc(*pc->pbufout, newlen);
        if(!tmp)
            return 1;

        memcpy(tmp + *pc->pbufoutlen, pc->cache, pc->cache_cur);

        *pc->pbufout = tmp;
        *pc->pbufoutlen = newlen;
        pc->cache_cur = 0;
    }

    return 0;
}

static int write_cache(buf_cache* pc,
                       const void *to_write,
                       unsigned int to_write_len)
{
    unsigned char* tmp;

    assert(pc && to_write);
    assert(pc->cache_len >= pc->cache_cur);

    /* If trying to write more than the cache will hold
     * flush the cache and allocate enough space immediately,
     * that is, don't use the cache. */
    if(to_write_len > pc->cache_len - pc->cache_cur)
    {
        unsigned int newlen;
        flush_cache(pc);
        newlen = *pc->pbufoutlen + to_write_len;
        tmp = realloc(*pc->pbufout, newlen);
        if(!tmp)
            return 1;
        memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len);
        *pc->pbufout = tmp;
        *pc->pbufoutlen = newlen;
    }
    else
    {
        /* Write the data to the cache. */
        memcpy(pc->cache + pc->cache_cur, to_write, to_write_len);
        pc->cache_cur += to_write_len;
    }

    return 0;
}

//为信源符号建立叶结点,统计次数
static unsigned int
get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{
    int c;
    unsigned int total_count = 0;

    /* Set all frequencies to 0. */
    init_frequencies(pSF);

    /* Count the frequency of each symbol in the input file. */
    while((c = fgetc(in)) != EOF)
    {
        unsigned char uc = c;
        if(!(*pSF)[uc])//如果第一次遇到这个符号,则新建该符号的叶结点
            (*pSF)[uc] = new_leaf_node(uc);
        ++(*pSF)[uc]->count;//对所有符号出现的次数分别进行计数
        ++total_count;
    }

    return total_count;
}

static unsigned int
get_symbol_frequencies_from_memory(SymbolFrequencies *pSF,
                                   const unsigned char *bufin,
                                   unsigned int bufinlen)
{
    unsigned int i;
    unsigned int total_count = 0;

    /* Set all frequencies to 0. */
    init_frequencies(pSF);

    /* Count the frequency of each symbol in the input file. */
    for(i = 0; i < bufinlen; ++i)
    {
        unsigned char uc = bufin[i];
        if(!(*pSF)[uc])
            (*pSF)[uc] = new_leaf_node(uc);
        ++(*pSF)[uc]->count;
        ++total_count;
    }

    return total_count;
}

/*
 * When used by qsort, SFComp sorts the array so that
 * the symbol with the lowest frequency is first. Any
 * NULL entries will be sorted to the end of the list.
 */
static int
SFComp(const void *p1, const void *p2)
{
    const huffman_node *hn1 = *(const huffman_node**)p1;
    const huffman_node *hn2 = *(const huffman_node**)p2;

    /* Sort all NULLs to the end. */
    if(hn1 == NULL && hn2 == NULL)
        return 0;
    if(hn1 == NULL)
        return 1;
    if(hn2 == NULL)
        return -1;

    if(hn1->count > hn2->count)
        return 1;
    else if(hn1->count < hn2->count)
        return -1;

    return 0;
}

#if 1
static void
print_freqs(SymbolFrequencies * pSF)
{
    size_t i;
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
        if((*pSF)[i])
            printf("%d, %ld\n", (*pSF)[i]->symbol, (*pSF)[i]->count);
        else
            printf("NULL\n");
    }
}
#endif

/*
 * build_symbol_encoder builds a SymbolEncoder by walking
 * down to the leaves of the Huffman tree and then,
 * for each leaf, determines its code.
 */
static void
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)
{
    if(subtree == NULL)
        return;

    //如果传入的结点是叶结点,对其进行编码并存放在对应的指针指向的空间里;如果不是,用递归方法不断调用自身传入该结点的左、右孩子,直到叶结点
    if(subtree->isLeaf)
        (*pSF)[subtree->symbol] = new_code(subtree);
    else
    {   //递归
        build_symbol_encoder(subtree->zero, pSF);
        build_symbol_encoder(subtree->one, pSF);
    }
}

/*
 * calculate_huffman_codes turns pSF into an array
 * with a single entry that is the root of the
 * huffman tree. The return value is a SymbolEncoder,
 * which is an array of huffman codes index by symbol value.
 */
static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{
    unsigned int i = 0;
    unsigned int n = 0;
    huffman_node *m1 = NULL, *m2 = NULL;
    SymbolEncoder *pSE = NULL;

#if 1
    printf("BEFORE SORT\n");
    print_freqs(pSF);   //演示堆栈的使用
#endif

    /* Sort the symbol frequency array by ascending frequency. */
    //qsort是自带的快速排序函数,参数为待排序数组的首地址(*pSF),排序元素数量(MAX_SYMBOLS),每个元素的长度(sizeof((*pSF)[0])),自定义的比较函数(SFComp,返回1则前〉后,-1则后〉前)
    qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);   //讲解SFComp函数的作用,断点在调试程序里的作用

#if 1   
    printf("AFTER SORT\n");
    print_freqs(pSF);
#endif

    /* Get the number of symbols. */
    for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n)
        ;

    /*
     * Construct a Huffman tree. This code is based
     * on the algorithm given in Managing Gigabytes
     * by Ian Witten et al, 2nd edition, page 34.
     * Note that this implementation uses a simple
     * count instead of probability.
     */
    for(i = 0; i < n - 1; ++i)
    {
        /* Set m1 and m2 to the two subsets of least probability. */
            m1 = (*pSF)[0];
        m2 = (*pSF)[1];

        /* Replace m1 and m2 with a set {m1, m2} whose probability
         * is the sum of that of m1 and m2. */
        (*pSF)[0] = m1->parent = m2->parent =
            new_nonleaf_node(m1->count + m2->count, m1, m2);
        (*pSF)[1] = NULL;

        /* Put newSet into the correct count position in pSF. */
        qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);
    }

    /* Build the SymbolEncoder array from the tree. */
    pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));
    memset(pSE, 0, sizeof(SymbolEncoder));
    build_symbol_encoder((*pSF)[0], pSE);
    return pSE;
}

/*
 * Write the huffman code table. The format is:
 * 4 byte code count in network byte order.
 * 4 byte number of bytes encoded
 *   (if you decode the data, you should get this number of bytes)
 * code1
 * ...
 * codeN, where N is the count read at the begginning of the file.
 * Each codeI has the following format:
 * 1 byte symbol, 1 byte code bit length, code bytes.
 * Each entry has numbytes_from_numbits code bytes.
 * The last byte of each code may have extra bits, if the number of
 * bits in the code is not a multiple of 8.
 */
static int
write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{
    unsigned long i, count = 0;

    /* Determine the number of entries in se. */
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
        if((*se)[i])
            ++count;
    }

    /* Write the number of entries in network byte order. */
    i = htonl(count);    //在网络传输中,采用big-endian序,对于0x0A0B0C0D ,传输顺序就是0A 0B 0C 0D ,
    //因此big-endian作为network byte order,little-endian作为host byte order。
    //little-endian的优势在于unsigned char/short/int/long类型转换时,存储位置无需改变
    if(fwrite(&i, sizeof(i), 1, out) != 1)
        return 1;

    /* Write the number of bytes that will be encoded. */
    symbol_count = htonl(symbol_count);
    if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)
        return 1;

    /* Write the entries. */
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
        huffman_code *p = (*se)[i];
        if(p)
        {
            unsigned int numbytes;
            /* Write the 1 byte symbol. */
            fputc((unsigned char)i, out);
            /* Write the 1 byte code bit length. */
            fputc(p->numbits, out);
            /* Write the code bytes. */
            numbytes = numbytes_from_numbits(p->numbits);
            if(fwrite(p->bits, 1, numbytes, out) != numbytes)
                return 1;
        }
    }

    return 0;
}

/*
 * Allocates memory and sets *pbufout to point to it. The memory
 * contains the code table.
 */
static int
write_code_table_to_memory(buf_cache *pc,
                           SymbolEncoder *se,
                           unsigned int symbol_count)
{
    unsigned long i, count = 0;

    /* Determine the number of entries in se. */
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
        if((*se)[i])
            ++count;
    }

    /* Write the number of entries in network byte order. */
    i = htonl(count);

    if(write_cache(pc, &i, sizeof(i)))
        return 1;

    /* Write the number of bytes that will be encoded. */
    symbol_count = htonl(symbol_count);
    if(write_cache(pc, &symbol_count, sizeof(symbol_count)))
        return 1;

    /* Write the entries. */
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
        huffman_code *p = (*se)[i];
        if(p)
        {
            unsigned int numbytes;
            /* The value of i is < MAX_SYMBOLS (256), so it can
            be stored in an unsigned char. */
            unsigned char uc = (unsigned char)i;
            /* Write the 1 byte symbol. */
            if(write_cache(pc, &uc, sizeof(uc)))
                return 1;
            /* Write the 1 byte code bit length. */
            uc = (unsigned char)p->numbits;
            if(write_cache(pc, &uc, sizeof(uc)))
                return 1;
            /* Write the code bytes. */
            numbytes = numbytes_from_numbits(p->numbits);
            if(write_cache(pc, p->bits, numbytes))
                return 1;
        }
    }

    return 0;
}

/*
 * read_code_table builds a Huffman tree from the code
 * in the in file. This function returns NULL on error.
 * The returned value should be freed with free_huffman_tree.
 */
static huffman_node*
read_code_table(FILE* in, unsigned int *pDataBytes)
{
    //在解码端重建huffman树
    huffman_node *root = new_nonleaf_node(0, NULL, NULL);
    unsigned int count;

    /* Read the number of entries.
       (it is stored in network byte order). */
    if(fread(&count, sizeof(count), 1, in) != 1)
    {
        free_huffman_tree(root);
        return NULL;
    }

    count = ntohl(count);//将一个无符号长整形数从网络字节顺序转换为主机字节顺序

    /* Read the number of data bytes this encoding represents. */
    if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1)
    {
        free_huffman_tree(root);
        return NULL;
    }

    *pDataBytes = ntohl(*pDataBytes);


    /* Read the entries. */
    while(count-- > 0)
    {
        int c;
        unsigned int curbit;
        unsigned char symbol;
        unsigned char numbits;
        unsigned char numbytes;
        unsigned char *bytes;
        huffman_node *p = root;

        if((c = fgetc(in)) == EOF)//读取符号并判断
        {
            free_huffman_tree(root);
            return NULL;
        }
        symbol = (unsigned char)c;

        if((c = fgetc(in)) == EOF)//读取字符长度并判断
        {
            free_huffman_tree(root);
            return NULL;
        }

        numbits = (unsigned char)c;
        numbytes = (unsigned char)numbytes_from_numbits(numbits);
        bytes = (unsigned char*)malloc(numbytes);
        if(fread(bytes, 1, numbytes, in) != numbytes)
        {
            free(bytes);
            free_huffman_tree(root);
            return NULL;
        }

        /*
         * Add the entry to the Huffman tree. The value
         * of the current bit is used switch between
         * zero and one child nodes in the tree. New nodes
         * are added as needed in the tree.
         */
        for(curbit = 0; curbit < numbits; ++curbit)
        {
            if(get_bit(bytes, curbit))
            {
                if(p->one == NULL)
                {
                    p->one = curbit == (unsigned char)(numbits - 1)
                        ? new_leaf_node(symbol)
                        : new_nonleaf_node(0, NULL, NULL);
                    p->one->parent = p;
                }
                p = p->one;
            }
            else
            {
                if(p->zero == NULL)
                {
                    p->zero = curbit == (unsigned char)(numbits - 1)
                        ? new_leaf_node(symbol)
                        : new_nonleaf_node(0, NULL, NULL);
                    p->zero->parent = p;
                }
                p = p->zero;
            }
        }

        free(bytes);
    }

    return root;
}

static int
memread(const unsigned char* buf,
        unsigned int buflen,
        unsigned int *pindex,
        void* bufout,
        unsigned int readlen)
{
    assert(buf && pindex && bufout);
    assert(buflen >= *pindex);
    if(buflen < *pindex)
        return 1;
    if(readlen + *pindex >= buflen)
        return 1;
    memcpy(bufout, buf + *pindex, readlen);
    *pindex += readlen;
    return 0;
}

static huffman_node*
read_code_table_from_memory(const unsigned char* bufin,
                            unsigned int bufinlen,
                            unsigned int *pindex,
                            unsigned int *pDataBytes)
{
    huffman_node *root = new_nonleaf_node(0, NULL, NULL);
    unsigned int count;

    /* Read the number of entries.
       (it is stored in network byte order). */
    if(memread(bufin, bufinlen, pindex, &count, sizeof(count)))
    {
        free_huffman_tree(root);
        return NULL;
    }

    count = ntohl(count);

    /* Read the number of data bytes this encoding represents. */
    if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes)))
    {
        free_huffman_tree(root);
        return NULL;
    }

    *pDataBytes = ntohl(*pDataBytes);

    /* Read the entries. */
    while(count-- > 0)
    {
        unsigned int curbit;
        unsigned char symbol;
        unsigned char numbits;
        unsigned char numbytes;
        unsigned char *bytes;
        huffman_node *p = root;

        if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol)))
        {
            free_huffman_tree(root);
            return NULL;
        }

        if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits)))
        {
            free_huffman_tree(root);
            return NULL;
        }

        numbytes = (unsigned char)numbytes_from_numbits(numbits);
        bytes = (unsigned char*)malloc(numbytes);
        if(memread(bufin, bufinlen, pindex, bytes, numbytes))
        {
            free(bytes);
            free_huffman_tree(root);
            return NULL;
        }

        /*
         * Add the entry to the Huffman tree. The value
         * of the current bit is used switch between
         * zero and one child nodes in the tree. New nodes
         * are added as needed in the tree.
         */
        for(curbit = 0; curbit < numbits; ++curbit)
        {
            if(get_bit(bytes, curbit))
            {
                if(p->one == NULL)
                {
                    p->one = curbit == (unsigned char)(numbits - 1)
                        ? new_leaf_node(symbol)
                        : new_nonleaf_node(0, NULL, NULL);
                    p->one->parent = p;
                }
                p = p->one;
            }
            else
            {
                if(p->zero == NULL)
                {
                    p->zero = curbit == (unsigned char)(numbits - 1)
                        ? new_leaf_node(symbol)
                        : new_nonleaf_node(0, NULL, NULL);
                    p->zero->parent = p;
                }
                p = p->zero;
            }
        }

        free(bytes);
    }

    return root;
}

static int
do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{
    unsigned char curbyte = 0;
    unsigned char curbit = 0;
    int c;

    while((c = fgetc(in)) != EOF)
    {
        unsigned char uc = (unsigned char)c;
        huffman_code *code = (*se)[uc];
        unsigned long i;

        for(i = 0; i < code->numbits; ++i)
        {
            /* Add the current bit to curbyte. */
            curbyte |= get_bit(code->bits, i) << curbit;

            /* If this byte is filled up then write it
             * out and reset the curbit and curbyte. */
            if(++curbit == 8)
            {
                fputc(curbyte, out);
                curbyte = 0;
                curbit = 0;
            }
        }
    }

    /*
     * If there is data in curbyte that has not been
     * output yet, which means that the last encoded
     * character did not fall on a byte boundary,
     * then output it.
     */
    if(curbit > 0)//写最后一个符号没写满8bit的情况
        fputc(curbyte, out);

    return 0;
}

static int
do_memory_encode(buf_cache *pc,
                 const unsigned char* bufin,
                 unsigned int bufinlen,
                 SymbolEncoder *se)
{
    unsigned char curbyte = 0;
    unsigned char curbit = 0;
    unsigned int i;

    for(i = 0; i < bufinlen; ++i)
    {
        unsigned char uc = bufin[i];
        huffman_code *code = (*se)[uc];
        unsigned long i;

        for(i = 0; i < code->numbits; ++i)
        {
            /* Add the current bit to curbyte. */
            curbyte |= get_bit(code->bits, i) << curbit;

            /* If this byte is filled up then write it
             * out and reset the curbit and curbyte. */
            if(++curbit == 8)
            {
                if(write_cache(pc, &curbyte, sizeof(curbyte)))
                    return 1;
                curbyte = 0;
                curbit = 0;
            }
        }
    }

    /*
     * If there is data in curbyte that has not been
     * output yet, which means that the last encoded
     * character did not fall on a byte boundary,
     * then output it.
     */
    return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;
}

//step3:add by yzhang for huffman statistics
int huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count)
{
    int i,count =0;
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {   
        if((*SF)[i])
        {
            st->freq[i]=(float)(*SF)[i]->count/total_count;
            count+=(*SF)[i]->count;
        }
        else 
        {
            st->freq[i]= 0;
        }
    }
    if(count==total_count)
        return 1;
    else
        return 0;
}

int huffST_getcodeword(SymbolEncoder *se, huffman_stat *st)
{
    unsigned long i,j;

    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
        huffman_code *p = (*se)[i];
        if(p)
        {
            unsigned int numbytes;
            st->numbits[i] = p->numbits;
            numbytes = numbytes_from_numbits(p->numbits);
            for (j=0;jbits[i][j] = p->bits[j];
        }
        else
            st->numbits[i] =0;
    }

    return 0;
}

void output_huffman_statistics(huffman_stat *st,FILE *out_Table)
{
    int i,j;
    unsigned char c;
    fprintf(out_Table,"symbol\t   freq\t   codelength\t   code\n");
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {   
        fprintf(out_Table,"%d\t   ",i);
        fprintf(out_Table,"%f\t   ",st->freq[i]);
        fprintf(out_Table,"%d\t    ",st->numbits[i]);
        if(st->numbits[i])
        {
            for(j = 0; j < st->numbits[i]; ++j)
            {
                c =get_bit(st->bits[i], j);
                fprintf(out_Table,"%d",c);
            }
        }
        fprintf(out_Table,"\n");
    }
}
//end by yzhang
/*
 * huffman_encode_file huffman encodes in to out.
 */
int
huffman_encode_file(FILE *in, FILE *out, FILE *out_Table)  //step1:changed by yzhang for huffman statistics from (FILE *in, FILE *out) to (FILE *in, FILE *out, FILE *out_Table)
{
    SymbolFrequencies sf;
    SymbolEncoder *se;
    huffman_node *root = NULL;
    int rc;
    unsigned int symbol_count;
    //step2:add by yzhang for huffman statistics
    huffman_stat hs;
    //end by yzhang

    /* Get the frequency of each symbol in the input file. */
    symbol_count = get_symbol_frequencies(&sf, in); //演示扫描完一遍文件后,SF指针数组的每个元素的构成

    //step3:add by yzhang for huffman statistics,...  get the frequency of each symbol 
    huffST_getSymFrequencies(&sf,&hs,symbol_count);
    //end by yzhang

    /* Build an optimal table from the symbolCount. */
    se = calculate_huffman_codes(&sf);
    root = sf[0];

    //step3:add by yzhang for huffman statistics... output the statistics to file
    huffST_getcodeword(se, &hs);
    output_huffman_statistics(&hs,out_Table);
    //end by yzhang

    /* Scan the file again and, using the table
       previously built, encode it into the output file. */
    rewind(in);
    rc = write_code_table(out, se, symbol_count);
    if(rc == 0)
        rc = do_file_encode(in, out, se);

    /* Free the Huffman tree. */
    free_huffman_tree(root);
    free_encoder(se);
    return rc;
}

int
huffman_decode_file(FILE *in, FILE *out)
{
    huffman_node *root, *p;
    int c;
    unsigned int data_count;

    /* Read the Huffman code table. */
    root = read_code_table(in, &data_count);
    if(!root)
        return 1;

    /* Decode the file. */
    p = root;
    while(data_count > 0 && (c = fgetc(in)) != EOF)
    {
        unsigned char byte = (unsigned char)c;
        unsigned char mask = 1;
        while(data_count > 0 && mask)
        {
            p = byte & mask ? p->one : p->zero;
            mask <<= 1;

            if(p->isLeaf)
            {
                fputc(p->symbol, out);
                p = root;
                --data_count;
            }
        }
    }

    free_huffman_tree(root);
    return 0;
}

#define CACHE_SIZE 1024

int huffman_encode_memory(const unsigned char *bufin,
                          unsigned int bufinlen,
                          unsigned char **pbufout,
                          unsigned int *pbufoutlen)
{
    SymbolFrequencies sf;
    SymbolEncoder *se;
    huffman_node *root = NULL;
    int rc;
    unsigned int symbol_count;
    buf_cache cache;

    /* Ensure the arguments are valid. */
    if(!pbufout || !pbufoutlen)
        return 1;

    if(init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen))
        return 1;

    /* Get the frequency of each symbol in the input memory. */
    symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen);

    /* Build an optimal table from the symbolCount. */
    se = calculate_huffman_codes(&sf);
    root = sf[0];

    /* Scan the memory again and, using the table
       previously built, encode it into the output memory. */
    rc = write_code_table_to_memory(&cache, se, symbol_count);
    if(rc == 0)
        rc = do_memory_encode(&cache, bufin, bufinlen, se);

    /* Flush the cache. */
    flush_cache(&cache);

    /* Free the Huffman tree. */
    free_huffman_tree(root);
    free_encoder(se);
    free_cache(&cache);
    return rc;
}

int huffman_decode_memory(const unsigned char *bufin,
                          unsigned int bufinlen,
                          unsigned char **pbufout,
                          unsigned int *pbufoutlen)
{
    huffman_node *root, *p;
    unsigned int data_count;
    unsigned int i = 0;
    unsigned char *buf;
    unsigned int bufcur = 0;

    /* Ensure the arguments are valid. */
    if(!pbufout || !pbufoutlen)
        return 1;

    /* Read the Huffman code table. */
    root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count);
    if(!root)
        return 1;

    buf = (unsigned char*)malloc(data_count);

    /* Decode the memory. */
    p = root;
    for(; i < bufinlen && data_count > 0; ++i) 
    {
        unsigned char byte = bufin[i];
        unsigned char mask = 1;
        while(data_count > 0 && mask)
        {
            p = byte & mask ? p->one : p->zero;
            mask <<= 1;

            if(p->isLeaf)
            {
                buf[bufcur++] = p->symbol;
                p = root;
                --data_count;
            }
        }
    }

    free_huffman_tree(root);
    *pbufout = buf;
    *pbufoutlen = bufcur;
    return 0;
}

三、结果分析

实验选取了10中文件类型进行Huffman编码,分别为bmp、doc、exe、pdf、png、ppt、rar、wav、xls、yuv。对编码后的文件进行分析,得到以下结果图表:
这里写图片描述
可以看到,进行Huffman编码后,大多数文件都变小了,压缩比在1到4之间。但也有rar这样经过编码后不小反大的文件。

再观察每个文件的字符概率分布情况:
实验三 Huffman编解码算法实现与压缩效率分析_第2张图片
实验三 Huffman编解码算法实现与压缩效率分析_第3张图片
实验三 Huffman编解码算法实现与压缩效率分析_第4张图片
实验三 Huffman编解码算法实现与压缩效率分析_第5张图片
实验三 Huffman编解码算法实现与压缩效率分析_第6张图片
实验三 Huffman编解码算法实现与压缩效率分析_第7张图片
实验三 Huffman编解码算法实现与压缩效率分析_第8张图片
对比联合图表可以发现,压缩比是由概率分布决定的。相比于实验选用的bmp、doc等字符概率比较集中的文件,字符概率分布平均分散的文件(如rar、png、pdf),压缩比更小,信源熵更大。

你可能感兴趣的:(实验三 Huffman编解码算法实现与压缩效率分析)