对原始文件进行Huffman编码,首先需要解决以下几点问题:
这三个问题在程序中的实现思路如下图:
将待编码文件里的数据参照已形成的Huffman码表一一进行转换,就可以得到编码后的文件了。
Huffman解码是查表+翻译的过程。读取随接收文件传来的码表后,再逐位读取文件实际数据,对照码表进行翻译即可。
流程中最关键的对Huffman树的操作在程序中主要通过两个结构体实现:Huffman_node和Huffman_code。
建立的二叉树上每个节点都以Huffman_node类型存在。节点之间的主要关系有父子、兄弟,Huffman_node中定义了指向父节点的指针*parent和指向孩子的指针*zero, *one来表述节点与节点之间的关系。除此之外,还有节点本身的属性:isLeaf、count、symbol。
而编码码字定义为了Huffman_code,本身属性包括码字占用的比特数和码字本身。
具体程序如下,部分理解在注释中给出。
/*
* huffcode - Encode/Decode files using Huffman encoding.
* http://huffman.sourceforge.net
* Copyright (C) 2003 Douglas Ryan Richardson; Gauss Interprise, Inc
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "huffman.h"
#include
#include
#include
#include
#include
#ifdef WIN32
#include
extern int getopt(int, char**, char*);
extern char* optarg;
#else
#include
#endif
static int memory_encode_file(FILE *in, FILE *out);
static int memory_decode_file(FILE *in, FILE *out);
static void
version(FILE *out)
{
fputs("huffcode 0.3\n"
"Copyright (C) 2003 Douglas Ryan Richardson"
"; Gauss Interprise, Inc\n",
out);
}
static void
usage(FILE* out)
{
fputs("Usage: huffcode [-i] [-o
"-i - input file (default is standard input)\n"
"-o - output file (default is standard output)\n"
"-d - decompress\n"
"-c - compress (default)\n"
"-m - read file into memory, compress, then write to file (not default)\n",
// step1: by yzhang, for huffman statistics
"-t - output huffman statistics\n",
//step1:end by yzhang
out);
}
int
main(int argc, char** argv)
{
char memory = 0;
char compress = 1;
int opt;
const char *file_in = NULL, *file_out = NULL;
//step1:add by yzhang for huffman statistics
const char *file_out_table = NULL;
//end by yzhang
FILE *in = stdin;
FILE *out = stdout;
//step1:add by yzhang for huffman statistics
FILE * outTable = NULL;
//end by yzhang
/* Get the command line arguments. */
while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1) //演示如何跳出循环,及查找括号对
{
switch(opt)
{
case 'i':
file_in = optarg;
break;
case 'o':
file_out = optarg;
break;
case 'c':
compress = 1;//压缩
break;
case 'd':
compress = 0;//解压
break;
case 'h':
usage(stdout);
return 0;
case 'v':
version(stdout);
return 0;
case 'm':
memory = 1;
break;
// by yzhang for huffman statistics
case 't':
file_out_table = optarg;
break;
//end by yzhang
default:
usage(stderr);
return 1;
}
}
/* If an input file is given then open it. */
if(file_in)
{
in = fopen(file_in, "rb");
if(!in)
{
fprintf(stderr,
"Can't open input file '%s': %s\n",
file_in, strerror(errno));
return 1;
}
}
/* If an output file is given then create it. */
if(file_out)
{
out = fopen(file_out, "wb");
if(!out)
{
fprintf(stderr,
"Can't open output file '%s': %s\n",
file_out, strerror(errno));
return 1;
}
}
//by yzhang for huffman statistics
if(file_out_table)
{
outTable = fopen(file_out_table, "w");
if(!outTable)
{
fprintf(stderr,
"Can't open output file '%s': %s\n",
file_out_table, strerror(errno));
return 1;
}
}
//end by yzhang
if(memory)
{
return compress ?
memory_encode_file(in, out) : memory_decode_file(in, out);
}
if(compress) //change by yzhang
huffman_encode_file(in, out,outTable);//step1:changed by yzhang from huffman_encode_file(in, out) to huffman_encode_file(in, out,outTable)
else
huffman_decode_file(in, out);
if(in)
fclose(in);
if(out)
fclose(out);
if(outTable)
fclose(outTable);
return 0;
}
static int
memory_encode_file(FILE *in, FILE *out)
{
unsigned char *buf = NULL, *bufout = NULL;
unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;
assert(in && out);
/* Read the file into memory. */
while(!feof(in))
{
unsigned char *tmp;
len += inc;
tmp = (unsigned char*)realloc(buf, len);
if(!tmp)
{
if(buf)
free(buf);
return 1;
}
buf = tmp;
cur += fread(buf + cur, 1, inc, in);
}
if(!buf)
return 1;
/* Encode the memory. */
if(huffman_encode_memory(buf, cur, &bufout, &bufoutlen))
{
free(buf);
return 1;
}
free(buf);
/* Write the memory to the file. */
if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen)
{
free(bufout);
return 1;
}
free(bufout);
return 0;
}
static int
memory_decode_file(FILE *in, FILE *out)
{
unsigned char *buf = NULL, *bufout = NULL;
unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;
assert(in && out);
/* Read the file into memory. */
while(!feof(in))
{
unsigned char *tmp;
len += inc;
tmp = (unsigned char*)realloc(buf, len);
if(!tmp)
{
if(buf)
free(buf);
return 1;
}
buf = tmp;
cur += fread(buf + cur, 1, inc, in);
}
if(!buf)
return 1;
/* Decode the memory. */
if(huffman_decode_memory(buf, cur, &bufout, &bufoutlen))
{
free(buf);
return 1;
}
free(buf);
/* Write the memory to the file. */
if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen)
{
free(bufout);
return 1;
}
free(bufout);
return 0;
}
/*
* huffman - Encode/Decode files using Huffman encoding.
* http://huffman.sourceforge.net
* Copyright (C) 2003 Douglas Ryan Richardson; Gauss Interprise, Inc
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include
#include
#include
#include
#include "huffman.h"
#ifdef WIN32
#include
#include
#define alloca _alloca
#else
#include
#endif
typedef struct huffman_node_tag
{
unsigned char isLeaf;
unsigned long count;
struct huffman_node_tag *parent;
union
{
struct
{
struct huffman_node_tag *zero, *one;
};
unsigned char symbol;
};
} huffman_node;
typedef struct huffman_code_tag
{
/* The length of this code in bits. */
unsigned long numbits;
/* The bits that make up this code. The first
bit is at position 0 in bits[0]. The second
bit is at position 1 in bits[0]. The eighth
bit is at position 7 in bits[0]. The ninth
bit is at position 0 in bits[1]. */
unsigned char *bits;
} huffman_code;
//step2:add by yzhang for huffman statistics
//存放信源符号的信息:符号频率、比特数、符号码字
typedef struct huffman_statistics_result
{
float freq[256];
unsigned long numbits[256];
unsigned char bits[256][100];
}huffman_stat;
/*huffman_stat *init_huffstatistics()
{ huffman_stat *p;
int i;
p = (huffman_stat*)malloc(sizeof(huffman_stat));
p->freq = (float *)malloc(sizeof(float)*256 );
p->numbits = (unsigned long *) malloc(sizeof(unsigned long)*256);
for (i=0 ; i<256;i++)
p->bits[i] = (unsigned char *)malloc(sizeof(unsigned char)*100);
return p;
}*/
//end by yzhang
//将bit数转换为其对应的byte数,不能被8整除的部分要多分配一整个byte给它
static unsigned long
numbytes_from_numbits(unsigned long numbits)
{
return numbits / 8 + (numbits % 8 ? 1 : 0);
}
/*
* get_bit returns the ith bit in the bits array
* in the 0th position of the return value.
*/
static unsigned char
get_bit(unsigned char* bits, unsigned long i)
{
return (bits[i / 8] >> i % 8) & 1;
}
//由于程序中从二叉树形成码字的过程是从叶到根的,所以需要bit反转函数来获得顺序正确的码字,同时以byte为单位对其进行规范化
//例:传入倒序码字为010111011,通过bit反转函数变为00000001 10111010
static void
reverse_bits(unsigned char* bits, unsigned long numbits)
{
unsigned long numbytes = numbytes_from_numbits(numbits);
unsigned char *tmp =
(unsigned char*)alloca(numbytes);//alloca与malloc功能相似,但alloca会自动释放申请的空间
unsigned long curbit;
long curbyte = 0;
memset(tmp, 0, numbytes);//将tmp指向空间的前numbytes个字节内容全部置0
for(curbit = 0; curbit < numbits; ++curbit)
{
unsigned int bitpos = curbit % 8;
//如果一个byte写满了,就跳到下一个byte继续写
if(curbit > 0 && curbit % 8 == 0)
++curbyte;
//通过get_bit函数从传入的bits里获得当前操作的比特结果,用移位运算将其移动到在一个byte里对应的位置
//由于tmp的指向操作是以byte为单位的,这里只能通过按位取或(|=)来把bit一个一个写到tmp指向的空间里去
//bit反转是靠numbits-curbit-1实现的
tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);
}
memcpy(bits, tmp, numbytes);//把反转后的tmp写回到bits里
}
/*
* new_code builds a huffman_code from a leaf in
* a Huffman tree.
*/
static huffman_code*
new_code(const huffman_node* leaf)
{
/* Build the huffman code by walking up to
* the root node and then reversing the bits,
* since the Huffman code is calculated by
* walking down the tree. */
unsigned long numbits = 0;
unsigned char* bits = NULL;
huffman_code *p;
//此段while循环的目的是从传入的叶结点开始向上进行寻根,得到该叶结点对应的码字
while(leaf && leaf->parent)
{
huffman_node *parent = leaf->parent;
unsigned char cur_bit = (unsigned char)(numbits % 8);
unsigned long cur_byte = numbits / 8;
/* If we need another byte to hold the code,
then allocate it. */
if(cur_bit == 0)
{
size_t newSize = cur_byte + 1;
bits = (unsigned char*)realloc(bits, newSize);//把bits所占的空间大小调整为newSize个字节
bits[newSize - 1] = 0; /* Initialize the new byte. */
}
/* If a one must be added then or it in. If a zero
* must be added then do nothing, since the byte
* was initialized to zero. */
if(leaf == parent->one)//如果叶结点的地址等于该叶结点的爹妈的1孩子地址,则进行对应的移位操作
bits[cur_byte] |= 1 << cur_bit;
++numbits;
leaf = parent;
}
if(bits)
reverse_bits(bits, numbits);
p = (huffman_code*)malloc(sizeof(huffman_code));
p->numbits = numbits;
p->bits = bits;
return p;//p里包含了编完的码字、码字长度
}
#define MAX_SYMBOLS 256
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];
typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];
//传入符号,建立其对应的叶结点,设置参数
static huffman_node*
new_leaf_node(unsigned char symbol)
{
huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));
p->isLeaf = 1;
p->symbol = symbol;
p->count = 0;
p->parent = 0;
return p;
}
//建立一个非叶结点,并将它的0、1孩子地址设置为传入的0、1结点地址
static huffman_node*
new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)
{
huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));
p->isLeaf = 0;
p->count = count;
p->zero = zero;
p->one = one;
p->parent = 0;
return p;
}
static void
free_huffman_tree(huffman_node *subtree)
{
if(subtree == NULL)
return;
if(!subtree->isLeaf)
{
free_huffman_tree(subtree->zero);
free_huffman_tree(subtree->one);
}
free(subtree);
}
static void
free_code(huffman_code* p)
{
free(p->bits);
free(p);
}
static void
free_encoder(SymbolEncoder *pSE)
{
unsigned long i;
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*pSE)[i];
if(p)
free_code(p);
}
free(pSE);
}
static void
init_frequencies(SymbolFrequencies *pSF)
{
memset(*pSF, 0, sizeof(SymbolFrequencies));
#if 0
unsigned int i;
for(i = 0; i < MAX_SYMBOLS; ++i)
{
unsigned char uc = (unsigned char)i;
(*pSF)[i] = new_leaf_node(uc);
}
#endif
}
typedef struct buf_cache_tag
{
unsigned char *cache;
unsigned int cache_len;
unsigned int cache_cur;
unsigned char **pbufout;
unsigned int *pbufoutlen;
} buf_cache;
static int init_cache(buf_cache* pc,
unsigned int cache_size,
unsigned char **pbufout,
unsigned int *pbufoutlen)
{
assert(pc && pbufout && pbufoutlen);
if(!pbufout || !pbufoutlen)
return 1;
pc->cache = (unsigned char*)malloc(cache_size);
pc->cache_len = cache_size;
pc->cache_cur = 0;
pc->pbufout = pbufout;
*pbufout = NULL;
pc->pbufoutlen = pbufoutlen;
*pbufoutlen = 0;
return pc->cache ? 0 : 1;
}
static void free_cache(buf_cache* pc)
{
assert(pc);
if(pc->cache)
{
free(pc->cache);
pc->cache = NULL;
}
}
static int flush_cache(buf_cache* pc)
{
assert(pc);
if(pc->cache_cur > 0)
{
unsigned int newlen = pc->cache_cur + *pc->pbufoutlen;
unsigned char* tmp = realloc(*pc->pbufout, newlen);
if(!tmp)
return 1;
memcpy(tmp + *pc->pbufoutlen, pc->cache, pc->cache_cur);
*pc->pbufout = tmp;
*pc->pbufoutlen = newlen;
pc->cache_cur = 0;
}
return 0;
}
static int write_cache(buf_cache* pc,
const void *to_write,
unsigned int to_write_len)
{
unsigned char* tmp;
assert(pc && to_write);
assert(pc->cache_len >= pc->cache_cur);
/* If trying to write more than the cache will hold
* flush the cache and allocate enough space immediately,
* that is, don't use the cache. */
if(to_write_len > pc->cache_len - pc->cache_cur)
{
unsigned int newlen;
flush_cache(pc);
newlen = *pc->pbufoutlen + to_write_len;
tmp = realloc(*pc->pbufout, newlen);
if(!tmp)
return 1;
memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len);
*pc->pbufout = tmp;
*pc->pbufoutlen = newlen;
}
else
{
/* Write the data to the cache. */
memcpy(pc->cache + pc->cache_cur, to_write, to_write_len);
pc->cache_cur += to_write_len;
}
return 0;
}
//为信源符号建立叶结点,统计次数
static unsigned int
get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{
int c;
unsigned int total_count = 0;
/* Set all frequencies to 0. */
init_frequencies(pSF);
/* Count the frequency of each symbol in the input file. */
while((c = fgetc(in)) != EOF)
{
unsigned char uc = c;
if(!(*pSF)[uc])//如果第一次遇到这个符号,则新建该符号的叶结点
(*pSF)[uc] = new_leaf_node(uc);
++(*pSF)[uc]->count;//对所有符号出现的次数分别进行计数
++total_count;
}
return total_count;
}
static unsigned int
get_symbol_frequencies_from_memory(SymbolFrequencies *pSF,
const unsigned char *bufin,
unsigned int bufinlen)
{
unsigned int i;
unsigned int total_count = 0;
/* Set all frequencies to 0. */
init_frequencies(pSF);
/* Count the frequency of each symbol in the input file. */
for(i = 0; i < bufinlen; ++i)
{
unsigned char uc = bufin[i];
if(!(*pSF)[uc])
(*pSF)[uc] = new_leaf_node(uc);
++(*pSF)[uc]->count;
++total_count;
}
return total_count;
}
/*
* When used by qsort, SFComp sorts the array so that
* the symbol with the lowest frequency is first. Any
* NULL entries will be sorted to the end of the list.
*/
static int
SFComp(const void *p1, const void *p2)
{
const huffman_node *hn1 = *(const huffman_node**)p1;
const huffman_node *hn2 = *(const huffman_node**)p2;
/* Sort all NULLs to the end. */
if(hn1 == NULL && hn2 == NULL)
return 0;
if(hn1 == NULL)
return 1;
if(hn2 == NULL)
return -1;
if(hn1->count > hn2->count)
return 1;
else if(hn1->count < hn2->count)
return -1;
return 0;
}
#if 1
static void
print_freqs(SymbolFrequencies * pSF)
{
size_t i;
for(i = 0; i < MAX_SYMBOLS; ++i)
{
if((*pSF)[i])
printf("%d, %ld\n", (*pSF)[i]->symbol, (*pSF)[i]->count);
else
printf("NULL\n");
}
}
#endif
/*
* build_symbol_encoder builds a SymbolEncoder by walking
* down to the leaves of the Huffman tree and then,
* for each leaf, determines its code.
*/
static void
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)
{
if(subtree == NULL)
return;
//如果传入的结点是叶结点,对其进行编码并存放在对应的指针指向的空间里;如果不是,用递归方法不断调用自身传入该结点的左、右孩子,直到叶结点
if(subtree->isLeaf)
(*pSF)[subtree->symbol] = new_code(subtree);
else
{ //递归
build_symbol_encoder(subtree->zero, pSF);
build_symbol_encoder(subtree->one, pSF);
}
}
/*
* calculate_huffman_codes turns pSF into an array
* with a single entry that is the root of the
* huffman tree. The return value is a SymbolEncoder,
* which is an array of huffman codes index by symbol value.
*/
static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{
unsigned int i = 0;
unsigned int n = 0;
huffman_node *m1 = NULL, *m2 = NULL;
SymbolEncoder *pSE = NULL;
#if 1
printf("BEFORE SORT\n");
print_freqs(pSF); //演示堆栈的使用
#endif
/* Sort the symbol frequency array by ascending frequency. */
//qsort是自带的快速排序函数,参数为待排序数组的首地址(*pSF),排序元素数量(MAX_SYMBOLS),每个元素的长度(sizeof((*pSF)[0])),自定义的比较函数(SFComp,返回1则前〉后,-1则后〉前)
qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp); //讲解SFComp函数的作用,断点在调试程序里的作用
#if 1
printf("AFTER SORT\n");
print_freqs(pSF);
#endif
/* Get the number of symbols. */
for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n)
;
/*
* Construct a Huffman tree. This code is based
* on the algorithm given in Managing Gigabytes
* by Ian Witten et al, 2nd edition, page 34.
* Note that this implementation uses a simple
* count instead of probability.
*/
for(i = 0; i < n - 1; ++i)
{
/* Set m1 and m2 to the two subsets of least probability. */
m1 = (*pSF)[0];
m2 = (*pSF)[1];
/* Replace m1 and m2 with a set {m1, m2} whose probability
* is the sum of that of m1 and m2. */
(*pSF)[0] = m1->parent = m2->parent =
new_nonleaf_node(m1->count + m2->count, m1, m2);
(*pSF)[1] = NULL;
/* Put newSet into the correct count position in pSF. */
qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);
}
/* Build the SymbolEncoder array from the tree. */
pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));
memset(pSE, 0, sizeof(SymbolEncoder));
build_symbol_encoder((*pSF)[0], pSE);
return pSE;
}
/*
* Write the huffman code table. The format is:
* 4 byte code count in network byte order.
* 4 byte number of bytes encoded
* (if you decode the data, you should get this number of bytes)
* code1
* ...
* codeN, where N is the count read at the begginning of the file.
* Each codeI has the following format:
* 1 byte symbol, 1 byte code bit length, code bytes.
* Each entry has numbytes_from_numbits code bytes.
* The last byte of each code may have extra bits, if the number of
* bits in the code is not a multiple of 8.
*/
static int
write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{
unsigned long i, count = 0;
/* Determine the number of entries in se. */
for(i = 0; i < MAX_SYMBOLS; ++i)
{
if((*se)[i])
++count;
}
/* Write the number of entries in network byte order. */
i = htonl(count); //在网络传输中,采用big-endian序,对于0x0A0B0C0D ,传输顺序就是0A 0B 0C 0D ,
//因此big-endian作为network byte order,little-endian作为host byte order。
//little-endian的优势在于unsigned char/short/int/long类型转换时,存储位置无需改变
if(fwrite(&i, sizeof(i), 1, out) != 1)
return 1;
/* Write the number of bytes that will be encoded. */
symbol_count = htonl(symbol_count);
if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)
return 1;
/* Write the entries. */
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*se)[i];
if(p)
{
unsigned int numbytes;
/* Write the 1 byte symbol. */
fputc((unsigned char)i, out);
/* Write the 1 byte code bit length. */
fputc(p->numbits, out);
/* Write the code bytes. */
numbytes = numbytes_from_numbits(p->numbits);
if(fwrite(p->bits, 1, numbytes, out) != numbytes)
return 1;
}
}
return 0;
}
/*
* Allocates memory and sets *pbufout to point to it. The memory
* contains the code table.
*/
static int
write_code_table_to_memory(buf_cache *pc,
SymbolEncoder *se,
unsigned int symbol_count)
{
unsigned long i, count = 0;
/* Determine the number of entries in se. */
for(i = 0; i < MAX_SYMBOLS; ++i)
{
if((*se)[i])
++count;
}
/* Write the number of entries in network byte order. */
i = htonl(count);
if(write_cache(pc, &i, sizeof(i)))
return 1;
/* Write the number of bytes that will be encoded. */
symbol_count = htonl(symbol_count);
if(write_cache(pc, &symbol_count, sizeof(symbol_count)))
return 1;
/* Write the entries. */
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*se)[i];
if(p)
{
unsigned int numbytes;
/* The value of i is < MAX_SYMBOLS (256), so it can
be stored in an unsigned char. */
unsigned char uc = (unsigned char)i;
/* Write the 1 byte symbol. */
if(write_cache(pc, &uc, sizeof(uc)))
return 1;
/* Write the 1 byte code bit length. */
uc = (unsigned char)p->numbits;
if(write_cache(pc, &uc, sizeof(uc)))
return 1;
/* Write the code bytes. */
numbytes = numbytes_from_numbits(p->numbits);
if(write_cache(pc, p->bits, numbytes))
return 1;
}
}
return 0;
}
/*
* read_code_table builds a Huffman tree from the code
* in the in file. This function returns NULL on error.
* The returned value should be freed with free_huffman_tree.
*/
static huffman_node*
read_code_table(FILE* in, unsigned int *pDataBytes)
{
//在解码端重建huffman树
huffman_node *root = new_nonleaf_node(0, NULL, NULL);
unsigned int count;
/* Read the number of entries.
(it is stored in network byte order). */
if(fread(&count, sizeof(count), 1, in) != 1)
{
free_huffman_tree(root);
return NULL;
}
count = ntohl(count);//将一个无符号长整形数从网络字节顺序转换为主机字节顺序
/* Read the number of data bytes this encoding represents. */
if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1)
{
free_huffman_tree(root);
return NULL;
}
*pDataBytes = ntohl(*pDataBytes);
/* Read the entries. */
while(count-- > 0)
{
int c;
unsigned int curbit;
unsigned char symbol;
unsigned char numbits;
unsigned char numbytes;
unsigned char *bytes;
huffman_node *p = root;
if((c = fgetc(in)) == EOF)//读取符号并判断
{
free_huffman_tree(root);
return NULL;
}
symbol = (unsigned char)c;
if((c = fgetc(in)) == EOF)//读取字符长度并判断
{
free_huffman_tree(root);
return NULL;
}
numbits = (unsigned char)c;
numbytes = (unsigned char)numbytes_from_numbits(numbits);
bytes = (unsigned char*)malloc(numbytes);
if(fread(bytes, 1, numbytes, in) != numbytes)
{
free(bytes);
free_huffman_tree(root);
return NULL;
}
/*
* Add the entry to the Huffman tree. The value
* of the current bit is used switch between
* zero and one child nodes in the tree. New nodes
* are added as needed in the tree.
*/
for(curbit = 0; curbit < numbits; ++curbit)
{
if(get_bit(bytes, curbit))
{
if(p->one == NULL)
{
p->one = curbit == (unsigned char)(numbits - 1)
? new_leaf_node(symbol)
: new_nonleaf_node(0, NULL, NULL);
p->one->parent = p;
}
p = p->one;
}
else
{
if(p->zero == NULL)
{
p->zero = curbit == (unsigned char)(numbits - 1)
? new_leaf_node(symbol)
: new_nonleaf_node(0, NULL, NULL);
p->zero->parent = p;
}
p = p->zero;
}
}
free(bytes);
}
return root;
}
static int
memread(const unsigned char* buf,
unsigned int buflen,
unsigned int *pindex,
void* bufout,
unsigned int readlen)
{
assert(buf && pindex && bufout);
assert(buflen >= *pindex);
if(buflen < *pindex)
return 1;
if(readlen + *pindex >= buflen)
return 1;
memcpy(bufout, buf + *pindex, readlen);
*pindex += readlen;
return 0;
}
static huffman_node*
read_code_table_from_memory(const unsigned char* bufin,
unsigned int bufinlen,
unsigned int *pindex,
unsigned int *pDataBytes)
{
huffman_node *root = new_nonleaf_node(0, NULL, NULL);
unsigned int count;
/* Read the number of entries.
(it is stored in network byte order). */
if(memread(bufin, bufinlen, pindex, &count, sizeof(count)))
{
free_huffman_tree(root);
return NULL;
}
count = ntohl(count);
/* Read the number of data bytes this encoding represents. */
if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes)))
{
free_huffman_tree(root);
return NULL;
}
*pDataBytes = ntohl(*pDataBytes);
/* Read the entries. */
while(count-- > 0)
{
unsigned int curbit;
unsigned char symbol;
unsigned char numbits;
unsigned char numbytes;
unsigned char *bytes;
huffman_node *p = root;
if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol)))
{
free_huffman_tree(root);
return NULL;
}
if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits)))
{
free_huffman_tree(root);
return NULL;
}
numbytes = (unsigned char)numbytes_from_numbits(numbits);
bytes = (unsigned char*)malloc(numbytes);
if(memread(bufin, bufinlen, pindex, bytes, numbytes))
{
free(bytes);
free_huffman_tree(root);
return NULL;
}
/*
* Add the entry to the Huffman tree. The value
* of the current bit is used switch between
* zero and one child nodes in the tree. New nodes
* are added as needed in the tree.
*/
for(curbit = 0; curbit < numbits; ++curbit)
{
if(get_bit(bytes, curbit))
{
if(p->one == NULL)
{
p->one = curbit == (unsigned char)(numbits - 1)
? new_leaf_node(symbol)
: new_nonleaf_node(0, NULL, NULL);
p->one->parent = p;
}
p = p->one;
}
else
{
if(p->zero == NULL)
{
p->zero = curbit == (unsigned char)(numbits - 1)
? new_leaf_node(symbol)
: new_nonleaf_node(0, NULL, NULL);
p->zero->parent = p;
}
p = p->zero;
}
}
free(bytes);
}
return root;
}
static int
do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{
unsigned char curbyte = 0;
unsigned char curbit = 0;
int c;
while((c = fgetc(in)) != EOF)
{
unsigned char uc = (unsigned char)c;
huffman_code *code = (*se)[uc];
unsigned long i;
for(i = 0; i < code->numbits; ++i)
{
/* Add the current bit to curbyte. */
curbyte |= get_bit(code->bits, i) << curbit;
/* If this byte is filled up then write it
* out and reset the curbit and curbyte. */
if(++curbit == 8)
{
fputc(curbyte, out);
curbyte = 0;
curbit = 0;
}
}
}
/*
* If there is data in curbyte that has not been
* output yet, which means that the last encoded
* character did not fall on a byte boundary,
* then output it.
*/
if(curbit > 0)//写最后一个符号没写满8bit的情况
fputc(curbyte, out);
return 0;
}
static int
do_memory_encode(buf_cache *pc,
const unsigned char* bufin,
unsigned int bufinlen,
SymbolEncoder *se)
{
unsigned char curbyte = 0;
unsigned char curbit = 0;
unsigned int i;
for(i = 0; i < bufinlen; ++i)
{
unsigned char uc = bufin[i];
huffman_code *code = (*se)[uc];
unsigned long i;
for(i = 0; i < code->numbits; ++i)
{
/* Add the current bit to curbyte. */
curbyte |= get_bit(code->bits, i) << curbit;
/* If this byte is filled up then write it
* out and reset the curbit and curbyte. */
if(++curbit == 8)
{
if(write_cache(pc, &curbyte, sizeof(curbyte)))
return 1;
curbyte = 0;
curbit = 0;
}
}
}
/*
* If there is data in curbyte that has not been
* output yet, which means that the last encoded
* character did not fall on a byte boundary,
* then output it.
*/
return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;
}
//step3:add by yzhang for huffman statistics
int huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count)
{
int i,count =0;
for(i = 0; i < MAX_SYMBOLS; ++i)
{
if((*SF)[i])
{
st->freq[i]=(float)(*SF)[i]->count/total_count;
count+=(*SF)[i]->count;
}
else
{
st->freq[i]= 0;
}
}
if(count==total_count)
return 1;
else
return 0;
}
int huffST_getcodeword(SymbolEncoder *se, huffman_stat *st)
{
unsigned long i,j;
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*se)[i];
if(p)
{
unsigned int numbytes;
st->numbits[i] = p->numbits;
numbytes = numbytes_from_numbits(p->numbits);
for (j=0;jbits[i][j] = p->bits[j];
}
else
st->numbits[i] =0;
}
return 0;
}
void output_huffman_statistics(huffman_stat *st,FILE *out_Table)
{
int i,j;
unsigned char c;
fprintf(out_Table,"symbol\t freq\t codelength\t code\n");
for(i = 0; i < MAX_SYMBOLS; ++i)
{
fprintf(out_Table,"%d\t ",i);
fprintf(out_Table,"%f\t ",st->freq[i]);
fprintf(out_Table,"%d\t ",st->numbits[i]);
if(st->numbits[i])
{
for(j = 0; j < st->numbits[i]; ++j)
{
c =get_bit(st->bits[i], j);
fprintf(out_Table,"%d",c);
}
}
fprintf(out_Table,"\n");
}
}
//end by yzhang
/*
* huffman_encode_file huffman encodes in to out.
*/
int
huffman_encode_file(FILE *in, FILE *out, FILE *out_Table) //step1:changed by yzhang for huffman statistics from (FILE *in, FILE *out) to (FILE *in, FILE *out, FILE *out_Table)
{
SymbolFrequencies sf;
SymbolEncoder *se;
huffman_node *root = NULL;
int rc;
unsigned int symbol_count;
//step2:add by yzhang for huffman statistics
huffman_stat hs;
//end by yzhang
/* Get the frequency of each symbol in the input file. */
symbol_count = get_symbol_frequencies(&sf, in); //演示扫描完一遍文件后,SF指针数组的每个元素的构成
//step3:add by yzhang for huffman statistics,... get the frequency of each symbol
huffST_getSymFrequencies(&sf,&hs,symbol_count);
//end by yzhang
/* Build an optimal table from the symbolCount. */
se = calculate_huffman_codes(&sf);
root = sf[0];
//step3:add by yzhang for huffman statistics... output the statistics to file
huffST_getcodeword(se, &hs);
output_huffman_statistics(&hs,out_Table);
//end by yzhang
/* Scan the file again and, using the table
previously built, encode it into the output file. */
rewind(in);
rc = write_code_table(out, se, symbol_count);
if(rc == 0)
rc = do_file_encode(in, out, se);
/* Free the Huffman tree. */
free_huffman_tree(root);
free_encoder(se);
return rc;
}
int
huffman_decode_file(FILE *in, FILE *out)
{
huffman_node *root, *p;
int c;
unsigned int data_count;
/* Read the Huffman code table. */
root = read_code_table(in, &data_count);
if(!root)
return 1;
/* Decode the file. */
p = root;
while(data_count > 0 && (c = fgetc(in)) != EOF)
{
unsigned char byte = (unsigned char)c;
unsigned char mask = 1;
while(data_count > 0 && mask)
{
p = byte & mask ? p->one : p->zero;
mask <<= 1;
if(p->isLeaf)
{
fputc(p->symbol, out);
p = root;
--data_count;
}
}
}
free_huffman_tree(root);
return 0;
}
#define CACHE_SIZE 1024
int huffman_encode_memory(const unsigned char *bufin,
unsigned int bufinlen,
unsigned char **pbufout,
unsigned int *pbufoutlen)
{
SymbolFrequencies sf;
SymbolEncoder *se;
huffman_node *root = NULL;
int rc;
unsigned int symbol_count;
buf_cache cache;
/* Ensure the arguments are valid. */
if(!pbufout || !pbufoutlen)
return 1;
if(init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen))
return 1;
/* Get the frequency of each symbol in the input memory. */
symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen);
/* Build an optimal table from the symbolCount. */
se = calculate_huffman_codes(&sf);
root = sf[0];
/* Scan the memory again and, using the table
previously built, encode it into the output memory. */
rc = write_code_table_to_memory(&cache, se, symbol_count);
if(rc == 0)
rc = do_memory_encode(&cache, bufin, bufinlen, se);
/* Flush the cache. */
flush_cache(&cache);
/* Free the Huffman tree. */
free_huffman_tree(root);
free_encoder(se);
free_cache(&cache);
return rc;
}
int huffman_decode_memory(const unsigned char *bufin,
unsigned int bufinlen,
unsigned char **pbufout,
unsigned int *pbufoutlen)
{
huffman_node *root, *p;
unsigned int data_count;
unsigned int i = 0;
unsigned char *buf;
unsigned int bufcur = 0;
/* Ensure the arguments are valid. */
if(!pbufout || !pbufoutlen)
return 1;
/* Read the Huffman code table. */
root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count);
if(!root)
return 1;
buf = (unsigned char*)malloc(data_count);
/* Decode the memory. */
p = root;
for(; i < bufinlen && data_count > 0; ++i)
{
unsigned char byte = bufin[i];
unsigned char mask = 1;
while(data_count > 0 && mask)
{
p = byte & mask ? p->one : p->zero;
mask <<= 1;
if(p->isLeaf)
{
buf[bufcur++] = p->symbol;
p = root;
--data_count;
}
}
}
free_huffman_tree(root);
*pbufout = buf;
*pbufoutlen = bufcur;
return 0;
}
实验选取了10中文件类型进行Huffman编码,分别为bmp、doc、exe、pdf、png、ppt、rar、wav、xls、yuv。对编码后的文件进行分析,得到以下结果图表:
可以看到,进行Huffman编码后,大多数文件都变小了,压缩比在1到4之间。但也有rar这样经过编码后不小反大的文件。
再观察每个文件的字符概率分布情况:
对比联合图表可以发现,压缩比是由概率分布决定的。相比于实验选用的bmp、doc等字符概率比较集中的文件,字符概率分布平均分散的文件(如rar、png、pdf),压缩比更小,信源熵更大。