最近研究了下LZW算法,也看了很多这个方面的资料。LZW适合于文本文件,对于稍稍大点的流文件则出现压缩出来的文件大于源文件的情况。LZW有很多著名的实现程序,下面的程序以动态增加位数为出发点,利用哈希表来实现LZW的压缩。 哈希算法有二个,一个被我注释掉,二个都可以用。具体哪个好,我自己也没有测试。
/**********************************************************************
***********************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define hashsize 4096
#define clear 256 /*清除标志位*/
#define teminate 257
#define not_used -1
#define MAXVAL(n) (( 1 <<( n )) -1)
#define max_bits 12
FILE *in;
FILE *out;
int bitsize = 9;
int maxcode;
/*字典数据结构*/
typedef struct prex_cha{
int value; /*值*/
unsigned int prefix; /*字符串*/
unsigned int character; /*追加的字母*/
}Hash_Table;
Hash_Table Hash_table[hashsize];
void initial_hashtable ()/*字典初始化*/
{
int i;
for (i=0; i<hashsize; i++)
{
Hash_table[i].value = not_used;
}
}
/*把数据改写成bit流输出*/
void output (unsigned int code)
{
static int count = 0;
static unsigned long buffer = 0L; /*buffer 为定义的存储字节的缓冲区*/
buffer |= (unsigned long) code << (32 - bitsize - count);
count += bitsize;
while (count >= 8) /*如果缓冲区大于8则输出里面的前8位*/
{
fputc(buffer >> 24,out);
buffer <<= 8;
count -= 8;
}
}
/*
int find_match(int prefix,unsigned int character)
{
int index;
int offset; /*offset为偏移位*/
/* index = (character << 4) ^ prefix; /*用异或来决定index*/
/* if (index == 0)
{
offset = 1;
}
else
{
offset = 5021 - index;
}
while (1)
{
if (Hash_table[index].value == not_used)
{
return (index);
}
if (Hash_table[index].prefix == prefix && Hash_table[index].character == character)
{
return (index);
}
index -= offset;
if (index < 0)
{
index += 5021;
}
}
}
*/
int find_match(int prefix,unsigned int character)
{
int index;
index = prefix % 4096;
while (1)
{
if(Hash_table[index].value == not_used)
{
return (index);
}
if(Hash_table[index].prefix == prefix && Hash_table[index].character == character)
{
return (index);
}
index = index + 1;
if(index >= 4096)
{
index = index - 4096;
}
}
}
void lzwcompression ()
{
unsigned int prefix;
unsigned int character;
unsigned int index;
unsigned int next_code = 258; /* 当前字典的标号*/
initial_hashtable ();
prefix = fgetc (in);
while ((character = fgetc(in)) !=(unsigned) EOF)
{
index = find_match (prefix,character);
if (Hash_table[index].value != not_used)/*能够找到*/
{
prefix = Hash_table[index].value;
}
else
{
if(next_code <= maxcode)/*不能找到,是新的字符串,则添加到表中*/
{
Hash_table[index].value = next_code++;
Hash_table[index].character = character;
Hash_table[index].prefix = prefix;
}
output(prefix);
prefix = character; /*把后缀给前缀,准备下次输入*/
/*特殊标志,当位数必须增加时*/
if(next_code > maxcode)
{
if(bitsize < 12)
{
maxcode = MAXVAL(++bitsize);
}
else /*达到4096时候,必须清除哈希表,重新开始*/
{
output (256); /*输出清除标志到文件中*/
initial_hashtable();
next_code = 258;
bitsize = 9;
maxcode = MAXVAL(bitsize);/*maxcode 变为511*/
}
}
}/*if-else结束*/
}/*while结束*/
output(prefix); /*输出最后一个*/
if (next_code == maxcode)
{ /* 如果在最后的哈息表刚好刚好是maxcode,则也必须把位数增加一位 */
++bitsize;
}
output(257); /* 输出结束标志*/
output(0); /* 解码时要用到 */
output(0);
output(0);
}
int main(int argc,char* argv[])
{
char filename[255];
if (argc < 2)
{
printf("usage:the command format is:lzw_compression <filename>!");
return(1);
}
if( (in = fopen(argv[1], "rb")) == NULL)
{
printf ("Cannot open input file - %s/n", argv[1]);
exit (1);
}
strcpy(filename, argv[1]);
/*加上后缀.zzz,表明是压缩文件*/
strncat(filename, ".zzz", 4);
if ((out = fopen(filename, "wb")) == NULL)
{
printf("Cannot open output file - %s/n", filename);
fclose(in);
exit(1);
}
maxcode = MAXVAL(bitsize);
lzwcompression();
fclose (in);
fclose (out);
return 0;
}