压缩算法-LZW

压缩算法-LZW

基本原理:提取原始文本文件数据中的不同字符,基于这些字符创建一个编译表,然后用编译表中的字符的索引来替代原始文本文件数据中的相应字符,减少原始数据大小。

1) 从源文件中读取不同字节;

2) 根据字节创建字符串编译表;

3) 使用字符串编译表中的字符索引来替换源文件中相应的字符串。


LZW使用扩充的ASCLL表,用12位表示,0~255表示一个数据字节, 256和257保留,258~4096表示字符串编译表中的编码。

算法描述:

1) 读入一个字节作为前缀

2) 读入第二个字节,作为后缀;

3) 把前缀和后缀结合作为hashkey,在字符串表中查找是否已经存在相同的字符串。

4) If 不存在输出前缀

5) If 字符串存在,继续读入下一字符作为后缀重复 3)。


输入例子 :ababcbababaaaaaaa
LZW编码:          a,   b,  ab,  c,   ba, bab,a,   aa, aaa,a

程序输出编码:97,98,258,99,259,262,97,264,265,97,10,4095,0,

注意:为便于理解LZW算法,这里没有按12bit输出数据而是直接输出了译码表中的值。


#include <stdio.h>
#include <stdlib.h>


#define BITS 12
#define HASHSHIFT  BITS - 8
#define MAX_VALUE (1<<BITS)-1
#define MAX_CODE MAX_VALUE - 1
#define HASH_TABLE_LENGTH 4099
#define process 1000

typedef struct _LZW_tag
{
    int *_code;
    unsigned int* _prenum;
    unsigned char* _baknum;
} LZW_t;

LZW_t lzwt, *lzw;

int findhash(unsigned int prenum, unsigned int baknum)
{
    int index;
    int offset;

    index = (baknum<<HASHSHIFT)^prenum;
    if (index == 0)
    {
        offset = 1;
    }
    else
    {
        offset = HASH_TABLE_LENGTH - index;
    }

    while (1)
    {
        if (lzw->_code[index] == -1)
        {
            return index;
        }

        if (lzw->_prenum[index] == prenum && lzw->_baknum[index] == baknum)
        {
            return index;
        }

        index -= offset;
        if (index < 0)
        {
            index += HASH_TABLE_LENGTH;
        }
    }
}

void dataout(FILE* pOut, unsigned int code)
{
    static int outbinary=0;
    static unsigned long nob = 0;

    char buff[64];

    memset(buff, 0, 64);
    sprintf(buff, "%d,", code);

    fwrite(buff, strlen(buff), 1, pOut);
//    putc(code, pOut);
//    putc(',', pOut);
    /*
    nob |= (unsigned long)code << (32-BITS-outbinary);
    outbinary += BITS;

    while(outbinary >= 8)
    {
        putc(nob>>24, pOut);
        nob<<=8;
        outbinary=outbinary - 8;
    }
//*/
}


void compress(FILE* pIn, FILE* pOut)
{
    int i = 0;
    int nIndex = 0;
    int nLen1 = 0;
    int nLen2 = 0;

    int nPreNum = 0;
    int nBakNum = 0;
    int curr_code = 258;

    nLen1 = HASH_TABLE_LENGTH * sizeof(int);
    nLen2 = HASH_TABLE_LENGTH * sizeof(char);

    lzw = &lzwt;
    lzw->_code = (int*)malloc(nLen1);
    lzw->_prenum = (unsigned int*)malloc(nLen1);
    lzw->_baknum = (unsigned char*)malloc(nLen2);

    for (i = 0; i < HASH_TABLE_LENGTH; i++)
    {
        lzw->_code[i] = -1;
    }

    i = 0;
    nPreNum = getc(pIn);
    printf("read=%c\n",nPreNum);
    while((nBakNum = getc(pIn))!=EOF)
    {
        printf("read=%c\n",nBakNum);
        nIndex = findhash(nPreNum, nBakNum);
        printf("index=%d\n", nIndex);
        if (lzw->_code[nIndex] != -1)
        {
            nPreNum = lzw->_code[nIndex];
        }
        else
        {
            if (curr_code <= MAX_CODE)
	    {
                lzw->_code[nIndex] = curr_code++;
                lzw->_prenum[nIndex] = nPreNum;
                lzw->_baknum[nIndex] = nBakNum;
            }
            dataout(pOut, nPreNum);
            nPreNum = nBakNum;
        }
    }

    dataout(pOut, nPreNum);
    dataout(pOut, (MAX_VALUE));
    dataout(pOut, 0);

    free(lzw->_code);
    free(lzw->_prenum);
    free(lzw->_baknum);
}

unsigned char decode_stack[HASH_TABLE_LENGTH];


unsigned char* decode(unsigned char* buffer, unsigned int code)
{
    int len = 0;

    while(code > 257)
    {
        *buffer++ = lzw->_baknum[code];
        printf("decode=%c\n", *buffer);
        code = lzw->_prenum[code];
        len++;

        if (len >= HASH_TABLE_LENGTH)
        {
            return NULL;
        }
    }
    *buffer = code;
    return buffer;
}


unsigned int incode(FILE* pIn)
{
    unsigned int ret;
    char buff[64];
    char ch;
    int i = 0;

    memset(buff, 0, 64);
    //static int inputbinary = 0;
    //static unsigned long nib = 0;

    ch = getc(pIn);
    while ((ch != ',') && (ch != EOF))
    {
        buff[i] = ch;
        i++;
        ch=getc(pIn); 
    }
    
    ret = atol(buff);

/*
    while(inputbinary <= 24)
    {
        nib |= (unsigned long)getc(pIn) << (24-inputbinary);
        inputbinary = inputbinary + 8;
    }
    ret = nib >> (32-BITS);
    nib <<= BITS;
    inputbinary = inputbinary - BITS;
//*/
    return ret;
}


void decompress(FILE* pIn, FILE* pOut)
{
    unsigned int curr_code;
    unsigned int baknum;
    unsigned int prenum;

    int i = 0;
    int nLen1, nLen2;
    int ch;
    unsigned char *ps;

    nLen1 = HASH_TABLE_LENGTH*sizeof(unsigned int);
    nLen2 = HASH_TABLE_LENGTH*sizeof(char);

    lzwt._code = (int*)malloc(nLen1);
    lzwt._prenum = (unsigned int*)malloc(nLen1);
    lzwt._baknum = (unsigned char*)malloc(nLen2);

    lzw = &lzwt;
    curr_code = 258;

    prenum = incode(pIn);
    printf("incode=%d\n", prenum);
    ch = prenum;

    putc(prenum, pOut);

    while((baknum = incode(pIn)) != MAX_VALUE)
    {
        printf("incode=%d\n",baknum);
        if (baknum >= curr_code)
        {
            *decode_stack = ch;
            ps = decode(decode_stack+1, prenum);
        }
        else
        {
            ps = decode(decode_stack, baknum);
        }

        ch = *ps;
        while(ps >= decode_stack)
        {
            printf("write to=%c\n", *ps);
            putc(*ps--, pOut);
        }
        if (curr_code <= MAX_VALUE)
        {
            lzw->_prenum[curr_code] = prenum;
            lzw->_baknum[curr_code] = ch;
            curr_code++; 
        }

        prenum = baknum;
    }

    free(lzw->_code);
    free(lzw->_prenum);
    free(lzw->_baknum);
}

int main()
{
    FILE *pIn  = NULL;
    FILE *pOut = NULL;

    FILE *pDIn = NULL;
    FILE *pDOut = NULL;

    pIn = fopen("org.dat", "rb");
    pOut = fopen("t1.lzw", "wb");

    pDIn = fopen("t1.lzw", "rb");
    pDOut = fopen ("t1o.dat", "wb");
    compress(pIn, pOut);
    fclose(pIn);
    fclose(pOut);

    printf("\n\ndecompress file====================================================\n\n");

    decompress(pDIn, pDOut);

    fclose(pDIn);
    fclose(pDOut);
    return 0;
}





你可能感兴趣的:(压缩算法-LZW)