参考网上例子和资料,自己使用inflateInit2解压gzip文件。
为了以后查阅方便,首先把网上的两篇文章转载下,最后是自己的例子。
ttp报文中gzip解码方法与遇到的问题
前几天写了个脚本去开心网偷菜,由于数据是zip压缩的,所以试着解压。
(或者在模拟发包HTTP头去掉"Accept-Encoding:gzip, deflate\r\n",直接可以获得未压缩的原始数据)
最开始用的方法很土,把数据保存为gz结尾的文件,然后在调用命令行gzip -d -f xxx.gz最后在读取解压后的文件。
忒麻烦了。开始用zlib来在内存中解压,头文件
最开始使用uncompress来解压,总是失败,试着用compress把原始数据压缩后比较一下,发现完全不一致,gzip压缩的格式有一个头,1F8B08,1F0B代表gzip格式,08代码deflate算法。
查阅资料使用inflate来解压,但是依然解压失败。发现初始化需要使用inflateInit2(&d_stream,47)才可以,其他的解压和example.c的例子都一样。
但是问题来了,我想在内存里构造一个gzip的包,网上说使用deflateInit2(&strm,DEFAULT_COMPRESSION,Z_DEFLATED, DEFAULT_WINDOWSIZE,DEFAULT_MEMLEVEL,Z_DEFAULT_STRATEGY);就可以了。
首先没有缺少宏定义,查头文件,换成deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION,Z_DEFLATED, -15,8,Z_DEFAULT_STRATEGY);
但是,压缩后的内容没有gzip的压缩头!如果我自己在前面加一个gzip的头,那么解压的时候校验码错误。
疯呀,谁能帮帮我。
如果使用zlib提供的gz开头的函数比如gzopen,gzputs,那么生成的压缩文件则一切正常。
------------------------------------------------------------
解压gzip的函数:
char* ungzip(char*source,int len)
{
int err;
z_stream d_stream;
Byte compr[2000]={0}, uncompr[2000]={0};
memcpy(compr,(Byte*)source,len);
uLong comprLen, uncomprLen;
comprLen = sizeof(compr) / sizeof(compr[0]);
uncomprLen = comprLen;
strcpy((char*)uncompr, "garbage");
d_stream.zalloc =(alloc_func)0;
d_stream.zfree = (free_func)0;
d_stream.opaque = (voidpf)0;
d_stream.next_in =compr;
d_stream.avail_in = 0;
d_stream.next_out = uncompr;
err = inflateInit2(&d_stream,47);
if(err!=Z_OK)
{
printf("inflateInit2 error:%d",err);
return NULL;
}
while (d_stream.total_out < uncomprLen && d_stream.total_in
err = inflate(&d_stream,Z_NO_FLUSH);
if(err == Z_STREAM_END) break;
if(err!=Z_OK)
{
printf("inflate error:%d",err);
return NULL;
}
}
err = inflateEnd(&d_stream);
if(err!=Z_OK)
{
printf("inflateEnd error:%d",err);
return NULL;
}
char* b = new char[d_stream.total_out+1];
bzero(b,d_stream.total_out+1);
memcpy(b,(char*)uncompr,d_stream.total_out);
return b;
}
抓HTTP包的时候发现很多网站的响应报文是gzip压缩后的数据,存放在一个或多个chunk里面(参见HTTP响应报文中的chunked)。这些gzip数据是不可阅读的,需要进行解压。一开始在网上找到了一份可以正常运行的代码,贴出来:
http://hi.baidu.com/xzq2000/blog/item/c5429f2fd6a646301f308991.html/cmtid/332e72f08f0b53a2a40f5237
char*ungzip(char* source,int len)
{
int err;
z_stream d_stream;
Byte compr[segment_size]={0}, uncompr[segment_size*4]={0};
memcpy(compr,(Byte*)source,len);
uLong comprLen, uncomprLen;
comprLen = sizeof(compr) / sizeof(compr[0]);
uncomprLen = 4*comprLen;
strcpy((char*)uncompr, "garbage");
d_stream.zalloc= (alloc_func)0;
d_stream.zfree = (free_func)0;
d_stream.opaque = (voidpf)0;
d_stream.next_in= compr;
d_stream.avail_in = 0;
d_stream.next_out = uncompr;
err =inflateInit2(&d_stream,47);
if(err!=Z_OK)
{
printf("inflateInit2error:%d",err);
returnNULL;
}
while (d_stream.total_out < uncomprLen && d_stream.total_in
err = inflate(&d_stream,Z_NO_FLUSH);
if(err== Z_STREAM_END) break;
if(err!=Z_OK)
{
printf("inflateerror:%d",err);
returnNULL;
}
}
err = inflateEnd(&d_stream);
if(err!=Z_OK)
{
printf("inflateEnderror:%d",err);
returnNULL;
}
char* b = new char[d_stream.total_out+1];
memset(b,0,d_stream.total_out+1);
memcpy(b,(char*)uncompr,d_stream.total_out);
return b;
}
后来看了zlibusage example(参见zlib使用范例),模仿写了一段代码,可以正常运行,而且感觉比上面的代码要快,因为上面的代码把z_stream的avail_in和avail_out都设为1了,只能一字节一字节地解压,非常慢。
#include
#include
#include
#include
#define segment_size 1460//largest tcp data segment
int ungzip(char* source,int len,char*des)
{
int ret,have;
int offset=0;
z_stream d_stream;
Byte compr[segment_size]={0}, uncompr[segment_size*4]={0};
memcpy(compr,(Byte*)source,len);
uLong comprLen, uncomprLen;
comprLen =len;//
//sizeof(compr)永远都是segment_size,显然不对,strlen(compr)也是不对的,因为strlen只算到\0之前,
//但是gzip或者zlib数据里\0很多。
uncomprLen = segment_size*4;
strcpy((char*)uncompr, "garbage");
d_stream.zalloc= Z_NULL;
d_stream.zfree = Z_NULL;
d_stream.opaque = Z_NULL;
d_stream.next_in= Z_NULL;//inflateInit和inflateInit2都必须初始化next_in和avail_in
d_stream.avail_in = 0;//deflateInit和deflateInit2则不用
ret =inflateInit2(&d_stream,47);
if(ret!=Z_OK)
{
printf("inflateInit2error:%d",ret);
returnret;
}
d_stream.next_in=compr;
d_stream.avail_in=comprLen;
do
{
d_stream.next_out=uncompr;
d_stream.avail_out=uncomprLen;
ret = inflate(&d_stream,Z_NO_FLUSH);
assert(ret != Z_STREAM_ERROR);
switch (ret)
{
case Z_NEED_DICT:
ret = Z_DATA_ERROR;
case Z_DATA_ERROR:
case Z_MEM_ERROR:
(void)inflateEnd(&d_stream);
return ret;
}
have=uncomprLen-d_stream.avail_out;
memcpy(des+offset,uncompr,have);//这里一开始我写成了memcpy(des+offset,d_stream.next_out,have);
//后来发现这是不对的,因为next_out指向的下次的输出,现在指向的是无有意义数据的内存。见下图
offset+=have;
}while(d_stream.avail_out==0);
inflateEnd(&d_stream);
memcpy(des+offset,"\0",1);
return ret;
}
//本人写的解压函数
Int32 uncompressGzip(UInt8* pSrc, UInt32 srcSize,char**pOutDest, UInt32* pOutBufSize)
{
Int32 ret = OK;
UInt8* pBuf = pSrc+ (srcSize - 1);
uLongf len =*pBuf;
Int32uncompressResult;
z_stream d_stream;
//check gz file,rfc1952 P6
if((*pSrc !=0x1f)||(*(pSrc+1) != 0x8b))
{
DBG_ERR(("uncompressGzip non Gzip"));
return ERR;
}
for(inti = 0; i < 3; i++)
{
pBuf--;
len <<= 8;
len += *pBuf;
}
//fortest
if((len== 0) || (len > 1000000))
{
DBG_ERR(("uncompressGzip,error gzip!"));
return ERR;
}
char*pDesBuf = (char*)memAlloc(len);
//gzipdecompression start!!!
d_stream.zalloc =Z_NULL;
d_stream.zfree =Z_NULL;
d_stream.opaque = Z_NULL;
d_stream.next_in =Z_NULL;
d_stream.avail_in= 0;
uncompressResult =inflateInit2(&d_stream,47);
if(uncompressResult!=Z_OK)
{
printf("inflateInit2 error:%d",uncompressResult);
returnuncompressResult;
}
d_stream.next_in=pSrc;
d_stream.avail_in=srcSize;
d_stream.next_out=(Bytef *)pDesBuf;
d_stream.avail_out=len;
uncompressResult =inflate(&d_stream,Z_NO_FLUSH);
switch(uncompressResult)
{
caseZ_NEED_DICT:
uncompressResult = Z_DATA_ERROR;
caseZ_DATA_ERROR:
caseZ_MEM_ERROR:
(void)inflateEnd(&d_stream);
returnuncompressResult;
}
printf("outlen= %d, total_in= %d, total_out= %d, avail_out= %d@@@@@@@@@@@\n",len, d_stream.total_in, d_stream.total_out, d_stream.avail_out);
inflateEnd(&d_stream);
*pOutBufSize = len-2;
*pOutDest = pDesBuf;
returnret;
}