HttpClient GZip压缩问题

最近公司人手不够,临时做一些网络爬虫方面的工作,在爬取一些网站的时候遇到访问页面gzip压缩的问题,花时间研究了一下,终于给解决了。在这里记录一下,方便以后回溯。


示例代码

package com.yulore.test;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;

public class HttpClientTest02 {

	public static void main(String[] args) {
		test();
	}

	public static void test(){
		//http://www.koubei.com/?spm=0.0.0.117.pR54PP&city=110100[0,1]
		String url = "http://www.koubei.com/?spm=0.0.0.117.pR54PP&city=110100[0,1]";
		String content = httpGet(url);
		String regex = "href=\"([\\S]*?)\"\\s*?class=\"nav_a\\s*?shanghu\"\\s*?target=\"_blank\">本地商户";
		Pattern pattern = Pattern.compile(regex,Pattern.CASE_INSENSITIVE);
		Matcher matcher = pattern.matcher(content);
		if(matcher.find()){
			String target = matcher.group(1);
			System.err.println("target="+target);
		}
		
	}
	/**
	 * java使用代理发送http请求
	 * 
	 * @return
	 */
	public static String httpGet(String url) {
		String ip = "xxxxx";
		String content = null;
		DefaultHttpClient httpclient = null;
		try {
			httpclient = new DefaultHttpClient();
			/** 设置代理IP **/
			HttpHost proxy = new HttpHost(ip, 8080);
			httpclient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,
					proxy);
			HttpGet httpget = new HttpGet(url);

			httpget.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,
					1000 * 30); // 设置请求超时时间
			httpget.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.79 Safari/537.1");
			httpget.setHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
			httpget.setHeader("Accept-Encoding", "gzip,deflate,sdch");	//需要加上这个头字段

			HttpResponse resp = httpclient.execute(httpget);
			int statusCode = resp.getStatusLine().getStatusCode();
			if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY){

				System.out.println("当前访问页面重定向了,,,");
				Header[] locationHeader = resp.getHeaders("Location");
				if (locationHeader != null && locationHeader.length > 0) {
					String redirectUrl = locationHeader[0].getValue();
					System.out.println("redirectUrl:" + redirectUrl);
				}

			} else if (statusCode == HttpStatus.SC_OK) {
				InputStream in = null;
				HttpEntity entity = resp.getEntity();
				Header header = entity.getContentEncoding();
				if(header != null && header.getValue().equalsIgnoreCase("gzip")){	//判断返回内容是否为gzip压缩格式
					
					System.err.println("gzip");
					GzipDecompressingEntity gzipEntity = new GzipDecompressingEntity(entity);
					in = gzipEntity.getContent();
				}else{
					in = entity.getContent();
				}
				content = getHTMLContent(in);
				System.out.println("content:" + content);
			}

		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			httpclient.getConnectionManager().shutdown(); // 关闭连接
		}
		return content;
	}

	private static String getHTMLContent(InputStream in) {
		StringBuffer sb = new StringBuffer();
		BufferedReader br = new BufferedReader(new InputStreamReader(in));
		try {
			String line = null;
			while((line=br.readLine())!=null){
				sb.append(line);
			}
			
		} catch (IOException e) {
			e.printStackTrace();
		}finally{
			try {
				br.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		
		return sb.toString();
	}
}




说明:

其实就是使用httpclient提供的 GzipDecompressingEntity类解密GZip内容






你可能感兴趣的:(Java)