httpclient使用详解(爬虫)

一、简介

HttpClient是Apache Jakarta Common下的子项目,用来提供高效的、最新的、功能丰富的支持HTTP协议的客户端编程工具包,并且它支持HTTP协议最新的版本和建议。HttpClient已经应用在很多的项目中,比如Apache Jakarta上很著名的另外两个开源项目Cactus和HTMLUnit都使用了HttpClient。

下载地址: http://hc.apache.org/downloads.cgi

二、特性

1. 基于标准、纯净的java语言。实现了Http1.0和Http1.1

2. 以可扩展的面向对象的结构实现了Http全部的方法(GET, POST, PUT, DELETE, HEAD, OPTIONS, and TRACE)。

3. 支持HTTPS协议。

4. 通过Http代理建立透明的连接。

5. 利用CONNECT方法通过Http代理建立隧道的https连接。

6. Basic, Digest, NTLMv1, NTLMv2, NTLM2 Session, SNPNEGO/Kerberos认证方案。

7. 插件式的自定义认证方案。

8. 便携可靠的套接字工厂使它更容易的使用第三方解决方案。

9. 连接管理器支持多线程应用。支持设置最大连接数,同时支持设置每个主机的最大连接数,发现并关闭过期的连接。

10. 自动处理Set-Cookie中的Cookie。

11. 插件式的自定义Cookie策略。

12. Request的输出流可以避免流中内容直接缓冲socket服务器。

13. Response的输入流可以有效的从socket服务器直接读取相应内容。

14. 在http1.0和http1.1中利用KeepAlive保持持久连接。

15. 直接获取服务器发送的response code和 headers。

16. 设置连接超时的能力。

17. 实验性的支持http1.1 response caching。

18. 源代码基于Apache License 可免费获取。

三、使用方法

使用HttpClient发送请求、接收响应很简单,一般需要如下几步即可。

1. 创建HttpClient对象。

2. 创建请求方法的实例,并指定请求URL。如果需要发送GET请求,创建HttpGet对象;如果需要发送POST请求,创建HttpPost对象。

3. 如果需要发送请求参数,可调用HttpGet、HttpPost共同的setParams(HetpParams params)方法来添加请求参数;对于HttpPost对象而言,也可调用setEntity(HttpEntity entity)方法来设置请求参数。

4. 调用HttpClient对象的execute(HttpUriRequest request)发送请求,该方法返回一个HttpResponse。

5. 调用HttpResponse的getAllHeaders()、getHeaders(String name)等方法可获取服务器的响应头;调用HttpResponse的getEntity()方法可获取HttpEntity对象,该对象包装了服务器的响应内容。程序可通过该对象获取服务器的响应内容。

6. 释放连接。无论执行方法是否成功,都必须释放连接

四、样例

HttpClientRequestHandler类

package com.xiaojiang.httpclient;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;

import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.xiaojiang.exception.DataTaskException;


public class HttpClientRequestHandler {

	private static final int ERROR_CODE = 1;

	
	/**
	 * get方式提交数据
	 */
	public static Document doGet(String url, String proxyIp, Integer proxyPort) throws DataTaskException{
		
		//System.out.println("doGet中使用代理:"+proxyIp+":"+proxyPort);
		
		HttpClient client = HttpConnectionManager.getHttpClient(proxyIp, proxyPort);
		
		HttpGet httpGet = new HttpGet(url);
		httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.5");
		httpGet.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
		httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
		httpGet.setHeader("Accept-Encoding", "gzip, deflate");
		httpGet.setHeader("User-Agent", HttpUserAgent.get());
		
		try{
			//执行
			HttpResponse response = client.execute(httpGet);
			int statuCode = response.getStatusLine().getStatusCode();
			if(statuCode == 200){
				String html = formatResponse(response);
				
				if(html != null){
					return Jsoup.parse(html);
				}
				return null;
				
			} else {
				throw new DataTaskException(statuCode, "请求URL【"+url+"】,"+statuCode+"错误", null);
			}
		} catch (Exception e){
			throw new DataTaskException(ERROR_CODE, e.getMessage(), e);
		} finally {
			if(httpGet != null){
				httpGet.abort();
			}
		}
	}
	

	/**
	 * post方式提交
	 * @throws DataTaskException 
	 */
	public static Document doPost(String url, Map paramaters, String proxyIp, Integer proxyPort) throws DataTaskException{
		
		HttpClient client = HttpConnectionManager.getHttpClient(proxyIp, proxyPort);
		
		HttpPost request = new HttpPost(url);
		
		request.setHeader("Accept", "application/json, text/javascript, */*; q=0.01");
		request.setHeader("Accept-Encoding", "gzip, deflate");
		request.setHeader("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
		request.setHeader("Cache-Control", "no-cache");
		request.setHeader("Connection", "keep-alive");
		request.setHeader("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");

		// 创建名/值组列表
		List parameters = new ArrayList();
		for(String key : paramaters.keySet()){
			parameters.add(new BasicNameValuePair(key, paramaters.get(key)));
		}
		
		try {
			// 创建UrlEncodedFormEntity对象
			UrlEncodedFormEntity formEntiry = new UrlEncodedFormEntity(parameters);
			request.setEntity(formEntiry);
			
			// 执行请求
			HttpResponse response = client.execute(request);
			int statuCode = response.getStatusLine().getStatusCode();
			
			if (statuCode == 200) {
				String html = formatResponse(response);
				
				if(html != null){
					return Jsoup.parse(html);
				}
						
				return null;

			} else if (statuCode == 404) {
				
				throw new DataTaskException(ERROR_CODE, "请求URL【"+url+"】,404错误", null);
			}
		} catch (Exception e) {
			
			throw new DataTaskException(ERROR_CODE, e.getMessage(), e);
		
		} finally {
			if(request != null){
				request.abort();
			}
		}
		return null;
	}
	
	/**
	 * 格式化请求结果
	 * @throws DataTaskException 
	 */
	private static String formatResponse(HttpResponse response) throws DataTaskException {
		
		ByteArrayInputStream bis = null;
		try{
			Header contentEncoding = response.getFirstHeader("Content-Encoding");
			
			if(contentEncoding == null){
				return EntityUtils.toString(response.getEntity(),"UTF-8");
			} else {
				
				String charset = "utf-8";
				Header contentType = response.getFirstHeader("Content-Type");
				
				if(contentType != null){
					String contentTypeStr = contentType.getValue();
					if(contentTypeStr != null && !"".equals(contentTypeStr)){
						charset = contentTypeStr.substring(contentTypeStr.indexOf("=") + 1,contentTypeStr.length());
						
					}
				}
				
				String contentEncodingType = contentEncoding.getValue();
				if(contentEncodingType.equalsIgnoreCase("gzip")){
					if(response.toString().contains("soufun"))
						charset = "gb2312";
					
					byte[] bytes = IOUtils.toByteArray(response.getEntity().getContent());
					bis = new ByteArrayInputStream(bytes);
					
					return uncompress(bis ,charset);
				}
				
			}
			
		} catch(Exception e) {
			throw new DataTaskException(ERROR_CODE, "格式化HttpResponse出错", e);
		} finally {
			if(bis != null){
				try {
					bis.close();
				} catch (IOException e) {
					throw new DataTaskException(ERROR_CODE, "格式化HttpResponse出错", e);
				}
			}
		}
		
		return null;
	}

	
	/**
	 * GZIP解压
	 */
	private static String uncompress(ByteArrayInputStream in, String charset) {

		ByteArrayOutputStream out = new ByteArrayOutputStream();
		
		try {
			GZIPInputStream gunzip = new GZIPInputStream(in);
			byte[] buffer = new byte[256];
			int n;
			while((n = gunzip.read(buffer)) >=0 ){
				out.write(buffer, 0, n);
			}
			return out.toString(charset);
			
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}
	
	
}

HttpConnectionManager类

package com.xiaojiang.httpclient;

import org.apache.http.HttpHost;
import org.apache.http.client.HttpClient;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.params.ConnManagerParams;
import org.apache.http.conn.params.ConnPerRouteBean;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;

public class HttpConnectionManager {

	private static HttpParams httpParams;
	private static ClientConnectionManager connectionManager;
	
	//最大连接数
	public final static int MAX_TOTAL_CONNECTIONS = 800;
	
	//获取连接的最大等待时间
	public final static int WAIT_TIMEOUT = 60000;
	
	//每个路由最大连接数
	public final static int MAX_ROUTE_CONNECTIONS = 400;
	
	//连接超时时间
	public final static int CONNECT_TIMEOUT = 60000;
	
	//读取超时时间
	public final static int READ_TIMEOUT = 60000;
	
	static {
		httpParams = new BasicHttpParams();
		// 设置最大连接数
		ConnManagerParams.setMaxTotalConnections(httpParams, MAX_TOTAL_CONNECTIONS);
		// 设置获取连接的最大等待时间
		ConnManagerParams.setTimeout(httpParams, WAIT_TIMEOUT);
		// 设置每个路由最大连接数
		ConnPerRouteBean connPerRoute = new ConnPerRouteBean(MAX_ROUTE_CONNECTIONS);
		ConnManagerParams.setMaxConnectionsPerRoute(httpParams,connPerRoute);
		
		// 设置连接超时时间
		HttpConnectionParams.setConnectionTimeout(httpParams, CONNECT_TIMEOUT);
		// 设置读取超时时间
		HttpConnectionParams.setSoTimeout(httpParams, READ_TIMEOUT);
	
		SchemeRegistry registry = new SchemeRegistry();
		registry.register(new Scheme("http", PlainSocketFactory.getSocketFactory(), 80));
		registry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443));

		connectionManager = new ThreadSafeClientConnManager(httpParams, registry);
	}
	
	
	public static HttpClient getHttpClient(String proxyIp, Integer proxyPort){
		DefaultHttpClient client = new DefaultHttpClient(connectionManager, httpParams);
		
		if(proxyIp !=null && proxyPort !=null){
			HttpHost proxy = new HttpHost(proxyIp, proxyPort);
			client.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY, proxy);
		}
		return client;
	}
	
	
	
	
	
	
	
}

HttpUserAgent类package com.xiaojiang.httpclient;

package com.xiaojiang.httpclient;

import java.util.ArrayList;
import java.util.List;
import java.util.Random;

public class HttpUserAgent {

	
private static List agents;
	
	static{
		agents = new ArrayList();
		
		//IE
		agents.add("Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)");
		agents.add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)");
		agents.add("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)");
		agents.add("Mozilla/4.0 (compatible; MSIE 5.0; Windows NT)");
		
		//Firefox 
		agents.add("Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1 ");
		agents.add("Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3 ");
		agents.add("Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12");
		agents.add("Mozilla/5.0 (Windows NT 6.3; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0)");
		agents.add("Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0");
		
		//Opera 
		agents.add("Opera/9.27 (Windows NT 5.2; U; zh-cn)");
		agents.add("Opera/8.0 (Macintosh; PPC Mac OS X; U; en)");
		agents.add("Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0");
		
		//Safari  
		agents.add("Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13");
		agents.add("Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3");

		//Chrome  
		agents.add("Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13");
		agents.add("Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30");
		
		//Navigator   
		agents.add("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6");
		
		//360极速浏览器   
		agents.add("Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ;  QIHU 360EE)");
		
	}
	
	
	public static String get(){
		return agents.get(new Random().nextInt(agents.size()-1));
	}
}














你可能感兴趣的:(网络爬虫)