package spider;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.zip.GZIPInputStream;
import javax.net.ssl.SSLHandshakeException;
import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.NoHttpResponseException;
import org.apache.http.ParseException;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.params.ConnManagerParams;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.message.BasicHeader;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
/**
* http连接、抓取管理类
*
@author
lidongyang
* @createtime Oct 18, 2012 1:55:18 PM
*
* @note 基本测试版
*/
public
class HttpConnectionManager {
/**
* 连接池里的最大连接数
*/
public
static
final
int MAX_TOTAL_CONNECTIONS = 100;
/**
* 每个路由的默认最大连接数
*/
public
static
final
int MAX_ROUTE_CONNECTIONS = 50;
/**
* 连接超时时间
*/
public
static
final
int CONNECT_TIMEOUT = 50000;
/**
* 套接字超时时间
*/
public
static
final
int SOCKET_TIMEOUT = 50000;
/**
* 连接池中 连接请求执行被阻塞的超时时间
*/
public
static
final
long CONN_MANAGER_TIMEOUT = 60000;
/**
* http连接相关参数
*/
private
static HttpParams parentParams;
/**
* http线程池管理器
*/
private
static PoolingClientConnectionManager cm;
/**
* http客户端
*/
private
static DefaultHttpClient httpClient;
/**
* 默认目标主机
*/
private
static
final HttpHost DEFAULT_TARGETHOST =
new HttpHost("http://www.qq.com", 80);
/**
* 初始化http连接池,设置参数、http头等等信息
*/
static {
SchemeRegistry schemeRegistry =
new SchemeRegistry();
schemeRegistry.register(
new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
schemeRegistry.register(
new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));
cm =
new PoolingClientConnectionManager(schemeRegistry);
cm.setMaxTotal(MAX_TOTAL_CONNECTIONS);
cm.setDefaultMaxPerRoute(MAX_ROUTE_CONNECTIONS);
cm.setMaxPerRoute(
new HttpRoute(DEFAULT_TARGETHOST), 20);
//
设置对目标主机的最大连接数
parentParams =
new BasicHttpParams();
parentParams.setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);
parentParams.setParameter(ClientPNames.DEFAULT_HOST, DEFAULT_TARGETHOST);
//
设置默认targetHost
parentParams.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);
parentParams.setParameter(ClientPNames.CONN_MANAGER_TIMEOUT, CONN_MANAGER_TIMEOUT);
parentParams.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, CONNECT_TIMEOUT);
parentParams.setParameter(CoreConnectionPNames.SO_TIMEOUT, SOCKET_TIMEOUT);
parentParams.setParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS,
true);
parentParams.setParameter(ClientPNames.HANDLE_REDIRECTS,
true);
//
设置头信息,模拟浏览器
Collection
collection =
new ArrayList
();
collection.add(
new BasicHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)"));
collection.add(
new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));
collection.add(
new BasicHeader("Accept-Language", "zh-cn,zh,en-US,en;q=0.5"));
collection.add(
new BasicHeader("Accept-Charset", "ISO-8859-1,utf-8,gbk,gb2312;q=0.7,*;q=0.7"));
collection.add(
new BasicHeader("Accept-Encoding", "gzip, deflate"));
parentParams.setParameter(ClientPNames.DEFAULT_HEADERS, collection);
//
请求重试处理
HttpRequestRetryHandler httpRequestRetryHandler =
new HttpRequestRetryHandler() {
public
boolean retryRequest(IOException exception,
int executionCount, HttpContext context) {
if (executionCount >= 5) {
//
如果超过最大重试次数,那么就不要继续了
return
false;
}
if (exception
instanceof NoHttpResponseException) {
//
如果服务器丢掉了连接,那么就重试
return
true;
}
if (exception
instanceof SSLHandshakeException) {
//
不要重试SSL握手异常
return
false;
}
HttpRequest request = (HttpRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST);
boolean idempotent = !(request
instanceof HttpEntityEnclosingRequest);
if (idempotent) {
//
如果请求被认为是幂等的,那么就重试
return
true;
}
return
false;
}
};
httpClient =
new DefaultHttpClient(cm, parentParams);
httpClient.setHttpRequestRetryHandler(httpRequestRetryHandler);
}
/**
* 抓取页面代码
*
@param
url 目标页面的url
*
@return
页面代码
*/
public String getHtml(String url) {
HttpHost proxyHost =
new HttpHost("211.142.236.137", 8080);
//
代理
String html = getHtml(url, proxyHost);
int count = 0;
while(StringUtils.isEmpty(html)){
proxyHost =
new HttpHost("211.142.236.137", 80);
//
更换代理
html = getHtml(url, proxyHost);
count++;
if(count > 3){
System.out.println("抓取失败");
break;
}
}
System.out.println(html.length());
return html;
}
/**
* 抓取url所指的页面代码
*
@param
url 目标页面的url
*
@return
页面代码
*/
public String getHtml(String url, HttpHost proxyHost) {
String html = "";
HttpGet httpGet =
new HttpGet(url);
httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);
//
设置代理
HttpResponse httpResponse;
HttpEntity httpEntity;
try {
httpResponse = httpClient.execute(httpGet);
StatusLine statusLine = httpResponse.getStatusLine();
int statusCode = statusLine.getStatusCode();
System.out.println(statusCode);
if(200 != statusCode) {
return html;
}
httpEntity = httpResponse.getEntity();
if(httpEntity !=
null){
html = readHtmlContentFromEntity(httpEntity);
}
}
catch (ClientProtocolException e) {
//
TODO Auto-generated catch block
e.printStackTrace();
}
catch (IOException e) {
//
TODO Auto-generated catch block
e.printStackTrace();
}
finally {
if(httpGet !=
null){
httpGet.releaseConnection();
}
}
return html;
}
/**
* 从response返回的实体中读取页面代码
*
@param
httpEntity Http实体
*
@return
页面代码
*
@throws
ParseException
*
@throws
IOException
*/
private String readHtmlContentFromEntity(HttpEntity httpEntity)
throws ParseException, IOException {
String html = "";
Header header = httpEntity.getContentEncoding();
if(httpEntity.getContentLength() < 2147483647L){
//
EntityUtils无法处理ContentLength超过2147483647L的Entity
if(header !=
null && "gzip".equals(header.getValue())){
html = EntityUtils.toString(
new GzipDecompressingEntity(httpEntity));
}
else {
html = EntityUtils.toString(httpEntity);
}
}
else {
InputStream in = httpEntity.getContent();
if(header !=
null && "gzip".equals(header.getValue())){
html = unZip(in, ContentType.getOrDefault(httpEntity).getCharset().toString());
}
else {
html = readInStreamToString(in, ContentType.getOrDefault(httpEntity).getCharset().toString());
}
if(in !=
null){
in.close();
}
}
return html;
}
/**
* 测试代理是否可用(其实和getHtml(String url, HttpHost proxyHost)的代码差不多,为了从功能上区别,暂时这样)
*
@param
httpHost 封装了代理的ip地址和端口
*
@param
url 用来测试的页面
*
@return
true 可用 false 不可用
*/
public
boolean isProxyUsable(HttpHost proxyHost, String url) {
HttpGet httpGet =
new HttpGet(url);
httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);
try {
HttpResponse httpResponse = httpClient.execute(httpGet);
StatusLine statusLine = httpResponse.getStatusLine();
int statusCode = statusLine.getStatusCode();
System.out.println(statusCode);
if(200 != statusCode) {
return
false;
}
HttpEntity httpEntity = httpResponse.getEntity();
if(httpEntity !=
null) {
String html = readHtmlContentFromEntity(httpEntity);
System.out.println(html.length());
if(StringUtils.isEmpty(html)){
return
false;
}
}
else {
return
false;
}
}
catch (ClientProtocolException e) {
//
TODO Auto-generated catch block
e.printStackTrace();
return
false;
}
catch (IOException e) {
//
TODO Auto-generated catch block
e.printStackTrace();
return
false;
}
return
true;
}
/**
* 解压服务器返回的gzip流
*
@param
in 抓取返回的InputStream流
*
@param
charSet 页面内容编码
*
@return
页面内容的String格式
*
@throws
IOException
*/
private String unZip(InputStream in, String charSet)
throws IOException {
ByteArrayOutputStream baos =
new ByteArrayOutputStream();
GZIPInputStream gis =
null;
try {
gis =
new GZIPInputStream(in);
byte[] _byte =
new
byte[1024];
int len = 0;
while ((len = gis.read(_byte)) != -1) {
baos.write(_byte, 0, len);
}
String unzipString =
new String(baos.toByteArray(), charSet);
return unzipString;
}
finally {
if (gis !=
null) {
gis.close();
}
if(baos !=
null){
baos.close();
}
}
}
/**
* 读取InputStream流
*
@param
in InputStream流
*
@return
从流中读取的String
*
@throws
IOException
*/
private String readInStreamToString(InputStream in, String charSet)
throws IOException {
StringBuilder str =
new StringBuilder();
String line;
BufferedReader bufferedReader =
new BufferedReader(
new InputStreamReader(in, charSet));
while((line = bufferedReader.readLine()) !=
null){
str.append(line);
str.append("\n");
}
if(bufferedReader !=
null) {
bufferedReader.close();
}
return str.toString();
}
/**
* for test
*
@author
lidongyang
* @createtime Oct 18, 2012 2:35:09 PM
*/
public
class Test
implements Runnable {
String url;
int threadNum;
public Test() {
}
public Test(String url,
int threadNum) {
this.url = url;
this.threadNum = threadNum;
}
@Override
public
void run() {
getHtml(url);
}
}
/**
* for test
*
@param
args
*
@throws
InterruptedException
*/
public
static
void main(String[] args)
throws InterruptedException{
HttpConnectionManager httpConnectionManager =
new HttpConnectionManager();
Date start =
new Date();
httpConnectionManager.getHtml("http://www.qq.com");
Date end =
new Date();
System.out.println((end.getTime() - start.getTime())/1000.0 + " 秒");
}
}
package parser;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import spider.HttpConnectionManager;
/**
test
*
@author
lidongyang
* @createtime Oct 23, 2012 11:05:33 AM
*/
public
class GetQqNews {
public
static
void main(String[] args){
HttpConnectionManager httpConnectionManager =
new HttpConnectionManager();
String html = httpConnectionManager.getHtml("http://www.qq.com");
Document doc = Jsoup.parse(html);
Elements newsList = doc.select("[class=ft fl]").select("ul").select("li").select("a");
for (Element element : newsList) {
System.out.println(element.attr("href") + "----" + element.text());
}
}
}