爬虫简单示例,用httpClient4.2.1实现(转载)

HttpConnectionManager.java

复制代码
package spider;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.zip.GZIPInputStream;

import javax.net.ssl.SSLHandshakeException;

import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.NoHttpResponseException;
import org.apache.http.ParseException;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.params.ConnManagerParams;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.message.BasicHeader;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;


/**
 * http连接、抓取管理类
 * 
@author  lidongyang
 * @createtime Oct 18, 2012 1:55:18 PM
 * 
 * @note 基本测试版
 
*/
public  class HttpConnectionManager {
    
     /**  
     * 连接池里的最大连接数
     
*/  
     public  static  final  int MAX_TOTAL_CONNECTIONS = 100;
    
     /**  
     * 每个路由的默认最大连接数
     
*/  
     public  static  final  int MAX_ROUTE_CONNECTIONS = 50;
    
     /**  
     * 连接超时时间
     
*/  
     public  static  final  int CONNECT_TIMEOUT = 50000;
    
     /**
     * 套接字超时时间
     
*/
     public  static  final  int SOCKET_TIMEOUT = 50000;
    
     /**
     * 连接池中 连接请求执行被阻塞的超时时间
     
*/
     public  static  final  long CONN_MANAGER_TIMEOUT = 60000;
    
     /**
     * http连接相关参数
     
*/
     private  static HttpParams parentParams;
    
     /**
     * http线程池管理器
     
*/
     private  static PoolingClientConnectionManager cm;
    
     /**
     * http客户端
     
*/
     private  static DefaultHttpClient httpClient;
    
     /**
     * 默认目标主机
     
*/
     private  static  final HttpHost DEFAULT_TARGETHOST =  new HttpHost("http://www.qq.com", 80);
    
     /**
     * 初始化http连接池,设置参数、http头等等信息
     
*/
     static {
        SchemeRegistry schemeRegistry =  new SchemeRegistry();
        schemeRegistry.register(
                  new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
        schemeRegistry.register(
                  new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));

        cm =  new PoolingClientConnectionManager(schemeRegistry);

        cm.setMaxTotal(MAX_TOTAL_CONNECTIONS);
        
        cm.setDefaultMaxPerRoute(MAX_ROUTE_CONNECTIONS);

        cm.setMaxPerRoute( new HttpRoute(DEFAULT_TARGETHOST), 20);         // 设置对目标主机的最大连接数
        
        parentParams =  new BasicHttpParams(); 
        parentParams.setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);

        parentParams.setParameter(ClientPNames.DEFAULT_HOST, DEFAULT_TARGETHOST);     // 设置默认targetHost
        
        parentParams.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);
        
        parentParams.setParameter(ClientPNames.CONN_MANAGER_TIMEOUT, CONN_MANAGER_TIMEOUT);
        parentParams.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, CONNECT_TIMEOUT);
        parentParams.setParameter(CoreConnectionPNames.SO_TIMEOUT, SOCKET_TIMEOUT);
        
        parentParams.setParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS,  true);
        parentParams.setParameter(ClientPNames.HANDLE_REDIRECTS,  true);
        
         // 设置头信息,模拟浏览器
        Collection
 collection =  new ArrayList
();
        collection.add( new BasicHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)"));
        collection.add( new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));
        collection.add( new BasicHeader("Accept-Language", "zh-cn,zh,en-US,en;q=0.5"));
        collection.add( new BasicHeader("Accept-Charset", "ISO-8859-1,utf-8,gbk,gb2312;q=0.7,*;q=0.7"));
        collection.add( new BasicHeader("Accept-Encoding", "gzip, deflate"));
        
        parentParams.setParameter(ClientPNames.DEFAULT_HEADERS, collection);
         // 请求重试处理
        HttpRequestRetryHandler httpRequestRetryHandler =  new HttpRequestRetryHandler() {
             public  boolean retryRequest(IOException exception,  int executionCount, HttpContext context) {
                 if (executionCount >= 5) {
                     //  如果超过最大重试次数,那么就不要继续了
                     return  false;
                }
                 if (exception  instanceof NoHttpResponseException) {
                     //  如果服务器丢掉了连接,那么就重试
                     return  true;
                }
                 if (exception  instanceof SSLHandshakeException) {
                     //  不要重试SSL握手异常
                     return  false;
                }
                HttpRequest request = (HttpRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST);
                 boolean idempotent = !(request  instanceof HttpEntityEnclosingRequest);
                 if (idempotent) {
                     //  如果请求被认为是幂等的,那么就重试
                     return  true;
                }
                 return  false;
            }
        };
        
        httpClient =  new DefaultHttpClient(cm, parentParams);
        
        httpClient.setHttpRequestRetryHandler(httpRequestRetryHandler);
    }
    
     /**
     * 抓取页面代码
     * 
@param  url 目标页面的url
     * 
@return  页面代码
     
*/
     public String getHtml(String url) {
        HttpHost proxyHost =  new HttpHost("211.142.236.137", 8080); // 代理
        
        String html = getHtml(url, proxyHost);
        
         int count = 0;
         while(StringUtils.isEmpty(html)){
            proxyHost =  new HttpHost("211.142.236.137", 80); // 更换代理
            html = getHtml(url, proxyHost);
            count++;
             if(count > 3){
                System.out.println("抓取失败");
                 break;
            }
        }
        
System.out.println(html.length());
         return html;
    }
    
     /**
     * 抓取url所指的页面代码
     * 
@param  url 目标页面的url
     * 
@return  页面代码
     
*/
     public String getHtml(String url, HttpHost proxyHost) {
        String html = "";
        HttpGet httpGet =  new HttpGet(url);
        httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost); // 设置代理
        
        HttpResponse httpResponse;
        HttpEntity httpEntity;
         try {
            httpResponse = httpClient.execute(httpGet);
            
            StatusLine statusLine = httpResponse.getStatusLine();
             int statusCode = statusLine.getStatusCode();
System.out.println(statusCode);
             if(200 != statusCode) {
                 return html;
            }
            
            httpEntity = httpResponse.getEntity();
             if(httpEntity !=  null){
                html = readHtmlContentFromEntity(httpEntity);
            }
        }  catch (ClientProtocolException e) {
             //  TODO Auto-generated catch block
            e.printStackTrace();
        }  catch (IOException e) {
             //  TODO Auto-generated catch block
            e.printStackTrace();
        }  finally {
             if(httpGet !=  null){
                httpGet.releaseConnection();
            }
        }
        
         return html;
    }
    
     /**
     * 从response返回的实体中读取页面代码
     * 
@param  httpEntity Http实体
     * 
@return  页面代码
     * 
@throws  ParseException
     * 
@throws  IOException
     
*/
     private String readHtmlContentFromEntity(HttpEntity httpEntity)  throws ParseException, IOException {
        String html = "";
        Header header = httpEntity.getContentEncoding();
         if(httpEntity.getContentLength() < 2147483647L){             // EntityUtils无法处理ContentLength超过2147483647L的Entity
             if(header !=  null && "gzip".equals(header.getValue())){
                html = EntityUtils.toString( new GzipDecompressingEntity(httpEntity));
            }  else {
                html = EntityUtils.toString(httpEntity);
            }
        }  else {
            InputStream in = httpEntity.getContent();
             if(header !=  null && "gzip".equals(header.getValue())){
                html = unZip(in, ContentType.getOrDefault(httpEntity).getCharset().toString());
            }  else {
                html = readInStreamToString(in, ContentType.getOrDefault(httpEntity).getCharset().toString());
            }
             if(in !=  null){
                in.close();
            }
        }
         return html;
    }
    
     /**
     * 测试代理是否可用(其实和getHtml(String url, HttpHost proxyHost)的代码差不多,为了从功能上区别,暂时这样)
     * 
@param  httpHost 封装了代理的ip地址和端口
     * 
@param  url 用来测试的页面
     * 
@return  true 可用 false 不可用
     
*/
     public  boolean isProxyUsable(HttpHost proxyHost, String url) {
        HttpGet httpGet =  new HttpGet(url);
        httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);
         try {
            HttpResponse httpResponse = httpClient.execute(httpGet);
            
            StatusLine statusLine = httpResponse.getStatusLine();
             int statusCode = statusLine.getStatusCode();
System.out.println(statusCode);
             if(200 != statusCode) {
                 return  false;
            }
            HttpEntity httpEntity = httpResponse.getEntity();
             if(httpEntity !=  null) {
                String html = readHtmlContentFromEntity(httpEntity);
System.out.println(html.length());
                 if(StringUtils.isEmpty(html)){
                     return  false;
                }
            }  else {
                 return  false;
            }
            
        }  catch (ClientProtocolException e) {
             //  TODO Auto-generated catch block
            e.printStackTrace();
             return  false;
        }  catch (IOException e) {
             //  TODO Auto-generated catch block
            e.printStackTrace();
             return  false;
        }
        
         return  true;
    }
    
     /**
     * 解压服务器返回的gzip流
     * 
@param  in 抓取返回的InputStream流
     * 
@param  charSet 页面内容编码
     * 
@return  页面内容的String格式
     * 
@throws  IOException
     
*/
     private String unZip(InputStream in, String charSet)  throws IOException {
        ByteArrayOutputStream baos =  new ByteArrayOutputStream();
        GZIPInputStream gis =  null;
         try {
            gis =  new GZIPInputStream(in);
             byte[] _byte =  new  byte[1024];
             int len = 0;
             while ((len = gis.read(_byte)) != -1) {
                baos.write(_byte, 0, len);
            }
            String unzipString =  new String(baos.toByteArray(), charSet);
             return unzipString;
        }  finally {
             if (gis !=  null) {
                gis.close();
            }
             if(baos !=  null){
                baos.close();
            }
        }
    }
    
     /**
     * 读取InputStream流
     * 
@param  in InputStream流
     * 
@return  从流中读取的String
     * 
@throws  IOException
     
*/
     private String readInStreamToString(InputStream in, String charSet)  throws IOException {
        StringBuilder str =  new StringBuilder();
        String line;
        BufferedReader bufferedReader =  new BufferedReader( new InputStreamReader(in, charSet));
         while((line = bufferedReader.readLine()) !=  null){
            str.append(line);
            str.append("\n");
        }
         if(bufferedReader !=  null) {
            bufferedReader.close();
        }
         return str.toString();
    }
    
     /**
     * for test
     * 
@author  lidongyang
     * @createtime Oct 18, 2012 2:35:09 PM
     
*/
     public  class Test  implements Runnable {
        String url;
         int threadNum;
        
         public Test() {
            
        }
        
         public Test(String url,  int threadNum) {
             this.url = url;
             this.threadNum = threadNum;
        }
        
        @Override
         public  void run() {
            getHtml(url);
        }
    }
    
    
     /**
     * for test
     * 
@param  args
     * 
@throws  InterruptedException 
     
*/
     public  static  void main(String[] args)  throws InterruptedException{
        HttpConnectionManager httpConnectionManager =  new HttpConnectionManager();
        Date start =  new Date();
        httpConnectionManager.getHtml("http://www.qq.com");
        Date end =  new Date();
        System.out.println((end.getTime() - start.getTime())/1000.0 + " 秒");
    }
}
复制代码

GetQqNews.java

复制代码

 package parser;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import spider.HttpConnectionManager;

/**  test
 * 
@author  lidongyang
 * @createtime Oct 23, 2012 11:05:33 AM
 
*/
public  class GetQqNews {
    
    
     public  static  void main(String[] args){
        HttpConnectionManager httpConnectionManager =  new HttpConnectionManager();
        String html = httpConnectionManager.getHtml("http://www.qq.com");
        Document doc = Jsoup.parse(html);
        Elements newsList = doc.select("[class=ft fl]").select("ul").select("li").select("a");
         for (Element element : newsList) {
            System.out.println(element.attr("href") + "----" + element.text());
        }
    }
}
复制代码


你可能感兴趣的:(java,httpclient,spider)