网络爬虫速成指南(一)网页下载

 
   
注解:此处仅仅是介绍一些类库及常规使用,如果要详细了解Http协议推荐看下《Http权威指南》
 
   

。net 方向 主要是用到HttpWebRequest下载内容:

JAVA方向:
主要是用到HttpClient下载内容
示例代码:
相关类库(httpclient-4.1.2 httpcore-4.1.4
示例代码c#:
package com.data.crawl.qa.baiduzhidao;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;

import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.cookie.CookieSpecProvider;
import org.apache.http.impl.cookie.BestMatchSpecFactory;
import org.apache.http.impl.cookie.BrowserCompatSpecFactory;

/**
 * HttpClient连接池
 *
 * @author wqj
 *
 */
public class HttpClientPool {
    
    private static Log log = LogFactory.getLog(HttpClientPool.class);
    
    /**
     * 最大HttpClient连接数
     */
    private final int MAX_TOTAL_CONNECTIONS = 10;

    /**
     * HttpClient连接池
     */
    private PoolingHttpClientConnectionManager connectionManager;


    /**
     * cookie 上下文
     */
    protected HttpClientContext context = null;


    /**
     * default constructor
     */
    public HttpClientPool(){
        connectionManager = new PoolingHttpClientConnectionManager();
        /* 连接池最大生成连接数200 */
        connectionManager.setMaxTotal(MAX_TOTAL_CONNECTIONS);
        /* 默认设置route最大连接数为20 */
        connectionManager.setDefaultMaxPerRoute(10);
     // 实例化cookie
        context = HttpClientContext.create();
        Registry<CookieSpecProvider> registry = RegistryBuilder.<CookieSpecProvider> create()
                .register(CookieSpecs.BEST_MATCH, new BestMatchSpecFactory())
                .register(CookieSpecs.BROWSER_COMPATIBILITY, new BrowserCompatSpecFactory()).build();
        context.setCookieSpecRegistry(registry);
    }

    /**
     * 从线程池实例化HttpClient
     *
     * @return
     */
    private CloseableHttpClient getHttpClient() {
        int socketTimeOut = 120000;
        int connectionTimeOut = 60000;

        RequestConfig config = RequestConfig.custom().setSocketTimeout(socketTimeOut)
                .setConnectTimeout(connectionTimeOut).setCookieSpec(CookieSpecs.BEST_MATCH).build();
        return HttpClients.custom().setDefaultRequestConfig(config).setConnectionManager(connectionManager).build();
    }

    /**
     * Post方式
     */
    public String Post(String uri, Map<String, String> params) {
        CloseableHttpClient httpclient = getHttpClient();
        HttpPost httpost = new HttpPost(uri);
        List<NameValuePair> post_data = new ArrayList<NameValuePair>();

        Set<String> keySet = params.keySet();
        for (String key : keySet) {
            post_data.add(new BasicNameValuePair(key, params.get(key)));
        }

        CloseableHttpResponse response = null;

        try {
            httpost.setEntity(new UrlEncodedFormEntity(post_data, "UTF-8"));
            response = httpclient.execute(httpost, context);
            
            //默认编码
            String charset = "utf-8";            
            HttpEntity entity = response.getEntity();

            String html = null;
            if (entity != null) {
                InputStream in = entity.getContent();

                /* 侦测编码 */
                ByteArrayOutputStream swapStream = new ByteArrayOutputStream();
                byte[] buff = new byte[1024];
                int rc = 0;
                while ((rc = in.read(buff, 0, 1024)) > 0) {
                    swapStream.write(buff, 0, rc);
                }
                byte[] data = swapStream.toByteArray();

                String charset_1 = Icu4jDetector.getEncode(data);
                charset = charset_1 == null ? charset : charset_1;

                html = new String(data, charset);
                System.out.println(html);
                in.close();
            }
            return html;
        } catch (UnsupportedEncodingException e) {
            log.error(e.getMessage());
        } catch (ClientProtocolException e) {
            log.error(e.getMessage());
        } catch (IOException e) {
            log.error(e.getMessage());
        }
        return null;
    }

    /**
     * 模拟登陆时,访问首页时使用此方法,此方法不带cookie
     *
     * @param uri 统一资源定位符
     * @return html文档
     */
    public String downHtml(String uri) {
        CloseableHttpClient httpclient = getHttpClient();
        HttpGet httpget = new HttpGet(uri);
        CloseableHttpResponse response = null;

        try {
            response = httpclient.execute(httpget);

            /* 判断访问的状态码 */
            int statusCode = response.getStatusLine().getStatusCode();
            if (statusCode != HttpStatus.SC_OK) {
                log.info("request failed: " + response.getStatusLine());
                return null;
            }

            /* 侦测编码 */
            Pattern pattern = Pattern.compile("text/html;[\\s]*charset=(.*)");
            Header[] arr = response.getHeaders("Content-Type");
            String charset = "utf-8";
            if (arr != null) {
                String content = arr[0].getValue().toLowerCase();
                Matcher m = pattern.matcher(content);
                if (m.find()) {
                    charset = m.group(1);
                }
            }

            HttpEntity entity = response.getEntity();
            String html = null;
            if (entity != null) {
                InputStream in = entity.getContent();

                /* 侦测编码 */
                ByteArrayOutputStream swapStream = new ByteArrayOutputStream();
                byte[] buff = new byte[1024];
                int rc = 0;
                while ((rc = in.read(buff, 0, 1024)) > 0) {
                    swapStream.write(buff, 0, rc);
                }
                byte[] data = swapStream.toByteArray();

                String charset_1 = Icu4jDetector.getEncode(data);
                charset = charset_1 == null ? charset : charset_1;

                html = new String(data, charset);
                in.close();
            }
            return html;

        } catch (ClientProtocolException e) {
           log.info(e.getMessage());
        } catch (IOException e) {
            log.info(e.getMessage());
        }
        return null;
    }

}

 

你可能感兴趣的:(网络爬虫)