JAVA使用HttpClient实现爬虫技术

1. pom文件中加入httpClient依赖包


   org.apache.httpcomponents
   httpclient
   4.3.1

2. 创建一个调用httpClient的工具类

public class HttpClientUtil {


    private CloseableHttpClient closeableHttpClient;

    private RequestConfig requestConfig;

    // 最大的连接数
    private int maxTotal = 10;

    // 最大的并发数
    private int defaultMaxPerRoute = 5;

    // 连接超时数
    private int connectTimeOut = 2000;

    // 数据传输的最长时间
    private int socketTimeout = 10000;

    // 在连接之前测试连接可不可用
    private boolean staleConnectionCheckEnabled = true;

    // 从数据池中获取连接的最长时间
    private int connectionRequestTimeOut = 500;



    public HttpClientUtil() {

        createCloseableHttpClient();
        createRequestConfig();
    }

    /**
     *  创建CloseableHttpClient
     */
    private void createCloseableHttpClient() {

        PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager();
        connectionManager.setMaxTotal(maxTotal);
        connectionManager.setDefaultMaxPerRoute(defaultMaxPerRoute);

        HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
        httpClientBuilder.setConnectionManager(connectionManager);
        this.closeableHttpClient = httpClientBuilder.build();
    }

    /**
     *  创建requestConfig
     */
    private void createRequestConfig() {

        RequestConfig.Builder custom = RequestConfig.custom();
        custom.setConnectTimeout(connectTimeOut)
                .setSocketTimeout(socketTimeout)
                .setStaleConnectionCheckEnabled(staleConnectionCheckEnabled)
                .setConnectionRequestTimeout(connectionRequestTimeOut);

        this.requestConfig = custom.build();

    }

    /**
     *  get请求不带参数
     * @param url
     * @return
     * @throws Exception
     */
    public String doGet(String url) throws Exception {

        // 先获取地址的请求对象
        HttpGet httpGet = new HttpGet(url);
        // 配置参数
        httpGet.setConfig(requestConfig);
        // 执行请求
        CloseableHttpResponse response = closeableHttpClient.execute(httpGet);

        if (response.getStatusLine().getStatusCode() == 200) {
            return EntityUtils.toString(response.getEntity(),"UTF-8");
        }
        return null;
    }

    /**
     *  get请求带参数
     * @param url
     * @param map
     * @return
     * @throws Exception
     */
    public String doGet(String url, Map, Object> map) throws Exception {

        URIBuilder uriBuilder = new URIBuilder(url);
        if (map != null) {
            Set, Object>> entrySet = map.entrySet();
            for (Map.Entry, Object> entry : entrySet) {
                uriBuilder.addParameter(entry.getKey(),entry.getValue().toString());
            }
        }
        return this.doGet(uriBuilder.build().toString());
    }


    /**
     *  带参数的post请求
     * @param url
     * @param map
     * @return
     * @throws Exception
     */
    public String doPost(String url, Map, Object> map) throws Exception {

        HttpPost httpPost = new HttpPost(url);
        httpPost.setConfig(requestConfig);
        if (map != null) {
            List pairList = new ArrayList<>();
            Set, Object>> entrySet = map.entrySet();
            for (Map.Entry,Object> entry : entrySet) {
                pairList.add(new BasicNameValuePair(entry.getKey(),entry.getValue().toString()));
            }
            UrlEncodedFormEntity entity = new UrlEncodedFormEntity(pairList);
            httpPost.setEntity(entity);
        }

        CloseableHttpResponse response = closeableHttpClient.execute(httpPost);
        if (response.getStatusLine().getStatusCode() == 200) {
            return EntityUtils.toString(response.getEntity(),"UTF-8");
        }

        return null;
    }

    /**
     *  post请求不带参数
     * @param url
     * @return
     * @throws Exception
     */
    public String doPost(String url) throws Exception {

        return this.doPost(url,null);
    }

3. 利用unit进行单元测试

@Test
public void test1() {

    String url = "https://blog.csdn.net/javalixy/article/details/76284524";
    HttpClientUtil clientUtil = new HttpClientUtil();
    try {
        String result = clientUtil.doGet(url);
        parseHtml(result);
    } catch (Exception e) {
        e.printStackTrace();
    }

可以得到返回的Html页面

4.使用开源框架Jsoup进行html页面的解析

4.1  加入jsoup依赖


   org.jsoup
   jsoup
   1.7.3

4.2 jsoup解析页面

private void parseHtml(String result) {

    Document document = Jsoup.parse(result);
    Elements linkElements = document.select("link[href]");
    Elements textElements = document.select("span");
    Elements imgElements = document.select("img");

    System.out.println(String.format("LinkElements: (%d)", linkElements.size()));
    System.out.println(String.format("TextElements: (%d)", textElements.size()));
    System.out.println(String.format("ImgElements: (%d)", imgElements.size()));

    for(Element link : linkElements){
        print(" * a: <%s>  (%s)", link.attr("abs:href"), trim(link.text(), 35));
    }
    for (Element text : textElements) {
        print("* text: <%s> (%s)",text.attr("abs:class"), trim(text.text(),35));
    }
    for (Element img : imgElements) {
        print("* text: <%s> (%s)",img.attr("abs:src"), trim(img.text(),35));
    }
}

private void print(String str , Object...msg) {
    System.out.println(String.format(str,msg));
}

private static String trim(String str, int width){
    if(str.length() > width) {
        return str.substring(0, width + 1) + ".";
    } else {
        return str;
    }
}

你可能感兴趣的:(HttpClient)