Java爬虫①HttpClient

HttpClient是Apache中一个开源的项目,用来提供高效的,最新的,功能丰富的支持HTTP协议的客户端编程工具包,并且它支持HTTP协议最新版本和建议。

引入依赖


        
            org.apache.httpcomponents
            httpclient
            4.5.2
        
        
            org.slf4j
            slf4j-log4j12
            1.7.25
        
    

配置文件

log4j.rootLogger=DEBUGOER, m
log4j.appender.m=org.apache.log4j.ConsoleAppender
log4j.appender.m.layout=org.apache.log4j.PatternLayout
log4j.appender.m.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n

例子:

package com.sihi.crawler.test;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;


public class TestHttpClient {
    public static void main(String[] args) throws IOException {
        //1.打开浏览器,创建HTTP客户端
        CloseableHttpClient client = HttpClients.createDefault();
        //2.输入网址,创建请求
//        HttpGet httpGet = new HttpGet("http://www.sikiedu.com");
        HttpGet httpGet = new HttpGet("http://www.sikiedu.com/course/search?categoryId=0&orderBy=recommendedSeq");
        httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75");

        //3.发送请求,发送请求
        CloseableHttpResponse response = client.execute(httpGet);
        //4.服务器响应,解析响应
        HttpEntity entity = response.getEntity();

        System.out.println(EntityUtils.toString(entity,"UTF-8"));
    }
}

get请求

package com.sihi.crawler.test;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class TestHttpGet {
    public static void main(String[] args) {
        //get无参请求
        //httpGet();
        //get带参数
        //httpgetByParam1();
        //get带参数2
        httpgetByParam2();
    }

    private static void httpGet(){
        //1.创建httpClient
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //2.创建httpGet请求
        HttpGet httpGet = new HttpGet("http://www.sikiedu.com");

        //3.执行请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            int statusCode = response.getStatusLine().getStatusCode();//获得状态码
            if(200 == statusCode){
                HttpEntity entity = response.getEntity();
                System.out.println(EntityUtils.toString(entity,"UTF-8"));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private static void httpgetByParam1(){
        //1.创建httpClient
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //2.创建httpGet请求
        HttpGet httpGet = new HttpGet("http://www.sikiedu.com/course/search?categoryId=0&orderBy=recommendedSeq");

        //3.执行请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            int statusCode = response.getStatusLine().getStatusCode();//获得状态码
            if(200 == statusCode){
                HttpEntity entity = response.getEntity();
                System.out.println(EntityUtils.toString(entity,"UTF-8"));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private static void httpgetByParam2(){
        //1.创建httpClient
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //2.创建httpGet请求
        //HttpGet httpGet = new HttpGet("http://www.sikiedu.com/course/search?categoryId=0&orderBy=recommendedSeq");
        URI uri = null;
        try {
            uri = new URIBuilder().setScheme("http").setHost("www.sikiedu.com").setPath("/course/search")
                    .setParameter("categoryId", "0")
                    //.setParameters(new BasicNameValuePair("categoryId", "0"),new BasicNameValuePair("orderBy", "recommendedSeq")).build();
                    .setParameter("orderBy", "recommendedSeq").build();
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }
        HttpGet httpGet = new HttpGet(uri);

        //3.执行请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            int statusCode = response.getStatusLine().getStatusCode();//获得状态码
            if(200 == statusCode){
                HttpEntity entity = response.getEntity();
                System.out.println(EntityUtils.toString(entity,"UTF-8"));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}

post请求

导入alibaba的一个json的解析依赖


        
            com.alibaba
            fastjson
            1.2.68
package com.sihi.crawler.test;

import com.alibaba.fastjson.JSONObject;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

public class TestHttpPost {
    public static void main(String[] args) {
        CloseableHttpClient httpClient = HttpClients.createDefault();
        List nameValuePairs = new ArrayList();
        nameValuePairs.add(new BasicNameValuePair("app","ip.local"));
        nameValuePairs.add(new BasicNameValuePair("format","json"));
        //http://api.k780.com/?app=ip.local&format=json
        URI uri = null;
        try {
            uri = new URIBuilder().setScheme("http").setHost("api.k780.com").setParameters(nameValuePairs).build();
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }
        HttpPost httpPost = new HttpPost(uri);
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpPost);
            int statusCode = response.getStatusLine().getStatusCode();
            if (200 == statusCode){
                HttpEntity entity = response.getEntity();
                //System.out.println(EntityUtils.toString(entity,"UTF-8"));
                String json = EntityUtils.toString(entity,"UTF-8");
                //解析json拿到具体的ip {"success":"1","result":{"ip":"113.101.45.58","proxy":"1","att":"中国,广东,揭阳","operators":"电信"}}
                Map map =(Map) JSONObject.parse(json);
                Map result =(Map) map.get("result");
                Object ip = result.get("ip");
                System.out.println(ip.toString());
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}

HttpClient连接池

HttpClient代理

爬虫一个相同的IP地址频繁的访问相同的服务器,就会被禁止,所以设置一个互联网上的一个IP地址去访问,获得响应
https://www.xicidaili.com/nn 西刺代理

package com.sihi.crawler.test;


import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class TestHttpPool {
    public static void main(String[] args) {
        //配置连接池的参数
        PoolingHttpClientConnectionManager poolManger = new PoolingHttpClientConnectionManager();
        poolManger.setMaxTotal(100);//最大连接数
        poolManger.setDefaultMaxPerRoute(20);//路由最大连接数

        //配置连接池中连接的参数
        RequestConfig config = RequestConfig.custom()
                .setConnectTimeout(5000)//发送请求的超时时间
                .setSocketTimeout(2000)//响应超时时间
                .setConnectionRequestTimeout(500)//从连接池中获取的超时时间
                .setProxy(new HttpHost("114.105.103.142",4216))  //设置代理IP和端口号
                .build();
        //拿到httpClient
        CloseableHttpClient httpClient = HttpClients.custom()
                .setConnectionManager(poolManger)
                .setDefaultRequestConfig(config)
                .build();

        //2.创建httpGet请求
        HttpGet httpGet = new HttpGet("http://www.sikiedu.com");

        //3.执行请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            int statusCode = response.getStatusLine().getStatusCode();//获得状态码
            if(200 == statusCode){
                HttpEntity entity = response.getEntity();
                System.out.println(EntityUtils.toString(entity,"UTF-8"));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}

将上面的操作封装成一个工具类
HttpClientUtil

package com.sihi.crawler.test;

import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpClientUtil {

    private static PoolingHttpClientConnectionManager poolManger;
    private static RequestConfig config;

    public static CloseableHttpClient getHttpClient(){
        if(poolManger == null){
            //配置连接池的参数
            poolManger = new PoolingHttpClientConnectionManager();
            poolManger.setMaxTotal(100); //最大的连接数
            poolManger.setDefaultMaxPerRoute(20); //路由最大连接数
        }
        if(config == null){
            //配置连接池中连接的参数
            config = RequestConfig.custom()
                    .setConnectTimeout(5000)//发送请求的超时时间
                    .setSocketTimeout(2000)//响应超时时间
                    .setConnectionRequestTimeout(500)//从连接池中获取的超时时间
                    //.setProxy(new HttpHost("114.105.103.142",4216))  //设置代理IP和端口号
                    .build();
        }
        //拿到httpClient
        CloseableHttpClient httpClient = HttpClients.custom()
                .setConnectionManager(poolManger)
                .setDefaultRequestConfig(config)
                .build();

        return httpClient;
    }

    /**
     * 执行get请求返回的结果
     * @param url
     * @return
     */
    public static String doGet(String url){
        String result = "";
        //1.创建httpClient
        CloseableHttpClient httpClient = getHttpClient();

        //2.创建httpGet请求
        HttpGet httpGet = new HttpGet(url);

        //3.执行请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            int statusCode = response.getStatusLine().getStatusCode();//获得状态码
            if(200 == statusCode){
                HttpEntity entity = response.getEntity();
                result = EntityUtils.toString(entity,"UTF-8");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return result;
    }

    /**
     * 执行post请求返回的结果
     * @param url
     * @return
     */
    public static String doPost(String url){
        String result = "";
        //1.创建httpClient
        CloseableHttpClient httpClient = getHttpClient();

        //2.创建httpGet请求
        HttpPost httpPost = new HttpPost(url);

        //3.执行请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpPost);
            int statusCode = response.getStatusLine().getStatusCode();//获得状态码
            if(200 == statusCode){
                HttpEntity entity = response.getEntity();
                result = EntityUtils.toString(entity,"UTF-8");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if(response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return result;
    }
}

Java爬虫②Jsoup

你可能感兴趣的:(Java爬虫①HttpClient)