Java爬虫之 HttpClient 的使用

Get

  • 不带参数的Get请求
    // 创建httpClient对象,模拟客户端
    CloseableHttpClient httpClient = HttpClients.createDefault();
    // 创建httpGet对象,设置地址
    HttpGet httpGet = new HttpGet("http://www.baidu.com");
    CloseableHttpResponse response = null;
    try {
        // 使用httpClient发起请求,获取响应
        response = httpClient.execute(httpGet);
    
        // 解析
        if (response.getStatusLine().getStatusCode() == 200) {
            String content = EntityUtils.toString(response.getEntity(), "utf8");
            System.out.println(content.length());
        }
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        try {
            response.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        try {
            httpClient.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    
  • 带参数的Get请求
    // 创建httpClient对象,模拟客户端
    CloseableHttpClient httpClient = HttpClients.createDefault();
    
    // 创建URIBuilder
    URIBuilder uriBuilder = new URIBuilder("http://www.baidu.com");
    // 设置参数
    uriBuilder.setParameter("keys", "Java");
    
    // 创建httpGet对象,设置地址
    HttpGet httpGet = new HttpGet(uriBuilder.build());
    

Post

  • 不带参数的Post请求
    只需把 HttpGet 改为 HttpPost 即可
    // 创建httpClient对象,模拟客户端
    CloseableHttpClient httpClient = HttpClients.createDefault();
    // 创建httpPost对象,设置地址
    HttpPost httpPost = new HttpPost("http://www.baidu.com");
    
  • 带参数的Post请求
    // 创建httpClient对象,模拟客户端
    CloseableHttpClient httpClient = HttpClients.createDefault();
    // 创建httpGet对象,设置地址
    HttpPost httpPost = new HttpPost("http://www.baidu.com");
    // 声明List集合,封装表单中的请求参数
    List<NameValuePair> params = new ArrayList<NameValuePair>();
    // 地址
    params.add(new BasicNameValuePair("keys", "Java"));
    // 创建表单的Entity对象
    UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "utf8");
    // 设置表单的Entity对象到post中
    httpPost.setEntity(formEntity);
    

连接池

  • 避免多次创建 HttpClient
    // 从连接池获取HttpClient对象
    CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    HttpGet httpGet = new HttpGet("http://www.baidu.com");
    CloseableHttpResponse response = null;
    try {
        response = httpClient.execute(httpGet);
        if (response.getStatusLine().getStatusCode() == 200) {
            String content = EntityUtils.toString(response.getEntity(), "utf8");
            System.out.println(content.length());
        }
    } catch (IOException e) {
    
    } finally {
        if (response != null) {
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        // 不可关闭httpClient,由连接池统一管理
    }
    

请求参数

  • 对爬取设置参数
    // 创建httpClient对象,模拟客户端
    CloseableHttpClient httpClient = HttpClients.createDefault();
    // 创建httpGet对象,设置地址
    HttpGet httpGet = new HttpGet("http://www.baidu.com");
    
    // 配置请求信息
    RequestConfig config = RequestConfig.custom()
            // 创建连接的最长时间
            .setConnectTimeout(1000)
            // 获取连接的最长时间
            .setConnectionRequestTimeout(500)
            // 数据传输的最长时间
            .setSocketTimeout(10 * 1000).build();
    
    // 给请求设置请求信息
    httpGet.setConfig(config);
    

你可能感兴趣的:(Java爬虫)