Java爬虫入门【两种请求方式爬取,设置相关配置参数以及封装HttpClient工具】

Java爬虫入门

Java 网络爬虫具有很好的扩展性可伸缩性,其是目前搜索引擎开发的重要组成部分。例如,著
名的网络爬虫工具 Nutch 便是采用 Java 开发

(1)环境准备

1. 创建Maven工程itcast-crawler-first并给pom.xml加入依赖
<dependencies>
    
    <dependency>
        <groupId>org.apache.httpcomponentsgroupId>
        <artifactId>httpclientartifactId>
        <version>4.5.3version>
    dependency>
    
    <dependency>
        <groupId>org.jsoupgroupId>
        <artifactId>jsoupartifactId>
        <version>1.10.3version>
    dependency>
    
    <dependency>
        <groupId>junitgroupId>
        <artifactId>junitartifactId>
        <version>4.12version>
    dependency>
    
    <dependency>
        <groupId>org.apache.commonsgroupId>
        <artifactId>commons-lang3artifactId>
        <version>3.7version>
    dependency>
    <dependency>
        <groupId>commons-iogroupId>
        <artifactId>commons-ioartifactId>
        <version>2.6version>
    dependency>
    
    <dependency>
        <groupId>org.slf4jgroupId>
        <artifactId>slf4j-log4j12artifactId>
        <version>1.7.25version>
    dependency>
dependencies>
2. 在crawler\src\main\resources里面创建文件log4j.properties,然后添加下面的代码
log4j.rootLogger=DEBUG,A1
log4j.logger.cn.itcast = DEBUG
log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{
     yyyy-MM-dd
HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n

(2)Java爬虫入门爬取数据

1. 用JDK自带的URLConnection,进行页面的爬取

URLConnection 是 JDK 自带的一个抽象类,其代表应用程序和 URL 之间的通信链接。在网络爬虫中,我们可以使用 URLConnection 请求一个 URL 地址,然后获取流信息,通过对流信息的操作,可获得请求到的实体内容

package cn.itcast;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
/**
 * 演示原生JDK-API-URLConnection发送get和post请求
 */
public class JDKAPITest {
     
    @Test
    public void testGet() throws Exception{
     
        //1.确定首页的URL
        URL url = new URL("http://www.itcast.cn/?username=zhangsan");
        //2.通过url对象获取远程连接
        HttpURLConnection urlConnection =
                (HttpURLConnection)url.openConnection();
        //3.设置请求方式 请求参数 请求头
        urlConnection.setRequestMethod("GET"); //设置请求方式的时候一定要大写, 默认的请求方式是GET
        urlConnection.setRequestProperty("User-Agent", "Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.108 Safari/537.36");
        urlConnection.setConnectTimeout(30000); //连接超时 单位毫秒
        urlConnection.setReadTimeout(30000); //读取超时 单位毫秒
        //4.获取数据
        InputStream in = urlConnection.getInputStream();
        BufferedReader reader = new BufferedReader(new
                InputStreamReader(in));
        String line;
        String html = "";
        while ((line = reader.readLine()) != null) {
     
            html += line + "\n";
        }
        System.out.println(html);
        in.close();
        reader.close();
    }
    @Test
    public void testPost() throws Exception{
     
        //1.确定首页的URL
        URL url = new URL("http://www.itcast.cn");
        //2.获取远程连接
        HttpURLConnection urlConnection =(HttpURLConnection)
                url.openConnection();
        //3.设置请求方式 请求参数 请求头
        urlConnection.setRequestMethod("POST");
        urlConnection.setDoOutput(true); // 原生jdk默认关闭了输出流
        urlConnection.setRequestProperty("User-Agent", "Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.108 Safari/537.36");
        urlConnection.setConnectTimeout(30000); //连接超时 单位毫秒
        urlConnection.setReadTimeout(30000); //读取超时 单位毫秒
        OutputStream out = urlConnection.getOutputStream();
        out.write("username=zhangsan&password=123".getBytes());
        //4.获取数据
        InputStream in = urlConnection.getInputStream();
        BufferedReader reader = new BufferedReader(new
                InputStreamReader(in));
        String line;
        String html = "";
        while ((line = reader.readLine()) != null) {
     
            html += line + "\n";
        }
        System.out.println(html);
        in.close();
        reader.close();
    }
}
2. HttpClient

HttpClient 是 Apache Jakarta Common 下的子项目,用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包,并且它支持 HTTP 协议最新的版本和建议。其相比于传统JDK 自带的URLConnection,增加了易用性和灵活性。其功能主要是用来向服务器发送请求,并返回相关资源。在网络爬虫实战中,经常使用 HttpClient 获取网页内容,使用 jsoup 解析网页内容。

3. GET请求方式
package cn.itcast;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.junit.Test;

/**
 * 演示使用HttpClient实现网络爬虫
 */
public class HttpClientTest {
     
    @Test
    public void testGet() throws Exception {
     
        //1.创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //2.创建HttpClient请求并进行相关设置
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");
        httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36 SLBrowser/6.0.1.6181");
        //3.发起请求
        CloseableHttpResponse response = httpClient.execute(httpGet);
        //4.获取响应数据
        if (response.getStatusLine().getStatusCode()==200){
     
            String html = EntityUtils.toString(response.getEntity(),"utf-8");
            System.out.println(html);
        }
        //5.关闭资源
        httpClient.close();
        response.close();
    }
}
4. POST请求方式
package cn.itcast;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.junit.Test;

import java.util.ArrayList;
import java.util.List;
/**
 * 演示使用HttpClient实现网络爬虫
 */
public class HttpClientTest {
     

    @Test
    public void testPost() throws Exception {
     
        //1.创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //2.创建HttpPost对象并进行相关配置
        HttpPost httpPost = new HttpPost("http://www.itcast.cn");
        //给HttpPost设置参数
        //准备集合用来存放请求参数
        List<NameValuePair> params = new ArrayList<NameValuePair>();
        params.add(new BasicNameValuePair("username","java"));
        UrlEncodedFormEntity entity = new UrlEncodedFormEntity(params, "UTF-8");
        httpPost.setEntity(entity);//设置请求体,参数放在请求体中
        //设置请求头
        httpPost.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36 SLBrowser/6.0.1.6181");

        //3.发起请求
        CloseableHttpResponse response = httpClient.execute(httpPost);
        //4.判断响应状态码并获取数据
        if (response.getStatusLine().getStatusCode()==200){
     
            //获取到响应体,设置编码
            String html = EntityUtils.toString(response.getEntity(),"UTF-8");
            System.out.println(html);
        }
        //5.关闭资源
        response.close();
        httpClient.close();
    }
}
5. 连接池
package cn.itcast;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.junit.Test;
/**
 * 演示使用HttpClient连接池
 */
public class HttpClientTest {
     
    @Test
    public void testPool() throws Exception {
     
        //1.创建HttpClient连接管理器
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        //2.设置参数
        cm.setMaxTotal(200);//设置最大连接数
        cm.setDefaultMaxPerRoute(20);//设置每个主机的最大并发
        doGet(cm);
        doGet(cm);
    }
    private void doGet(PoolingHttpClientConnectionManager cm) throws Exception {
     
        //3.从连接池中获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();//在这里加上断点观察到,每次从池中获取到一个HttpClient对象
        //4.创建HttpGet对象
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");
        //5.发生请求
        CloseableHttpResponse response = httpClient.execute(httpGet);
        //6.获取数据
        if (response.getStatusLine().getStatusCode()==200){
     
            String html = EntityUtils.toString(response.getEntity(), "UTF-8");
            System.out.println(html);
        }
        //7.关闭资源
        response.close();
        //httpClient.close();
        //注意这里不要关闭HttpClient对象,因为使用连接池,HttpClient对象使用完之后应该要还回到池中,而不是关闭
    }
}
6. 设置其他配置参数
package cn.itcast;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.junit.Test;
/**
 * 演示使用HttpClient的相关设置实现网络爬虫
 */
public class HttpClientTest {
     
    @Test
    public void testConfig() throws Exception {
     
        //0.创建请求配置对象
        RequestConfig requestConfig = RequestConfig.custom()
                .setSocketTimeout(10000)//设置连接超时时间
                .setConnectTimeout(10000)//设置创建超时时间
                .setConnectionRequestTimeout(10000)//设置请求超时时间
                .setProxy(new HttpHost("58.220.95.78",9401))//添加代理
                .build();

        //1.创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
        //2.创建HttpGet对象
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");
        //3.发起请求
        CloseableHttpResponse response = httpClient.execute(httpGet);
        //4.获取响应数据
        if (response.getStatusLine().getStatusCode()==200){
     
            String html = EntityUtils.toString(response.getEntity(), "utf-8");
            System.out.println(html);
        }
        //5.关闭资源
        httpClient.close();
        response.close();
    }
}
封装HttpClient工具,方便爬取网页内容
package cn.itcast.utils;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;

/**
 * 封装HttpClient工具,方便爬取网页内容
 */
public abstract class HttpUtils {
     
    private static PoolingHttpClientConnectionManager cm = null;//声明HttpClient管理器对象(HttpClient连接池)
    private static List<String> userAgentList = null;
    private static RequestConfig config = null;

    //静态代码块会在类被加载的时候执行
    static {
     
        cm = new PoolingHttpClientConnectionManager();
        cm.setMaxTotal(200);
        cm.setDefaultMaxPerRoute(20);
        config = RequestConfig.custom()
                .setSocketTimeout(10000)
                .setConnectTimeout(10000)
                .setConnectionRequestTimeout(10000)
                .build();

        userAgentList = new ArrayList<String>();
        userAgentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36");
        userAgentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15;rv:73.0) Gecko/20100101 Firefox/73.0");
        userAgentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3)AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15");
        userAgentList.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299");
        userAgentList.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
        userAgentList.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0");
    }

    public static String getHtml(String url){
     
        //1.从连接池中获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        //2.创建HttpGet对象
        HttpGet httpGet = new HttpGet(url);
        //3.设置请求头和请求配置对象
        httpGet.setConfig(config);
        httpGet.setHeader("User-Agent",userAgentList.get(new Random().nextInt(userAgentList.size())));

        CloseableHttpResponse response = null;
        //4.发起请求
        try {
     
            response = httpClient.execute(httpGet);
            //5.获取响应内容
            if (response.getStatusLine().getStatusCode()==200){
     
                String html = "";
                if (response.getEntity()!=null){
     
                    html = EntityUtils.toString(response.getEntity(), "utf-8");
                }
                return html;
            }
        } catch (IOException e) {
     
            e.printStackTrace();
        }finally {
     
            try {
     
                response.close();
            } catch (IOException e) {
     
                e.printStackTrace();
            }
        }
        return null;
    }

    public static void main(String[] args) {
     
        String html = HttpUtils.getHtml("http://www.itcast.cn");
        System.out.println(html);
    }
}

你可能感兴趣的:(#,网络爬虫,网络,java,http,HttpClient,爬虫)