Java爬虫-快速入门 HttpClient+JSoup详解

1. HttpClient与Jsoup简介

1.1 HttpClient

HttpClient可以用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包,并且它支持 HTTP 协议最新的版本和建议。

HttpClient的作用

  • 实现了所有 HTTP 的方法(GET,POST,PUT,HEAD 等)
  • 支持自动转向
  • 支持 HTTPS 协议
  • 支持代理服务器等
1.2 JSoup

jsoup是一款Java的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

JSoup的作用

  • 从一个URL,文件或字符串中解析HTML;
  • 使用DOM或CSS选择器来查找、取出数据;
  • 可操作HTML元素、属性、文本;
1.3 为什么要一起使用

httpClient 属于专业的抓取网页的库,可以设置代理,抓取失败可以重试抓取

在我的实际使用中,单独用jsoup也可以直接抓取网页,但是在抓取上,jsoup比较弱,API简单,功能也简单,主要是扩展htmlparser的功能吧,解析html。测试过程中jsoup抓取页面经常报错(time out等等)。

因此,我们可以用httpclient抓取网页,再用Jsoup.parse解析页面。

2.项目maven依赖


<dependency>
    <groupId>org.jsoupgroupId>
    <artifactId>jsoupartifactId>
    <version>1.12.1version>
dependency>

<dependency>
    <groupId>org.apache.httpcomponentsgroupId>
    <artifactId>httpclientartifactId>
    <version>4.5.10version>
dependency>

3.HttpClientUtils工具类

@SuppressWarnings("unused")
public class HttpClientUtils {

    private static String CHARSET = "utf-8";

    private static Integer STATUS_CODE = 200;

    private static CloseableHttpClient httpClient;

    private static Pattern pattern = Pattern.compile("([\\s\\S]*?) true;

            javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
            javax.net.ssl.TrustManager tm = new miTM();
            trustAllCerts[0] = tm;
            javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext
                    .getInstance("SSL");
            sc.init(null, trustAllCerts, null);
            javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
            RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(5000).build();
            httpClient = HttpClientBuilder.create().setDefaultRequestConfig(config).setSSLContext(sc).setSSLHostnameVerifier(hv).build();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    static class miTM implements javax.net.ssl.TrustManager, javax.net.ssl.X509TrustManager {

        @Override
        public void checkClientTrusted(X509Certificate[] x509Certificates, String s) {
        }

        @Override
        public void checkServerTrusted(X509Certificate[] x509Certificates, String s) {
        }

        @Override
        public X509Certificate[] getAcceptedIssuers() {
            return new X509Certificate[0];
        }
    }


    /**
     * 带请求头的GET请求 (已设置默认请求头)
     *
     * @param url 链接url
     * @return 网页内容
     */
    public static String doGetWithHeaders(String url) {
        //
        String responseTex = "";
        CloseableHttpResponse response = null;
        try {
            // 2.创建uri对象
            URIBuilder builder = new URIBuilder(url);
            URI uri = builder.build();
            // 3.创建http GET请求
            HttpGet httpGet = new HttpGet(uri);
            //携带请求头的信息
            httpGet.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
            httpGet.setHeader("Accept-Encoding", "gzip, deflate, br");
            httpGet.setHeader("Accept-Language", "en-US,en;q=0.9,zh;q=0.8,zh-CN;q=0.7");
            httpGet.setHeader("Connection", "keep-alive");
            httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36");

            // 4.执行请求
            response = httpClient.execute(httpGet);
            // 5.判断返回状态是否为200
            if (response.getStatusLine().getStatusCode() == STATUS_CODE) {
                // 6.进行编码自适应处理
                HttpEntity entity = response.getEntity();
                byte[] bytes = EntityUtils.toByteArray(entity);
                String content = new String(bytes);
                Matcher matcher = pattern.matcher(content.toLowerCase());
                if (matcher.find()) {
                    CHARSET = matcher.group(4);
                    String gb2312 = "gb2312";
                    if (gb2312.equals(CHARSET)) {
                        byte[] gbkBytes = new String(bytes, "gb2312").getBytes();
                        responseTex = new String(gbkBytes);
                    }
                }

                responseTex = new String(bytes, CHARSET);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            close(httpClient, response);
        }
        return responseTex;
    }

    /**
     * 携带请求参数的GET请求
     *
     * @param url    链接url
     * @param params 请求参数
     * @return 网页内容
     */
    public static String doGet(String url, Map params) {
        //
        String responseTex = "";
        CloseableHttpResponse response = null;
        try {
            // 2.创建uri对象
            URIBuilder builder = new URIBuilder(url);
            if (params != null && !params.isEmpty()) {
                for (String key : params.keySet()) {
                    builder.addParameter(key, params.get(key));
                }
            }
            URI uri = builder.build();
            // 3.创建http GET请求
            HttpGet httpGet = new HttpGet(uri);
            // 4.执行请求
            response = httpClient.execute(httpGet);
            // 5.判断返回状态是否为200
            if (response.getStatusLine().getStatusCode() == STATUS_CODE) {
                // 6.进行UTF-8编码处理
                responseTex = EntityUtils.toString(response.getEntity(), CHARSET);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            close(httpClient, response);
        }
        return responseTex;
    }

    /**
     * 不需要携带参数的GET请求
     *
     * @param url 链接url
     * @return 网页内容
     */
    public static String doGet(String url) {
        return doGet(url, null);
    }

    /**
     * 既携带请求头又携带请求参数的GET请求
     * 该方法未实现
     *
     * @param url    链接url
     * @param params 请求参数
     * @param header 请求头
     * @return 网页内容
     */
    public static String doGet(String url, Map params, Map header) {
        return null;
    }


    /**
     * 携带请求参数的POST请求
     *
     * @param url   链接url
     * @param param 请求参数
     * @return 网页内容
     */
    public static String doPost(String url, Map param) {
        //
        String result = "";
        CloseableHttpResponse response = null;
        try {
            // 2.创建Http Post请求
            HttpPost httpPost = new HttpPost(url);
            // 3.创建参数列表
            if (param != null) {
                List paramList = new ArrayList<>();
                for (String key : param.keySet()) {
                    paramList.add(new BasicNameValuePair(key, param.get(key)));
                }
                // 4.模拟表单
                UrlEncodedFormEntity entity = new UrlEncodedFormEntity(paramList);
                httpPost.setEntity(entity);
            }
            // 5.执行http请求
            response = httpClient.execute(httpPost);
            // 6.获取响应的结果
            result = EntityUtils.toString(response.getEntity(), CHARSET);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            close(httpClient, response);
        }
        return result;
    }

    /**
     * 发送无携带请求参数的POST请求
     *
     * @param url 链接url
     * @return 网页内容
     */
    public static String doPost(String url) {
        return doPost(url, null);
    }

    /**
     * 以json的方式传递请求参数,发送POST请求
     *
     * @param url  链接url
     * @param json json格式的参数
     * @return 网页内容
     */
    public static String doPostJson(String url, String json) {//map json
        //
        String result = "";
        CloseableHttpResponse response = null;
        try {
            // 2.创建Http Post请求
            HttpPost httpPost = new HttpPost(url);
            // 3.创建请求内容
            StringEntity entity = new StringEntity(json, ContentType.APPLICATION_JSON);
            httpPost.setEntity(entity);
            // 4.执行http请求
            response = httpClient.execute(httpPost);
            // 5.获取响应结果
            result = EntityUtils.toString(response.getEntity(), "utf-8");
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            close(httpClient, response);
        }
        return result;
    }

    /**
     * 释放资源
     *
     * @param httpClient   httpClient
     * @param httpResponse httpResponse
     */
    private static void close(CloseableHttpClient httpClient, CloseableHttpResponse httpResponse) {
        if (httpResponse != null) {
            try {
                httpResponse.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        //暂不释放资源
//        try {
//
//            httpClient.close();
//        } catch (IOException e) {
//            e.printStackTrace();
//        }
    }


}

注意:该工具类的url地址要带http协议或https协议,否则会报错

4.JSoup的使用

4.1 使用dom方法来查找元素
public class Test4 {
    public static void main(String[] args) {
        String uri = "https://www.yiibai.com/jsoup/jsoup-quick-start.html";
        String html = HttpClientUtils.doGet(uri);
        //使用JSoup解析html
        Document document = Jsoup.parse(html);
        //使用document.getElementsByTag("a") 获取所有a标签  通过forEach遍历内容
        Elements aTag = document.getElementsByTag("a");
        for (Element element : aTag) {
            //使用text()方法获取文本内容
            String text = element.text();
            //使用html()方法获取标签体的HTML
            String html1 = element.html();
            //attr(String key)来获取属性为key的内容
            String href = element.attr("href");
        }

        //使用document.getElementById("xx")获取id为xx的标签
        Element id = document.getElementById("qq-group");
        String text = id.text();
        //attributes()获取所有属性
        Attributes attributes = id.attributes();
        //输出属性和属性值
        for (Attribute attribute : attributes) {
            String key = attribute.getKey();
            String value = attribute.getValue();
            System.out.println("key="+key+"--->value="+value);
        }

        //document.getElementsByClass("yy")获取class为yy的所有标签
        Elements aClass = document.getElementsByClass("article-content");

    }
}
4.2 使用选择器语法来查找元素
public class Test5 {
    public static void main(String[] args) {
        String uri = "https://www.yiibai.com/jsoup/jsoup-quick-start.html";
        String html = HttpClientUtils.doGet(uri);
        //使用JSoup解析html
        Document document = Jsoup.parse(html);
        //select("tagname") 通过标签查找元素
        Elements aTag = document.select("a");
        //select("#id") 通过ID查找元素
        Elements id = document.select("#qq-group");
        //select(".class") 通过class名称查找元素
        Elements class1 = document.select(".article-content");
        //select("[attribute]") 通过属性查找元素
        Elements href = document.select("[href]");

        //select(":contains(text)") 查找包含给定文本的元素,搜索不区分大不写
        Elements contains = document.select(":contains(JSoup安装)");
        for (Element element : contains) {
            //输出包含 'JSoup安装' 内容的链接
            String href1 = element.attr("href");
            if (href1!=""){
                System.out.println(href1);
            }
        }

        //select(":matches(regex)") 查找哪些元素的文本匹配指定的正则表达式
        Elements select = document.select(":matches(regex)");
    }
}

你可能感兴趣的:(Java高级)