Java爬取行政区域信息到数据库

maven依赖:


            <dependency>
                <groupId>org.jsoupgroupId>
                <artifactId>jsoupartifactId>
                <version>1.10.2version>
            dependency>

需要的工具类:

为什么使用工具类: 没有调用工具类这个方法会报错, 请求链接时会报这个会报错:

javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException: PKIX path building failed: sun.security.provider.certpath.SunCertPathBuilderException: unable to find valid certification path to requested target

如图:
在这里插入图片描述

package com.ghx.demo.util;

import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
 
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
 
public class SslUtils {private static void trustAllHttpsCertificates() throws Exception {
    TrustManager[] trustAllCerts = new TrustManager[1];
    TrustManager tm = new miTM();
    trustAllCerts[0] = tm;
    SSLContext sc = SSLContext.getInstance("SSL");
    sc.init(null, trustAllCerts, null);
    HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
}
 
static class miTM implements TrustManager,X509TrustManager {
    public X509Certificate[] getAcceptedIssuers() {
        return null;
    }
 
    public boolean isServerTrusted(X509Certificate[] certs) {
        return true;
    }
 
    public boolean isClientTrusted(X509Certificate[] certs) {
        return true;
    }
 
    public void checkServerTrusted(X509Certificate[] certs, String authType)
            throws CertificateException {
        return;
    }
 
    public void checkClientTrusted(X509Certificate[] certs, String authType)
            throws CertificateException {
        return;
    }
}
 
/**
 * 忽略HTTPS请求的SSL证书,必须在openConnection之前调用
 * @throws Exception
 */
public static void ignoreSsl() throws Exception{
    HostnameVerifier hv = new HostnameVerifier() {
        public boolean verify(String urlHostName, SSLSession session) {
            System.out.println("Warning: URL Host: " + urlHostName + " vs. " + session.getPeerHost());
            return true;
        }
    };
    trustAllHttpsCertificates();
    HttpsURLConnection.setDefaultHostnameVerifier(hv);
}
 
}


Demo:

package com.ghx.demo.test;

import com.ghx.demo.util.SslUtils;
import com.google.common.collect.Lists;
import org.apache.commons.collections4.CollectionUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;

import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * @Author: GeHengXin
 * @Description: 测试网页抓取全国的行政区域
 */
public class DemoTest2 {

    @Test
    public void generateInsertSql() throws Exception {
        try {
            //调用这个方法为了防止报:javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException
            SslUtils.ignoreSsl();
        } catch (Exception e1) {
            System.out.println("utils");
        }
        //行政区划代码     https://www.mca.gov.cn/article/sj/xzqh/1980/(中华人民共和国民政部)  在这个网站找最新的数据替换一下链接
        
        String url = "https://www.mca.gov.cn/article/sj/xzqh/2022/202201xzqh.html";

        Document doc = Jsoup.connect(url)
                //写自己的 User-Agent 在浏览器随便请求一个网页,F12 复制自己的出来粘贴到这。
                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57")
                .header("Accept", "*/*")
                .maxBodySize(0)
                .timeout(100000)
                .get();

        Elements trs = doc.select("tr");
        //输出的文件路径
        String filePath = "C:\\Users\\admin\\Desktop\\hhh.sql";
        ArrayList<String> list = Lists.newArrayList();
        ArrayList<String> ignoreList = Lists.newArrayList();
        //解析数据并存放到list中
        // 正则表达式
        String regionCodePattern = "^[1-9]\\d{5}$";

        for (Element tr : trs) {
            Elements tds = tr.select("td");
            if (tds.size() > 3) {
                String regionCode = tds.get(1).text();
                String regionArea = tds.get(2).text();
                String parentCode = "";

                // 解析规则
                if (regionCode.matches(regionCodePattern)) {
                    int leveType = 2;
                    parentCode = regionCode.substring(0, 2) + "0000";
                    if (!regionCode.endsWith("00")) {
                        leveType = 3;
                        parentCode = regionCode.substring(0, 4) + "00";
                    }

                    if (regionCode.endsWith("0000")) {
                        leveType = 1;
                        parentCode = "000000";
                    }
                    //拼接sql
                    String content = String.format("insert into region_code (code, name, level, parent_code, dtime, note, ctime)" +
                            " values (%s, '%s', %s, %s, '201903', '系统生成', NOW());" + System.getProperty("line.separator"), regionCode, regionArea, leveType, parentCode);
                    list.add(content);
                } else {
                    ignoreList.add(regionCode);
                }
            }
        }
        System.out.println("正则通过的总数量为:" + list.size());
        System.out.println("正则未通过的总数量为:" + ignoreList.size());
        System.out.println(ignoreList);
        //逐行输出到文件
        this.writeFileByLine(filePath, list);
    }


    /**
     * 一行一行写入文件,解决写入中文字符时出现乱码
     * 流的关闭顺序:先打开的后关,后打开的先关,
     * 否则有可能出现java.io.IOException: Stream closed异常
     *
     * @throws IOException
     */
    public void writeFileByLine(String filePath, List<String> rowList) throws IOException {
        if (CollectionUtils.isEmpty(rowList)){
            return;
        }
        //写入中文字符时解决中文乱码问题
        FileOutputStream fos=new FileOutputStream(new File(filePath));
        OutputStreamWriter osw=new OutputStreamWriter(fos, "UTF-8");
        BufferedWriter bw=new BufferedWriter(osw);
        //简写如下:
        for(String row:rowList){
            bw.write(row);
//            bw.write(row+"\t\n");
        }
        //注意关闭的先后顺序,先打开的后关闭,后打开的先关闭
        bw.close();
        osw.close();
        fos.close();
    }

}

运行控制台打印:

Java爬取行政区域信息到数据库_第1张图片

国家民政部公布的数据(部分截图):

Java爬取行政区域信息到数据库_第2张图片
一共是:3213 条(我是用笨方法,从网页复制到Excel中的,没有设置表头,从第一行就是数据,其实如果单纯的只是想获取sql,也可以从Excel转化为sql, 不过咱们主题是讨论如何用Java爬虫工具,爬取数据)
Java爬取行政区域信息到数据库_第3张图片

导出的文件展示(只截取了文件得末尾):

Java爬取行政区域信息到数据库_第4张图片

对比发现,比官网得数据 少了几条,不过还能接受,至少我们的项目是满足了。至于少的几条具体是什么原因,我没有去分析,开始以为可能是正则校验,过滤掉了的那部分数据,不过打印出来看了看,正则没通过的是 11条,10个都是空的字符串,好像也对不上,有兴趣的,可以自己分析数据比对比对,这里就不展开了。如图:
Java爬取行政区域信息到数据库_第5张图片
参考: https://www.cnblogs.com/fengpingfan/p/10875230.html

你可能感兴趣的:(java,数据库,ssl,爬虫)