使用jsoup写java爬虫,爬取全国地区及编码数据

我也是参考了这篇博客http://www.cnblogs.com/sanmubird/p/7857474.html写的程序,是可以实现的。只需要修改一下MyCrawler代码即可。

package com.lenovo.crawl.main;

import com.lenovo.crawl.entity.Region;
import com.lenovo.crawl.link.LinkFilter;
import com.lenovo.crawl.link.Links;
import com.lenovo.crawl.page.Page;
import com.lenovo.crawl.page.PageParserTool;
import com.lenovo.crawl.page.RequestAndResponseTool;
import com.lenovo.crawl.util.DBCPUtils;
import com.lenovo.crawl.util.FileTool;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanHandler;
import org.apache.commons.dbutils.handlers.BeanListHandler;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Crawler {
    static Pattern pattern = Pattern.compile("^[0-9]+.*");
    static Pattern pattern_ = Pattern.compile("^[0-9]*");
    /**
     * 使用种子初始化 URL 队列
     *
     * @param seeds 种子 URL
     * @return
     */
    private void initCrawlerWithSeeds(String[] seeds) {
        for (int i = 0; i < seeds.length; i++){
            Links.addUnvisitedUrlQueue(seeds[i]);
        }
    }
    /**
     * 抓取过程
     *
     * @param seeds
     * @return
     */
    public void crawling(String[] seeds){
        Crawler crawler=new Crawler();
        //创建一个File的实例对象
        File file=new File("D:\\ja\\tempIn");
        //判断file是否存在,不存在就创建出一个文件目录
        if(!file.exists()){
            file.mkdirs();
        }
        File file1=new File(file,"result.txt");
        //判断file1是否存在,不存在就创建出一个文件
        if(!file1.exists()){
            file.mkdirs();
        }
        //创建FileOutInputStream的对象
        FileOutputStream fos=null;
        try{
            fos=new FileOutputStream(file1,true);
        //初始化 URL 队列
        initCrawlerWithSeeds(seeds);

        //定义过滤器,提取以 http://www.baidu.com 开头的链接
        LinkFilter filter = new LinkFilter() {
            public boolean accept(String url) {
                if (url.startsWith("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"))
                    return true;
                else
                    return false;
            }
        };

        //循环条件:待抓取的链接不空且抓取的网页不多于 1000
        while (!Links.unVisitedUrlQueueIsEmpty()) {
            //先从待访问的序列中取出第一个;
            String visitUrl = (String) Links.removeHeadOfUnVisitedUrlQueue();
            if (visitUrl == null){
                continue;
            }
            //根据URL得到page;
            Page page = RequestAndResponseTool.sendRequstAndGetResponse(visitUrl);
            //对page进行处理: 访问DOM的某个标签
            Elements es = PageParserTool.select(page,"a");
            Elements esTr = PageParserTool.select(page,"tr");
            if(!esTr.isEmpty()){
                for (Element e:esTr)
                {
                   if(e.attributes().get("class").equals("villagetr")){
                       List nodes=e.childNodes();
                       if(nodes!=null&&nodes.size()==3){
                           System.out.println(nodes.get(0).outerHtml().replaceAll("","").replaceAll("",""));
                           System.out.println(nodes.get(2).outerHtml().replaceAll("","").replaceAll("",""));
                           String code="                "+(nodes.get(0).outerHtml().replaceAll("","").replaceAll("","")+" ");
                           String name="                "+(nodes.get(2).outerHtml().replaceAll("","").replaceAll("","")+" ");
                           fos.write(code.getBytes());
                           fos.write("\r\n".getBytes());
                           fos.write(name.getBytes());
                           fos.write("\r\n".getBytes());
                       }
                   }
                   if(e.attributes().get("class").equals("provincetr")){
                       List nodes=e.childNodes();
                       if(nodes!=null&&nodes.size()>0){
                           for (Node node:nodes) {
                              Node value=node.childNodes().get(0);
                              String name=value.childNodes().get(0).outerHtml();
                              Attributes attributes=value.attributes();
                              String a=attributes.get("href");
                              a=a.substring(0,2);
                              a=a+"0000000000";
                               System.out.println(name);
                               System.out.println(a);
                               fos.write(a.getBytes());
                               fos.write("\r\n".getBytes());
                               fos.write(name.getBytes());
                               fos.write("\r\n".getBytes());
                           }
                       }
                   }
                   if(e.attributes().get("class").equals("citytr")){
                       List nodes=e.childNodes();
                       if(nodes!=null&&nodes.size()>0){
                              Node codeNode=nodes.get(0).childNodes().get(0);
                              String code=codeNode.childNodes().get(0).outerHtml();
                              Node nameCode=nodes.get(1).childNodes().get(0);
                              String name=nameCode.childNodes().get(0).outerHtml();
                              code="    "+code;
                              name="    "+name;
                           System.out.println(name);
                           System.out.println(code);
                           fos.write(code.getBytes());
                           fos.write("\r\n".getBytes());
                           fos.write(name.getBytes());
                           fos.write("\r\n".getBytes());
                       }
                   }
                   if(e.attributes().get("class").equals("countytr")){
                       List nodes=e.childNodes();
                       if(nodes!=null&&nodes.size()>0){
                              Node codeNode=nodes.get(0).childNodes().get(0);
                              String code=codeNode.childNodes().get(0).outerHtml();
                              Node nameCode=nodes.get(1).childNodes().get(0);
                              String name=nameCode.childNodes().get(0).outerHtml();
                           code="        "+code;
                           name="        "+name;
                           System.out.println(name);
                           System.out.println(code);
                           fos.write(code.getBytes());
                           fos.write("\r\n".getBytes());
                           fos.write(name.getBytes());
                           fos.write("\r\n".getBytes());
                       }
                   }
                   if(e.attributes().get("class").equals("towntr")){
//                       Region region=new Region();
                       List nodes=e.childNodes();
                       if(nodes!=null&&nodes.size()>0){
                              Node codeNode=nodes.get(0).childNodes().get(0);
                              String code=codeNode.childNodes().get(0).outerHtml();
                              Node nameCode=nodes.get(1).childNodes().get(0);
                              String name=nameCode.childNodes().get(0).outerHtml();
                           code="            "+code;
                           name="            "+name;
                           System.out.println(name);
                           System.out.println(code);
                           fos.write(code.getBytes());
                           fos.write("\r\n".getBytes());
                           fos.write(name.getBytes());
                           fos.write("\r\n".getBytes());
                       }
                   }
                }
            }
            if(!es.isEmpty()){
                for (Element e:es
                     ) {
                    String key=e.attributes().get("href");
                    Matcher m = pattern.matcher(key);
                    if(m.matches()){
                        System.out.println(e.html().replaceAll("
", "")); fos.write((e.html().replaceAll("
", "")+" ").getBytes()); fos.write("\r\n".getBytes()); if(!pattern_.matcher(e.html()).matches()) { crawling(new String[]{visitUrl.substring(0, visitUrl.lastIndexOf("/") + 1) + key}); } } } } //将保存文件 FileTool.saveToLocal(page); //将已经访问过的链接放入已访问的链接中; Links.addVisitedUrlSet(visitUrl); //得到超链接 Set links = PageParserTool.getLinks(page,"img"); for (String link : links) { Links.addUnvisitedUrlQueue(link); } } }catch(Exception e){ System.out.println("--------IO异常----------"); }finally { try{ fos.close(); }catch(Exception e){ System.out.println("------被要关闭的文件不存在-------"); } } } public static void readFileByLines(String fileName) { File file = new File(fileName); BufferedReader reader = null; try { QueryRunner qr = new QueryRunner(DBCPUtils.getDataSource()); Connection connection=DBCPUtils.getDataSource().getConnection(); String sqlInsert = "INSERT INTO region VALUES (?,?,?,?,?,?)"; reader = new BufferedReader(new FileReader(file)); List codeList=new ArrayList(); List nameList=new ArrayList(); String tempString = null; int line = 1; // 一次读入一行,直到读入null为文件结束 while ((tempString = reader.readLine()) != null) { if(line%2==0){ nameList.add(tempString); }else{ codeList.add(tempString); } line++; } for(int i=0;i list=qr.query(sql,new BeanListHandler(Region.class)); // Object [][] params=new Object[344][]; // int k=0; for (int i=0;i(Region.class),param); String province=region1.getProvince(); region.setProvince(province); Object [] paramUpdate={region.getProvince(),region1.getCode()," ",region.getCode()}; String updateSql="update region set province=?,provinceCode=?,cityCode=? where code=?"; qr.update(updateSql,paramUpdate); } if(level.equals("3")){ String parentCode=region.getParentCode(); String sql2="select * from region where code=?"; Object [] param={parentCode}; Region region1=qr.query(sql2,new BeanHandler(Region.class),param); String province=region1.getProvince(); String city=region1.getCity(); region.setProvince(province); region.setCity(city); Object [] paramUpdate={region.getProvince(),region.getCity(),region1.getProvinceCode(),region1.getCode(),region.getCode()}; String updateSql="update region set province=?,city=?,provinceCode=?,cityCode=? where code=?"; qr.update(updateSql,paramUpdate); } } connection.close(); } //main 方法入口 public static void main(String[] args) throws Exception{ Crawler crawler=new Crawler(); crawler.crawling(new String[]{"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/"}); } }

 region对象如下:

package com.lenovo.crawl.entity;

public class Region {
    private String code;
    private String provinceCode;
    private String province;
    private String cityCode;
    private String city;
    private String countyCode;
    private String county;
    private String parentCode;
    private String level;

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getProvinceCode() {
        return provinceCode;
    }

    public void setProvinceCode(String provinceCode) {
        this.provinceCode = provinceCode;
    }

    public String getProvince() {
        return province;
    }

    public void setProvince(String province) {
        this.province = province;
    }

    public String getCityCode() {
        return cityCode;
    }

    public void setCityCode(String cityCode) {
        this.cityCode = cityCode;
    }

    public String getCity() {
        return city;
    }

    public void setCity(String city) {
        this.city = city;
    }

    public String getCountyCode() {
        return countyCode;
    }

    public void setCountyCode(String countyCode) {
        this.countyCode = countyCode;
    }

    public String getCounty() {
        return county;
    }

    public void setCounty(String county) {
        this.county = county;
    }

    public String getParentCode() {
        return parentCode;
    }

    public void setParentCode(String parentCode) {
        this.parentCode = parentCode;
    }

    public String getLevel() {
        return level;
    }

    public void setLevel(String level) {
        this.level = level;
    }

    public Region() {
    }

    public Region(String code, String provinceCode, String province, String cityCode, String city, String countyCode, String county, String parentCode, String level) {
        this.code = code;
        this.provinceCode = provinceCode;
        this.province = province;
        this.cityCode = cityCode;
        this.city = city;
        this.countyCode = countyCode;
        this.county = county;
        this.parentCode = parentCode;
        this.level = level;
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;

        Region region = (Region) o;

        if (code != null ? !code.equals(region.code) : region.code != null) return false;
        if (provinceCode != null ? !provinceCode.equals(region.provinceCode) : region.provinceCode != null)
            return false;
        if (province != null ? !province.equals(region.province) : region.province != null) return false;
        if (cityCode != null ? !cityCode.equals(region.cityCode) : region.cityCode != null) return false;
        if (city != null ? !city.equals(region.city) : region.city != null) return false;
        if (countyCode != null ? !countyCode.equals(region.countyCode) : region.countyCode != null) return false;
        if (county != null ? !county.equals(region.county) : region.county != null) return false;
        if (parentCode != null ? !parentCode.equals(region.parentCode) : region.parentCode != null) return false;
        return level != null ? level.equals(region.level) : region.level == null;
    }

    @Override
    public int hashCode() {
        int result = code != null ? code.hashCode() : 0;
        result = 31 * result + (provinceCode != null ? provinceCode.hashCode() : 0);
        result = 31 * result + (province != null ? province.hashCode() : 0);
        result = 31 * result + (cityCode != null ? cityCode.hashCode() : 0);
        result = 31 * result + (city != null ? city.hashCode() : 0);
        result = 31 * result + (countyCode != null ? countyCode.hashCode() : 0);
        result = 31 * result + (county != null ? county.hashCode() : 0);
        result = 31 * result + (parentCode != null ? parentCode.hashCode() : 0);
        result = 31 * result + (level != null ? level.hashCode() : 0);
        return result;
    }
}

我是先把数据爬取完,写到了txt文件里,然后去解析txt文件,把数据整理成了结构化的(为了实现三级联动),各自加了一个level的属性。然后写到了数据库里。(上面的代码,如果不想这样输出,可以写个递归。按层级输出)

我的txt连接在这:https://download.csdn.net/download/qq_29281307/11191484

你可能感兴趣的:(使用jsoup写java爬虫,爬取全国地区及编码数据)