爬虫系列-1-爬取第五级地址

哈喽,大家好!好久不见。

目录

一、背景介绍
二、拼凑URL
三、解析页面并存入数据库



/**
     * 测试编程式事务
     */
    @Test
    @Rollback(false)
    public void testMannulCommitTransaction() {
        for(int i=1;i<=100000;i++) {
            RegionLevelFiveException rfe = new RegionLevelFiveException();
            rfe.setMark("11");
            rfe.setContent("内容");
            DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
            TransactionStatus status = null;
            try{
                status = transactionManager.getTransaction(definition);
                jpaRegionLevelFiveExceptionLogRepository.addRegionLevelFiveExceptionLog(rfe);
                transactionManager.commit(status);
            }catch(Exception ex) {

            }
        }
    }**加粗样式**
 

一、背景介绍:

1、数据库里已有省、市、区(县)、街道(镇)、居委会(村),还缺最后一级:居委会(region_revel_5)
2、用Jsoup工具解析页面,根据库里已有的四级地址,拿到第五级,持久化数据。
3、未让程序通俗易懂,我把冗余的信息也都放出来:注释和打印的内容
4、底层是公司封装的一套JPA框架

二、拼凑URL
获取区域码网站:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/

 /**
     * 拼凑出url
     */
    @Test
    public void testRegionPuller() {
        int count = 1;
        long start = System.currentTimeMillis();
       List<Region> regionsOfProvince = regionRepository.regionsOfProvince();
        System.out.println("得到的省ID集合是:"+regionsOfProvince.toString());
        if(CollectionsUtils.isNotEmpty(regionsOfProvince)) {
             for(Region region :regionsOfProvince) {
                int provinceCode = region.getCode();
                System.out.println("省ID:"+provinceCode);
                String urlOfProvince = Integer.toString(provinceCode).substring(0,2);
                System.out.println("省的URL:"+urlOfProvince);
                List<Region> cities = regionRepository.regionsOfCity(provinceCode);
                //System.out.println("市ID是:"+cities.toString());
                if(CollectionsUtils.isNotEmpty(cities)) {
                    for(Region regionCity : cities) {
                        int cityCode = regionCity.getCode();
                        String  urlOfCity = Integer.toString(cityCode).substring(2,4);
                        List<Region> districts  = regionRepository.regionsOfDistrict(cityCode);
                        if(CollectionsUtils.isNotEmpty(districts)) {
                            for(Region regionDistrict :districts) {
                                int districtCode = regionDistrict.getCode();
                                String urlOfDistrict = Integer.toString(districtCode).substring(4,6);
                                List<RegionLevelFour> villages = regionLevelFiveRepository.regionsOfVillage(districtCode);
                                if(CollectionsUtils.isNotEmpty(villages)){
                                    for(RegionLevelFour village :villages){
                                        int villageCode =  village.getCode();
                                                String urlOfVillage = Integer.toString(villageCode);
                                                System.out.println("镇的编码:"+urlOfVillage);
                                                System.out.println("镇截取后的编码:"+urlOfVillage.substring(6));
                                                String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/"+urlOfProvince+"/"+urlOfCity+"/"+urlOfDistrict+"/"+(urlOfProvince+urlOfCity+urlOfDistrict+(urlOfVillage.substring(6)))+".html";
                                                System.out.println("拼凑出的url是....."+url);
                                                //解析页面
                                                pullRegions(url,villageCode);

                                    }
                                                System.out.println("页面抓取出来的level5对象总量是:"+regions.size());
                                }
                            }

                        }
                    }
                }
             }

            long end = System.currentTimeMillis();
             long customerTime = (start-end)/(1000*60*60);
            System.out.println("Binggo! 耗时:"+customerTime+"h");
        }
    }

三、解析页面并存入数据库

Gralde方式导入包:
compile group: ‘org.jsoup’, name: ‘jsoup’, version: ‘1.11.3’

 List<RegionLevelFive> regions = new ArrayList<>();//去重是网站自己会保证的,用Set无必要且会无序


 /**
  * 解析页面
  * 持久化数据
  */
// @Transactional
 //@Rollback(false)
 public void pullRegions(String url,int parentId) {
     long start = System.currentTimeMillis();
     int count = 0;
     System.out.println("开始获取.....");
     //1.获取首页
     Document html = null;
     try {
          html = Jsoup.connect(url).timeout(5000).get();
         html = getHemlFromUrl(url,false);
         System.out.println("================================打印网页=======================================");
         System.out.println("获取到的网页是:"+html);
     } catch (IOException e) {
         System.out.println("获取页面失败");
         e.printStackTrace();
     }
     //2.jsoup获取标签
     Elements regionTags = null;
     try{
          regionTags = html.select("table.villagetable")
                 .select("tbody")
                 .select("tr.villagetr");

     }catch(NullPointerException ne) {
         RegionLevelFiveException regionLF = new RegionLevelFiveException();
         regionLF.setContent(url);
         regionLF.setMark("页面拿取不到");
         System.out.println("该页面拿取不到:"+url);
         DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
         TransactionStatus status = null;
         try{
             status = transactionManager.getTransaction(definition);
             jpaRegionLevelFiveExceptionLogRepository.addRegionLevelFiveExceptionLog(regionLF);
             transactionManager.commit(status);
         }catch(Exception e) {

         }
     }
     //3.从中抽取基本信息,封装为RegionLevelFive
     System.out.println("================================打印regionTypes=======================================");
     System.out.println(regionTags);
     if(regionTags != null) {
         for(Element tr : regionTags) {
             Elements regionsOfHtml =  tr.select("td");
             String code = regionsOfHtml.first().text();
             String name = regionsOfHtml.last().text();

             RegionLevelFive  region = new RegionLevelFive();
             System.out.println("code  "+code);//110101001001 值太大,百亿级别无法直接转String
             region.setCode(code);
             region.setName(name);
             region.setLevel(5);
             region.setParentCode(Integer.toString(parentId));
             System.out.println("第"+count+"条记录"+region.toString());
             regions.add(region);
             count++;
             DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
             TransactionStatus status = null;
             try{
                 status = transactionManager.getTransaction(definition);
                 regionLevelFiveRepository.addRegionOfVillage(region);
                 transactionManager.commit(status);
             }catch(Exception e) {

             }
         }
         System.out.println("页面抓取出来的level5对象总量是:"+regions.size());
     }
 }

getHemlFromUrl(url,false)代码如下:

/**
 * 设置代理浏览器
 */
public Document getHemlFromUrl(String url, boolean useHtmlUnit) throws IOException {
    if(!useHtmlUnit) {
       try{
           return Jsoup.connect(url)
                   .userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")
                   .get();
       }catch(SocketTimeoutException e) {
            e.printStackTrace();
             System.out.println("该页面获取超时:"+url);

           RegionLevelFiveException regionLF = new RegionLevelFiveException();
           regionLF.setContent(url);
           regionLF.setMark("获取页面超时");
           try{
               DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
               TransactionStatus status = null;
               jpaRegionLevelFiveExceptionLogRepository.addRegionLevelFiveExceptionLog(regionLF);
               transactionManager.commit(status);
           }catch(Exception e1) {

           }
           return null;
       }
    }else{
        return  null;
    }
}

上面代码注意以下几点:
1、存储数据的时候,没有在方法上加事务,因为for循环中要在每次循环中持久化数据,所以用到了编程式事务;
2、对于异常全部抛出,避免中断。必定有许多页面有问题,这时候不能中断,否则50多万条数据什么时候能跑完?
同时把异常页面持久化到数据库中;
3、实践表明,异常网页7K多条呢,后面统一处理。

整体代码如下:

package com.dzj.regionLevel5;

import com.dzj.bdc.biz.domain.Region;
import com.dzj.bdc.biz.domain.RegionLevelFive;
import com.dzj.bdc.biz.domain.RegionLevelFiveException;
import com.dzj.bdc.biz.domain.RegionLevelFour;
import com.dzj.bdc.biz.domain.repository.RegionLevelFiveRepository;
import com.dzj.bdc.biz.domain.repository.RegionRepository;
import com.dzj.bdc.biz.domain.repository.jpa1.JpaRegionLevelFiveExceptionLogRepository;
import com.dzj.ddc.biz.test.base.BaseRepositoryTest;
import com.dzj.ddc.biz.test.base.BaseTest;
import com.dzj.frw.common.utils.CollectionsUtils;
import com.dzj.frw.common.utils.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.ImportResource;
import org.springframework.test.annotation.Rollback;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import org.springframework.transaction.PlatformTransactionManager;
import org.springframework.transaction.TransactionStatus;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.transaction.support.DefaultTransactionDefinition;

import java.io.IOException;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.List;

/**
 * Author:meice Huang
 * Time: 2019/2/28 下午9:10
 */
@RunWith(SpringJUnit4ClassRunner.class)
@Configuration
@ContextConfiguration(classes = RegionLevelFivePuller.class)
@ImportResource({ "classpath:basis-data-repositories-beans.xml",
        "classpath:META-INF/bo-frw-fixture.xml" })
public class RegionLevelFivePuller extends BaseTest {


    @Autowired
    private RegionRepository regionRepository;

    @Autowired
    private RegionLevelFiveRepository regionLevelFiveRepository;

    @Autowired
    private JpaRegionLevelFiveExceptionLogRepository jpaRegionLevelFiveExceptionLogRepository;

    @Autowired
    private PlatformTransactionManager transactionManager;

    List<RegionLevelFive> regions = new ArrayList<>();//去重是网站自己会保证的,用Set无必要且会无序


    /**
     * 解析页面
     * 持久化数据
     */
   // @Transactional
    //@Rollback(false)
    public void pullRegions(String url,int parentId) {
        long start = System.currentTimeMillis();
        int count = 0;
        System.out.println("开始获取.....");
        //1.获取首页
        Document html = null;
        try {
             html = Jsoup.connect(url).timeout(5000).get();
            html = getHemlFromUrl(url,false);
            System.out.println("================================打印网页=======================================");
            System.out.println("获取到的网页是:"+html);
        } catch (IOException e) {
            System.out.println("获取页面失败");
            e.printStackTrace();
        }
        //2.jsoup获取标签
        Elements regionTags = null;
        try{
             regionTags = html.select("table.villagetable")
                    .select("tbody")
                    .select("tr.villagetr");

        }catch(NullPointerException ne) {
            RegionLevelFiveException regionLF = new RegionLevelFiveException();
            regionLF.setContent(url);
            regionLF.setMark("页面拿取不到");
            System.out.println("该页面拿取不到:"+url);
            DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
            TransactionStatus status = null;
            try{
                status = transactionManager.getTransaction(definition);
                jpaRegionLevelFiveExceptionLogRepository.addRegionLevelFiveExceptionLog(regionLF);
                transactionManager.commit(status);
            }catch(Exception e) {

            }
        }
        //3.从中抽取基本信息,封装为RegionLevelFive
        System.out.println("================================打印regionTypes=======================================");
        System.out.println(regionTags);
        if(regionTags != null) {
            for(Element tr : regionTags) {
                Elements regionsOfHtml =  tr.select("td");
                String code = regionsOfHtml.first().text();
                String name = regionsOfHtml.last().text();

                RegionLevelFive  region = new RegionLevelFive();
                System.out.println("code  "+code);//110101001001 值太大,百亿级别无法直接转String
                region.setCode(code);
                region.setName(name);
                region.setLevel(5);
                region.setParentCode(Integer.toString(parentId));
                System.out.println("第"+count+"条记录"+region.toString());
                regions.add(region);
                count++;
                DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
                TransactionStatus status = null;
                try{
                    status = transactionManager.getTransaction(definition);
                    regionLevelFiveRepository.addRegionOfVillage(region);
                    transactionManager.commit(status);
                }catch(Exception e) {

                }
            }
            System.out.println("页面抓取出来的level5对象总量是:"+regions.size());
        }
    }


    /**
     * 设置代理浏览器
     */
    public Document getHemlFromUrl(String url, boolean useHtmlUnit) throws IOException {
        if(!useHtmlUnit) {
           try{
               return Jsoup.connect(url)
                       .userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")
                       .get();
           }catch(SocketTimeoutException e) {
                e.printStackTrace();
                 System.out.println("该页面获取超时:"+url);

               RegionLevelFiveException regionLF = new RegionLevelFiveException();
               regionLF.setContent(url);
               regionLF.setMark("获取页面超时");
               try{
                   DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
                   TransactionStatus status = null;
                   jpaRegionLevelFiveExceptionLogRepository.addRegionLevelFiveExceptionLog(regionLF);
                   transactionManager.commit(status);
               }catch(Exception e1) {

               }
               return null;
           }
        }else{
            return  null;
        }
    }

    /**
     * 拼凑出url
     */
    @Test
    public void testRegionPuller() {
        int count = 1;
        long start = System.currentTimeMillis();
       List<Region> regionsOfProvince = regionRepository.regionsOfProvince();
        System.out.println("得到的省ID集合是:"+regionsOfProvince.toString());
        if(CollectionsUtils.isNotEmpty(regionsOfProvince)) {
             for(Region region :regionsOfProvince) {
                int provinceCode = region.getCode();
                System.out.println("省ID:"+provinceCode);
                String urlOfProvince = Integer.toString(provinceCode).substring(0,2);
                System.out.println("省的URL:"+urlOfProvince);
                List<Region> cities = regionRepository.regionsOfCity(provinceCode);
                //System.out.println("市ID是:"+cities.toString());
                if(CollectionsUtils.isNotEmpty(cities)) {
                    for(Region regionCity : cities) {
                        int cityCode = regionCity.getCode();
                        String  urlOfCity = Integer.toString(cityCode).substring(2,4);
                        List<Region> districts  = regionRepository.regionsOfDistrict(cityCode);
                        if(CollectionsUtils.isNotEmpty(districts)) {
                            for(Region regionDistrict :districts) {
                                int districtCode = regionDistrict.getCode();
                                String urlOfDistrict = Integer.toString(districtCode).substring(4,6);
                                List<RegionLevelFour> villages = regionLevelFiveRepository.regionsOfVillage(districtCode);
                                if(CollectionsUtils.isNotEmpty(villages)){
                                    for(RegionLevelFour village :villages){
                                        int villageCode =  village.getCode();
                                                String urlOfVillage = Integer.toString(villageCode);
                                                System.out.println("镇的编码:"+urlOfVillage);
                                                System.out.println("镇截取后的编码:"+urlOfVillage.substring(6));
                                                String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/"+urlOfProvince+"/"+urlOfCity+"/"+urlOfDistrict+"/"+(urlOfProvince+urlOfCity+urlOfDistrict+(urlOfVillage.substring(6)))+".html";
                                                System.out.println("拼凑出的url是....."+url);
                                                //解析页面
                                                pullRegions(url,villageCode);

                                    }
                                                System.out.println("页面抓取出来的level5对象总量是:"+regions.size());
                                }
                            }

                        }
                    }
                }
             }

            long end = System.currentTimeMillis();
             long customerTime = (start-end)/(1000*60*60);
            System.out.println("Binggo! 耗时:"+customerTime+"h");
        }
    }


    /**
     * 测试连接数据库
     */
    @Test
    @Transactional
    @Rollback(false)
    public void TestSQl() {
        RegionLevelFive regionLevelFive = new RegionLevelFive();
        regionLevelFive.setCode("8942849");
        regionLevelFive.setName("测试数据");
        regionLevelFive.setLevel(5);
        regionLevelFive.setParentCode("2322323");
        regionLevelFiveRepository.addRegionOfVillage(regionLevelFive);
    }




    public static void main(String[] args) {
        RegionLevelFivePuller rlfp  = new RegionLevelFivePuller();
        String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/61/10/02/611002001.html";
        rlfp.pullRegions(url,0);
    }


    /**
     * 测试编程式事务
     */
    @Test
    @Rollback(false)
    public void testMannulCommitTransaction() {
        for(int i=1;i<=100000;i++) {
            RegionLevelFiveException rfe = new RegionLevelFiveException();
            rfe.setMark("11");
            rfe.setContent("内容");
            DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
            TransactionStatus status = null;
            try{
                status = transactionManager.getTransaction(definition);
                jpaRegionLevelFiveExceptionLogRepository.addRegionLevelFiveExceptionLog(rfe);
                transactionManager.commit(status);
            }catch(Exception ex) {

            }
        }
    }
    }

好了,下期再会!

你可能感兴趣的:(Java)