哈喽,大家好!好久不见。
一、背景介绍
二、拼凑URL
三、解析页面并存入数据库
/**
* 测试编程式事务
*/
@Test
@Rollback(false)
public void testMannulCommitTransaction() {
for(int i=1;i<=100000;i++) {
RegionLevelFiveException rfe = new RegionLevelFiveException();
rfe.setMark("11");
rfe.setContent("内容");
DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
TransactionStatus status = null;
try{
status = transactionManager.getTransaction(definition);
jpaRegionLevelFiveExceptionLogRepository.addRegionLevelFiveExceptionLog(rfe);
transactionManager.commit(status);
}catch(Exception ex) {
}
}
}**加粗样式**
一、背景介绍:
1、数据库里已有省、市、区(县)、街道(镇)、居委会(村),还缺最后一级:居委会(region_revel_5)
2、用Jsoup工具解析页面,根据库里已有的四级地址,拿到第五级,持久化数据。
3、未让程序通俗易懂,我把冗余的信息也都放出来:注释和打印的内容
4、底层是公司封装的一套JPA框架
二、拼凑URL
获取区域码网站:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/
/**
* 拼凑出url
*/
@Test
public void testRegionPuller() {
int count = 1;
long start = System.currentTimeMillis();
List<Region> regionsOfProvince = regionRepository.regionsOfProvince();
System.out.println("得到的省ID集合是:"+regionsOfProvince.toString());
if(CollectionsUtils.isNotEmpty(regionsOfProvince)) {
for(Region region :regionsOfProvince) {
int provinceCode = region.getCode();
System.out.println("省ID:"+provinceCode);
String urlOfProvince = Integer.toString(provinceCode).substring(0,2);
System.out.println("省的URL:"+urlOfProvince);
List<Region> cities = regionRepository.regionsOfCity(provinceCode);
//System.out.println("市ID是:"+cities.toString());
if(CollectionsUtils.isNotEmpty(cities)) {
for(Region regionCity : cities) {
int cityCode = regionCity.getCode();
String urlOfCity = Integer.toString(cityCode).substring(2,4);
List<Region> districts = regionRepository.regionsOfDistrict(cityCode);
if(CollectionsUtils.isNotEmpty(districts)) {
for(Region regionDistrict :districts) {
int districtCode = regionDistrict.getCode();
String urlOfDistrict = Integer.toString(districtCode).substring(4,6);
List<RegionLevelFour> villages = regionLevelFiveRepository.regionsOfVillage(districtCode);
if(CollectionsUtils.isNotEmpty(villages)){
for(RegionLevelFour village :villages){
int villageCode = village.getCode();
String urlOfVillage = Integer.toString(villageCode);
System.out.println("镇的编码:"+urlOfVillage);
System.out.println("镇截取后的编码:"+urlOfVillage.substring(6));
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/"+urlOfProvince+"/"+urlOfCity+"/"+urlOfDistrict+"/"+(urlOfProvince+urlOfCity+urlOfDistrict+(urlOfVillage.substring(6)))+".html";
System.out.println("拼凑出的url是....."+url);
//解析页面
pullRegions(url,villageCode);
}
System.out.println("页面抓取出来的level5对象总量是:"+regions.size());
}
}
}
}
}
}
long end = System.currentTimeMillis();
long customerTime = (start-end)/(1000*60*60);
System.out.println("Binggo! 耗时:"+customerTime+"h");
}
}
三、解析页面并存入数据库
Gralde方式导入包:
compile group: ‘org.jsoup’, name: ‘jsoup’, version: ‘1.11.3’
List<RegionLevelFive> regions = new ArrayList<>();//去重是网站自己会保证的,用Set无必要且会无序
/**
* 解析页面
* 持久化数据
*/
// @Transactional
//@Rollback(false)
public void pullRegions(String url,int parentId) {
long start = System.currentTimeMillis();
int count = 0;
System.out.println("开始获取.....");
//1.获取首页
Document html = null;
try {
html = Jsoup.connect(url).timeout(5000).get();
html = getHemlFromUrl(url,false);
System.out.println("================================打印网页=======================================");
System.out.println("获取到的网页是:"+html);
} catch (IOException e) {
System.out.println("获取页面失败");
e.printStackTrace();
}
//2.jsoup获取标签
Elements regionTags = null;
try{
regionTags = html.select("table.villagetable")
.select("tbody")
.select("tr.villagetr");
}catch(NullPointerException ne) {
RegionLevelFiveException regionLF = new RegionLevelFiveException();
regionLF.setContent(url);
regionLF.setMark("页面拿取不到");
System.out.println("该页面拿取不到:"+url);
DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
TransactionStatus status = null;
try{
status = transactionManager.getTransaction(definition);
jpaRegionLevelFiveExceptionLogRepository.addRegionLevelFiveExceptionLog(regionLF);
transactionManager.commit(status);
}catch(Exception e) {
}
}
//3.从中抽取基本信息,封装为RegionLevelFive
System.out.println("================================打印regionTypes=======================================");
System.out.println(regionTags);
if(regionTags != null) {
for(Element tr : regionTags) {
Elements regionsOfHtml = tr.select("td");
String code = regionsOfHtml.first().text();
String name = regionsOfHtml.last().text();
RegionLevelFive region = new RegionLevelFive();
System.out.println("code "+code);//110101001001 值太大,百亿级别无法直接转String
region.setCode(code);
region.setName(name);
region.setLevel(5);
region.setParentCode(Integer.toString(parentId));
System.out.println("第"+count+"条记录"+region.toString());
regions.add(region);
count++;
DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
TransactionStatus status = null;
try{
status = transactionManager.getTransaction(definition);
regionLevelFiveRepository.addRegionOfVillage(region);
transactionManager.commit(status);
}catch(Exception e) {
}
}
System.out.println("页面抓取出来的level5对象总量是:"+regions.size());
}
}
getHemlFromUrl(url,false)代码如下:
/**
* 设置代理浏览器
*/
public Document getHemlFromUrl(String url, boolean useHtmlUnit) throws IOException {
if(!useHtmlUnit) {
try{
return Jsoup.connect(url)
.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")
.get();
}catch(SocketTimeoutException e) {
e.printStackTrace();
System.out.println("该页面获取超时:"+url);
RegionLevelFiveException regionLF = new RegionLevelFiveException();
regionLF.setContent(url);
regionLF.setMark("获取页面超时");
try{
DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
TransactionStatus status = null;
jpaRegionLevelFiveExceptionLogRepository.addRegionLevelFiveExceptionLog(regionLF);
transactionManager.commit(status);
}catch(Exception e1) {
}
return null;
}
}else{
return null;
}
}
上面代码注意以下几点:
1、存储数据的时候,没有在方法上加事务,因为for循环中要在每次循环中持久化数据,所以用到了编程式事务;
2、对于异常全部抛出,避免中断。必定有许多页面有问题,这时候不能中断,否则50多万条数据什么时候能跑完?
同时把异常页面持久化到数据库中;
3、实践表明,异常网页7K多条呢,后面统一处理。
整体代码如下:
package com.dzj.regionLevel5;
import com.dzj.bdc.biz.domain.Region;
import com.dzj.bdc.biz.domain.RegionLevelFive;
import com.dzj.bdc.biz.domain.RegionLevelFiveException;
import com.dzj.bdc.biz.domain.RegionLevelFour;
import com.dzj.bdc.biz.domain.repository.RegionLevelFiveRepository;
import com.dzj.bdc.biz.domain.repository.RegionRepository;
import com.dzj.bdc.biz.domain.repository.jpa1.JpaRegionLevelFiveExceptionLogRepository;
import com.dzj.ddc.biz.test.base.BaseRepositoryTest;
import com.dzj.ddc.biz.test.base.BaseTest;
import com.dzj.frw.common.utils.CollectionsUtils;
import com.dzj.frw.common.utils.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.ImportResource;
import org.springframework.test.annotation.Rollback;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import org.springframework.transaction.PlatformTransactionManager;
import org.springframework.transaction.TransactionStatus;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.transaction.support.DefaultTransactionDefinition;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.List;
/**
* Author:meice Huang
* Time: 2019/2/28 下午9:10
*/
@RunWith(SpringJUnit4ClassRunner.class)
@Configuration
@ContextConfiguration(classes = RegionLevelFivePuller.class)
@ImportResource({ "classpath:basis-data-repositories-beans.xml",
"classpath:META-INF/bo-frw-fixture.xml" })
public class RegionLevelFivePuller extends BaseTest {
@Autowired
private RegionRepository regionRepository;
@Autowired
private RegionLevelFiveRepository regionLevelFiveRepository;
@Autowired
private JpaRegionLevelFiveExceptionLogRepository jpaRegionLevelFiveExceptionLogRepository;
@Autowired
private PlatformTransactionManager transactionManager;
List<RegionLevelFive> regions = new ArrayList<>();//去重是网站自己会保证的,用Set无必要且会无序
/**
* 解析页面
* 持久化数据
*/
// @Transactional
//@Rollback(false)
public void pullRegions(String url,int parentId) {
long start = System.currentTimeMillis();
int count = 0;
System.out.println("开始获取.....");
//1.获取首页
Document html = null;
try {
html = Jsoup.connect(url).timeout(5000).get();
html = getHemlFromUrl(url,false);
System.out.println("================================打印网页=======================================");
System.out.println("获取到的网页是:"+html);
} catch (IOException e) {
System.out.println("获取页面失败");
e.printStackTrace();
}
//2.jsoup获取标签
Elements regionTags = null;
try{
regionTags = html.select("table.villagetable")
.select("tbody")
.select("tr.villagetr");
}catch(NullPointerException ne) {
RegionLevelFiveException regionLF = new RegionLevelFiveException();
regionLF.setContent(url);
regionLF.setMark("页面拿取不到");
System.out.println("该页面拿取不到:"+url);
DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
TransactionStatus status = null;
try{
status = transactionManager.getTransaction(definition);
jpaRegionLevelFiveExceptionLogRepository.addRegionLevelFiveExceptionLog(regionLF);
transactionManager.commit(status);
}catch(Exception e) {
}
}
//3.从中抽取基本信息,封装为RegionLevelFive
System.out.println("================================打印regionTypes=======================================");
System.out.println(regionTags);
if(regionTags != null) {
for(Element tr : regionTags) {
Elements regionsOfHtml = tr.select("td");
String code = regionsOfHtml.first().text();
String name = regionsOfHtml.last().text();
RegionLevelFive region = new RegionLevelFive();
System.out.println("code "+code);//110101001001 值太大,百亿级别无法直接转String
region.setCode(code);
region.setName(name);
region.setLevel(5);
region.setParentCode(Integer.toString(parentId));
System.out.println("第"+count+"条记录"+region.toString());
regions.add(region);
count++;
DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
TransactionStatus status = null;
try{
status = transactionManager.getTransaction(definition);
regionLevelFiveRepository.addRegionOfVillage(region);
transactionManager.commit(status);
}catch(Exception e) {
}
}
System.out.println("页面抓取出来的level5对象总量是:"+regions.size());
}
}
/**
* 设置代理浏览器
*/
public Document getHemlFromUrl(String url, boolean useHtmlUnit) throws IOException {
if(!useHtmlUnit) {
try{
return Jsoup.connect(url)
.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")
.get();
}catch(SocketTimeoutException e) {
e.printStackTrace();
System.out.println("该页面获取超时:"+url);
RegionLevelFiveException regionLF = new RegionLevelFiveException();
regionLF.setContent(url);
regionLF.setMark("获取页面超时");
try{
DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
TransactionStatus status = null;
jpaRegionLevelFiveExceptionLogRepository.addRegionLevelFiveExceptionLog(regionLF);
transactionManager.commit(status);
}catch(Exception e1) {
}
return null;
}
}else{
return null;
}
}
/**
* 拼凑出url
*/
@Test
public void testRegionPuller() {
int count = 1;
long start = System.currentTimeMillis();
List<Region> regionsOfProvince = regionRepository.regionsOfProvince();
System.out.println("得到的省ID集合是:"+regionsOfProvince.toString());
if(CollectionsUtils.isNotEmpty(regionsOfProvince)) {
for(Region region :regionsOfProvince) {
int provinceCode = region.getCode();
System.out.println("省ID:"+provinceCode);
String urlOfProvince = Integer.toString(provinceCode).substring(0,2);
System.out.println("省的URL:"+urlOfProvince);
List<Region> cities = regionRepository.regionsOfCity(provinceCode);
//System.out.println("市ID是:"+cities.toString());
if(CollectionsUtils.isNotEmpty(cities)) {
for(Region regionCity : cities) {
int cityCode = regionCity.getCode();
String urlOfCity = Integer.toString(cityCode).substring(2,4);
List<Region> districts = regionRepository.regionsOfDistrict(cityCode);
if(CollectionsUtils.isNotEmpty(districts)) {
for(Region regionDistrict :districts) {
int districtCode = regionDistrict.getCode();
String urlOfDistrict = Integer.toString(districtCode).substring(4,6);
List<RegionLevelFour> villages = regionLevelFiveRepository.regionsOfVillage(districtCode);
if(CollectionsUtils.isNotEmpty(villages)){
for(RegionLevelFour village :villages){
int villageCode = village.getCode();
String urlOfVillage = Integer.toString(villageCode);
System.out.println("镇的编码:"+urlOfVillage);
System.out.println("镇截取后的编码:"+urlOfVillage.substring(6));
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/"+urlOfProvince+"/"+urlOfCity+"/"+urlOfDistrict+"/"+(urlOfProvince+urlOfCity+urlOfDistrict+(urlOfVillage.substring(6)))+".html";
System.out.println("拼凑出的url是....."+url);
//解析页面
pullRegions(url,villageCode);
}
System.out.println("页面抓取出来的level5对象总量是:"+regions.size());
}
}
}
}
}
}
long end = System.currentTimeMillis();
long customerTime = (start-end)/(1000*60*60);
System.out.println("Binggo! 耗时:"+customerTime+"h");
}
}
/**
* 测试连接数据库
*/
@Test
@Transactional
@Rollback(false)
public void TestSQl() {
RegionLevelFive regionLevelFive = new RegionLevelFive();
regionLevelFive.setCode("8942849");
regionLevelFive.setName("测试数据");
regionLevelFive.setLevel(5);
regionLevelFive.setParentCode("2322323");
regionLevelFiveRepository.addRegionOfVillage(regionLevelFive);
}
public static void main(String[] args) {
RegionLevelFivePuller rlfp = new RegionLevelFivePuller();
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/61/10/02/611002001.html";
rlfp.pullRegions(url,0);
}
/**
* 测试编程式事务
*/
@Test
@Rollback(false)
public void testMannulCommitTransaction() {
for(int i=1;i<=100000;i++) {
RegionLevelFiveException rfe = new RegionLevelFiveException();
rfe.setMark("11");
rfe.setContent("内容");
DefaultTransactionDefinition definition = new DefaultTransactionDefinition();
TransactionStatus status = null;
try{
status = transactionManager.getTransaction(definition);
jpaRegionLevelFiveExceptionLogRepository.addRegionLevelFiveExceptionLog(rfe);
transactionManager.commit(status);
}catch(Exception ex) {
}
}
}
}
好了,下期再会!
你可能感兴趣的:(Java)