很多地方需要用到 统计用区划和城乡划分代码
这块以国家统计局的权威数据为准,但是人家是一个网页。
虽然Python解析起来很快,但我还是想用 Java
写一套,打发时间也好,无聊也罢,学习学习。
首先要做的就是分析网页的内容特点,进行数据建模和构建框架。
我本机MySQL运行的,图个方便也没用Oracle或者服务器类,一切从简。
CREATE TABLE tb_locations (
id bigint(20) NOT NULL,
flag varchar(6) DEFAULT NULL,
local_code varchar(30) DEFAULT NULL,
local_name varchar(100) DEFAULT NULL,
lv int(11) DEFAULT NULL,
sup_local_code varchar(30) DEFAULT NULL,
url varchar(60) DEFAULT NULL,
PRIMARY KEY (id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
先说下我的实现思路:自上而下,逐级递归。
统计用区划和城乡可以想象成一个树形结构,主干就是省、直辖市、自治区。逐级解析html
文本内容,再拼装成完整URI
路径作为下一级路径解析依据。
这里用到两个技术点:
- Mybatis实现的批量提交
- dom4j解析xml元素
2.1.1. 依赖包
akkad-war3
xyz.wongs.drunkard
1.0.0-SNAPSHOT
4.0.0
war3-area
jar
xyz.wongs.drunkard
mybatis-pk-redis
com.oracle
ojdbc6
HikariCP
com.zaxxer
org.springframework.boot
spring-boot-starter-test
test
org.jsoup
jsoup
1.9.2
net.sourceforge.htmlunit
neko-htmlunit
2.30
io.springfox
springfox-boot-starter
org.apache.httpcomponents
httpmime
4.5.5
com.gargoylesoftware
htmlunit
2.3
net.sourceforge.htmlunit
htmlunit-core-js
2.31
com.gargoylesoftware
htmlunit-cssparser
1.0.0
xalan
xalan
2.7.2
xerces
xercesimpl
2.11.0
javax.persistence
javax.persistence-api
2.2
com.jayway.jsonpath
json-path
org.springframework.boot
spring-boot-maven-plugin
repackage
org.mybatis.generator
mybatis-generator-maven-plugin
1.3.2
${basedir}/src/main/resources/generator/generatorConfig.xml
true
true
2.1.2. 核心实现代码
package xyz.wongs.drunkard.war3.web.area.task.impl;
import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;
import xyz.wongs.drunkard.base.constant.Constant;
import xyz.wongs.drunkard.war3.domain.entity.Location;
import xyz.wongs.drunkard.war3.domain.service.LocationService;
import xyz.wongs.drunkard.war3.web.util.IdClazzUtils;
import xyz.wongs.drunkard.war3.web.util.AreaCodeStringUtils;
import xyz.wongs.drunkard.war3.web.area.task.ProcessService;
import java.io.IOException;
import java.sql.Time;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.concurrent.TimeUnit;
/**
* @ClassName: JsoupProcessServiceImpl
* @Description:TODO(这里用一句话描述这个类的作用)
* @author: WCNGS
* @date: 2017年7月28日 上午11:31:30 *
* @Copyright: 2017 WCNGS Inc. All rights reserved.
*/
@Slf4j
@Service("processService")
public class ProcessServiceImpl implements ProcessService {
@Autowired
@Qualifier("locationService")
LocationService locationService;
@Override
public void initLevelOne(String url, Location parentLocation) {
List levelOne = null;
try {
levelOne = getLevelOneByRoot(url, parentLocation.getLocalCode());
} catch (IOException e) {
log.error(" IOException pCode={}", parentLocation.getLocalCode(), e.getMessage(), url);
}
save(levelOne);
}
@Override
public boolean initLevelTwo(String url, Location location) {
try {
List secondLevelLocas = getLocationSecondLevel(url, location);
save(secondLevelLocas);
return true;
} catch (Exception e) {
return false;
}
}
/**
* 初始化省、直辖区、自治区
*
* @param url
* @return void
* @throws
* @method intiRootUrl
* @author [email protected]
* @version
* @date 2018/6/30 23:29
* @see
*/
@Override
public boolean intiRootUrl(String url) {
try {
List rootLocations = getLocationRoot(url, "0");
save(rootLocations);
return true;
} catch (Exception e) {
e.printStackTrace();
return false;
}
}
public List getLocationRoot(String url, String pCode) {
List locas = new ArrayList(35);
try {
Elements eleProv = getElementsByConnection(url, "provincetr");
for (Element e : eleProv) {
Elements eleHerf = e.getElementsByTag("td").select("a[href]");
if (null == eleHerf || eleHerf.size() == 0) {
continue;
}
for (Element target : eleHerf) {
String urls = target.attributes().asList().get(0).getValue();
Location location = Location.builder().id(IdClazzUtils.getId(Location.class))
.localCode("0").url(urls).lv(0).localName(target.text())
.localCode(urls.substring(0, 2)).build();
locas.add(location);
}
}
} catch (IOException e) {
log.error(" IOException pCode={}", pCode, e.getMessage(), url);
}
return locas;
}
/**
* 方法实现说明
*
* @param url
* @param location
* @return void
* @throws
* @method thridLevelResolve
* @author [email protected]
* @version
* @date 2018/7/1 9:50
* @see
*/
@Override
public void initLevelThrid(String url, Location location) {
this.initLevelThrid(url, location, "Y");
}
/**
* 方法实现说明
*
* @param url
* @param location
* @param flag
* @return void
* @throws
* @method thridLevelResolve
* @author [email protected]
* @version
* @date 2018/7/1 16:24
* @see
*/
@Override
public void initLevelThrid(String url, Location location, String flag) {
try {
if (StringUtils.isEmpty(location.getUrl())) {
return;
}
List thridLevelLocas = getLocation(url, new String[]{"towntr", "href"}, location.getLocalCode(), 3, flag);
location.setFlag(flag);
locationService.updateByPrimaryKey(location);
save(thridLevelLocas);
} catch (Exception e) {
e.printStackTrace();
}
}
public void save(List locations) {
//结果为空,抛出异常
if (null == locations || locations.isEmpty()) {
log.error(" target saved is null!");
return;
}
locationService.insertBatchByOn(locations);
}
@Override
public void initLevelFour(String url, List thridLevelLocas) {
for (Location le : thridLevelLocas) {
List locations = new ArrayList(12);
String suffix = new StringBuilder().append(url).append(AreaCodeStringUtils.getUrlStrByLocationCode(le.getLocalCode(), 3)).append(le.getUrl()).toString();
Elements es = null;
try {
es = getElementsByConnection(suffix, "villagetr");
Location tempLocation = null;
for (Element e : es) {
tempLocation = new Location(e.child(0).text(), e.child(2).text(), le.getLocalCode(), null, 4);
tempLocation.setId(IdClazzUtils.getId(Location.class));
locations.add(tempLocation);
}
le.setFlag("Y");
locationService.updateByPrimaryKey(le);
save(locations);
} catch (IOException e) {
log.error(" IOException code={},msg={},url={}", le.getLocalCode(), e.getMessage(), suffix);
int times = AreaCodeStringUtils.getSecond(3);
try {
TimeUnit.SECONDS.sleep(times);
} catch (InterruptedException interruptedException) {
log.error("msg={} ", interruptedException.getMessage());
}
continue;
} catch (Exception e) {
log.error("Exception code={},msg={}", le.getLocalCode(), e.getMessage());
continue;
}
}
}
/**
* @param url
* @param location
* @return
* @Title: getLocationSecondLevel
* @Description: TODO(这里用一句话描述这个方法的作用)
* @return: List
*/
public List getLocationSecondLevel(String url, Location location) {
List locas = null;
try {
locas = new ArrayList(90);
//URL地址截取
//标识位
boolean flag = false;
Elements es = getElementsByConnection(url, "countytr");
if (null == es) {
log.error(url + " 不能解析!");
return null;
}
Location tempLocation = null;
for (Element e : es) {
//针对市辖区 这种无URL的做特殊处理
if (!flag) {
tempLocation = new Location(e.child(0).text(), e.child(1).text(), location.getLocalCode(), null, 2);
tempLocation.setId(IdClazzUtils.getId(Location.class));
locas.add(tempLocation);
//标识位置为TURE
flag = true;
continue;
}
es = e.getElementsByAttribute("href");
if (es.size() == 0) {
tempLocation = new Location(e.child(0).text(), e.child(1).text(), location.getLocalCode(), "", 2);
tempLocation.setId(IdClazzUtils.getId(Location.class));
locas.add(tempLocation);
continue;
}
List attrs = es.get(0).attributes().asList();
tempLocation = new Location(es.get(0).text(), es.get(1).text(), location.getLocalCode(), attrs.get(0).getValue(), 2);
tempLocation.setId(IdClazzUtils.getId(Location.class));
locas.add(tempLocation);
}
} catch (Exception e) {
e.printStackTrace();
}
return locas;
}
/**
* @param url
* @param pCode
* @return
* @Title: getLocationOneLevel
* @Description: 1、获取第一级地市信息
* 2、第二级区县信息
* @return: List
*/
public List getLevelOneByRoot(String url, String pCode) throws IOException {
List locas = new ArrayList(20);
Elements eles = getElementsByConnection(url, "citytr");
if (null == eles) {
log.error(url + " 不能解析!");
return null;
}
Location location = null;
for (Element e : eles) {
eles = e.getElementsByAttribute("href");
List attrs = eles.get(0).attributes().asList();
location = new Location(eles.get(0).text(), eles.get(1).text(), pCode, attrs.get(0).getValue(), 1);
location.setId(IdClazzUtils.getId(Location.class));
locas.add(location);
}
return locas;
}
public List getLocation(String url, String[] cssClazz, String parentCode, Integer lv, String flag) throws IOException {
List locas = new ArrayList(20);
Elements eles = getElementsByConnection(url, cssClazz[0]);
if (null == eles) {
log.error(url + " 不能解析!");
return null;
}
Location location = null;
for (Element e : eles) {
eles = e.getElementsByAttribute(cssClazz[1]);
List attrs = eles.get(0).attributes().asList();
location = new Location(eles.get(0).text(), eles.get(1).text(), parentCode, attrs.get(0).getValue(), lv, flag);
location.setId(IdClazzUtils.getId(Location.class));
locas.add(location);
}
return locas;
}
/**
* 案例
*
* 340102001000
* 明光路街道
*
*
* @param url
* @param cssClazz
* @param parentURLCode
* @return List
* @Title: getLocation
* @Description: TODO(这里用一句话描述这个方法的作用)
*/
public List getLocation(String url, String[] cssClazz, String parentCode, Integer lv) {
return getLocation(url, cssClazz, parentCode, lv);
}
/**
* 方法实现说明
*
* @param url
* @param clazzName
* @return org.jsoup.select.Elements
* @throws
* @method getElementss
* @author [email protected]
* @version
* @date 2018/7/2 11:28
* @see
*/
public Elements getElementsByConnection(String url, String clazzName) throws IOException {
try {
/** CloseableHttpClient httpclient = HttpClients.createDefault(); **/
//设置CookieSpecs.STANDARD的cookie解析模式,下面为源码,对应解析格式我给出了备注
CloseableHttpClient httpclient = HttpClients.custom()
.setDefaultRequestConfig(RequestConfig.custom()
.setCookieSpec(CookieSpecs.STANDARD).build())
.build();
HttpGet httpget = new HttpGet(url);
httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
RequestConfig config = RequestConfig.custom()
//.setProxy(proxy)
//设置连接超时 ✔
// 设置连接超时时间 10秒钟
.setConnectTimeout(10000)
// 设置读取超时时间10秒钟
.setSocketTimeout(10000)
.build();
httpget.setConfig(config);
// 执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体
String content = EntityUtils.toString(entity, "GBK");
// ============================= 【Jsoup】 ====================================
Document doc = Jsoup.parse(content);
return doc.getElementsByClass(clazzName);
} catch (ConnectTimeoutException e) {
log.error(" ConnectTimeoutException URL={},clazzName={},errMsg={}", url, clazzName, e.getMessage());
}
return null;
}
/**
* @param locations
* @return java.lang.String
* @throws
* @Description
* @date 2020/9/9 14:52
*/
public String appengUrl(List locations) {
Iterator it = locations.iterator();
String url = "";
StringBuilder sb = new StringBuilder();
while (it.hasNext()) {
Location cation = it.next();
String str = cation.getUrl();
if (cation.getLv() == 3) {
sb.append(str);
} else {
int i = cation.getUrl().indexOf(Constant.SLASH);
sb.append(str.substring(0, i)).append(Constant.SLASH).append(sb);
}
}
return url;
}
}
2.1.3. 单元测试
在单元测试中,自上而下,按个运行测试方法即可。
- initRoot:初始化省、直辖市、自治区的。数量31,速度非常快
- intLevelOne:初始化城市,数量三百多,速度快
- intLevelTwo:初始化县区,数量四千多,速度一般
- intLevelThree:初始化乡镇 街道,数量四万,速度慢
- intLevelFour: 初始化社区村,速度非常慢,需要按照批次执行
同时在运行中,可能会由于服务器拒绝连接,造成无法解析出来地址,这没关系,代码中已经容错这些,继续执行即可!
package xyz.wongs.drunkard.task;
import com.github.pagehelper.PageInfo;
import lombok.extern.slf4j.Slf4j;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import xyz.wongs.drunkard.base.BaseTest;
import xyz.wongs.drunkard.war3.domain.entity.Location;
import xyz.wongs.drunkard.war3.domain.service.LocationService;
import xyz.wongs.drunkard.war3.web.util.AreaCodeStringUtils;
import xyz.wongs.drunkard.war3.web.area.task.ProcessService;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.concurrent.TimeUnit;
/**
* @author [email protected]
* @ClassName ProcessServiceImplTest
* @Description
* @Github https://github.com/rothschil
* @date 2020/9/9 15:26
* @Version 1.0.0
*/
@Slf4j
public class ProcessServiceTest extends BaseTest {
private static final String URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/";
private final static Logger logger = LoggerFactory.getLogger(ProcessServiceTest.class);
@Autowired
@Qualifier("processService")
private ProcessService processService;
@Autowired
private LocationService locationService;
/**
* 获取所有省,作为Root根节点
*
* @return
* @throws
* @Description
* @date 2020/4/30 0:41
*/
@Test
public void initRoot() {
processService.intiRootUrl(URL);
}
/**
* 解析所有省、直辖的城市
*
* @return void
* @throws
* @Description
* @date 2020/9/4 22:03
*/
@Test
public void intLevelOne() throws Exception {
city(1);
}
public void city(int pageNum) {
PageInfo pageInfo = locationService.getLocationsByLv(0, pageNum, 30);
if (pageInfo.getPages() == 0 || pageInfo.getPageNum() > pageInfo.getPages()) {
return;
}
List locations = pageInfo.getList();
Iterator iter = locations.iterator();
while (iter.hasNext()) {
Location location = iter.next();
String uls = URL + location.getUrl();
processService.initLevelOne(uls, location);
location.setFlag("Y");
locationService.updateByPrimaryKey(location);
}
city(pageNum + 1);
}
/**
* 根据地市,解析并初始化区县
*
* @return void
* @throws
* @Description
* @date 2020/9/5 10:21
*/
@Test
public void intLevelTwo() throws Exception {
exet(1);
}
public void exet(int pageNum) {
PageInfo pageInfo = locationService.getLocationsByLv(1, pageNum, 30);
if (pageInfo.getPages() == 0 || pageInfo.getPageNum() > pageInfo.getPages()) {
return;
}
List locations = pageInfo.getList();
Iterator iter = locations.iterator();
while (iter.hasNext()) {
Location location = iter.next();
String url2 = new StringBuilder().append(URL).append(location.getUrl()).toString();
processService.initLevelTwo(url2, location);
location.setFlag("Y");
locationService.updateByPrimaryKey(location);
}
exet(pageNum + 1);
}
/**
* 根据区县,解析并初始化乡镇 街道
*
* @return
* @throws
* @Description
* @date 2020/4/30 0:27
*/
@Test
public void intLevelThree() {
three(1);
}
public void three(int pageNum) {
PageInfo pageInfo = locationService.getLocationsByLv(2, pageNum, 100);
if (pageInfo.getPages() == 0 || pageInfo.getPageNum() > pageInfo.getPages()) {
return;
}
uot++;
List locations = pageInfo.getList();
Iterator iter = locations.iterator();
Location location = null;
while (iter.hasNext()) {
location = iter.next();
String url2 = new StringBuilder().append(URL).append(AreaCodeStringUtils.getUrlStrByLocationCode(location.getLocalCode(), 2)).append(location.getUrl()).toString();
processService.initLevelThrid(url2, location, "D");
try {
int times = AreaCodeStringUtils.getSecond(3);
TimeUnit.SECONDS.sleep(times);
} catch (InterruptedException e) {
log.error("msg={} ", e.getMessage());
}
}
if (uot == COT) {
return;
}
three(pageNum + 1);
}
private static int COT = 100;
private static int uot = 0;
/**
* 根据乡镇 街道,解析并初始化社区村
*
* @return
* @Description
* @throwsOperationImplicitParameterReader
* @date 2020/4/30 0:27
*/
@Test
public void intLevelFour() {
Location location = new Location();
location.setLv(3);
location.setFlag("D");
village(0, location);
}
public void village(int pageNum, Location location) {
PageInfo pageInfo = locationService.getLocationsByLvAndFlag(pageNum, 2, location);
log.error(pageInfo.toString());
if (pageInfo.getPages() == 0 || pageInfo.getPageNum() > pageInfo.getPages()) {
return;
}
uot++;
List locations = pageInfo.getList();
if (!locations.isEmpty()) {
processService.initLevelFour(URL, locations);
}
if (uot == COT) {
return;
}
village(pageNum + 1, location);
}
}
2.1.4. 打开浏览器
访问 http://localhost:9090/region/ip=109.27.45.12
这是我之前一个例子,用来解析IP地址,获取地域信息的。
2.1.5. 源码地址,如果觉得对你有帮助,请Star
觉得对你有帮助,请Star
Github源码地址
Gitee源码地址