国家统计局的统计用区划和城乡划分代码的网址为http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/ 通过爬取该网址可以获取最新的全国省市区乡镇村的数据。
本文通过springboot结合jsoup技术实现模拟爬虫爬取全国的省市数据,其他层级的数据如县乡镇村的数据可以由同学们自己练习。
一、数据库表
create table province
(
id int not null auto_increment,
name varchar(64),
code varchar(32),
primary key (id)
)
ENGINE = InnoDB
DEFAULT CHARSET = utf8;
create table city
(
id int not null auto_increment,
province_id int,
name varchar(64),
code varchar(32),
primary key (id)
)
ENGINE = InnoDB
DEFAULT CHARSET = utf8;
二、创建工程demo012
pom.xml的内容为
4.0.0
com.yinww
demo-springboot2
0.0.1-SNAPSHOT
demo012
org.springframework.boot
spring-boot-starter-web
org.mybatis.spring.boot
mybatis-spring-boot-starter
1.3.2
mysql
mysql-connector-java
runtime
org.jsoup
jsoup
1.11.3
三、Java类
主类
package com.yinww.demo.springboot2.demo012;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
@SpringBootApplication
public class Demo012Application {
public static void main(String[] args) {
SpringApplication.run(Demo012Application.class, args);
}
}
工具类:
public class HttpUtil {
public static Document get(String url, String charset) throws IOException {
String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36";
URL url2 = new URL(url);
HttpURLConnection connection = (HttpURLConnection)url2.openConnection();
connection.setRequestMethod("GET");
//是否允许缓存,默认true。
connection.setUseCaches(Boolean.FALSE);
//设置请求头信息
connection.addRequestProperty("Connection", "close");
connection.addRequestProperty("user-agent", userAgent);
//设置连接主机超时(单位:毫秒)
connection.setConnectTimeout(80000);
//设置从主机读取数据超时(单位:毫秒)
connection.setReadTimeout(80000);
//开始请求
try {
Document doc = Jsoup.parse(connection.getInputStream(), charset, url);
return doc;
} catch (Exception e) {
System.out.println("parse error: " + url);
e.printStackTrace();
}
return null;
}
}
控制器
@RestController
public class SpiderController {
@Autowired
private SpiderService spiderService;
@GetMapping({"/", ""})
public Object spider() throws Exception {
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/";
String charset = "gb2312";
Document rootDoc = HttpUtil.get(url, charset);
if(rootDoc == null) {
return 0;
}
Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0);
String yearHref = firstElement.select("a").get(0).attr("href"); // 最近一个年份的省份链接
Document doc = HttpUtil.get(yearHref, charset);
// 遍历所有的省
Elements provinceElements = doc.getElementsByClass("provincetr");
for (Element element : provinceElements) {
Elements aEles = element.select("a");
for (Element aEle : aEles) {
String name = aEle.text();
String provincesHref = aEle.attr("href");
String code = provincesHref.substring(0, provincesHref.indexOf("."));
int index = yearHref.lastIndexOf("/") + 1;
provincesHref = yearHref.substring(0, index) + provincesHref;
Province province = new Province(name, code);
spiderService.saveProvince(province);
getCites(provincesHref, charset, province.getId());
}
}
return "spider crawl end.";
}
private void getCites(String url, String charset, int provinceId) throws Exception {
Document rootDoc = HttpUtil.get(url, charset);
if(rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("citytr");
for (Element cityElement : cityElements) {
Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
String name = aEle.text();
String cityHref = aEle.attr("href");
City city = new City();
city.setName(name);
city.setProvinceId(provinceId);
int start = cityHref.lastIndexOf("/") + 1;
String code = cityHref.substring(start, cityHref.indexOf("."));
city.setCode(code);
spiderService.saveCity(city);
}
}
}
}
四、运行程序
启动程序后,访问 http://localhost:8080/ 等待程序运行结束,再查看数据库可获取到31条省的数据,343条市的数据。
整体思路还是比较清晰,重点在于掌握jsoup的几个api的用法,其他的都是常规的springboot代码。
本文内容到此结束,更多内容可关注公众号和个人微信号: