1:POM文件依赖
mysql
mysql-connector-java
5.1.36
org.apache.commons
commons-lang3
3.7
us.codecraft
webmagic-core
0.7.3
us.codecraft
webmagic-extension
0.7.3
2:Main方法
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.math.NumberUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
*****************************
* 爬区统计局城市区域信息代码
*
* @author LinYingQiang
* @date 2018年7月18日 上午11:51:08
* @mobile
*****************************
*/
public class StatisticsPageProcessor implements PageProcessor {
static String driver = "com.mysql.jdbc.Driver";
static String url = "jdbc:mysql://***.***.***:3306/wool_trade?characterEncoding=utf8&useSSL=false";
static String username = "***";
static String password = "***";
static Connection conn = null;
static{
try {
Class.forName(driver); //classLoader,加载对应驱动
conn =DriverManager.getConnection(url, username, password);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
}
@Override
public Site getSite() {
return Site.me().setRetryTimes(3).setSleepTime(1500);
}
@Override
public void process(Page page) {
if(page.getUrl().get().contains("index.html")){//省份
List provinceModels = new ArrayList<>();
List provices = page.getHtml().xpath("tr[@class='provincetr']/td/a/text()").all();
List urls = page.getHtml().xpath("tr[@class='provincetr']/td/a").links().all();
TargetModel model = null;
for(int i = 0 ; i< provices.size() && (provices.size() == urls.size()); i ++){
model = new TargetModel();
String url = urls.get(i);
Pattern p1 = Pattern.compile("\\d+\\.html$");
Matcher m1 = p1.matcher(url);
if(m1.find()){
String group1 = m1.group();
Pattern p2 = Pattern.compile("\\d+");
Matcher m2 = p2.matcher(group1);
if(m2.find()){
String group2 = m2.group();
if(NumberUtils.isCreatable(group2)){
model.setId(Integer.valueOf(group2));
}
}
}
model.setpId(0);
model.setName(provices.get(i));
model.setUrls(url);
provinceModels.add(model);
page.addTargetRequest(url);
}
page.putField("provinces", provinceModels);
}else if(page.getUrl().regex("http://www\\.stats\\.gov\\.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/[0-9]*.html").match()){//市级
List cityModels = new ArrayList<>();
List cities = page.getHtml().xpath("tr[@class='citytr']/td[2]/a/text()").all();
List urls = page.getHtml().xpath("tr[@class='citytr']/td[2]/a").links().all();
Integer tmId = null;
//解析父省份id设置到当前对象
String url = page.getUrl().get();
Pattern p1 = Pattern.compile("\\d+\\.html$");
Matcher m1 = p1.matcher(url);
if(m1.find()){
String group1 = m1.group();
Pattern p2 = Pattern.compile("\\d+");
Matcher m2 = p2.matcher(group1);
if(m2.find()){
String group2 = m2.group();
if(NumberUtils.isCreatable(group2)){
tmId = Integer.valueOf(group2);
}
}
}
for(int i = 0; (i < cities.size() && (cities.size() == urls.size())); i ++){
TargetModel tm = new TargetModel();
String cUrl = urls.get(i);
Pattern p3 = Pattern.compile("\\d+\\.html$");
Matcher m3 = p3.matcher(cUrl);
if(m3.find()){
String group3 = m3.group();
Pattern p4 = Pattern.compile("\\d+");
Matcher m4 = p4.matcher(group3);
if(m4.find()){
String group4 = m4.group();
if(NumberUtils.isCreatable(group4)){
tm.setId(Integer.valueOf(group4));
}
}
}
tm.setpId(tmId);
tm.setName(cities.get(i));
tm.setUrls(cUrl);
cityModels.add(tm);
page.addTargetRequest(cUrl);
}
page.putField("cities", cityModels);
}else if(page.getUrl().regex("http://www\\.stats\\.gov\\.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/[0-9]*/[0-9]*.html").match()){//区域
List areaModels = new ArrayList<>();
List cities = page.getHtml().xpath("tr[@class='countytr']/td[2]/a/text()").all();
List urls = page.getHtml().xpath("tr[@class='countytr']/td[2]/a").links().all();
//解析父区域id设置到当前对象
Integer tmId = null;
String url = page.getUrl().get();
Pattern p1 = Pattern.compile("\\d+\\.html$");
Matcher m1 = p1.matcher(url);
if(m1.find()){
String group1 = m1.group();
Pattern p2 = Pattern.compile("\\d+");
Matcher m2 = p2.matcher(group1);
if(m2.find()){
String group2 = m2.group();
if(NumberUtils.isCreatable(group2)){
tmId = Integer.valueOf(group2);
}
}
}
for(int i = 0; (i < cities.size() && (cities.size() == urls.size())); i ++){
TargetModel tm = new TargetModel();
String cUrl = urls.get(i);
Pattern p3 = Pattern.compile("\\d+\\.html$");
Matcher m3 = p3.matcher(cUrl);
if(m3.find()){
String group3 = m3.group();
Pattern p4 = Pattern.compile("\\d+");
Matcher m4 = p4.matcher(group3);
if(m4.find()){
String group4 = m4.group();
if(NumberUtils.isCreatable(group4)){
tm.setId(Integer.valueOf(group4));
}
}
}
tm.setpId(tmId);
tm.setName(cities.get(i));
tm.setUrls(cUrl);
areaModels.add(tm);
//page.addTargetRequest(cUrl);
}
page.putField("areas", areaModels);
}
}
public static void main(String[] args) {
CustomPipeline customPipeline = new CustomPipeline();
Spider spider = Spider.create(new StatisticsPageProcessor());
spider.addUrl("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html");
spider.addPipeline(customPipeline);
spider.start();
for(;;){
if(spider.getStatus().equals(Spider.Status.Stopped)){
break;
}
}
List provinces = customPipeline.provinces;
//当所有任务执行完毕
//INSERT INTO `wool_trade`.`bus_region` (`id`, `name`, `p_id`) VALUES ('1', '北京市', '0');
String sql = "insert into `bus_region`(`id`,`name`,`p_id`) VALUES (?, ?, ?)";
PreparedStatement preparedStatement;
try {
for(TargetModel province : provinces){
preparedStatement = conn.prepareStatement(sql);
preparedStatement.setInt(1, province.getId());
preparedStatement.setString(2, province.getName());
preparedStatement.setInt(3, province.getpId());
preparedStatement.executeUpdate();
//遍历市级
for(TargetModel city : province.getChilds()){
preparedStatement = conn.prepareStatement(sql);
preparedStatement.setInt(1, city.getId());
preparedStatement.setString(2, city.getName());
preparedStatement.setInt(3, city.getpId());
preparedStatement.executeUpdate();
//遍历区域
for(TargetModel area : city.getChilds()){
preparedStatement = conn.prepareStatement(sql);
preparedStatement.setInt(1, area.getId());
preparedStatement.setString(2, area.getName());
preparedStatement.setInt(3, area.getpId());
preparedStatement.executeUpdate();
}
}
}
}catch (SQLException e) {
}
}
}
3:Pipeline类
import java.util.List;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
*****************************
* 爬区统计局城市区域信息代码
*
* @author LinYingQiang
* @date 2018年7月18日 上午11:52:12
* @mobile
*****************************
*/
public class CustomPipeline implements Pipeline {
List provinces;
@Override
public void process(ResultItems resultItems, Task task) {
if(resultItems.get("provinces") != null){
provinces = resultItems.get("provinces");
}
if(resultItems.get("cities") != null && provinces.size() > 0){
List cities = resultItems.get("cities");
for(TargetModel province : provinces){
for(TargetModel city : cities){
if(province.getId().equals(city.getpId())){
province.getChilds().add(city);
}else{
continue;
}
}
}
}
if(resultItems.get("areas") != null && provinces.size() > 0){
List areas = resultItems.get("areas");
for(TargetModel province : provinces){
for(TargetModel city : province.getChilds()){
for(TargetModel area : areas){
if(city.getId().equals(area.getpId())){
city.getChilds().add(area);
}else{
continue;
}
}
}
}
}
}
}