我也是参考了这篇博客http://www.cnblogs.com/sanmubird/p/7857474.html写的程序,是可以实现的。只需要修改一下MyCrawler代码即可。
package com.lenovo.crawl.main;
import com.lenovo.crawl.entity.Region;
import com.lenovo.crawl.link.LinkFilter;
import com.lenovo.crawl.link.Links;
import com.lenovo.crawl.page.Page;
import com.lenovo.crawl.page.PageParserTool;
import com.lenovo.crawl.page.RequestAndResponseTool;
import com.lenovo.crawl.util.DBCPUtils;
import com.lenovo.crawl.util.FileTool;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanHandler;
import org.apache.commons.dbutils.handlers.BeanListHandler;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Crawler {
static Pattern pattern = Pattern.compile("^[0-9]+.*");
static Pattern pattern_ = Pattern.compile("^[0-9]*");
/**
* 使用种子初始化 URL 队列
*
* @param seeds 种子 URL
* @return
*/
private void initCrawlerWithSeeds(String[] seeds) {
for (int i = 0; i < seeds.length; i++){
Links.addUnvisitedUrlQueue(seeds[i]);
}
}
/**
* 抓取过程
*
* @param seeds
* @return
*/
public void crawling(String[] seeds){
Crawler crawler=new Crawler();
//创建一个File的实例对象
File file=new File("D:\\ja\\tempIn");
//判断file是否存在,不存在就创建出一个文件目录
if(!file.exists()){
file.mkdirs();
}
File file1=new File(file,"result.txt");
//判断file1是否存在,不存在就创建出一个文件
if(!file1.exists()){
file.mkdirs();
}
//创建FileOutInputStream的对象
FileOutputStream fos=null;
try{
fos=new FileOutputStream(file1,true);
//初始化 URL 队列
initCrawlerWithSeeds(seeds);
//定义过滤器,提取以 http://www.baidu.com 开头的链接
LinkFilter filter = new LinkFilter() {
public boolean accept(String url) {
if (url.startsWith("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"))
return true;
else
return false;
}
};
//循环条件:待抓取的链接不空且抓取的网页不多于 1000
while (!Links.unVisitedUrlQueueIsEmpty()) {
//先从待访问的序列中取出第一个;
String visitUrl = (String) Links.removeHeadOfUnVisitedUrlQueue();
if (visitUrl == null){
continue;
}
//根据URL得到page;
Page page = RequestAndResponseTool.sendRequstAndGetResponse(visitUrl);
//对page进行处理: 访问DOM的某个标签
Elements es = PageParserTool.select(page,"a");
Elements esTr = PageParserTool.select(page,"tr");
if(!esTr.isEmpty()){
for (Element e:esTr)
{
if(e.attributes().get("class").equals("villagetr")){
List nodes=e.childNodes();
if(nodes!=null&&nodes.size()==3){
System.out.println(nodes.get(0).outerHtml().replaceAll("","").replaceAll(" ",""));
System.out.println(nodes.get(2).outerHtml().replaceAll("","").replaceAll(" ",""));
String code=" "+(nodes.get(0).outerHtml().replaceAll("","").replaceAll(" ","")+" ");
String name=" "+(nodes.get(2).outerHtml().replaceAll("","").replaceAll(" ","")+" ");
fos.write(code.getBytes());
fos.write("\r\n".getBytes());
fos.write(name.getBytes());
fos.write("\r\n".getBytes());
}
}
if(e.attributes().get("class").equals("provincetr")){
List nodes=e.childNodes();
if(nodes!=null&&nodes.size()>0){
for (Node node:nodes) {
Node value=node.childNodes().get(0);
String name=value.childNodes().get(0).outerHtml();
Attributes attributes=value.attributes();
String a=attributes.get("href");
a=a.substring(0,2);
a=a+"0000000000";
System.out.println(name);
System.out.println(a);
fos.write(a.getBytes());
fos.write("\r\n".getBytes());
fos.write(name.getBytes());
fos.write("\r\n".getBytes());
}
}
}
if(e.attributes().get("class").equals("citytr")){
List nodes=e.childNodes();
if(nodes!=null&&nodes.size()>0){
Node codeNode=nodes.get(0).childNodes().get(0);
String code=codeNode.childNodes().get(0).outerHtml();
Node nameCode=nodes.get(1).childNodes().get(0);
String name=nameCode.childNodes().get(0).outerHtml();
code=" "+code;
name=" "+name;
System.out.println(name);
System.out.println(code);
fos.write(code.getBytes());
fos.write("\r\n".getBytes());
fos.write(name.getBytes());
fos.write("\r\n".getBytes());
}
}
if(e.attributes().get("class").equals("countytr")){
List nodes=e.childNodes();
if(nodes!=null&&nodes.size()>0){
Node codeNode=nodes.get(0).childNodes().get(0);
String code=codeNode.childNodes().get(0).outerHtml();
Node nameCode=nodes.get(1).childNodes().get(0);
String name=nameCode.childNodes().get(0).outerHtml();
code=" "+code;
name=" "+name;
System.out.println(name);
System.out.println(code);
fos.write(code.getBytes());
fos.write("\r\n".getBytes());
fos.write(name.getBytes());
fos.write("\r\n".getBytes());
}
}
if(e.attributes().get("class").equals("towntr")){
// Region region=new Region();
List nodes=e.childNodes();
if(nodes!=null&&nodes.size()>0){
Node codeNode=nodes.get(0).childNodes().get(0);
String code=codeNode.childNodes().get(0).outerHtml();
Node nameCode=nodes.get(1).childNodes().get(0);
String name=nameCode.childNodes().get(0).outerHtml();
code=" "+code;
name=" "+name;
System.out.println(name);
System.out.println(code);
fos.write(code.getBytes());
fos.write("\r\n".getBytes());
fos.write(name.getBytes());
fos.write("\r\n".getBytes());
}
}
}
}
if(!es.isEmpty()){
for (Element e:es
) {
String key=e.attributes().get("href");
Matcher m = pattern.matcher(key);
if(m.matches()){
System.out.println(e.html().replaceAll("
", ""));
fos.write((e.html().replaceAll("
", "")+" ").getBytes());
fos.write("\r\n".getBytes());
if(!pattern_.matcher(e.html()).matches()) {
crawling(new String[]{visitUrl.substring(0, visitUrl.lastIndexOf("/") + 1) + key});
}
}
}
}
//将保存文件
FileTool.saveToLocal(page);
//将已经访问过的链接放入已访问的链接中;
Links.addVisitedUrlSet(visitUrl);
//得到超链接
Set links = PageParserTool.getLinks(page,"img");
for (String link : links) {
Links.addUnvisitedUrlQueue(link);
}
}
}catch(Exception e){
System.out.println("--------IO异常----------");
}finally {
try{
fos.close();
}catch(Exception e){
System.out.println("------被要关闭的文件不存在-------");
}
}
}
public static void readFileByLines(String fileName) {
File file = new File(fileName);
BufferedReader reader = null;
try {
QueryRunner qr = new QueryRunner(DBCPUtils.getDataSource());
Connection connection=DBCPUtils.getDataSource().getConnection();
String sqlInsert = "INSERT INTO region VALUES (?,?,?,?,?,?)";
reader = new BufferedReader(new FileReader(file));
List codeList=new ArrayList();
List nameList=new ArrayList();
String tempString = null;
int line = 1;
// 一次读入一行,直到读入null为文件结束
while ((tempString = reader.readLine()) != null) {
if(line%2==0){
nameList.add(tempString);
}else{
codeList.add(tempString);
}
line++;
}
for(int i=0;i list=qr.query(sql,new BeanListHandler(Region.class));
// Object [][] params=new Object[344][];
// int k=0;
for (int i=0;i(Region.class),param);
String province=region1.getProvince();
region.setProvince(province);
Object [] paramUpdate={region.getProvince(),region1.getCode()," ",region.getCode()};
String updateSql="update region set province=?,provinceCode=?,cityCode=? where code=?";
qr.update(updateSql,paramUpdate);
}
if(level.equals("3")){
String parentCode=region.getParentCode();
String sql2="select * from region where code=?";
Object [] param={parentCode};
Region region1=qr.query(sql2,new BeanHandler(Region.class),param);
String province=region1.getProvince();
String city=region1.getCity();
region.setProvince(province);
region.setCity(city);
Object [] paramUpdate={region.getProvince(),region.getCity(),region1.getProvinceCode(),region1.getCode(),region.getCode()};
String updateSql="update region set province=?,city=?,provinceCode=?,cityCode=? where code=?";
qr.update(updateSql,paramUpdate);
}
}
connection.close();
}
//main 方法入口
public static void main(String[] args) throws Exception{
Crawler crawler=new Crawler();
crawler.crawling(new String[]{"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/"});
}
}
region对象如下:
package com.lenovo.crawl.entity;
public class Region {
private String code;
private String provinceCode;
private String province;
private String cityCode;
private String city;
private String countyCode;
private String county;
private String parentCode;
private String level;
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getProvinceCode() {
return provinceCode;
}
public void setProvinceCode(String provinceCode) {
this.provinceCode = provinceCode;
}
public String getProvince() {
return province;
}
public void setProvince(String province) {
this.province = province;
}
public String getCityCode() {
return cityCode;
}
public void setCityCode(String cityCode) {
this.cityCode = cityCode;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public String getCountyCode() {
return countyCode;
}
public void setCountyCode(String countyCode) {
this.countyCode = countyCode;
}
public String getCounty() {
return county;
}
public void setCounty(String county) {
this.county = county;
}
public String getParentCode() {
return parentCode;
}
public void setParentCode(String parentCode) {
this.parentCode = parentCode;
}
public String getLevel() {
return level;
}
public void setLevel(String level) {
this.level = level;
}
public Region() {
}
public Region(String code, String provinceCode, String province, String cityCode, String city, String countyCode, String county, String parentCode, String level) {
this.code = code;
this.provinceCode = provinceCode;
this.province = province;
this.cityCode = cityCode;
this.city = city;
this.countyCode = countyCode;
this.county = county;
this.parentCode = parentCode;
this.level = level;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Region region = (Region) o;
if (code != null ? !code.equals(region.code) : region.code != null) return false;
if (provinceCode != null ? !provinceCode.equals(region.provinceCode) : region.provinceCode != null)
return false;
if (province != null ? !province.equals(region.province) : region.province != null) return false;
if (cityCode != null ? !cityCode.equals(region.cityCode) : region.cityCode != null) return false;
if (city != null ? !city.equals(region.city) : region.city != null) return false;
if (countyCode != null ? !countyCode.equals(region.countyCode) : region.countyCode != null) return false;
if (county != null ? !county.equals(region.county) : region.county != null) return false;
if (parentCode != null ? !parentCode.equals(region.parentCode) : region.parentCode != null) return false;
return level != null ? level.equals(region.level) : region.level == null;
}
@Override
public int hashCode() {
int result = code != null ? code.hashCode() : 0;
result = 31 * result + (provinceCode != null ? provinceCode.hashCode() : 0);
result = 31 * result + (province != null ? province.hashCode() : 0);
result = 31 * result + (cityCode != null ? cityCode.hashCode() : 0);
result = 31 * result + (city != null ? city.hashCode() : 0);
result = 31 * result + (countyCode != null ? countyCode.hashCode() : 0);
result = 31 * result + (county != null ? county.hashCode() : 0);
result = 31 * result + (parentCode != null ? parentCode.hashCode() : 0);
result = 31 * result + (level != null ? level.hashCode() : 0);
return result;
}
}
我是先把数据爬取完,写到了txt文件里,然后去解析txt文件,把数据整理成了结构化的(为了实现三级联动),各自加了一个level的属性。然后写到了数据库里。(上面的代码,如果不想这样输出,可以写个递归。按层级输出)
我的txt连接在这:https://download.csdn.net/download/qq_29281307/11191484