获取这个页面的信息http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html
第一次写的,省,市,县,镇,村,嵌套好几层循环,总是read timed out错误
import java.sql.Connection;
import java.sql.DriverManager;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Util_Spider_Gov_01 {
//频繁报错,一次抓不完
private static final int YEAR = 2016;
private static String startUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html";
private static int total = 1;
public static void main(String[] args) throws Exception {
Document document_province;
try {
document_province = Jsoup.connect(startUrl)
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko").timeout(5000)
.get();
} catch (Exception e) {
document_province = Jsoup.connect(startUrl)
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko").timeout(5000)
.get();
e.printStackTrace();
}
Elements elements_provinces = document_province.body().select(".provincetr > td > a");
// Connection connection = getConnection();
// PreparedStatement ps_province_insert = connection.prepareStatement("insert
// into province(name) value(?)");
// PreparedStatement ps_province_select = connection.prepareStatement("select *
// from province where name = ?");
int i = 1;
for (Element province : elements_provinces) {
String url_province = province.absUrl("href");
Document document_city;
try {
document_city = Jsoup.connect(url_province)
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
.timeout(2000).get();
} catch (Exception e) {
try {
document_city = Jsoup.connect(url_province)
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
.timeout(2000).get();
System.out.println("省份连接一级错误:");
} catch (Exception e1) {
document_city = Jsoup.connect(url_province)
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
.timeout(2000).get();
// TODO Auto-generated catch block
e1.printStackTrace();
}
e.printStackTrace();
}
Elements elements_cities = document_city.body().select(".citytr");
for (Element city : elements_cities) {
Elements city_info = city.select("a[href]");
Element city_markCode_element = city_info.get(0);
String city_markCode = city_markCode_element.text();
Element city_name_element = city_info.get(1);
String city_name = city_name_element.text();
String url_county = city_name_element.absUrl("href");
Document document_county;
try {
document_county = Jsoup.connect(url_county)
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
.timeout(2000).get();
} catch (Exception e) {
// TODO Auto-generated catch block
try {
document_county = Jsoup.connect(url_county)
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
.timeout(2000).get();
} catch (Exception e1) {
document_county = Jsoup.connect(url_county)
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
.timeout(2000).get();
// TODO Auto-generated catch block
e1.printStackTrace();
}
e.printStackTrace();
}
Elements elements_counties = document_county.body().select(".countytr");
for (Element county : elements_counties) {
Elements county_info = county.select("a[href]");
if (county_info.size() > 0) {
Element county_markCode_element = county_info.get(0);
String county_markCode = county_markCode_element.text();
Element county_name_element = county_info.get(1);
String county_name = county_name_element.text();
String url_town = county_name_element.absUrl("href");
Document document_town;
try {
document_town = Jsoup.connect(url_town)
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
.timeout(2000).get();
} catch (Exception e) {
try {
document_town = Jsoup.connect(url_town)
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
.timeout(2000).get();
} catch (Exception e1) {
document_town = Jsoup.connect(url_town)
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
.timeout(2000).get();
// TODO Auto-generated catch block
e1.printStackTrace();
}
// TODO Auto-generated catch block
e.printStackTrace();
}
Elements elements_town = document_town.body().select(".towntr");
for (Element town : elements_town) {
Elements town_info = town.select("a[href]");
Element town_markCode_element = town_info.get(0);
String town_markCode = town_markCode_element.text();
Element town_name_element = town_info.get(1);
String town_name = town_name_element.text();
String url_village = town_name_element.absUrl("href");
Document document_village;
try {
document_village = Jsoup.connect(url_village)
.userAgent(
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
.timeout(2000).get();
} catch (Exception e) {
try {
document_village = Jsoup.connect(url_village)
.userAgent(
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
.timeout(2000).get();
} catch (Exception e1) {
document_village = Jsoup.connect(url_village)
.userAgent(
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
.timeout(2000).get();
// TODO Auto-generated catch block
e1.printStackTrace();
}
// TODO Auto-generated catch block
e.printStackTrace();
}
Elements elements_village = document_village.body().select(".villagetr");
for (Element village : elements_village) {
Elements village_info = village.select("td");
Element village_markCode_element = village_info.get(0);
Element village_classCode_element = village_info.get(1);
Element village_name_element = village_info.get(2);
System.out.println("第"+total+"个村庄名字:"+village_name_element.text());
System.out.println("=============");
total++;
}
}
}
}
}
}
System.out.println("抓完了");
}
public static Connection getConnection() throws Exception {
Class.forName("com.mysql.jdbc.Driver");
Connection connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/villagecount", "root", "root");
return connection;
}
}