使用jsoup来扒取地域数据
数据源:http://www.xzqh.org/html/list/10034.html (能力范围内找的比较全的台湾地区数据)
导入使用jsoup需要的jar包
package test;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
public class Test {
private static Map
// Map
private static BufferedWriter bufferedWriter = null;
private static BufferedWriter bufferedWriter2 = null;
private static Gson gson = new GsonBuilder().disableHtmlEscaping().create();
public static String toJson(Object src) {
return gson.toJson(src);
}
// static {
// CSSMap.put(1, "list_nav");// 省
// CSSMap.put(2, "list_nav");// 省
CSSMap.put(1, "provincetr");// 省
CSSMap.put(2, "citytr");// 市
// CSSMap.put(3, "countytr");// 县
// CSSMap.put(4, "towntr");// 镇
// CSSMap.put(5, "villagetr");// 村
// }
@SuppressWarnings("unchecked")
public static void main(String[] args) throws IOException {
System.out.println("asdfasdfa");
int level = 1;
initFile();
int count = 1;
// 获取页面数据
Document connect = connect("http://www.xzqh.org/html/list/10034.html");
Elements rowProvince = connect.select("div.list_nav");
List
//初始化文件(根据个人需求) 本人的数据存储并没有使用mysql,使用的的是图数据库所以需要两个文件,一个是关系文件,一个是数据文件
private static void initFile() {
try {
bufferedWriter = new BufferedWriter(new FileWriter(new File(
"D:\\CityInfo.csv"), true));
bufferedWriter2 = new BufferedWriter(new FileWriter(new File(
"D:\\Relation.csv"), true));
} catch (IOException e) {
e.printStackTrace();
}
}
private static void closeStream() {
if (bufferedWriter != null) {
try {
bufferedWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
bufferedWriter = null;
}
if (bufferedWriter2 != null) {
try {
bufferedWriter2.close();
} catch (IOException e) {
e.printStackTrace();
}
bufferedWriter2 = null;
}
}
private static void parseNextLevel(Element parentElement, int level, List
throws IOException {
try {
Thread.sleep(500);// 睡眠一下,否则可能出现各种错误状态码
} catch (InterruptedException e) {
e.printStackTrace();
}
List
Document doc = connect(parentElement.attr("abs:href"));
if (doc != null) {
Elements newsHeadlines = doc.select("div.list_layout");//
// 获取表格的一行数据
for (Element element : newsHeadlines) {
Elements select1 = element.select("a");
for (Element child : select1) {
String text = child.text();
//行政区划分的连接下才有下级数据
if (text.endsWith("行政区划")) {
System.out.println(text);
//按照个人需求组装数据
String parentId = UUID.randomUUID().toString();
Map
tmp1.put("regionName", text.replace("行政区划", ""));
tmp1.put("regionId", parentId);
tmp1.put("level", "3");
tmp1.put("type", "1");
tmp1.put("parentId", "710000000000");
resList.add(tmp1);
keyList = new ArrayList
//下级查询
Document doc2 = connect(child.attr("abs:href"));
if (doc2 != null) {
int count = 1;
Elements elements1 = doc2.select("table").select("tr");
for (Element row : elements1) {
if (count == 1) {
count++;
continue;
}
String[] split = row.select("td").text().split(" ");
String regionName = split[0].replaceAll(" ", "");
if (regionName.equals("合计")) {
continue;
}
String regionId = split[split.length-2];
Map
tmp.put("regionName", regionName);
tmp.put("regionId", regionId);
tmp.put("level", "4");
tmp.put("type", "1");
tmp.put("parentId", parentId);
keyList.add(tmp);
}
}
if (keyList != null && !keyList.isEmpty()) {
// System.out.println(keyList.toString());
tmp1.put("childList", keyList);
}
}
}
}
}
}
/**
* 写一行数据到数据文件中去
*
* @param element
* 爬取到的数据元素
* @param level
* 城市级别
*/
private static void printInfo2(Map
try {
String regionId = (String) region.get("regionId");
String regionName = (String) region.get("regionName");
String level = (String) region.get("level");
String type = (String) region.get("type");
sb.setLength(0);
sb.append(regionId).append(",").append(regionName).append(",").append(level).append(",").append(type);
bufferedWriter.write(sb.toString());
bufferedWriter.newLine();
bufferedWriter.flush();
sb.setLength(0);
String parentId = (String) region.get("parentId");
sb.append(parentId).append(",").append(regionId);
bufferedWriter2.write(sb.toString());
bufferedWriter2.newLine();
bufferedWriter2.flush();
sb.setLength(0);
} catch (IOException e) {
e.printStackTrace();
}
}
private static void printInfo(Element element, int level) {
try {
bufferedWriter.write(element.select("div").last().text() + "{"
+ level + "}[" + element.select("div").first().text() + "]");
bufferedWriter.newLine();
bufferedWriter.flush();
} catch (IOException e) {
e.printStackTrace();
}
}
private static Document connect(String url) {
if (url == null || url.isEmpty()) {
throw new IllegalArgumentException("The input url('" + url
+ "') is invalid!");
}
try {
return Jsoup.connect(url).timeout(100 * 1000).get();
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
}
不知道啥原因,在国家统计局地域那边没有港澳台的数据,真是费了好大的力气才找到这个台湾省的地域,可能并不是很全,将就着用吧,代码是按照全国的地域扒取的方式改出来的,勉强用用吧