最近有个项目需要拿到百度地图中XXX市中所有学校的边界坐标。经过一下午的努力,终于成功拿到了坐标,对坐标数据还原和验证。特此记录一下具体实现过程和爬取思路。
Point of interesting. 可以翻译为兴趣点,就是在地图上任何非地理意义的有意义的点:比如商店,酒吧,加油站,医院,学校,车站等。不属于poi的是有地理意义的坐标:城市,河流,山峰
实现思路一共分三步。
第一步先找到边界坐标存放的JSON请求接口,也就是Request URL。
第二步解析URL,编写程序拿到Response返回值的JSON字符串。
第三步解析JSON字符串拿到解密后的坐标点,对坐标点进行还原和验证。
打开百度地图随便搜索一所学校
可以看到该学院的边界已经自动被画出来了,那么边界坐标信息就肯定存在搜索接口返回的JSON中。
最终我在该URL的JSON里找到了湖南涉外经济学院的坐标点和边界坐标集合。geo:坐标点,profile_geo:坐标边界集合
通过第一步我们拿到URL,通过观察发现“wd=”后接的是个URL Encode通过解码发现接的就是我们搜索的学校名称。
那我们是不是只需要截取前面一段URL也能拿到目标JSON呢?答案是肯定的。
接下来编写Python程序批量爬取我们需要的含有坐标点和边界坐标集合的JSON
import requests
import random
import json
if __name__ == '__main__':
ua = []
wd = []
with open('ua.txt', 'r') as f:
for line in f:
ua.append(list(line.strip('\n').split(',')))
with open('schoollist\\XXX市所有学校名单.txt', 'r', encoding='utf-8') as f:
for line in f:
wd.append(list(line.strip('\n').split(',')))
for i in wd:
url = 'https://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=direct&pcevaname=pc4.1&qt=s&da_src=shareurl&wd={name}'
name = ''.join(i)
url = url.format(name=name) # 去除括号
print(url)
num = random.randint(0, len(ua))
heads = {'User-Agent': ''.join(ua[num-1])}
r = requests.get(url, heads)
response_dict = r.json()
outputname = name + '.json'
with open('json\\' + outputname, 'w') as f:
json.dump(response_dict, f)
通过观察发现边界坐标集合在JSON中的节点为JSON-->content-->content[0]-->profile_geo。
并且profile_flag中边界坐标集合为加密后的墨卡托坐标,所以我们需要将墨卡托坐标转换为百度地图经纬度坐标,转换后批量抽取关键信息。
接下来编写Java程序批量解析我们刚刚爬取到的JSON数据
public class test {
private static final Double[] MCBAND = {12890594.86, 8362377.87, 5591021d, 3481989.83, 1678043.12, 0d};
private static final Double[][] MC2LL = {{1.410526172116255e-8, 0.00000898305509648872, -1.9939833816331, 200.9824383106796, -187.2403703815547, 91.6087516669843, -23.38765649603339, 2.57121317296198, -0.03801003308653, 17337981.2}, {-7.435856389565537e-9, 0.000008983055097726239, -0.78625201886289, 96.32687599759846, -1.85204757529826, -59.36935905485877, 47.40033549296737, -16.50741931063887, 2.28786674699375, 10260144.86}, {-3.030883460898826e-8, 0.00000898305509983578, 0.30071316287616, 59.74293618442277, 7.357984074871, -25.38371002664745, 13.45380521110908, -3.29883767235584, 0.32710905363475, 6856817.37}, {-1.981981304930552e-8, 0.000008983055099779535, 0.03278182852591, 40.31678527705744, 0.65659298677277, -4.44255534477492, 0.85341911805263, 0.12923347998204, -0.04625736007561, 4482777.06}, {3.09191371068437e-9, 0.000008983055096812155, 0.00006995724062, 23.10934304144901, -0.00023663490511, -0.6321817810242, -0.00663494467273, 0.03430082397953, -0.00466043876332, 2555164.4}, {2.890871144776878e-9, 0.000008983055095805407, -3.068298e-8, 7.47137025468032, -0.00000353937994, -0.02145144861037, -0.00001234426596, 0.00010322952773, -0.00000323890364, 826088.5}};
public static void main(String[] args) {
bd("E:\\pythonProject\\json", "E:\\pythonProject\\jsonlist\\output.json");
}
public static void write(String file, String conent) {
BufferedWriter out = null;
try {
out = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(file, true)));
out.write(conent + "\r\n");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 获取路径下的所有文件/文件夹路径
*
* @param directoryPath 需要遍历的文件夹路径
* @param isAddDirectory 是否将子文件夹的路径也添加到list集合中
* @return
*/
public static List getAllFile(String directoryPath, boolean isAddDirectory) {
List list = new ArrayList();
File baseFile = new File(directoryPath);
if (baseFile.isFile() || !baseFile.exists()) {
return list;
}
File[] files = baseFile.listFiles();
for (File file : files) {
if (file.isDirectory()) {
if (isAddDirectory) {
list.add(file.getAbsolutePath());
}
list.addAll(getAllFile(file.getAbsolutePath(), isAddDirectory));
} else {
list.add(file.getAbsolutePath());
}
}
return list;
}
public static String mc2jw(String geo) {
//String geo = "4|13517816.2,3597097.8;13527673.6,3607970.0|1-13520242.7,3597248.8,13520242.7,3597399.8,13519636.1,3597399.8,13519636.1,3598607.8,13519484.4,3598607.8,13519484.4,3598758.8,13519029.5,3598758.8,13519029.5,3599060.8,13518574.5,3599060.8,13518574.5,3599211.8,13518271.2,3599211.8,13518271.2,3599815.9,13517816.2,3599815.9,13517816.2,3600268.9,13518119.6,3600268.9,13518119.6,3600570.9,13518271.2,3600570.9,13518271.2,3600872.9,13518574.5,3600872.9,13518574.5,3601023.9,13519181.1,3601023.9,13519181.1,3602533.9,13518877.8,3602533.9,13518877.8,3602986.9,13519332.8,3602986.9,13519332.8,3605100.9,13518574.5,3605100.9,13518574.5,3606611.0,13518422.8,3606611.0,13518422.8,3607064.0,13518574.5,3607064.0,13518574.5,3607366.0,13518877.8,3607366.0,13518877.8,3607819.0,13519332.8,3607819.0,13519332.8,3607970.0,13519787.7,3607970.0,13519787.7,3607064.0,13521152.6,3607064.0,13521152.6,3606460.0,13522062.5,3606460.0,13522062.5,3605856.0,13521910.8,3605856.0,13521910.8,3604345.9,13522669.1,3604345.9,13522669.1,3603288.9,13523730.7,3603288.9,13523730.7,3602533.9,13524185.6,3602533.9,13524185.6,3601023.9,13524337.3,3601023.9,13524337.3,3600721.9,13526157.1,3600721.9,13526157.1,3601174.9,13527218.6,3601174.9,13527218.6,3600419.9,13527370.3,3600419.9,13527370.3,3599513.9,13527673.6,3599513.9,13527673.6,3598758.8,13527218.6,3598758.8,13527218.6,3598305.8,13526612.0,3598305.8,13526612.0,3597097.8,13524943.9,3597097.8,13524943.9,3597550.8,13524792.2,3597550.8,13524792.2,3598154.8,13522517.4,3598154.8,13522517.4,3597701.8,13522214.1,3597701.8,13522214.1,3597550.8,13520697.6,3597550.8,13520697.6,3597097.8,13520242.7,3597097.8;";
List mocatorList = new ArrayList();
if (geo.length() <= 71) {
//没有边界,解析该点坐标
mocatorList = parseJeoPoint(geo);
} else {
//有边界
mocatorList = parseJeo(geo);
}
// 封装板块边界
StringBuilder sb = new StringBuilder();
for (int i = 0; i < mocatorList.size(); i++) {
String[] coordinate = mocatorList.get(i).split("\\#");
// 墨卡托坐标转换为百度经纬度坐标
Map location = convertMC2LL(Double.parseDouble(coordinate[0]), Double.parseDouble(coordinate[1]));
Double lng = location.get("lng");
Double lat = location.get("lat");
String coord = lng + "," + lat;
sb.append(coord);
if (i < mocatorList.size() - 1) {
sb.append(";");
}
}
return sb.toString();
}
/**
* @author Caleb
* @date 2021/10/27 18:56
* @param directoryPath 待解析JSON文件夹路径
* @param outPutPath 解析后JSON文件输出路径
* @return void
**/
public static void bd(String directoryPath, String outPutPath) {
List list = getAllFile(directoryPath, false);
for (String jsonStr : list) {
String readJson = getJsonFile(jsonStr);
// 先将这条数据解析为JSONObject
JSONObject outJson = JSONObject.parseObject(readJson);
// 取“content”的键值,value里面是个json数组
JSONArray jsonArray = outJson.getJSONArray("content");
String str1 = jsonStr.substring(0, jsonStr.lastIndexOf("\\"));
String schoolname = jsonStr.substring(str1.length() + 1, jsonStr.length() - 5);
//当jsonArray为空时,说明该学校没有在地图上标注
if (jsonArray == null || jsonArray.size() == 0) {
//写入文件
write("E:\\pythonProject\\log\\没有找到学校名单.txt", schoolname);
} else {
JSONObject Json = jsonArray.getJSONObject(0);
String profile_geo = Json.getString("profile_geo");
String geo = Json.getString("geo");
//当学校边界坐标没有时,获取学校坐标点数据
if (profile_geo == null || profile_geo.equals("")) {
write("E:\\pythonProject\\log\\只有学校坐标名单.json", "{\"name\": ");
write("E:\\pythonProject\\log\\只有学校坐标名单.json", "\"" + schoolname + "\",");
String str = mc2jw(geo);
String[] split = str.split(";");
write("E:\\pythonProject\\log\\只有学校坐标名单.json", "\"geo\": ");
for (String strPoint : split) {
String strxy = "[" + strPoint + "]";
write("E:\\pythonProject\\log\\只有学校坐标名单.json", strxy);
}
write("E:\\pythonProject\\log\\只有学校坐标名单.json", "};");
} else {
write(outPutPath, "{\"name\": ");
write(outPutPath, "\"" + schoolname + "\",");
if (profile_geo.length() != 0) {
String str = mc2jw(profile_geo);
String[] split = str.split(";");
write(outPutPath, "\"profile_geo\": [");
int index = 0;
for (String strPoint : split) {
String strxy = "";
if (index == split.length - 1) {
strxy = "[" + strPoint + "]";
} else {
strxy = "[" + strPoint + "],";
}
write(outPutPath, strxy);
index++;
}
write(outPutPath, "]\n};");
}
System.out.println(jsonStr);
}
}
}
}
/**
* 读取文件夹下所有路径
*
* @param filePath 文件夹路径
* @return
*/
public static String getJsonFile(String filePath) {
BufferedReader reader = null;
String readJson = "";
try {
FileInputStream fileInputStream = new FileInputStream(filePath);
InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream, "UTF-8");
reader = new BufferedReader(inputStreamReader);
String tempString = null;
while ((tempString = reader.readLine()) != null) {
readJson += tempString;
}
} catch (IOException e) {
e.getStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.getStackTrace();
}
}
}
return readJson;
}
/**
* 解析Jeo数据
*
* @param mocator
*/
public static List parseJeo(String mocator) {
List mocatorList = new ArrayList();
if (null == mocator) return null;
/* 拆分数据 */
String[] geos = mocator.split("\\|");
int n = Integer.parseInt(geos[0]);
String center = geos[1];
String polylineMoca = geos[2]; //墨卡托坐标
if (n == 2) {
polylineMoca = "1-" + polylineMoca;
String[] plm = polylineMoca.split("\\;");
String plmstr = "";
for (String str : plm) {
plmstr += str;
}
plm = new String[1];
plm[0] = plmstr;
}
String[] plm = polylineMoca.split("\\;");
/* 获取墨卡托边界 */
String geo = null;
if (n == 4 || n == 2) {
for (int i = 0; i < plm.length; i++) {
String[] geoPaths = plm[i].split("\\-");
if (geoPaths[0].equals("1")) {
geo = geoPaths[1];
}
}
}
// 墨卡托坐标解析
String[] geoPolyline = new String[9999];
try {
geoPolyline = geo.split("\\,");
} catch (Exception e) {
System.out.println(e.getMessage());
}
for (int i = 0; i < geoPolyline.length; i += 2) {
mocatorList.add(geoPolyline[i] + "#" + geoPolyline[i + 1]);
}
return mocatorList;
}
/**
* 解析Jeo坐标点的数据
*
* @param mocatorpoint
*/
public static List parseJeoPoint(String mocatorpoint) {
List mocatorList = new ArrayList();
if (null == mocatorpoint) return null;
String str = mocatorpoint.substring(0, mocatorpoint.lastIndexOf("|"));
String str1 = mocatorpoint.substring(str.length() + 1, mocatorpoint.length() - 1);
/* 拆分数据 */
// 墨卡托坐标解析
String[] geoPolyline = str1.split("\\,");
for (int i = 0; i < geoPolyline.length; i += 2) {
mocatorList.add(geoPolyline[i] + "#" + geoPolyline[i + 1]);
}
return mocatorList;
}
/**
* 墨卡托坐标转经纬度坐标
*
* @param x
* @param y
* @return
*/
public static Map convertMC2LL(Double x, Double y) {
Double[] cF = null;
x = Math.abs(x);
y = Math.abs(y);
for (int cE = 0; cE < MCBAND.length; cE++) {
if (y >= MCBAND[cE]) {
cF = MC2LL[cE];
break;
}
}
Map location = converter(x, y, cF);
location.put("lng", location.get("x"));
location.remove("x");
location.put("lat", location.get("y"));
location.remove("y");
return location;
}
private static Map converter(Double x, Double y, Double[] cE) {
Double xTemp = cE[0] + cE[1] * Math.abs(x);
Double cC = Math.abs(y) / cE[9];
Double yTemp = cE[2] + cE[3] * cC + cE[4] * cC * cC + cE[5] * cC * cC * cC + cE[6] * cC * cC * cC * cC + cE[7] * cC * cC * cC * cC * cC + cE[8] * cC * cC * cC * cC * cC * cC;
xTemp *= (x < 0 ? -1 : 1);
yTemp *= (y < 0 ? -1 : 1);
Map location = new HashMap();
location.put("x", xTemp);
location.put("y", yTemp);
return location;
}
}
最终我们拿到的学校边界经纬度坐标是这个样子的。
当没有边界经纬度坐标时,拿到的学校点的坐标是这个样子的。
百度地图上没有找到的学校名单
通过百度地图坐标拾取系统验证还原,发现坐标点是一致的。至此我们有了这批数据,就可以做后续的处理了!