基于python爬虫下载中国自然保护区边界文件http://www.papc.cn/html/folder/946895-1.htm
左侧有文件夹标注的保护区才提供kmz文件的下载,下载过程可以参考全国自然保护区边界矢量数据下载地址及处理方法。点击下载KMZ文件后跳转的url为:http://www.papc.cn/res/papc/kmz/1001.kmz
利用python访问这个url,并将网页的内容保存为.KMZ文件即可完成下载,代码如下:
import urllib.request
import urllib.parse
import shutil
import os
import time
def text_create(tar_path, name, msg):
# 创建一个文本文件
full_path = os.path.join(tar_path, name+".txt")
file = open(full_path, 'w')
file.write(msg)
idx = 1001
out_dir = r"S:\cache" # kmz文件的保存路径
url = r"http://www.papc.cn/res/papc/kmz/{0}.kmz".format(idx)
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
req = urllib.request.Request(url, headers=header)
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
text_create(out_dir, "{0}".format(idx), html)
f = open(os.path.join(out_dir, "{0}.txt".format(idx)))
name = f.readlines()[3].lstrip()[6:].split(".")[0]
f.close()
shutil.copyfile(os.path.join(out_dir, "%d.txt" % idx), os.path.join(out_dir, "{0}.kmz".format(name)))
print("%d %s success" % (idx, name))
实现批量下载的重点是获知各个自然保护区的url地址(http://www.papc.cn/res/papc/kmz/xxxx.kmz
)。我们可以直接采用暴力搜索的形式,代码如下:
import urllib.request
import urllib.parse
import shutil
import os
import time
def text_create(tar_path, name, msg):
# 创建一个文本文件
full_path = os.path.join(tar_path, name+".txt")
file = open(full_path, 'w')
file.write(msg)
for idx in range(1000,4000):
out_dir = r"S:\cache" # kmz文件的保存路径
url = r"http://www.papc.cn/res/papc/kmz/{0}.kmz".format(idx)
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
try:
req = urllib.request.Request(url, headers=header)
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
text_create(out_dir, "{0}".format(idx), html)
f = open(os.path.join(out_dir, "{0}.txt".format(idx)))
name = f.readlines()[3].lstrip()[6:].split(".")[0]
f.close()
shutil.copyfile(os.path.join(out_dir, "%d.txt" % idx), os.path.join(out_dir, "{0}.kmz".format(name)))
print("%d %s success" % (idx, name))
except Exception as e:
print(idx, e)
time.sleep(2)
不过,这种暴力搜索的方法可能存在一定的遗漏,我们可以利用Screaming Frog SEO Spider来爬取网站的信息,并从中筛选出末尾是.kmz结尾的url
通过分析发现下载kmz文件对应url的命名规律
因此我们只需要在之前代码的基础上另idx遍历1001至3307及14060469至14060744两个范围即可下载全部保护区的边界文件,代码如下:
import urllib.request
import urllib.parse
import shutil
import os
def text_create(tar_path, name, msg):
# 创建一个文本文件
full_path = os.path.join(tar_path, name+".txt") # 也可以创建一个.doc的word文档
file = open(full_path, 'w')
file.write(msg)
for idx in range(1001, 3008):
# idx = 1001
out_dir = r"S:\cache"
url = r"http://www.papc.cn/res/papc/kmz/{0}.kmz".format(idx)
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
try:
req = urllib.request.Request(url, headers=header)
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
text_create(out_dir, "{0}".format(idx), html)
f = open(os.path.join(out_dir, "{0}.txt".format(idx)))
name = f.readlines()[3].lstrip()[6:].split(".")[0]
f.close()
shutil.copyfile(os.path.join(out_dir, "%d.txt" % idx), os.path.join(out_dir, "{0}.kmz".format(name)))
print("%d %s success" % (idx,name))
except Exception as e:
print(idx, e)
time.sleep(2)
for idx in range(14060469, 14060745):
# idx = 1001
out_dir = r"S:\cache"
url = r"http://www.papc.cn/res/papc/kmz/{0}.kmz".format(idx)
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
try:
req = urllib.request.Request(url, headers=header)
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
text_create(out_dir, "{0}".format(idx), html)
f = open(os.path.join(out_dir, "{0}.txt".format(idx)))
name = f.readlines()[3].lstrip()[6:].split(".")[0]
f.close()
shutil.copyfile(os.path.join(out_dir, "%d.txt" % idx), os.path.join(out_dir, "{0}.kmz".format(name)))
print("%d %s success" % (idx,name))
except Exception as e:
print(idx, e)
time.sleep(2)