python编程练习:爬虫爬取静态网页批量下载自然保护区边界文件

文章目录

  • 一、功能介绍
  • 二、下载单个自然保护区的边界
  • 三、批量下载


一、功能介绍

基于python爬虫下载中国自然保护区边界文件http://www.papc.cn/html/folder/946895-1.htm
python编程练习:爬虫爬取静态网页批量下载自然保护区边界文件_第1张图片


二、下载单个自然保护区的边界

左侧有文件夹标注的保护区才提供kmz文件的下载,下载过程可以参考全国自然保护区边界矢量数据下载地址及处理方法。点击下载KMZ文件后跳转的url为:http://www.papc.cn/res/papc/kmz/1001.kmz
python编程练习:爬虫爬取静态网页批量下载自然保护区边界文件_第2张图片
利用python访问这个url,并将网页的内容保存为.KMZ文件即可完成下载,代码如下:

import urllib.request
import urllib.parse
import shutil
import os
import time

def text_create(tar_path, name, msg):
    # 创建一个文本文件
    full_path = os.path.join(tar_path, name+".txt")  
    file = open(full_path, 'w')
    file.write(msg)


idx = 1001
out_dir = r"S:\cache" # kmz文件的保存路径
url = r"http://www.papc.cn/res/papc/kmz/{0}.kmz".format(idx)
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
req = urllib.request.Request(url, headers=header)
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
text_create(out_dir, "{0}".format(idx), html)
f = open(os.path.join(out_dir, "{0}.txt".format(idx)))
name = f.readlines()[3].lstrip()[6:].split(".")[0]
f.close()
shutil.copyfile(os.path.join(out_dir, "%d.txt" % idx), os.path.join(out_dir, "{0}.kmz".format(name)))
print("%d %s success" % (idx, name))

在这里插入图片描述
python编程练习:爬虫爬取静态网页批量下载自然保护区边界文件_第3张图片


三、批量下载

实现批量下载的重点是获知各个自然保护区的url地址(http://www.papc.cn/res/papc/kmz/xxxx.kmz
)。我们可以直接采用暴力搜索的形式,代码如下:

import urllib.request
import urllib.parse
import shutil
import os
import time

def text_create(tar_path, name, msg):
    # 创建一个文本文件
    full_path = os.path.join(tar_path, name+".txt")  
    file = open(full_path, 'w')
    file.write(msg)


for idx in range(1000,4000):
	out_dir = r"S:\cache" # kmz文件的保存路径
	url = r"http://www.papc.cn/res/papc/kmz/{0}.kmz".format(idx)
	header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
	try:
	    req = urllib.request.Request(url, headers=header)
	    response = urllib.request.urlopen(req)
	    html = response.read().decode("utf-8")
	    text_create(out_dir, "{0}".format(idx), html)
	    f = open(os.path.join(out_dir, "{0}.txt".format(idx)))
	    name = f.readlines()[3].lstrip()[6:].split(".")[0]
	    f.close()
	    shutil.copyfile(os.path.join(out_dir, "%d.txt" % idx), os.path.join(out_dir, "{0}.kmz".format(name)))
	    print("%d %s success" % (idx, name))
	except Exception as e:
	    print(idx, e)
	time.sleep(2)

python编程练习:爬虫爬取静态网页批量下载自然保护区边界文件_第4张图片
python编程练习:爬虫爬取静态网页批量下载自然保护区边界文件_第5张图片
不过,这种暴力搜索的方法可能存在一定的遗漏,我们可以利用Screaming Frog SEO Spider来爬取网站的信息,并从中筛选出末尾是.kmz结尾的url
python编程练习:爬虫爬取静态网页批量下载自然保护区边界文件_第6张图片
通过分析发现下载kmz文件对应url的命名规律
python编程练习:爬虫爬取静态网页批量下载自然保护区边界文件_第7张图片
因此我们只需要在之前代码的基础上另idx遍历1001至3307及14060469至14060744两个范围即可下载全部保护区的边界文件,代码如下:

import urllib.request
import urllib.parse
import shutil
import os

def text_create(tar_path, name, msg):
    # 创建一个文本文件
    full_path = os.path.join(tar_path, name+".txt")  # 也可以创建一个.doc的word文档
    file = open(full_path, 'w')
    file.write(msg)


for idx in range(1001, 3008):
    # idx = 1001
    out_dir = r"S:\cache"
    url = r"http://www.papc.cn/res/papc/kmz/{0}.kmz".format(idx)
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
    try:
        req = urllib.request.Request(url, headers=header)
        response = urllib.request.urlopen(req)
        html = response.read().decode("utf-8")
        text_create(out_dir, "{0}".format(idx), html)
        f = open(os.path.join(out_dir, "{0}.txt".format(idx)))
        name = f.readlines()[3].lstrip()[6:].split(".")[0]
        f.close()
        shutil.copyfile(os.path.join(out_dir, "%d.txt" % idx), os.path.join(out_dir, "{0}.kmz".format(name)))
        print("%d %s success" % (idx,name))
    except Exception as e:
        print(idx, e)
    time.sleep(2)

for idx in range(14060469, 14060745):
    # idx = 1001
    out_dir = r"S:\cache"
    url = r"http://www.papc.cn/res/papc/kmz/{0}.kmz".format(idx)
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
    try:
        req = urllib.request.Request(url, headers=header)
        response = urllib.request.urlopen(req)
        html = response.read().decode("utf-8")
        text_create(out_dir, "{0}".format(idx), html)
        f = open(os.path.join(out_dir, "{0}.txt".format(idx)))
        name = f.readlines()[3].lstrip()[6:].split(".")[0]
        f.close()
        shutil.copyfile(os.path.join(out_dir, "%d.txt" % idx), os.path.join(out_dir, "{0}.kmz".format(name)))
        print("%d %s success" % (idx,name))
    except Exception as e:
        print(idx, e)
    time.sleep(2)

最终得到789个kmz文件
python编程练习:爬虫爬取静态网页批量下载自然保护区边界文件_第8张图片


你可能感兴趣的:(python编程,python,爬虫)