目标爬取该路径下的所有tif数据
https://data.ngdc.noaa.gov/instruments/remote-sensing/passive/spectrometers-radiometers/imaging/viirs/mosaics/
实现思路:
构造所有的文件夹路径
遍历a标签拿到所有的下载链接,判断是tif结尾的就下载
由于影像文件相对较大,增加了进度的显示
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib
from bs4 import BeautifulSoup
import urllib.request
start=20200210
rawurl='https://data.ngdc.noaa.gov/instruments/remote-sensing/passive/spectrometers-radiometers/imaging/viirs/mosaics/'
def downloaddata(baseurl,filename):
file_name = r"D:\nppdata\\s"+filename
url=baseurl+filename
print(url)
urllib.request.urlretrieve(url, file_name)
def callback(a1, a2, a3):
"""
显示下载文件的进度
:param @a1:目前为此传递的数据块数量
:param @a2:每个数据块的大小,单位是byte,字节
:param @a3:远程文件的大小
:return: None
"""
download_pg = 100.0 * a1 * a2 / a3
if download_pg > 100:
download_pg = 100
print("当前下载进度为: %.2f%%" % download_pg, )
def download(url, filename, callback, header):
"""
封装了 urlretrieve()的自定义函数,递归调用urlretrieve(),当下载失败时,重新下载
download file from internet
:param url: path to download from
:param savepath: path to save files
:return: None
"""
try:
urllib.request.urlretrieve(url, filename, callback, header)
# except urllib.ContentTooShortError:
# print('Network conditions is not good.Reloading.')
# download(url, filename, callback, header)
except Exception as e:
print(e)
print('Network conditions is not good.\nReloading.....')
download(url, filename, callback, header)
headers = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
header = headers.encode() # 不进行类型转换,传入urlretrieve()后会报错,需转换成bytes类型
for i in range(20):
urls=[]
print (rawurl+str(start)+"/")
url=rawurl+str(start)+"/"
content = urllib.request.urlopen(url).read().decode('ascii') #获取页面的HTML
soup = BeautifulSoup(content, 'lxml')
list_urls=soup.find_all("a") #定位到a标签,其中存放着文件的url
for i in list_urls[1:]:
lianjie=i.get('href')
if(lianjie[0:5]=='SVDNB' and lianjie.endswith(".tif")):
print(lianjie,"下载中")
url=url+lianjie
file_name = r"D:\nppdata\\s" + lianjie
download(url, file_name, callback, header)
start += 1