本代码是用于下载http://openaccess.thecvf.com/ICCV2017.py上的论文,并将其按照论文名将其保存到不同的文件夹下。
首先使用Chorme的调试工具查看该网页的源代码,然后编写python脚本对所有文字进行下载。
import os
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
#下载网页,并将其保存在本地
#enter the url
url = input('Enter Url:')
html = urlopen(url, context=ctx).read()
#使用beautifulsoup解析网页元素,并将需要的节点找到
#parse the html
soup = BeautifulSoup(html, "html.parser")
# Retrieve all of the anchor tags
paper_name = soup('dt')
paper_link = soup('dd')
#下载资源的保存路径
#enter the director
paper_directory = input('Enter Directory:')
if(paper_directory==""):
paper_directory = os.getcwd()
#纯粹为了标记要下载的范围,不能一次下载太多
#enter begin index
print("paper count:",len(paper_name))
begin_index = input('begin index:')
#enter end index
end_index = input('end_index:')
#save the paper have video
paper_had_video = list()
#下载
for i in range(int(begin_index)-1,int(end_index)):
print("downloading:",paper_name[i].a.string)
#file save_dir
file_save_dir=""
links = paper_link[2*i+1].find_all('a')
for j in range(len(links)-1):
if(links[j].string=='pdf'):
file_save_dir = paper_directory+str(i+1)+"-"+os.path.basename(links[j]['href']).split('.')[0]
#making dir
os.mkdir(file_save_dir)
pdf_url = "http://openaccess.thecvf.com/"+links[j]['href']
pdf_file = requests.get(pdf_url, stream=True)
with open(file_save_dir+"/"+os.path.basename(links[j]['href']), "wb") as pdf:
for pdf_chunk in pdf_file.iter_content(chunk_size=1024):
if pdf_chunk:
pdf.write(pdf_chunk)
continue
if(links[j].string=='supp'):
supp_url = "http://openaccess.thecvf.com/"+links[j]['href']
supp_file= requests.get(supp_url, stream=True)
with open(file_save_dir+"/"+os.path.basename(links[j]['href']), "wb") as supplement:
for supplement_chunk in supp_file.iter_content(chunk_size=1024):
if supplement_chunk:
supplement.write(supplement_chunk)
continue
if(links[j].string=='video'):
paper_had_video.append(paper_name[i].a.string)
continue
#print out all the paper need video
print("Paper need download video:")
for i in range(len(paper_had_video)):
print(paper_had_video[i])
本博客的一切目的均以学习为目的。