Python:网页爬虫及资源下载

问题

本代码是用于下载http://openaccess.thecvf.com/ICCV2017.py上的论文,并将其按照论文名将其保存到不同的文件夹下。

思路

首先使用Chorme的调试工具查看该网页的源代码,然后编写python脚本对所有文字进行下载。

代码

import os
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

#下载网页,并将其保存在本地
#enter the url
url = input('Enter Url:')
html = urlopen(url, context=ctx).read()

#使用beautifulsoup解析网页元素,并将需要的节点找到
#parse the html
soup = BeautifulSoup(html, "html.parser")
# Retrieve all of the anchor tags
paper_name = soup('dt')
paper_link = soup('dd')

#下载资源的保存路径
#enter the director
paper_directory = input('Enter Directory:')
if(paper_directory==""):
    paper_directory = os.getcwd()

#纯粹为了标记要下载的范围,不能一次下载太多    
#enter begin index
print("paper count:",len(paper_name))
begin_index = input('begin index:')
#enter end index
end_index = input('end_index:')

#save the paper have video
paper_had_video = list()

#下载
for i in range(int(begin_index)-1,int(end_index)):
    print("downloading:",paper_name[i].a.string) 

    #file save_dir
    file_save_dir=""

    links = paper_link[2*i+1].find_all('a')
    for j in range(len(links)-1):
        if(links[j].string=='pdf'):
            file_save_dir = paper_directory+str(i+1)+"-"+os.path.basename(links[j]['href']).split('.')[0]

            #making dir
            os.mkdir(file_save_dir)

            pdf_url = "http://openaccess.thecvf.com/"+links[j]['href']
            pdf_file = requests.get(pdf_url, stream=True)
            with open(file_save_dir+"/"+os.path.basename(links[j]['href']), "wb") as pdf:
                for pdf_chunk in pdf_file.iter_content(chunk_size=1024):
                    if pdf_chunk:
                        pdf.write(pdf_chunk)
            continue
        if(links[j].string=='supp'):
            supp_url = "http://openaccess.thecvf.com/"+links[j]['href']
            supp_file= requests.get(supp_url, stream=True)
            with open(file_save_dir+"/"+os.path.basename(links[j]['href']), "wb") as supplement:
                for supplement_chunk in supp_file.iter_content(chunk_size=1024):
                    if supplement_chunk:
                        supplement.write(supplement_chunk)
            continue
        if(links[j].string=='video'):
            paper_had_video.append(paper_name[i].a.string)
            continue


#print out all the paper need video
print("Paper need download video:")
for i in range(len(paper_had_video)):
    print(paper_had_video[i])

注意

本博客的一切目的均以学习为目的。

你可能感兴趣的:(Python)