多线程下载CVPR ICCV IJCAI论文

前提:

  1. 科学上网
  2. 谷歌浏览器
  3. python3

特点:

  1. 自定义多线程下载数目threads_num
  2. 自定义下载路径
  3. 下载的文件均以论文的title命名
  4. 原始title中的非法字符(win10不允许作为文件名的字符)已被处理为下划线或空格

代码如下:

# coding:utf-8
import re
import urllib.request
import os
import threading
import os
from lxml import etree
from selenium import webdriver
import time
import re

rstr = r"[\/\\\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'


def getIJCAIPapers(ctype, year, paper_wrapper, localdir):
    length = len(paper_wrapper)
    url = 'https://www.ijcai.org/proceedings/' + year
    for i in range(length):
        title_temp = paper_wrapper[i].xpath('./div[@class="title"]/text()')[0]
        new_title = re.sub(rstr, "_", title_temp)
        id_temp = paper_wrapper[i].xpath(
            './div[@class="details"]/a[1]/@href')[0]
        url_temp = url + '/' + id_temp
        path_temp = os.path.join(localDir, new_title + '.pdf')
        if os.path.exists(path_temp):
            print(title_temp + ' has been downloaded before.')
        else:
            urllib.request.urlretrieve(url_temp, path_temp)
            print(title_temp + ' has been downloaded now.')


def get_CVPR_ICCV_Papers(ctype, year, paper_wrapper, localdir):
    length = len(paper_wrapper)
    download_url = 'http://openaccess.thecvf.com/'
    for i in range(length):
        url_suffix = paper_wrapper[i].xpath('./@href')[0]
        download_url_temp = download_url + url_suffix
        full_file_name = url_suffix.split('/')[-1]
        file_name_re = re.findall('^.*?_(.*.pdf)', full_file_name)
        if len(file_name_re) == 1:
            file_name_ = file_name_re[0]
            file_name_ = re.sub(rstr, "_", file_name_)
            file_name = file_name_.replace('_', ' ')
            file_path_temp = os.path.join(localdir, file_name)
            if os.path.exists(file_path_temp):
                print(file_name + ' has been downloaded before.')
            else:
                urllib.request.urlretrieve(download_url_temp, file_path_temp)
                print(file_name + ' has been downloaded now.')


if __name__ == '__main__':
    ctype = 'CVPR'  #修改成对应的会议类型(限:ICCV,CVPR,IJCAI,其余的需要自己修改网站链接)
    year = '2019'  #论文发表的年份
    localDir = os.path.join('自定义下载目录', ctype + year)
    if not os.path.exists(localDir):
        os.makedirs(localDir)
    threads_num = 20  #自定义多线程下载的线程数目
    driver = webdriver.Chrome()
    threads = []
    if ctype == 'IJCAI':
        url = 'https://www.ijcai.org/proceedings/' + year
        # 启动chrome
        driver.get(url)
        time.sleep(5)  #  等待浏览器加载页面
        pageSource = driver.page_source
        html = etree.HTML(pageSource)
        paper_wrapper = html.xpath(
            '//*[@id="subsection0"]/div[@class="paper_wrapper"]')
        length = len(paper_wrapper)
    else:
        url = 'http://openaccess.thecvf.com/' + ctype + year + '.py'
        # 启动chrome
        driver.get(url)
        time.sleep(5)  #  等待浏览器加载页面
        pageSource = driver.page_source
        html = etree.HTML(pageSource)
        paper_wrapper = html.xpath('//*[@id="content"]/dl/dd/a')
        length = len(paper_wrapper)

    one_thread_papers = int(length / threads_num)
    start = 0
    end = start + one_thread_papers
    for i in range(threads_num):
        if ctype == 'IJCAI':
            t = threading.Thread(
                target=getIJCAIPapers,
                args=(ctype, year, paper_wrapper[start:end], localDir))

        else:
            t = threading.Thread(
                target=get_CVPR_ICCV_Papers,
                args=(ctype, year, paper_wrapper[start:end], localDir))
        start = end
        end += one_thread_papers
        threads.append(t)
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    print('all downloaded finished.')

效果图(下载ing):

多线程下载CVPR ICCV IJCAI论文_第1张图片

多快乐啊2333

你可能感兴趣的:(论文学习)