NIPS 全文下载

有些收集癖,所以这次打算把NIPS上历年的论文都下载下来。尝试通过python直接下载,不过发现很慢,所以想到,那就直接先爬取所有的下载链接,分好文件夹,然后在手动对应不同的年份将其复制到迅雷中下载(目前一共开了29期,所以手动工作还好)

代码及解释如下:

# -*- coding: utf-8 -*-
"""
Created on Sat Sep  9 19:10:39 2017

@author: shouhuxianjian
"""
'''导入包'''
import os
import re
#import wget
import os.path as osp
import requests
from bs4 import BeautifulSoup as bs
url0 = 'http://papers.nips.cc/'
'''下载首页'''
html0 = requests.get(url = url0)
html0 = bs(html0.text,'html5lib')

gResDir = r'e:\NIPS'#存储的位置
NIPSTimes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
             11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
             21, 22, 23, 24, 25, 26, 27, 28, 29]#这条没用到
'''获取所有会议次数 '''
books_hrefs = [(li.text,li.find('a')['href']) for li in \
                 html0.find_all('li',text=re.compile('Advances'))]
books_hrefs = [(book.replace(' ','_'), url0.rstrip('/')+href ) for book,href in \
                books_hrefs]
'''创建文件夹函数 '''
def makedirs(indir):
    try:
        os.makedirs(indir)
    except Exception as e:
        pass
'''为每次会议单独创建一个文件夹'''
[makedirs(osp.join(gResDir,indir)) for indir,_ in books_hrefs]

invalidChars = re.compile('[;?\*|"<>:/]')#windows下文件夹命名的非法字符

cur = 14
'''读取每次会议,并读取该次会议下面的paper链接 '''
for indb,(book,href) in enumerate(books_hrefs):

    if indb < cur: continue
    html1 = requests.get(url = href)
    html1 = bs(html1.text,'html5lib')
    papers_hrefs = [(li.text,li.find('a')['href']) for li in html1.find_all('li') if\
                     '/paper/' in li.find('a')['href'] ]
    papers_hrefs = [(invalidChars.sub(' ',paper),url0.rstrip('/')+hrefPaper ) for \
                     paper,hrefPaper in papers_hrefs]
    resPath = osp.join(gResDir,book)

    '''读取该次会议已经下载过paper的链接,防止多次下载,即跳过之前下载的部分,从而有断点续下载功能 '''
    papersDi = {line.strip():1 for line in open(osp.join(resPath,'book.txt'),encoding = 'utf-8')} \
               if osp.exists( osp.join(resPath,'book.txt')) else {}

    ''' 读取该次会议下paper的链接,并将其中的pdf和supplemental链接保存到文件中'''      
    for ind,(paper,hrefPapers) in enumerate(papers_hrefs):

        if paper in papersDi: continue
        print('book:[{}/{}]   [{}]'.format(indb+1,len(books_hrefs),book))
        print('paper:[{}/{}]  [{}]'.format(ind+1,len(papers_hrefs),paper))
        print('='*50)

        '''下面设置超时30秒,如果爆出超时异常,中断当前程序然后重开就行'''
        html2 = requests.get(url = hrefPapers,timeout = 30)
        html2 = bs(html2.text,'html5lib')

        '''将未下载pdf的paper名字保存到文件cannotDownload.txt中 '''
        try:
            hrefPDF = html2.find('a',text = '[PDF]')['href']
        except Exception as e:
            with open(osp.join(resPath,'cannotDownload.txt'),'a',encoding = 'utf-8') as fa:
                fa.write(paper+'\n')
            continue

        '''有些paper并没有supplemental '''
        supplemental = html2.find('a',text = '[Supplemental]')
        hrefSupplemental = supplemental['href'] if supplemental else ''

        '''如下代码运行时下载速度慢,所以才将下载链接收集起来,交给迅雷等下载 '''
#        pdf = wget.download(url = url0.rstrip('/')+hrefPDF,
#                            out = osp.join(resPath,paper))
#        pdfContent = requests.get(url = url0.rstrip('/')+hrefPDF )

#        with open(osp.join(resPath,paper.split(',')[0])+'.pdf','wb') as fw:
#            fw.write(pdfContent.content)

        '''将pdf下载链接写入文件urls.txt '''
        with open(osp.join(resPath,'urls.txt'),'a',encoding = 'utf-8') as fw:
            fw.write(url0.rstrip('/')+hrefPDF+'\n')
        '''处理对应supplemental ''' 
        if hrefSupplemental:
            suppFormat = hrefSupplemental.split('.')[-1]
#            suppContent = requests.get(url = url0.rstrip('/')+hrefSupplemental )
#            with open(osp.join(resPath,paper.split(',')[0])+'.'+suppFormat,'wb') as fw:
#                fw.write(suppContent.content)
            with open(osp.join(resPath,'urls.txt'),'a',encoding = 'utf-8') as fw:
                fw.write(url0.rstrip('/')+hrefSupplemental+'\n')
        '''最后将当前paper名字写入文件book.txt用于确保上述行为运行结束,且保证断点续下载功能 '''
        with open(osp.join(resPath,'book.txt'),'a',encoding = 'utf-8') as fa:
            fa.write(paper+'\n')

结果如下图所示:
1 - 按照会议次数创建文件夹

NIPS 全文下载_第1张图片

2 - 每次会议文件夹下生成文件
NIPS 全文下载_第2张图片

3 - urls.txt中下载链接
NIPS 全文下载_第3张图片

已经将1988-2016 共29次会议的下载链接放在这里了

你可能感兴趣的:(machine,learning)