python 批量下载知网(CNKI)论文

1、目的:
朋友找我去知网帮他下载点相关论文,发现老是要去点击文章。点击下载,显得很麻烦,百度一下,别人的方法太复杂,所以自己写了一个python脚本自动下载知网论文。
2、前期准备
1)安装python 2.7
2)安装 selenium

pip install selenium

3)下载一个chromedriver.exe,放到脚本同一个文件夹内
4)安装chrome浏览器
3、直接撸代码
python 批量下载知网(CNKI)论文_第1张图片
(a)指定关键字下载知网论文

downloadCNKI.py
#!/usr/bin/env Python
# coding=utf-8
import  os
from time import sleep
from selenium import webdriver

def browser_init(isWait):
    options = webdriver.ChromeOptions()
    prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': 'E:\\PycharmProjects\\downloadCNKI\\output'}
    options.add_experimental_option('prefs', prefs)

    browser = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=options)
    browser.set_window_size(500,500)
    if isWait:
        browser.implicitly_wait(50)
    return browser

def searchKey(keyword):
    browser.get("http://kns.cnki.net/kns/brief/default_result.aspx")
    browser.find_element_by_id('txt_1_value1').send_keys(keyword)
    browser.find_element_by_id('btnSearch').click()

def switchToFrame(browser):
    #print 'start switch'
    browser.switch_to.frame('iframeResult')
    #print 'end switch'

def getDownloadLinks(browser,paper_downloadLinks):
    for link in browser.find_elements_by_css_selector('a[href^=\/kns\/detail]'):
        #link.click()
        url=link.get_attribute('href')
        url_part = url.split('&')[3:6]
        url_str= '&'.join(url_part)
        down_url='http://kns.cnki.net/KCMS/detail/detail.aspx?'+url_str
        #print down_url
        paper_downloadLinks.append(down_url)

def switchToPage(browser,n):
    for link in browser.find_elements_by_css_selector('a[href^=\?curpage]'):
        url=link.get_attribute('href')
        print url
        pageInd='curpage=%d&'%n
        print pageInd
        if pageInd in url:
            print "page: "+url
            link.click()
            break
def switchNextPage(browser):
    browser.find_element_by_link_text(u'下一页').click()

def do_download(driver,urls,fail_downLoadUrl):
    for url in urls:
        print url
        driver.get(url)
        paper_title=driver.title
        print "paper title"+paper_title
        if u'中国专利全文数据库' in paper_title:
            continue
        print "try  download :"+paper_title
        try:
            driver.find_element_by_xpath("//a[contains(text(),'PDF下载')]").click()
            print "download success!!!"
        except Exception as e:
            try:
                driver.find_element_by_xpath("//a[contains(text(),'整本下载')]").click()
                print "download success!!!"
            except Exception as e:
                print "download fail!!!"
                fail_downLoadUrl.append(url)

def usage():
    print "example : python downloadCNKI.py -k keyword  -p 1"

if __name__=="__main__":

    keyword=u'三角形'      #论文搜索的关键字
    pageNum = 1     # 下载多少页的论文

    browser=browser_init(True)
    searchKey(keyword)
    switchToFrame(browser)
    paper_downloadLinks = []    #论文下载链接

    curPage=1
    while curPage<=pageNum:
        getDownloadLinks(browser,paper_downloadLinks)

        switchNextPage(browser);
        curPage+=1
    browser.quit()
    print "采集了%d条数据"% len(paper_downloadLinks)
    driver=browser_init(False)
    fail_downLoadUrl=[]         #记录下失败的网站
    do_download(driver,paper_downloadLinks,fail_downLoadUrl)
    print  fail_downLoadUrl
    tryNum=0
    #尝试N次重新下载没有下载的
    while tryNum<5:
        if len(fail_downLoadUrl) !=0:
            paper_downloadLinks=fail_downLoadUrl
            fail_downLoadUrl=[]
            do_download(driver, paper_downloadLinks, fail_downLoadUrl)
            print fail_downLoadUrl
        else:
            break
        tryNum+=1
    sleep(60)
    driver.quit()

(b)指定论文题目下载知网论文
这个需要和脚本同目录下新建一个downfile.txt,按行存放需要下载题目

指定题目到downfile.txt的知网下载.py
#!/usr/bin/env Python
# coding=utf-8
import  os
from time import sleep
from selenium import webdriver

def browser_init(isWait):
    options = webdriver.ChromeOptions()
    prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': 'E:\\PycharmProjects\\downloadCNKI\\output'}
    options.add_experimental_option('prefs', prefs)

    browser = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=options)
    browser.set_window_size(500,500)
    if isWait:
        browser.implicitly_wait(50)
    return browser

def searchKey(keyword):
    browser.get("http://kns.cnki.net/kns/brief/default_result.aspx")
    browser.find_element_by_id('txt_1_value1').send_keys(keyword)
    browser.find_element_by_id('btnSearch').click()

def switchToFrame(browser):
    #print 'start switch'
    browser.switch_to.frame('iframeResult')
    #print 'end switch'

def getDownloadLinks(browser,paper_downloadLinks):
    for link in browser.find_elements_by_css_selector('a[href^=\/kns\/detail]'):
        #link.click()
        url=link.get_attribute('href')
        url_part = url.split('&')[3:6]
        url_str= '&'.join(url_part)
        down_url='http://kns.cnki.net/KCMS/detail/detail.aspx?'+url_str
        #print down_url
        paper_downloadLinks.append(down_url)

def getKeywordDownloadLink(browser,keyword,paper_downloadLinks):
    link=browser.find_element_by_link_text(keyword)
    url = link.get_attribute('href')
    #print url
    url_part = url.split('&')[3:6]
    url_str = '&'.join(url_part)
    down_url = 'http://kns.cnki.net/KCMS/detail/detail.aspx?' + url_str
    #print down_url
    paper_downloadLinks.append(down_url)


def switchToPage(browser,n):
    for link in browser.find_elements_by_css_selector('a[href^=\?curpage]'):
        url=link.get_attribute('href')
        print url
        pageInd='curpage=%d&'%n
        print pageInd
        if pageInd in url:
            print "page: "+url
            link.click()
            break
def switchNextPage(browser):
    browser.find_element_by_link_text(u'下一页').click()

def do_download(driver,urls,fail_downLoadUrl):
    for url in urls:
        print url
        driver.get(url)
        paper_title=driver.title
        print "paper title"+paper_title
        if u'数据库' in paper_title:
            continue
        print "try  download :"+paper_title
        try:
            driver.find_element_by_xpath("//a[contains(text(),'PDF下载')]").click()
            print "download success!!!"
        except Exception as e:
            try:
                driver.find_element_by_xpath("//a[contains(text(),'整本下载')]").click()
                print "download success!!!"
            except Exception as e:
                print "download fail!!!"
                fail_downLoadUrl.append(url)

def usage():
    print "example : python downloadCNKI.py -k keyword  -p 1"

if __name__=="__main__":

    paper_downloadLinks = []  # 论文下载链接
    pageNum = 1  # 下载多少页的论文
    browser = browser_init(True)

    file = open("downfile.txt")
    lineDatas = file.readlines();
    for line in lineDatas:
        keyword=line.strip('\n').decode('gbk')
        #keyword=u'三角形'      #论文搜索的关键字
        print u"采集: %s"% keyword
        searchKey(keyword)
        switchToFrame(browser)
        downloadLinks=[]
        getKeywordDownloadLink(browser,keyword,downloadLinks)

        paper_downloadLinks.append(''.join(downloadLinks))
    file.close()
    browser.quit()


    print "采集了%d条数据"% len(paper_downloadLinks)

    driver=browser_init(False)
    fail_downLoadUrl=[]         #记录下失败的网站
    do_download(driver,paper_downloadLinks,fail_downLoadUrl)
    print  fail_downLoadUrl
    tryNum=0
    #尝试N次重新下载没有下载的
    while tryNum<5:
        if len(fail_downLoadUrl) !=0:
            paper_downloadLinks=fail_downLoadUrl
            fail_downLoadUrl=[]
            do_download(driver, paper_downloadLinks, fail_downLoadUrl)
            print "重新下载 ",
            print  fail_downLoadUrl
        else:
            break
        tryNum+=1
    sleep(60)
    driver.quit()

很好用,让我帮助同学下载知网论文,妈妈再也不要担心我点错了。。。

你可能感兴趣的:(python)