python爬取中国知网(中国优秀硕士学位论文数据库)

笔者这几天受团队任务安排,需要写一份儿关于知网(中国优秀硕士学位论文数据库   入口:http://gb.oversea.cnki.net/kns55/brief/result.aspx?dbPrefix=CMFD

)的代码,主要是爬取论文的作者信息、论文副标题、学位授予年度、学校、引用频次等,例如在关键词搜索一下医疗保险:

表格的相关信息都可以爬取,点进每一篇文章以后,对应的关键词、副标题信息也可以爬取,只是笔者此次不需要设计下载原文,所以没有写相关代码,但是整个分析格式是一样的,可以根据关键词的爬取方式去寻找下载的url。对于这个内容的爬取,似乎对于访问的限制不太严格,笔者基本不考虑sleep时间,先贴出代码:

python爬取中国知网(中国优秀硕士学位论文数据库)_第1张图片

# -*- coding:utf-8 -*-
__author__ = 'TengYu'
import requests
import re
from lxml import etree
import json
import time
import xlwt
import sys
reload(sys)
sys.setdefaultencoding('utf8')

headers = {
'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
    'Cookie':'Your-cookie'
}
class PageList(object):
    def __init__(self,maxpage):
        self.maxpage = maxpage

    def getUrl(self):
        f1 = open("title_4_4.txt",'w')
        f2 = open("subtitle.txt",'w')
        f3 = open("keywords_4_4.txt",'w')
        excel = xlwt.Workbook(encoding='utf-8')
        sheet = excel.add_sheet('sheet1')
        sheet.write(0, 0, '论文中文题目')
        sheet.write(0, 1, '论文作者')
        sheet.write(0, 2, '作者学校')
        sheet.write(0, 3, '学位授予年度')
        sheet.write(0, 4, '下载频次')
        sheet.write(0, 5, '引用频次')
        begin = 6
        while begin < 18:
            sheet.write(0, begin, '关键词'+str(begin - 6))
            begin += 1
        while begin < 25:
            sheet.write(0, begin, '副标题'+ str(begin - 18))
            begin += 1
        firstPage = 200
        try:
            tempurl = " "
            num = 0
            while firstPage < self.maxpage:
                tempurl = "http://gb.oversea.cnki.net/kns55/brief/brief.aspx?curpage="+str(firstPage)+"&RecordsPerPage=20&QueryID=2&ID=&turnpage=1&tpagemode=L&dbPrefix=CMFD&Fields=&DisplayMode=listmode&PageName=ASP.brief_result_aspx&sKuaKuID=2"
                response = requests.get(tempurl, headers=headers).content
                selector = etree.HTML(response)
                trs = selector.xpath("//tr")
                firstTds = 11
                # print(len(trs))
                print("已经抓取"+str(num)+"条数据")
                while firstTds < 51:
                    num = num + 1
                    tr = trs[firstTds]
                    td = tr.xpath(".//td")
                    titletd = td[2]
                    titlehref = titletd.xpath(".//a/@href")
                    # print(titlehref)
                    # print(titletd.xpath("string(.)"))
                    href = str(titlehref[0])
                    # print(href)  #获取论文详细内容链接
                    detailurl = "http://gb.oversea.cnki.net" + href
                    print(detailurl)
                    authortd = td[3]  # 获取作者信息
                    schooltd = td[4]  # 获取学校信息
                    yeartd = td[5]  # 获取年份信息
                    yinyongtd = td[6]  # 获取引用次数信息
                    xiazaitd = td[7]  # 获取下载次数信息
                    author = str(authortd.xpath("string(.)").encode('utf-8'))
                    school = str(schooltd.xpath("string(.)").encode('utf-8'))
                    year = str(yeartd.xpath("string(.)").encode('utf-8'))
                    yinyong = "0"
                    xiazai = "0"
                    yinyongs = str(yinyongtd.xpath("string(.)").encode('utf-8'))
                    xiazais = str(xiazaitd.xpath("string(.)").encode('utf-8'))
                    if yinyongs.isspace():
                        yinyong = "0"
                    else:
                        yinyong = yinyongs.replace(' ', '')
                    if xiazais.isspace():
                        xiazai = "0"
                    else:
                        xiazai = xiazais.replace(' ', '')

                    firstTds += 2
                    # 获取具体数据信息,访问构造的detailurl
                    detail = requests.get(detailurl, headers=headers).content
                    sel = etree.HTML(detail)
                    divs = sel.xpath("//div[@id='main']")
                    title = divs[0].xpath("//div[@id='title']")[0]
                    span = title.xpath("//h1//span")[0]
                    st = str(span.xpath("string(.)").encode('utf-8'))
                    print(st)  # 论文题目
                    divs = sel.xpath(".//div[@class='summary pad10']")[0]
                    detailinfo = str(divs.xpath("string(.)").encode('utf-8'))
                    ps = divs.xpath(".//p")
                    f1.write(st+"\n")
                    sheet.write(num, 0, st)
                    sheet.write(num, 1, author)
                    sheet.write(num, 2, school)
                    sheet.write(num, 3, year)
                    sheet.write(num, 4, xiazai)
                    sheet.write(num, 5, yinyong)
                    try:
                        keywordsdiv = sel.xpath(".//div[@class='keywords']")[0]
                        span = keywordsdiv.xpath(".//span[@id='ChDivKeyWord']")[0]
                        hrefs = span.xpath(".//a")
                        i = 0
                        first = 6
                        while i < len(hrefs):
                            words = str(hrefs[i].xpath("string(.)").encode('utf-8'))
                            print(words)
                            f3.write(words+"\n")
                            sheet.write(num, first, words)
                            first += 1
                            i += 1
                    except  Exception as es:
                        print(es)
                    try:
                        if u'副题名' in detailinfo:
                            index = detailinfo.find('【副题名】')
                            loc = index + 15
                            then = 18
                            while detailinfo[loc] != '【':
                                subtitile = ""
                                while detailinfo[loc] != ' ' and detailinfo[loc] != ',':
                                    subtitile += detailinfo[loc]
                                    loc += 1
                                sheet.write(num, then, subtitile)
                                f2.write(subtitile + "\n")
                                then += 1
                    except Exception as es:
                        print(es)
                    time.sleep(1)
                firstPage += 1
                time.sleep(5)
        except Exception as es:
            print (es)
        excel.save("filename.xls")
        pass

pagelist = PageList(270)
pagelist.getUrl()

笔者当初大概看了一下,好像搜索出来的那个表格的话,第一页和后面的页数的格式不太一样,所以就从第二页开始爬的,第一页的代码有兴趣的读者可以自行完成(当然也可能一样,只是我没仔细看,因为当时是上午布置任务,下午四点前提交数据,所以就省略过去了)在你完成相关搜索以后,代码所分析的url(从第二页开始)其实是“http://gb.oversea.cnki.net/kns55/brief/brief.aspx?curpage=3&RecordsPerPage=20&QueryID=3&ID=&turnpage=1&tpagemode=L&dbPrefix=CMFD&Fields=&DisplayMode=listmode&PageName=ASP.brief_result_aspx&sKuaKuID=3”,至于怎么分析这个数据是从哪个链接获取的,比较简单,就不在这里介绍了,但是提醒注意的是笔者中途因为有点事儿,断了一下电脑,结果发现怎么都爬不下来了,一开始以为是代码的问题,最后才发现是上面这个url中的"QueryID和sKuqKuID"发生了改变,从3变成了2,所以大家如果出现这种情况,一定记得再去数据获取的url找一下这值,改成正确的一下。整个过程并不难理解,至于后面那个怎么查找具体的关键词之类的,建议大家可以打开一个所有信息(包括关键词,副标题)的论文的源代码后,用个h5开发工具将代码拷贝过去分析一下标签就行了,笔者这里不再赘述。

尊重原作,转载请注明,转载自:https://blog.csdn.net/kr2563

你可能感兴趣的:(python)