日志

今天中国知网又开始反爬了。所以我做了一下升级

__author__ = 'TengYu'
import requests
import re
from lxml import etree
import json
import time
import xlwt
import sys
reload(sys)
sys.setdefaultencoding('utf8')

headers = {
     
    'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
    'Cookie': 'Your-cookie'
}


class PageList(object):
    def __init__(self, maxpage):
        self.maxpage = maxpage

    def getUrl(self):
        f1 = open("title_4_4.txt", 'w')
        f2 = open("subtitle.txt", 'w')
        f3 = open("keywords_4_4.txt", 'w')
        excel = xlwt.Workbook(encoding='utf-8')
        sheet = excel.add_sheet('sheet1')
        sheet.write(0, 0, '论文中文题目')
        sheet.write(0, 1, '论文作者')
        sheet.write(0, 2, '作者学校')
        sheet.write(0, 3, '学位授予年度')
        sheet.write(0, 4, '下载频次')
        sheet.write(0, 5, '引用频次')
        begin = 6
        while begin < 18:
            sheet.write(0, begin, '关键词' + str(begin - 6))
            begin += 1
        while begin < 25:
            sheet.write(0, begin, '副标题' + str(begin - 18))
            begin += 1
        firstPage = 200
        try:
            tempurl = " "
            num = 0
            while firstPage < self.maxpage:
                tempurl = "http://gb.oversea.cnki.net/kns55/brief/brief.aspx?curpage=" + str(
                    firstPage) + "&RecordsPerPage=20&QueryID=2&ID=&turnpage=1&tpagemode=L&dbPrefix=CMFD&Fields=&DisplayMode=listmode&PageName=ASP.brief_result_aspx&sKuaKuID=2"
                response = requests.get(tempurl, headers=headers).content
                selector = etree.HTML(response)
                trs = selector.xpath("//tr")
                firstTds = 11
                # print(len(trs))
                print("已经抓取" + str(num) + "条数据")
                while firstTds < 51:
                    num = num + 1
                    tr = trs[firstTds]
                    td = tr.xpath(".//td")
                    titletd = td[2]
                    titlehref = titletd.xpath(".//a/@href")
                    # print(titlehref)
                    # print(titletd.xpath("string(.)"))
                    href = str(titlehref[0])
                    # print(href)  #获取论文详细内容链接
                    detailurl = "http://gb.oversea.cnki.net" + href
                    print(detailurl)
                    authortd = td[3]  # 获取作者信息
                    schooltd = td[4]  # 获取学校信息
                    yeartd = td[5]  # 获取年份信息
                    yinyongtd = td[6]  # 获取引用次数信息
                    xiazaitd = td[7]  # 获取下载次数信息
                    author = str(authortd.xpath("string(.)").encode('utf-8'))
                    school = str(schooltd.xpath("string(.)").encode('utf-8'))
                    year = str(yeartd.xpath("string(.)").encode('utf-8'))
                    yinyong = "0"
                    xiazai = "0"
                    yinyongs = str(yinyongtd.xpath("string(.)").encode('utf-8'))
                    xiazais = str(xiazaitd.xpath("string(.)").encode('utf-8'))
                    if yinyongs.isspace():
                        yinyong = "0"
                    else:
                        yinyong = yinyongs.replace(' ', '')
                    if xiazais.isspace():
                        xiazai = "0"
                    else:
                        xiazai = xiazais.replace(' ', '')

                    firstTds += 2
                    # 获取具体数据信息,访问构造的detailurl
                    detail = requests.get(detailurl, headers=headers).content
                    sel = etree.HTML(detail)
                    divs = sel.xpath("//div[@id='main']")
                    title = divs[0].xpath("//div[@id='title']")[0]
                    span = title.xpath("//h1//span")[0]
                    st = str(span.xpath("string(.)").encode('utf-8'))
                    print(st)  # 论文题目
                    divs = sel.xpath(".//div[@class='summary pad10']")[0]
                    detailinfo = str(divs.xpath("string(.)").encode('utf-8'))
                    ps = divs.xpath(".//p")
                    f1.write(st + "\n")
                    sheet.write(num, 0, st)
                    sheet.write(num, 1, author)
                    sheet.write(num, 2, school)
                    sheet.write(num, 3, year)
                    sheet.write(num, 4, xiazai)
                    sheet.write(num, 5, yinyong)
                    try:
                        keywordsdiv = sel.xpath(".//div[@class='keywords']")[0]
                        span = keywordsdiv.xpath(".//span[@id='ChDivKeyWord']")[0]
                        hrefs = span.xpath(".//a")
                        i = 0
                        first = 6
                        while i < len(hrefs):
                            words = str(hrefs[i].xpath("string(.)").encode('utf-8'))
                            print(words)
                            f3.write(words + "\n")
                            sheet.write(num, first, words)
                            first += 1
                            i += 1
                    except  Exception as es:
                        print(es)
                    try:
                        if u'副题名' in detailinfo:
                            index = detailinfo.find('【副题名】')
                            loc = index + 15
                            then = 18
                            while detailinfo[loc] != '【':
                                subtitile = ""
                                while detailinfo[loc] != ' ' and detailinfo[loc] != ',':
                                    subtitile += detailinfo[loc]
                                    loc += 1
                                sheet.write(num, then, subtitile)
                                f2.write(subtitile + "\n")
                                then += 1
                    except Exception as es:
                        print(es)
                    time.sleep(1)
                firstPage += 1
                time.sleep(5)
        except Exception as es:
            print(es)
        excel.save("filename.xls")
        pass


pagelist = PageList(270)
pagelist.getUrl()

你可能感兴趣的:(日志)