CNKI网页爬虫

刚接触Python一周时间,写了一个CNKI爬虫,可爬取论文题目、作者、期刊名称、摘要等。

如果安装有mysql数据库,可将爬取记录保存至数据库中。

零基础纯小白一个,代码贴出来主要是为了交流学习。

# CNKI爬虫 -- 版本4.0 可实现单页面的搜索,显示题目、作者、期刊、摘要,可翻页;修复摘要显示不全的问题;可将数据存储进MySQL数据库
# 
# !usr/bin/env python3
# -*- coding: utf-8 -*-


import requests
import mysql.connector
from lxml import html


KeyWords = '人工智能'       # 搜索关键词
MaxPage = 1                 # 爬取的页面数目
URL = 'https://www.cn-ki.net/'

Num_Paper = 0

data = {
    'keyword': KeyWords,
    'db': 'SCDB'
}


def get_html(url, para_data):
    content = requests.get(url, params=para_data)
    return content

# 链接mysql数据库
# conn = mysql.connector.connect(user='root', password='root', database='db_mycnki')
# cur = conn.cursor()

content = get_html(URL+'search', data)
page_url = content.url

page_ii = 0
while page_ii < MaxPage:
    content = get_html(page_url, '')
    tree = html.fromstring(content.text)

    e1 = tree.xpath('//div[@class="mdui-col-xs-12 mdui-col-md-9 mdui-typo"]')
    for ei in e1:
        e2_title = ei.xpath('h3/a/text()')
        title = ''.join(e2_title)

        e2_author = ei.xpath('div[1]/span[1]/text()')
        author = ''.join(e2_author)
        e2_JnName = ei.xpath('div[1]/span[3]/text()')
        JnName = ''.join(e2_JnName)
        e2_JnVol = ei.xpath('div[1]/span[4]/text()')
        JnVol = ''.join(e2_JnVol)
        e2_JnType = ei.xpath('div[1]/span[5]/text()')
        JnType = ''.join(e2_JnType)

        e2_href = ei.xpath('h3/a/@href')
        href = ''.join(e2_href)
        if URL not in href:
            href = URL + href

        Jn_content = get_html(href, '')
        Jn_tree = html.fromstring(Jn_content.text)

        # 这是之前的代码
        # e2_abstract = Jn_tree.xpath('//div[@class="mdui-col-md-11 mdui-col-xs-9 mdui-text-color-black-text"]/p/text()')
        # 修改后的代码
        e2_abstract = Jn_tree.xpath('//div[@class="mdui-col-xs-12 mdui-text-color-black-text mdui-typo"]/p/text()')

        abstract = (''.join(e2_abstract)).strip()

        print('********************' + href)

        print('title: >>>>>>>>>>> %s' % title)
        print('href: ----------- %s' % href)
        print('author: >>>>>>>>>>> %s' % author)
        print('JnName: ---- %s ---- %s ---- %s' % (JnName, JnVol, JnType))
        print('>>>>>>>>>>> Abstract <<<<<<<<<<<')
        print('%s' % abstract)

        # 保存数据记录
        # cur.execute('insert into tb_mycnki (id, title, author, JnName, JnVol, JnType, Abstract, href) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)',
        #             [Num_Paper, title, author, JnName, JnVol, JnType, abstract, href])

        Num_Paper += 1

    page_ii += 1

    nextpg = tree.xpath('//div[@class="mdui-col-xs-9 TitleLeftCell mdui-valign"]/a[last()]')
    if nextpg.__len__() == 0:
        break
    nextpg_text = nextpg[0].text
    if nextpg_text == '下一页':
        page_url = nextpg[0].attrib['href']
    else:
        break

    if URL not in page_url:
        page_url = URL + page_url

print('****************************************************************************')
print('>>>>>>>>>>>>>>>>  数据导出结束,共导出 %d 篇文献!<<<<<<<<<<<<<<<<<<<<<<<<<<' % Num_Paper)

# 关闭数据库
# conn.commit()
# cur.close()


你可能感兴趣的:(Python)