刚接触Python一周时间,写了一个CNKI爬虫,可爬取论文题目、作者、期刊名称、摘要等。
如果安装有mysql数据库,可将爬取记录保存至数据库中。
零基础纯小白一个,代码贴出来主要是为了交流学习。
# CNKI爬虫 -- 版本4.0 可实现单页面的搜索,显示题目、作者、期刊、摘要,可翻页;修复摘要显示不全的问题;可将数据存储进MySQL数据库 # # !usr/bin/env python3 # -*- coding: utf-8 -*- import requests import mysql.connector from lxml import html KeyWords = '人工智能' # 搜索关键词 MaxPage = 1 # 爬取的页面数目 URL = 'https://www.cn-ki.net/' Num_Paper = 0 data = { 'keyword': KeyWords, 'db': 'SCDB' } def get_html(url, para_data): content = requests.get(url, params=para_data) return content # 链接mysql数据库 # conn = mysql.connector.connect(user='root', password='root', database='db_mycnki') # cur = conn.cursor() content = get_html(URL+'search', data) page_url = content.url page_ii = 0 while page_ii < MaxPage: content = get_html(page_url, '') tree = html.fromstring(content.text) e1 = tree.xpath('//div[@class="mdui-col-xs-12 mdui-col-md-9 mdui-typo"]') for ei in e1: e2_title = ei.xpath('h3/a/text()') title = ''.join(e2_title) e2_author = ei.xpath('div[1]/span[1]/text()') author = ''.join(e2_author) e2_JnName = ei.xpath('div[1]/span[3]/text()') JnName = ''.join(e2_JnName) e2_JnVol = ei.xpath('div[1]/span[4]/text()') JnVol = ''.join(e2_JnVol) e2_JnType = ei.xpath('div[1]/span[5]/text()') JnType = ''.join(e2_JnType) e2_href = ei.xpath('h3/a/@href') href = ''.join(e2_href) if URL not in href: href = URL + href Jn_content = get_html(href, '') Jn_tree = html.fromstring(Jn_content.text) # 这是之前的代码 # e2_abstract = Jn_tree.xpath('//div[@class="mdui-col-md-11 mdui-col-xs-9 mdui-text-color-black-text"]/p/text()') # 修改后的代码 e2_abstract = Jn_tree.xpath('//div[@class="mdui-col-xs-12 mdui-text-color-black-text mdui-typo"]/p/text()') abstract = (''.join(e2_abstract)).strip() print('********************' + href) print('title: >>>>>>>>>>> %s' % title) print('href: ----------- %s' % href) print('author: >>>>>>>>>>> %s' % author) print('JnName: ---- %s ---- %s ---- %s' % (JnName, JnVol, JnType)) print('>>>>>>>>>>> Abstract <<<<<<<<<<<') print('%s' % abstract) # 保存数据记录 # cur.execute('insert into tb_mycnki (id, title, author, JnName, JnVol, JnType, Abstract, href) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', # [Num_Paper, title, author, JnName, JnVol, JnType, abstract, href]) Num_Paper += 1 page_ii += 1 nextpg = tree.xpath('//div[@class="mdui-col-xs-9 TitleLeftCell mdui-valign"]/a[last()]') if nextpg.__len__() == 0: break nextpg_text = nextpg[0].text if nextpg_text == '下一页': page_url = nextpg[0].attrib['href'] else: break if URL not in page_url: page_url = URL + page_url print('****************************************************************************') print('>>>>>>>>>>>>>>>> 数据导出结束,共导出 %d 篇文献!<<<<<<<<<<<<<<<<<<<<<<<<<<' % Num_Paper) # 关闭数据库 # conn.commit() # cur.close()