各种论文摘要爬取

首先通过知识搜索网站搜索相关论文
http://search.cnki.net/

抓取所有包含关键字的链接

# -*- coding: utf-8 -*-
import time
import re
import random
import requests

if __name__ == '__main__':

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }

    depth = 602
    start_url = "http://search.cnki.net/search.aspx?q=搜索关键字&rank=relevant&cluster=zyk&val=CDFDTOTAL"
    for i in range(depth):
        try:
            url = start_url + "&p=" + str(15*i)

            response = requests.get(url,headers=headers)
            # print(response.text)
            reStr1 = r'(.*?)
' articleList = re.findall(reStr1,response.text.replace('\r\n','').replace('\t','').replace(' ','').replace(' ',''),re.S|re.M) reStr2 = r'

.*? with open('url_results.txt', 'a+') as f: for art in articleList: url_list = re.findall(reStr2, art) for url in url_list: f.write(url+'\n') f.close() print("爬取第" + str(i) + "页成功!") time.sleep(random.randint(1,3)) except: print("爬取第"+str(i)+"页失败!") continue

上面代码把所有链接存储到了url_results.txt的文件夹中,接下来开始爬取文章详细内容

# -*- coding: utf-8 -*-
import time
import re
import random
import requests

# url = 'http://epub.cnki.net/grid2008/brief/detailj.aspx?filename=1016018279.nh&dbname=CDFDTEMP'

#这个headers信息必须包含,否则该网站会将你的请求重定向到其它页面
headers = {
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, sdch',
    'Accept-Language':'zh-CN,zh;q=0.8',
    'Connection':'keep-alive',
    'Host':'www.cnki.net',
    'Referer':'http://search.cnki.net/search.aspx?q=%E4%BD%9C%E8%80%85%E5%8D%95%E4%BD%8D%3a%E6%AD%A6%E6%B1%89%E5%A4%A7%E5%AD%A6&rank=relevant&cluster=zyk&val=CDFDTOTAL',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}

if __name__ == '__main__':
    #将爬取结果存储到results.txt文件中
    with open('results.txt','a+') as res:
        #通过url_results.txt读取链接进行访问
        with open('url_results.txt') as f:
            for url in f:
                print("正在读取:"+url)
                response = requests.get(url.replace('\n',''), headers=headers)
                all_content = response.text.replace('\r\n', '').replace('\t', '').replace(' ', '').replace(' ', '')
                re_title = r'(.*?)

' title = re.findall(re_title ,all_content , re.S | re.M) restr1 = r'(.*?)
' art_result = re.findall(restr1,all_content , re.S | re.M) re_author = r'

【作者】(.*?)' re_teacher = r'

【导师】(.*?);

'
re_author_info = r'

【作者基本信息】(.*?)(.*?)

'
re_art_info = r'(.*?)<' if len(art_result) > 0: art_info = art_result[0] res.write("【标题】 " + title[0]+'\n') res.write("【作者】 " + re.findall(re_author, art_info, re.S | re.M)[0]+'\n') res.write("【导师】 " + re.findall(re_teacher, art_info, re.S | re.M)[0]+'\n') res.write("【作者基本信息】 ") for school, lev in re.findall(re_author_info, art_info, re.S | re.M): res.write(school + lev+'\n') res.write("【摘要】 "+'\n') art_det = re.findall(re_art_info, art_info, re.S | re.M) if len(art_det)>0: res.write(art_det[0]+'\n') else: res.write('\n') res.flush() res.write('\n') time.sleep(random.randint(1, 3))

你可能感兴趣的:(Python,论文摘要爬取,知识爬取)