python爬虫篇1——爬取中英文论文文献数据

程序运行截图:

python爬虫篇1——爬取中英文论文文献数据_第1张图片

mysql代码:

CREATE TABLE `article` (
  `id` int(11) NOT NULL,
  `article_time` varchar(50) DEFAULT NULL,
  `article_volume` varchar(20) DEFAULT NULL,
  `article_author` varchar(2000) DEFAULT NULL,
  `article_name_english` varchar(2000) DEFAULT NULL,
  `article_name_chinese` varchar(2000) DEFAULT NULL,
  `article_content_english` varchar(5000) DEFAULT NULL,
  `article_content_chinese` varchar(2000) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8

python代码:

import random
import re
import requests
import pymysql

# 打开数据库连接
db = pymysql.connect(host='localhost',
                     port=8080,
                     user='root',
                     passwd='123',
                     db='students',
                     charset='utf8')
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()


# 功能:获取历年的论文文献名中英文,作者名,摘要中英文,时间

# 翻译接口 parm: content is english
def translator_chinese(content):
    """英文翻译成中文"""
    Tranlator_URL = "http://fy.iciba.com/ajax.php?a=fy&f=en&t=zh-CHS&w=%s" % ('"' + content + '"')
    urls = re.findall(r'"out":"(.*?)","ci', requests.get(Tranlator_URL).text, re.S)
    if len(urls) > 0:
        result = (urls[0].encode('ascii').decode('unicode_escape')).replace("“", "").replace("”", "")
        return result
    else:
        return ""


# for test
# print(translator_chinese(" therefore, be treated as a unity of contradictions."))

#通过年份获取数据
def get_data(year):
    """通过年份获取文献卷宗"""
    JZ_URL = "https://journals.sagepub.com/loi/oss?year=%i" % year
    respose = requests.get(JZ_URL)
    print("*" * 300)
    print("开始爬取%s年的文献数据!" % year)
    # 获取卷宗
    jz = (re.findall(r'class="expander".*?data-attr-vol="(.*?)"', respose.text, re.S))[1]
    print("卷宗:" + jz)
    # 获取文献url
    article_ml = re.findall(r'class="row js_issue".*?href="(.*?)"', respose.text, re.S)
    print("文献目录地址:")
    for i in range(0, len(article_ml)):
        print(str(i + 1) + "." + article_ml[i])
    print("*" * 300)
    for temp in article_ml:
        data = requests.get(temp)
        article_time = re.findall(r'
\n(.*?)\n
', data.text, re.S) # 获取文献时间 time = article_time[0][article_time[0].index(",") + 1:len(article_time[0])] print("文献时间:" + time) # 获取文献地址 addr = re.findall(r'class="ref nowrap" href="(.*?)"', data.text, re.S) Basic_URL = "https://journals.sagepub.com" print("文献列表地址:") for lb in range(0, len(addr)): print(str(lb + 1) + "." + addr[lb]) for ad in addr: # 获取每个文献内容 print("*" * 300) article_data = requests.get(Basic_URL + ad) article_c = re.findall(r'property="og:title" content="(.*?)"', article_data.text, re.S) if len(article_c) > 0: if "-" in article_c[0]: # 获取文献作者 article_author = article_c[0][article_c[0].index("-") + 1:len(article_c[0])] # 获取文献名 article_name_english = article_c[0][0:article_c[0].index("-")] article_name_chinese = translator_chinese(article_name_english) print("文献英文名字:" + article_name_english) print("文献中文名字:" + article_name_chinese) print("作者名字:" + article_author) else: article_author = "" article_name_english = article_c[0] article_name_chinese = translator_chinese(article_name_english) print("文献英文名字:" + article_name_english) print("文献中文名字:" + article_name_chinese) print("作者名字:" + article_author) else: break # 获取文献摘要 article_content_data = re.findall(r'

(.*?)

', article_data.text, re.S) if len(article_content_data) > 0: article_content_english = article_content_data[0] article_content_chinese = translator_chinese(article_content_data[0]) print("英文摘要:" + article_content_english) # 英文摘要 print("中文摘要:" + article_content_chinese) # 中文摘要 else: article_content_english = "" article_content_chinese = "" # 中英文摘要都为空 print("英文摘要:" + article_content_english) # 英文摘要 print("中文摘要:" + article_content_chinese) # 中文摘要 # 数据写入数据库 id = random.randint(0, 999999999) sql = """insert into article(id,article_time,article_volume,article_author,article_name_english,article_name_chinese, article_content_english,article_content_chinese) values(%i,%s,%s,%s,%s,%s,%s,%s) """ % ( id, "'" + time + "'", "'" + jz + "'", "'" + pymysql.escape_string(article_author) + "'", "'" + pymysql.escape_string(article_name_english) + "'", "'" + pymysql.escape_string(article_name_chinese) + "'", "'" + pymysql.escape_string(article_content_english) + "'", "'" + pymysql.escape_string(article_content_chinese) + "'") cursor.execute(sql) # # 提交到数据库执行 print("id:%i数据爬取成功!" % id) db.commit() # 主函数 if __name__ == '__main__': for year in range(2015, 2017): get_data(year) else: print("数据爬取完成!") db.close()

程序可能存在部分bug,欢迎交流指正。

你可能感兴趣的:(python,爬虫,python,mysql,文献)