Task1:论文数量统计
"""
Created on Mon Jan 11 16:07:31 2021
@author: Zoey
task1:论文数量统计,即统计2019年全年计算机各个方向论文数量
"""
import seaborn as sns
from bs4 import BeautifulSoup
import re
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
data = []
with open('F:/qyz/a2sdu/datawhale_20200110/arxiv-metadata-oai-2019.json','r') as f:
for idx, line in enumerate(f):
if idx >= 1000:
break
data.append(json.loads(line))
data = pd.DataFrame(data)
data.shape
data.head()
'''
def readArxivFile(path,columns=['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
'report-no', 'categories', 'license', 'abstract', 'versions',
'update_date', 'authors_parsed'],count=None):
'''
定义读取文件的函数
path: 文件路径
columns: 需要选择的列
count: 读取行数
'''
data = []
with open(path,'r') as f:
for idx, line in enumerate(f):
if idx >= 1000:
break
d = json.loads(line)
d = {col: d[col] for col in colums}
data.append(d)
data = pd.DataFrame(data)
return data
data = readArxivFile('F:/qyz/a2sdu/datawhale_20200110/arxiv-metadata-oai-2019.json',['id', 'categories', 'update_date'])
'''
data['categories'].describe()
unique_categories = set([i for l in [x.split(' ') for x in data['categories']] for i in l])
'''
上式为列表生成式等价于:
y = []
for x in data['categories']:
y.append(x.split(' '))
i1 = []
for l in y:
for i in l:
i1.append(i)
参考https://www.cnblogs.com/yyds/p/6281453.html
len(set(i1)) #33
'''
len(unique_categories)
unique_categories
data['year'] = pd.to_datetime(data['update_date']).dt.year
del data['update_date']
data = data[data['year'] >= 2019]
data.reset_index(drop=True, inplace=True)
website_url = requests.get('https://arxiv.org/category_taxonomy').text
soup = BeautifulSoup(website_url,'lxml')
root = soup.find('div',{
'id':'category_taxonomy_list'})
tags = root.find_all(['h2','h3','h4','p'],recursive=True)
level_1_name = ''
level_2_name = ''
level_2_code = ''
level_1_names = []
level_2_codes = []
level_2_names = []
level_3_codes = []
level_3_names = []
level_3_notes = []
for t in tags:
if t.name == 'h2':
level_1_name = t.text
level_2_code = t.text
level_2_name = t.text
elif t.name == 'h3':
raw = t.text
level_2_code = re.sub(r'(.*)\((.*)\)',r'\2',raw)
level_2_name = re.sub(r'(.*)\((.*)\)',r'\1',raw)
elif t.name == 'h4':
raw = t.text
level_3_code = re.sub(r'(.*)\((.*)\)',r'\1',raw)
level_3_name = re.sub(r'(.*)\((.*)\)',r'\2',raw)
elif t.name == 'p':
notes = t.text
level_1_names.append(level_1_name)
level_2_names.append(level_2_name)
level_2_codes.append(level_2_code)
level_3_names.append(level_3_name)
level_3_codes.append(level_3_code)
level_3_notes.append(notes)
df_taxonomy = pd.DataFrame({
'group_name':level_1_names,
'archive_name':level_2_names,
'archive_id':level_2_codes,
'category_name':level_3_names,
'categories':level_3_codes,
'category_description':level_3_notes
})
df_taxonomy['archive_name']
df_taxonomy.groupby(['group_name','archive_name'])
_df = data.merge(df_taxonomy, on="categories", how="left").drop_duplicates(["id","group_name"]).groupby("group_name").agg({
"id":"count"}).sort_values(by="id",ascending=False).reset_index()
fig = plt.figure(figsize=(15,12))
explode = (0,0,0,0.2,0.3,0.3,0.2,0.1)
plt.pie(_df['id'], labels=_df['group_name'], autopct='%1.2f%%', startangle=160, explode=explode)
plt.tight_layout()
plt.show()
group_name = 'Computer Science'
cats = data.merge(df_taxonomy, on='categories').query('group_name == @group_name')
cats.groupby(['year','category_name']).counts.reset_index().pivot(index='category_name', columns='year', values='id')