API爬虫
数据库存储和读取
数据预处理
数据分析及可视化
# 1、海外数据爬取# 国家名字爬取url = u'https://view.inews.qq.com/g2/getOnsInfo?name=disease_foreign'
headers = {'User_Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/
10.5.4039.400'} # 模拟浏览器表头,防止反爬
r = requests.get(url, timeout=5, headers=headers)
data = json.loads(r.text) # json文件可以直接读取data = json.loads(data['data'])# 遍历国家名字,得到每个国家的数据all_date = {}for i in data['foreignList']:
url = 'https://api.inews.qq.com/newsqa/v1/automation/foreign/
daily/list?country={}'.format(i['name']) # i['name']是国家名字
r = requests.get(url,headers=headers)
data = json.loads(r.text)
data = data['data']
all_date[i['name']] = data
# 从每个国家数据筛选:国家名,累计确诊,新增确诊,新增治愈
,新增死亡等5个字段,并存储成列表
new_data = []for i in all_date.keys():
cur_data = all_date[i]for j in cur_data:
data = {'country':i,'date':j['date'],'confirm_add':j['confirm_add'],
'confirm':j['confirm'],'heal':j['heal'],'dead':j['dead']}
new_data.append(data)# 转变成pandas格式,便于后续与国内数据拼接df1 = pd.DataFrame(new_data)print(df1.head())
# 2、国内数据爬取url = u'https://view.inews.qq.com/g2/getOnsInfo?name=disease_other'r = requests.get(url,headers=headers)
data = json.loads(r.text)
data_data = json.loads(data['data'])# 国内累计确诊,日期,累计治愈、累计死亡 5个字段在一个字典里all_data = []for i in data_data['chinaDayList']:
new_data = {}
new_data['country'] = '中国'new_data['confirm'] = i['confirm']
new_data['date'] = i['date']
new_data['dead'] = i['dead']
new_data['heal'] = i['heal']
all_data.append(new_data)# 国内新增确诊、日期在另一个字典里daily_data = []for i in data_data['chinaDayAddList']:
new_data = {}
new_data['confirm_add'] = i['confirm']
new_data['date'] = i['date']
daily_data.append(new_data)# 国内数据根据日期进行拼接df2 = pd.DataFrame(all_data)
df3 = pd.DataFrame(daily_data)
df4 = pd.merge(df2,df3,on='date')print(df4)
# 3、国内外数据横向拼接df = pd.concat([df1,df4],axis=0)# 转换成词典格式,方便保存至mysqldf = df.to_dict(orient='records')print(df)
# 导入库import pymysql
connection = pymysql.connect(host='localhost', # 连接数据库user='root',password='cindy407', # 你安装mysql时设置的密码charset='utf8',cursorclass=pymysql.cursors.DictCursor)
create_db = 'Create database if not exists pachong;' # 建立数据库drop_table = 'drop table if exists pachong.disease;' # 删除表格,因为要每天更新的话,需要先删除再重建create_table =''' Create table pachong.disease
(
country varchar(50),
date varchar(20),
confirm_add varchar(20),
confirm varchar(20),
heal varchar(20),
dead varchar(20)
);
''' # 创建表格字段alter_table = 'alter table pachong.disease convert to character set utf8;' # 这一步很关键,虽然连接时设置了语言为utf-8,但实际上还是没有改过来insert_table = '''insert into pachong.disease(country, date, confirm_add, confirm, heal, dead)
values(%s,%s,%s,%s,%s,%s)''' # 插入数据语句try:with connection.cursor() as cursor:
cursor.execute(create_db) # 分步执行sql语句cursor.execute(drop_table)
cursor.execute(create_table)
cursor.execute(alter_table)for item in df:
cursor.execute(insert_table, (
item['country'], item['date'], item['confirm_add'], item['confirm'], item['heal'], item['dead']))
connection.commit() # 提交sql,使之生效finally:
connection.close() # 关闭链接
# 数据读取connection = pymysql.connect(host='localhost',user='root',db='pachong',password='cindy407',charset='utf8',cursorclass=pymysql.cursors.DictCursor)
sql = 'select * from pachong.disease'data = pd.read_sql(sql,con=connection)print(data)# 这种读取方式也可以# with connection.cursor() as cursor:
# sql = "select * from pachong.disease"
# cursor.execute(sql)
# # 打印查询结果# result = cursor.fetchall()
# print(result)
# 1、删除重复项print(data.duplicated().value_counts())
data.drop_duplicates(keep='first',inplace=True)# 2、缺失值处理print(data.info())# 3、修改数据型格式data[['confirm','confirm_add','heal','dead']] = data[['confirm','confirm_add','heal','dead']].astype('int')# 4、异常值处理print(data.describe())# 5、修改日期格式data['date'] = data['date'].apply(lambda x: '2020-' + x.replace('.','-'))# 6、增加中国/海外的标识(后续要进行海内外对比)data['if_foreign'] = data['country']
data.loc[data['if_foreign']=='中国','if_foreign']='中国'data.loc[data['if_foreign']!='中国','if_foreign']='海外'print(data.head())
处理后数据如下: #
# 1、汇总概况:累计确诊、累计治愈、治愈率、累计死亡、死亡率# t = data.date.max() # 由于中国的数据更新要慢于海外1天,因此取前2天的分析t = '2020-05-09'a1 = data.loc[data['date'] == t]['confirm'].sum()
b1 = data.loc[data['date'] == t]['heal'].sum()
c1= data.loc[data['date'] == t]['heal'].sum()/data.loc[data['date'] == t]
['confirm'].sum()
d1 = data.loc[data['date'] == t]['dead'].sum()
e1 = data.loc[data['date'] == t]['dead'].sum()/data.loc[data['date'] == t]
['confirm'].sum()
a2 = data.loc[(data['date'] == t) & (data['country'] == '中国')]['confirm'].sum()
b2 = data.loc[(data['date'] == t) & (data['country'] == '中国')]['heal'].sum()
c2 = data.loc[(data['date'] == t) & (data['country'] == '中国')]['heal'].sum()/
data.loc[(data['date'] == t) & (data['country'] == '中国')]['confirm'].sum()
d2 = data.loc[(data['date'] == t) & (data['country'] == '中国')]['dead'].sum()
e2 = data.loc[(data['date'] == t) & (data['country'] == '中国')]['dead'].sum()/
data.loc[(data['date'] == t) & (data['country'] == '中国')]['confirm'].sum()
a3 = data.loc[(data['date'] == t) & (data['country'] != '中国')]['confirm'].sum()
b3 = data.loc[(data['date'] == t) & (data['country'] != '中国')]['heal'].sum()
c3 = data.loc[(data['date'] == t) & (data['country'] != '中国')]['heal'].sum()/
data.loc[(data['date'] == t) & (data['country'] != '中国')]['confirm'].sum()
d3 = data.loc[(data['date'] == t) & (data['country'] != '中国')]['dead'].sum()
e3 = data.loc[(data['date'] == t) & (data['country'] != '中国')]['dead'].sum()/
data.loc[(data['date'] == t) & (data['country'] != '中国')]['confirm'].sum()
# pandas行和列全部展示pd.options.display.max_rows=Nonepd.options.display.max_columns=None
print('全球:累计确诊 {:,}人,累计治愈 {:,}人,治愈率 {:0.2%},累计死亡 {:,}人,
死亡率 {:0.2%}'.format(a1,b1,c1,d1,e1))
print('中国:累计确诊 {:,}人,累计治愈 {:,}人,治愈率 {:0.2%},累计死亡 {:,}人,
死亡率 {:0.2%}'.format(a2,b2,c2,d2,e2))
print('海外:累计确诊 {:,}人,累计治愈 {:,}人,治愈率 {:0.2%},累计死亡 {:,}人,
死亡率 {:0.2%}'.format(a3,b3,c3,d3,e3))
结果如下,可以看出海外的死亡率高于中国1.3pct,说明中国的医疗水平以及治疗成果还是较好的
# 2、全球分布情况# map国外名称只能读取英文,所以要换成对应的英文名country_name = pd.read_csv('country_name.csv',header=None,names=['country_en','country'])
data = pd.merge(data,country_name,on = 'country',how='left')# 整理数据源x = list(data.loc[data['date'] ==t]['country_en'])
y = list(data.loc[data['date'] ==t]['confirm'])# 做图world = (
Map(init_opts=opts.InitOpts(theme=ThemeType.WESTEROS))
.add('世界地图',[list(z) for z in zip(x,y)],maptype="world",
is_map_symbol_show=False)
.set_series_opts(label_opts=opts.LabelOpts(is_show=False)) # 不显示国家名.set_global_opts(title_opts=opts.TitleOpts(title="nCoV全球疫情地图"),
visualmap_opts=opts.VisualMapOpts(is_piecewise=True,
pieces= # pieces可以设置不同区间颜色
[
{"min": 500001,"label":'50万以上',"color": '#800000'},{"min": 100001,"max":500000,"label":'10万-50万',"color": '#B22222'},{"min": 10001, "max": 100000, "label": '1万-10万', "color": "#FF8C00"}, # 不指定 max,表示 max 为无限大{"min": 5001, "max": 10000, "label": '5001-10000', "color": "#FA8072"},{"min": 501, "max": 5000, "label": '501-5000', "color": "#E9967A"},{"min": 101, "max": 500, "label": '101-500', "color": "#FFDEAD"},{"mim": 0, "max": 100, "label": '0-100', "color": "#FFF8DC"}
]))
)
world.render(r'C:\Users\cindy407\Desktop\世界地图.html')
# 导出HTML文件,可以在浏览器打开,也可渲染成图片
图片用浏览器打开如下,除了少数几个国家数据不全外,与今日头条还是非常红接近的 #
# 3、海内外各项指标对比分析fig,axes = plt.subplots(2,2,figsize=(20,20))
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置正常显示中文plt.rcParams['font.serif'] = ['SimHei'] # 设置正常显示中文# 新增确诊df1 = data.groupby(['date','if_foreign'])['confirm_add'].sum().unstack()
df1.index = pd.to_datetime(df1.index)
ax = plt.subplot(2,2,1)
ax.xaxis.set_major_formatter(mdate.DateFormatter('%Y-%m-%d'))
# 设置日期格式坐标,如果没有,会显示成数据格式
plt.xticks(pd.date_range(df1.index[0],df1.index[-1],freq='M'),size = 8)
# 设置日期展示间隔,freq M 是月,W 是周,Y是年等
plt.plot(df1.index,df1)
plt.legend(loc='best',labels=df1.columns)
plt.title('新增确诊人数')# 累计确诊df2 = data.groupby(['date','if_foreign'])['confirm'].sum().unstack()
df2.index = pd.to_datetime(df2.index)
ax = plt.subplot(2,2,2)
ax.xaxis.set_major_formatter(mdate.DateFormatter('%Y-%m-%d'))
plt.xticks(pd.date_range(df2.index[0],df2.index[-1],freq='M'),size = 8)
plt.plot(df2.index,df2)
plt.legend(loc='best',labels=df2.columns)
plt.title('累计确诊人数')# 治愈率
df3 = (data.groupby(['date','if_foreign'])['heal'].sum()/
data.groupby(['date','if_foreign'])['confirm'].sum()).unstack()
df3.index = pd.to_datetime(df3.index)
ax = plt.subplot(2,2,3)
ax.xaxis.set_major_formatter(mdate.DateFormatter('%Y-%m-%d'))
plt.xticks(pd.date_range(df3.index[0],df3.index[-1],freq='M'),size = 8)
plt.plot(df3.index,df3)
plt.legend(loc='best',labels=df3.columns)
plt.title('治愈率')# 死亡率
df4 = (data.groupby(['date','if_foreign'])['dead'].sum()/data.groupby
(['date','if_foreign'])['confirm'].sum()).unstack()
df4.index = pd.to_datetime(df4.index)
ax = plt.subplot(2,2,4)
ax.xaxis.set_major_formatter(mdate.DateFormatter('%Y-%m-%d'))
plt.xticks(pd.date_range(df4.index[0],df4.index[-1],freq='M'),size = 8)
plt.plot(df4.index,df4)
plt.legend(loc='best',labels=df4.columns)
plt.title('死亡率')
plt.show()
结果如下图:
# 4、全球累计>50000的国家的确诊趋势(限制人数从100开始)country = data[(data['date'] == t)&(data['confirm']>=50000)].sort_values('confirm', ascending=False)['country'].values
data = data[(data['country'].isin(country))&(data['confirm']>=100)]
fig,axes = plt.subplots(1,2,figsize=(20,40))
ax = plt.subplot(1,2,1)for i in country:
df1 = data[data.country==i]
plt.plot(list(range(1,len(df1)+1)),df1['confirm'])
plt.legend(loc='best',labels=country)# 踢掉美国,再看下:ax = plt.subplot(1,2,2)
country_sub = []for i in country:if i == '美国':continue
else:
df1 = data[data.country==i]
plt.plot(list(range(1,len(df1)+1)),df1['confirm'])
country_sub.append(i)
plt.legend(loc='best',labels=country_sub)
plt.show()
从结果来看,美国初期的新增速度与国内差不多,但在后期爆发式增长非常快,且没有一点抑制的状态,说明美国的防疫措施明显没有做到位 # 5、前日新增TOP10国家
df = data[data['date']==t].sort_values('confirm_add',ascending=False)[['country','confirm_add']].set_index('country',drop=True)
plt.bar(x=df.index.values[:10],height=df['confirm_add'].values[:10],width=0.5,color='b')
plt.title('昨日新增TOP10国家')
plt.show()
end
福利: 1、关注公众号,回复' 病毒 '即可获得 本文全部代码和全球中英文对应文件 2、回复" 分析 "可以免费获得如下 9本数据分析经典书籍电子版往期推荐
Pandas数据分析实战,让你体会数据的魔力
Excel 控件+Offset组合,让图表真正动起来
掌握数据透视表8大功能,数据分析能力蹭蹭蹭!
SQL窗口函数看不明白?5大应用场景让你一步到位