数据来源:BlankerL
截止日期:4/10
json数据转 csv
import json
import time
import csv
file = open('DXYArea-TimeSeries.json','r',encoding='utf-8')
infos = json.load(file)
with open('data.csv','a',newline='') as f:
writer = csv.DictWriter(f,fieldnames=['updateTime','provinceName',
'currentConfirmedCount','confirmedCount',
'suspectedCount','curedCount',
'deadCount','locationId',
'statisticsData','countryName','countryEnglishName'])
writer.writeheader()
for info in infos:
result = dict()
result["updateTime"] = time.strftime("%Y-%m-%d",time.localtime(int(str(info.get("updateTime"))[:-3])))
result["provinceName"] = info.get("provinceName")
result["currentConfirmedCount"] = info.get("currentConfirmedCount")
result["confirmedCount"] = info.get("confirmedCount")
result["suspectedCount"] = info.get("confirmedCount")
result["curedCount"] = info.get("curedCount")
result["deadCount"] = info.get("deadCount")
result["locationId"] = info.get("locationId")
result["statisticsData"] = info.get("statisticsData")
result["countryName"] = info.get("countryName")
result["countryEnglishName"] = info.get("countryEnglishName")
writer.writerow(result)
数据拆分中国和世界两部分
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
filename = r'C:\Users\liuhao\Desktop\新建文件夹\DXY-COVID-19-Data-master\json\data.csv'
data = pd.read_csv(filename, sep=',',encoding='utf-8',header=0)
test = data.copy()
test['updateTime'] = test['updateTime'].map(lambda x:x[5:])
# 去掉年份,方便坐标轴数据展示
china = test[test['countryName'] == '中国']
chn_province = china[china['locationId'] != 951001]
chn_province = chn_province.sort_values(['provinceName','confirmedCount','updateTime'],ascending=False)
chn_province_unique = chn_province.drop_duplicates(subset=['provinceName','updateTime'],keep='first',inplace=False)
# 发现采集数据时写入了同一地区当天的不同时间点的数据
# 将数据排序后,去重,保留一天内的最大值
chn_province_unique['currentConfirmedCount'] = chn_province_unique['confirmedCount'] - chn_province_unique['curedCount'] - chn_province_unique['deadCount']
# 这一列部分数据缺失,计算填充
def func(x):
s = x.replace('省','')
s = s.replace('市','')
s = s.replace('自治区','')
s = s.replace('壮族','')
s = s.replace('回族','')
s = s.replace('维吾尔','')
return s
chn_province_unique['provinceName'] = chn_province_unique['provinceName'].map(func)
# 对城市名清洗,因为地图Map默认不支持这样的后缀
world = test.drop(chn_province.index.values)
world = world.sort_values(['provinceName','confirmedCount','updateTime'],ascending=False)
world_unique = world.drop_duplicates(subset=['provinceName','updateTime'],keep='first',inplace=False)
world_unique['currentConfirmedCount'] = world_unique['confirmedCount'] - world_unique['curedCount'] - world_unique['deadCount']
world_unique.to_csv(r'C:\Users\liuhao\Desktop\新建文件夹\DXY-COVID-19-Data-master\json\world.csv',sep=',',index=0,encoding='utf-8-sig')
chn_province_unique.to_csv(r'C:\Users\liuhao\Desktop\新建文件夹\DXY-COVID-19-Data-master\json\chn_province.csv',sep=',',index=0,encoding='utf-8-sig')
观察数据发现有些地区的时间序列不全,补全完整的时间,地区序列
import pandas as pd
import numpy as np
filename = r'C:\Users\liuhao\Desktop\新建文件夹\DXY-COVID-19-Data-master\json\chn_province.csv'
data = pd.read_csv(filename, sep=',',header=0)
test = data.copy()
def fill_data(test):
"""这里填充用的遍历循环,数据量大的话有些慢,但没有找到更好的方法"""
time_series = list(set(test['updateTime'].tolist()))
city_series = set(test['provinceName'].tolist())
# 遍历每个时间点中未采集到的地区,加入到数据集中
for i in time_series:
c = test['provinceName'][test['updateTime'] == i].values.tolist()
diff = list(city_series.difference(set(c)))
for j in diff:
new = pd.DataFrame({'updateTime':[i],'provinceName':[j]})
test = test.append(new,ignore_index=False)
test.sort_values(by=['provinceName','updateTime'],ascending=False,inplace=True)
# 降序排列
partfilled = test[test['updateTime']=='01-22'].fillna(0)
# 将各地区最早时间点填充为0
test[test['updateTime']=='01-22'] = partfilled
# 这里过渡一下,试着直接填充,结果不行
test.fillna(method = 'bfill',inplace=True)
# 向后填充,即填充前一天的数据
return test
test.to_csv(r'C:\Users\liuhao\Desktop\新建文件夹\DXY-COVID-19-Data-master\json\chn_province_new.csv',index=0,encoding='utf-8-sig')
pyecharts画图
import pandas as pd
import numpy as np
from pyecharts.charts import Map,Line,Bar,Timeline
import pyecharts.options as opts
filename = r'C:\Users\liuhao\Desktop\新建文件夹\DXY-COVID-19-Data-master\json\chn_province_new.csv'
data = pd.read_csv(filename, sep=',',header=0)
test = data.copy()
test.sort_values(by='updateTime',ascending=True,inplace=True)
def day_data(data):
return test[[ 'provinceName', 'currentConfirmedCount', 'confirmedCount','curedCount', 'deadCount']][test['updateTime'] == data]
def city_data(city):
return test[[ 'updateTime','provinceName', 'currentConfirmedCount', 'confirmedCount','curedCount', 'deadCount']][test['provinceName'] == city]
data_series = test['updateTime'].unique()
city_series = test['provinceName'].unique()
tl = Timeline(init_opts=opts.InitOpts(width="1400px", height="800px"))
for i in data_series:
dta = day_data(i)
dta1 = dta[dta['provinceName'] != '湖北'].sort_values(by = 'currentConfirmedCount',ascending=True)
# 这里去掉了湖北的数据,湖北的数据跟其他地区数据不在一个数量级上,加入后图表不成比例
bar = Bar().add_xaxis(dta1['provinceName'].to_list())
bar.add_yaxis('现存确诊',dta1['currentConfirmedCount'].to_list(),stack="stack1")
bar.add_yaxis('治愈数',dta1['curedCount'].to_list(), stack="stack1")
bar.add_yaxis('死亡数',dta1['deadCount'].to_list(), stack="stack1")
bar.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
bar.set_global_opts(xaxis_opts=opts.AxisOpts(is_show=False),title_opts=opts.TitleOpts("{0}".format(i)))
bar.reversal_axis()
tl.add(bar, "{0}".format(i))
tl.render('timeline_bar_reversal.html')
tl1 = Timeline(init_opts=opts.InitOpts(width="1400px", height="800px"))
for i in city_series:
dta = city_data(i)
x_data = dta['updateTime'].sort_values(ascending=True).to_list()
line = Line().add_xaxis(xaxis_data=x_data)
line.add_yaxis(series_name = '现存确诊数',y_axis=dta['currentConfirmedCount'].to_list(),label_opts=opts.LabelOpts(is_show=False))
line.add_yaxis(series_name = '累计确诊数',y_axis=dta['confirmedCount'].to_list(),label_opts=opts.LabelOpts(is_show=False))
line.add_yaxis(series_name = '治愈数',y_axis=dta['curedCount'].to_list(),label_opts=opts.LabelOpts(is_show=False))
line.add_yaxis(series_name = '死亡数',y_axis=dta['deadCount'].to_list(),label_opts=opts.LabelOpts(is_show=False))
line.set_global_opts(
title_opts=opts.TitleOpts(title="{0}疫情图".format(i)),
tooltip_opts=opts.TooltipOpts(trigger="axis"),
yaxis_opts=opts.AxisOpts(
type_="value",
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True),
),
xaxis_opts=opts.AxisOpts(type_="category", boundary_gap=False),
)
tl1.add(line, "{0}".format(i))
tl1.render('timeline_city.html')
tl2 = Timeline(init_opts=opts.InitOpts(width="1400px", height="800px"))
for i in data_series:
day_dta = day_data(i)
province = day_dta['provinceName'].to_list()
data_paire = [list(z) for z in zip(province,day_dta['confirmedCount'])]
data_paire1 = [list(z) for z in zip(province,day_dta['currentConfirmedCount'])]
map = Map()
map.add('累计确诊',data_paire,'china',is_map_symbol_show=False)
map.add('现存确诊',data_paire1,'china',is_map_symbol_show=False)
map.set_global_opts(
tooltip_opts=opts.TooltipOpts(
trigger="item", formatter="{a}:{c}"
),
visualmap_opts=opts.VisualMapOpts(
max_=1000000,
is_piecewise=True,
pieces=[
{"max": 0, "min": 0, "label": "0"},
{"max": 100, "min": 0, "label": "0-100"},
{"max": 500, "min": 100, "label": "100-500"},
{"max": 1000, "min": 500, "label": "500-1000"},
{"max": 2000, "min": 1000, "label": "1000-2000"},
{"max": 1000000, "min": 2000, "label": ">2000"},
],
is_calculable=True,
range_color=["lightskyblue", "yellow", "orangered"],
),
)
tl2.add(map,'{0}'.format(i))
tl2.render('timeline_map.html')
本来想在地图中能像折线图中一样,悬浮提示文本显示该点的所有系列数据。但是 Map 中 TooltipOpts.formatter
怎么设置都不对,字符串不行,回调函数写的不对也没用。
(ps:希望有人能指点一下)