Python项目实战-疫情数据采集, 并做可视化展示
1. 爬虫基本流程
2. json
3. requests 爬虫当中 发送网络请求
4. pandas 表格处理 / 保存数据
4. pyecharts 可视化
python 3.8 比较稳定版本 解释器发行版 anaconda --> jupyter notebook 里面写数据分析代码 专业性
pycharm 专业代码编辑器 按照年份与月份划分版本的
一. 明确需求
我们要爬取的内容是什么?
分析数据从哪里来的 https://news.qq.com/zt2020/page/feiyan.htm#/
用开发者工具 进行抓包(数据包)分析
二. 代码流程
1. 发送请求 访问网站 https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&_=1638361138568
2. 获取数据
3. 解析数据
xpath css re json bs4(过时了)
json 在python语言当中 字典类型数据 {键1:值1, 键2:值2, 键3:值3}
4. 保存数据
import requests # 发送网络请求模块
import json
import pprint # 格式化输出模块
import pandas as pd # 数据分析当中一个非常重要的模块
url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&_=1638361138568'
response = requests.get(url, verify=False)
# : 响应体Response 200: 状态码 发送请求成功
json_data = response.json()['data']
# json_data 什么类型?
json_data = json.loads(json_data)
china_data = json_data['areaTree'][0]['children'] # 列表
data_set = []
for i in china_data:
data_dict = {}
# 地区名称
data_dict['province'] = i['name']
# 新增确认
data_dict['nowConfirm'] = i['total']['nowConfirm']
# 死亡人数
data_dict['dead'] = i['total']['dead']
# 治愈人数
data_dict['heal'] = i['total']['heal']
# 死亡率
data_dict['deadRate'] = i['total']['deadRate']
# 治愈率
data_dict['healRate'] = i['total']['healRate']
data_set.append(data_dict)
df = pd.DataFrame(data_set)
df.to_csv('data.csv')
#%% md
#%%
# 导入工具
import time # 时间模块
import json # json包
import requests # 网络请求库 第三方 pip
import pandas as pd # 数据处理 第三库 pip
#%% md
## 1. 目标网址
https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=jQuery35106097998260028255_1617971061475&_=1617971061476
#%%
url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&_=%d'%int(time.time()*1000)
#%% md
## 2.模拟浏览器发送请求,获取响应
#%%
html = requests.get(url)
#%% md
## 3.解析网页 提取数据
#%% md
- 正则
- xpath
- bs4
#%%
# json类型转换
data = json.loads(html.json()['data'])
china_data = data['areaTree'][0]['children']
data_set = []
for i in china_data:
data_dict = {}
# 地区名称
data_dict['province'] = i['name']
# 疫情数据
# 新增确诊
data_dict['nowConfirm'] = i['total']['nowConfirm']
data_dict['confirm'] = i['total']['confirm']
data_dict['dead'] = i['total']['dead']
data_dict['heal'] = i['total']['heal']
data_dict['deadRate'] = i['total']['deadRate']
data_dict['healRate'] = i['total']['healRate']
data_set.append(data_dict)
#%%
df = pd.DataFrame(data_set)
df
#%% md
## 4. 保存数据
#%%
df.to_csv(r'data.csv')
#%% md
#%% md
- matplotlib
- pyecharts # pip install pyecharts
#%%
from pyecharts import options as opts
from pyecharts.charts import Bar,Line,Pie,Map,Grid
#%%
df2 = df.sort_values(by=['nowConfirm'],ascending=False)[:9]
df2
#%%
[list(i) for i in zip(df2['province'].values.tolist(),df2['nowConfirm'].values.tolist())]
#%%
pie = (
Pie()
.add(
"",
[list(i) for i in zip(df2['province'].values.tolist(),df2['nowConfirm'].values.tolist())],
radius = ["10%","30%"]
)
.set_global_opts(
legend_opts=opts.LegendOpts(orient="vertical", pos_top="70%", pos_left="70%"),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
pie.render_notebook()
#%%
line = (
Line()
.add_xaxis(list(df['province'].values))
.add_yaxis("治愈率", df['healRate'].values.tolist())
.add_yaxis("死亡率", df['deadRate'].values.tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title="死亡率与治愈率"),
)
)
line.render_notebook()
#%%
bar = (
Bar()
.add_xaxis(list(df['province'].values)[:6])
.add_yaxis("死亡", df['dead'].values.tolist()[:6])
.add_yaxis("治愈", df['heal'].values.tolist()[:6])
.set_global_opts(
title_opts=opts.TitleOpts(title="各地区确诊人数与死亡人数情况"),
datazoom_opts=[opts.DataZoomOpts()],
)
)
bar.render_notebook()
#%%
china_map = (
Map()
.add("现有确诊", [list(i) for i in zip(df['province'].values.tolist(),df['nowConfirm'].values.tolist())], "china")
.set_global_opts(
title_opts=opts.TitleOpts(title="各地区确诊人数"),
visualmap_opts=opts.VisualMapOpts(max_=200, is_piecewise=True),
)
)
china_map.render('a.html')
#%%