爬取疫情数据

一、主题式网络爬虫设计方案

1.爬虫名称:爬取新冠病毒疫情数据。

2.爬取内容:腾讯新闻网实时疫情数据。

3.网络爬虫设计方案概述:从网页源代码中找出数据对应标签,对数据进行分析和可视化处理。

二、主题页面的结构特征分析

1.主题页面的结构与特征分析:爬取疫情数据_第1张图片

2.Htmls页面解析:页面中按F12查看网页源代码

3.节点(标签)查找方法与遍历方法:在所需数据位置右键→查找,即可找到标签

 

三、网络爬虫程序设计

1.数据爬取与采集:

import requests
import json
import time
import datetime
import matplotlib.pyplot as plt 
import numpy as np

url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)
data = json.loads(requests.get(url=url).json()['data'])
num = data['areaTree'][0]['children']

爬取疫情数据_第2张图片

2.数据分析与可视化:

# 解析确诊数据
total_data = {}
for item in num:
    if item['name'] not in total_data:
        total_data.update({item['name']:0})
    for city_data in item['children']:
        total_data[item['name']] +=int(city_data['total']['confirm'])    

# 解析疑似数据
total_suspect_data = {}
for item in num:
    if item['name'] not in total_suspect_data:
        total_suspect_data.update({item['name']:0})
    for city_data in item['children']:
        total_suspect_data[item['name']] +=int(city_data['total']['suspect'])    


# 解析死亡数据
total_dead_data = {}
for item in num:
    if item['name'] not in total_dead_data:
        total_dead_data.update({item['name']:0})
    for city_data in item['children']:
        total_dead_data[item['name']] +=int(city_data['total']['dead'])    

# 解析治愈数据
total_heal_data = {}
for item in num:
    if item['name'] not in total_heal_data:
        total_heal_data.update({item['name']:0})
    for city_data in item['children']:
        total_heal_data[item['name']] +=int(city_data['total']['heal'])    

# 解析新增确诊数据
total_new_data = {}
for item in num:
    if item['name'] not in total_new_data:
        total_new_data.update({item['name']:0})
    for city_data in item['children']:
        total_new_data[item['name']] +=int(city_data['today']['confirm'])     


#统计数据并输出
names = list(total_data.keys())
num1 = list(total_data.values())
num2 = list(total_suspect_data.values())
num3 = list(total_dead_data.values())
num4 = list(total_heal_data.values())
num5 = list(total_new_data.values())


today=datetime.date.today()
f=open('./疫情-%s.csv'%(today),'w',encoding='utf-8')
f.write('省份,确诊人数,死亡人数,治愈人数,新增确诊\n')
i = 0
while i<len(names):
    f.write(names[i]+','+str(num1[i])+','+str(num3[i])+','+str(num4[i])+','+str(num5[i])+'\n')
    i = i + 1




#绘制柱形图

plt.figure(figsize=[100,60])
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False    

#绘制确诊数据
p1 = plt.subplot(221)
names = total_data.keys()
nums = total_data.values()
print(names)
print(nums)
print(total_data)
plt.bar(names, nums, width=0.5, color='green')
plt.ylabel("确诊人数", rotation=90,size=50)
plt.xticks(list(names), rotation=-60, size=50)
for a, b in zip(list(names), list(nums)):
    plt.text(a, b, b, ha='center', va='bottom', size=35)
plt.sca(p1)

#绘制新增确诊数据
p2 = plt.subplot(222)
names = total_new_data.keys()
nums = total_new_data.values()
print(names)
print(nums)
plt.bar(names, nums, width=0.5, color='yellow')
plt.ylabel("新增确诊人数", rotation=90,size=50)
plt.xticks(list(names), rotation=-60, size=50)
for a, b in zip(list(names), list(nums)):
    plt.text(a, b, b, ha='center', va='bottom', size=35)
plt.sca(p2)

#绘制死亡数据
p3 = plt.subplot(223)
names = total_dead_data.keys()
nums = total_dead_data.values()
print(names)
print(nums)
plt.bar(names, nums, width=0.5, color='blue')
plt.xlabel("地区")
plt.ylabel("死亡人数", rotation=90,size=50)
plt.xticks(list(names), rotation=-60, size=50)
for a, b in zip(list(names), list(nums)):
    plt.text(a, b, b, ha='center', va='bottom', size=35)
plt.sca(p3)

#绘制治愈数据
p4 = plt.subplot(224)
names = total_heal_data.keys()
nums = total_heal_data.values()
print(names)
print(nums)
plt.bar(names, nums, width=0.3, color='red')
plt.xlabel("地区")
plt.ylabel("治愈人数", rotation=90,size=50)
plt.xticks(list(names), rotation=-60, size=50)
for a, b in zip(list(names), list(nums)):
    plt.text(a, b, b, ha='center', va='bottom', size=35)
plt.sca(p4)
plt.show()

爬取疫情数据_第3张图片

3.将以上数据汇总:

  1 import requests
  2 import json
  3 import time
  4 import datetime
  5 import matplotlib.pyplot as plt 
  6 import numpy as np
  7 
  8 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)
  9 data = json.loads(requests.get(url=url).json()['data'])
 10 num = data['areaTree'][0]['children']
 11 
 12 # 解析确诊数据
 13 total_data = {}
 14 for item in num:
 15     if item['name'] not in total_data:
 16         total_data.update({item['name']:0})
 17     for city_data in item['children']:
 18         total_data[item['name']] +=int(city_data['total']['confirm'])    
 19 
 20 # 解析疑似数据
 21 total_suspect_data = {}
 22 for item in num:
 23     if item['name'] not in total_suspect_data:
 24         total_suspect_data.update({item['name']:0})
 25     for city_data in item['children']:
 26         total_suspect_data[item['name']] +=int(city_data['total']['suspect'])    
 27 
 28 
 29 # 解析死亡数据
 30 total_dead_data = {}
 31 for item in num:
 32     if item['name'] not in total_dead_data:
 33         total_dead_data.update({item['name']:0})
 34     for city_data in item['children']:
 35         total_dead_data[item['name']] +=int(city_data['total']['dead'])    
 36 
 37 # 解析治愈数据
 38 total_heal_data = {}
 39 for item in num:
 40     if item['name'] not in total_heal_data:
 41         total_heal_data.update({item['name']:0})
 42     for city_data in item['children']:
 43         total_heal_data[item['name']] +=int(city_data['total']['heal'])    
 44 
 45 # 解析新增确诊数据
 46 total_new_data = {}
 47 for item in num:
 48     if item['name'] not in total_new_data:
 49         total_new_data.update({item['name']:0})
 50     for city_data in item['children']:
 51         total_new_data[item['name']] +=int(city_data['today']['confirm'])     
 52 
 53 
 54 #统计数据并输出
 55 names = list(total_data.keys())
 56 num1 = list(total_data.values())
 57 num2 = list(total_suspect_data.values())
 58 num3 = list(total_dead_data.values())
 59 num4 = list(total_heal_data.values())
 60 num5 = list(total_new_data.values())
 61 
 62 
 63 today=datetime.date.today()
 64 f=open('./疫情-%s.csv'%(today),'w',encoding='utf-8')
 65 f.write('省份,确诊人数,死亡人数,治愈人数,新增确诊\n')
 66 i = 0
 67 while i<len(names):
 68     f.write(names[i]+','+str(num1[i])+','+str(num3[i])+','+str(num4[i])+','+str(num5[i])+'\n')
 69     i = i + 1
 70 
 71 
 72 
 73 
 74 #绘制柱形图
 75 
 76 plt.figure(figsize=[100,60])
 77 plt.rcParams['font.sans-serif'] = ['SimHei']
 78 plt.rcParams['axes.unicode_minus'] = False    
 79 
 80 #绘制确诊数据
 81 p1 = plt.subplot(221)
 82 names = total_data.keys()
 83 nums = total_data.values()
 84 print(names)
 85 print(nums)
 86 print(total_data)
 87 plt.bar(names, nums, width=0.5, color='green')
 88 plt.ylabel("确诊人数", rotation=90,size=50)
 89 plt.xticks(list(names), rotation=-60, size=50)
 90 for a, b in zip(list(names), list(nums)):
 91     plt.text(a, b, b, ha='center', va='bottom', size=35)
 92 plt.sca(p1)
 93 
 94 #绘制新增确诊数据
 95 p2 = plt.subplot(222)
 96 names = total_new_data.keys()
 97 nums = total_new_data.values()
 98 print(names)
 99 print(nums)
100 plt.bar(names, nums, width=0.5, color='yellow')
101 plt.ylabel("新增确诊人数", rotation=90,size=50)
102 plt.xticks(list(names), rotation=-60, size=50)
103 for a, b in zip(list(names), list(nums)):
104     plt.text(a, b, b, ha='center', va='bottom', size=35)
105 plt.sca(p2)
106 
107 #绘制死亡数据
108 p3 = plt.subplot(223)
109 names = total_dead_data.keys()
110 nums = total_dead_data.values()
111 print(names)
112 print(nums)
113 plt.bar(names, nums, width=0.5, color='blue')
114 plt.xlabel("地区")
115 plt.ylabel("死亡人数", rotation=90,size=50)
116 plt.xticks(list(names), rotation=-60, size=50)
117 for a, b in zip(list(names), list(nums)):
118     plt.text(a, b, b, ha='center', va='bottom', size=35)
119 plt.sca(p3)
120 
121 #绘制治愈数据
122 p4 = plt.subplot(224)
123 names = total_heal_data.keys()
124 nums = total_heal_data.values()
125 print(names)
126 print(nums)
127 plt.bar(names, nums, width=0.3, color='red')
128 plt.xlabel("地区")
129 plt.ylabel("治愈人数", rotation=90,size=50)
130 plt.xticks(list(names), rotation=-60, size=50)
131 for a, b in zip(list(names), list(nums)):
132     plt.text(a, b, b, ha='center', va='bottom', size=35)
133 plt.sca(p4)
134 plt.show()

爬取疫情数据_第4张图片

爬取疫情数据_第5张图片

四、结论

1.确诊人数,死亡人数和治愈人数湖北均较高,新增人数人数陕西较多

2.本次作业了解到自身短板过多,很多知识没有学习透彻,导致遇到在过程中遇到很多问题,并且不能完整的完成本次作业

你可能感兴趣的:(爬取疫情数据)