一、主题式网络爬虫设计方案
1.爬虫名称:爬取新冠病毒疫情数据。
2.爬取内容:腾讯新闻网实时疫情数据。
3.网络爬虫设计方案概述:从网页源代码中找出数据对应标签,对数据进行分析和可视化处理。
二、主题页面的结构特征分析
2.Htmls页面解析:页面中按F12查看网页源代码
3.节点(标签)查找方法与遍历方法:在所需数据位置右键→查找,即可找到标签
三、网络爬虫程序设计
1.数据爬取与采集:
import requests import json import time import datetime import matplotlib.pyplot as plt import numpy as np url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000) data = json.loads(requests.get(url=url).json()['data']) num = data['areaTree'][0]['children']
2.数据分析与可视化:
# 解析确诊数据 total_data = {} for item in num: if item['name'] not in total_data: total_data.update({item['name']:0}) for city_data in item['children']: total_data[item['name']] +=int(city_data['total']['confirm']) # 解析疑似数据 total_suspect_data = {} for item in num: if item['name'] not in total_suspect_data: total_suspect_data.update({item['name']:0}) for city_data in item['children']: total_suspect_data[item['name']] +=int(city_data['total']['suspect']) # 解析死亡数据 total_dead_data = {} for item in num: if item['name'] not in total_dead_data: total_dead_data.update({item['name']:0}) for city_data in item['children']: total_dead_data[item['name']] +=int(city_data['total']['dead']) # 解析治愈数据 total_heal_data = {} for item in num: if item['name'] not in total_heal_data: total_heal_data.update({item['name']:0}) for city_data in item['children']: total_heal_data[item['name']] +=int(city_data['total']['heal']) # 解析新增确诊数据 total_new_data = {} for item in num: if item['name'] not in total_new_data: total_new_data.update({item['name']:0}) for city_data in item['children']: total_new_data[item['name']] +=int(city_data['today']['confirm']) #统计数据并输出 names = list(total_data.keys()) num1 = list(total_data.values()) num2 = list(total_suspect_data.values()) num3 = list(total_dead_data.values()) num4 = list(total_heal_data.values()) num5 = list(total_new_data.values()) today=datetime.date.today() f=open('./疫情-%s.csv'%(today),'w',encoding='utf-8') f.write('省份,确诊人数,死亡人数,治愈人数,新增确诊\n') i = 0 while i<len(names): f.write(names[i]+','+str(num1[i])+','+str(num3[i])+','+str(num4[i])+','+str(num5[i])+'\n') i = i + 1 #绘制柱形图 plt.figure(figsize=[100,60]) plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False #绘制确诊数据 p1 = plt.subplot(221) names = total_data.keys() nums = total_data.values() print(names) print(nums) print(total_data) plt.bar(names, nums, width=0.5, color='green') plt.ylabel("确诊人数", rotation=90,size=50) plt.xticks(list(names), rotation=-60, size=50) for a, b in zip(list(names), list(nums)): plt.text(a, b, b, ha='center', va='bottom', size=35) plt.sca(p1) #绘制新增确诊数据 p2 = plt.subplot(222) names = total_new_data.keys() nums = total_new_data.values() print(names) print(nums) plt.bar(names, nums, width=0.5, color='yellow') plt.ylabel("新增确诊人数", rotation=90,size=50) plt.xticks(list(names), rotation=-60, size=50) for a, b in zip(list(names), list(nums)): plt.text(a, b, b, ha='center', va='bottom', size=35) plt.sca(p2) #绘制死亡数据 p3 = plt.subplot(223) names = total_dead_data.keys() nums = total_dead_data.values() print(names) print(nums) plt.bar(names, nums, width=0.5, color='blue') plt.xlabel("地区") plt.ylabel("死亡人数", rotation=90,size=50) plt.xticks(list(names), rotation=-60, size=50) for a, b in zip(list(names), list(nums)): plt.text(a, b, b, ha='center', va='bottom', size=35) plt.sca(p3) #绘制治愈数据 p4 = plt.subplot(224) names = total_heal_data.keys() nums = total_heal_data.values() print(names) print(nums) plt.bar(names, nums, width=0.3, color='red') plt.xlabel("地区") plt.ylabel("治愈人数", rotation=90,size=50) plt.xticks(list(names), rotation=-60, size=50) for a, b in zip(list(names), list(nums)): plt.text(a, b, b, ha='center', va='bottom', size=35) plt.sca(p4) plt.show()
3.将以上数据汇总:
1 import requests 2 import json 3 import time 4 import datetime 5 import matplotlib.pyplot as plt 6 import numpy as np 7 8 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000) 9 data = json.loads(requests.get(url=url).json()['data']) 10 num = data['areaTree'][0]['children'] 11 12 # 解析确诊数据 13 total_data = {} 14 for item in num: 15 if item['name'] not in total_data: 16 total_data.update({item['name']:0}) 17 for city_data in item['children']: 18 total_data[item['name']] +=int(city_data['total']['confirm']) 19 20 # 解析疑似数据 21 total_suspect_data = {} 22 for item in num: 23 if item['name'] not in total_suspect_data: 24 total_suspect_data.update({item['name']:0}) 25 for city_data in item['children']: 26 total_suspect_data[item['name']] +=int(city_data['total']['suspect']) 27 28 29 # 解析死亡数据 30 total_dead_data = {} 31 for item in num: 32 if item['name'] not in total_dead_data: 33 total_dead_data.update({item['name']:0}) 34 for city_data in item['children']: 35 total_dead_data[item['name']] +=int(city_data['total']['dead']) 36 37 # 解析治愈数据 38 total_heal_data = {} 39 for item in num: 40 if item['name'] not in total_heal_data: 41 total_heal_data.update({item['name']:0}) 42 for city_data in item['children']: 43 total_heal_data[item['name']] +=int(city_data['total']['heal']) 44 45 # 解析新增确诊数据 46 total_new_data = {} 47 for item in num: 48 if item['name'] not in total_new_data: 49 total_new_data.update({item['name']:0}) 50 for city_data in item['children']: 51 total_new_data[item['name']] +=int(city_data['today']['confirm']) 52 53 54 #统计数据并输出 55 names = list(total_data.keys()) 56 num1 = list(total_data.values()) 57 num2 = list(total_suspect_data.values()) 58 num3 = list(total_dead_data.values()) 59 num4 = list(total_heal_data.values()) 60 num5 = list(total_new_data.values()) 61 62 63 today=datetime.date.today() 64 f=open('./疫情-%s.csv'%(today),'w',encoding='utf-8') 65 f.write('省份,确诊人数,死亡人数,治愈人数,新增确诊\n') 66 i = 0 67 while i<len(names): 68 f.write(names[i]+','+str(num1[i])+','+str(num3[i])+','+str(num4[i])+','+str(num5[i])+'\n') 69 i = i + 1 70 71 72 73 74 #绘制柱形图 75 76 plt.figure(figsize=[100,60]) 77 plt.rcParams['font.sans-serif'] = ['SimHei'] 78 plt.rcParams['axes.unicode_minus'] = False 79 80 #绘制确诊数据 81 p1 = plt.subplot(221) 82 names = total_data.keys() 83 nums = total_data.values() 84 print(names) 85 print(nums) 86 print(total_data) 87 plt.bar(names, nums, width=0.5, color='green') 88 plt.ylabel("确诊人数", rotation=90,size=50) 89 plt.xticks(list(names), rotation=-60, size=50) 90 for a, b in zip(list(names), list(nums)): 91 plt.text(a, b, b, ha='center', va='bottom', size=35) 92 plt.sca(p1) 93 94 #绘制新增确诊数据 95 p2 = plt.subplot(222) 96 names = total_new_data.keys() 97 nums = total_new_data.values() 98 print(names) 99 print(nums) 100 plt.bar(names, nums, width=0.5, color='yellow') 101 plt.ylabel("新增确诊人数", rotation=90,size=50) 102 plt.xticks(list(names), rotation=-60, size=50) 103 for a, b in zip(list(names), list(nums)): 104 plt.text(a, b, b, ha='center', va='bottom', size=35) 105 plt.sca(p2) 106 107 #绘制死亡数据 108 p3 = plt.subplot(223) 109 names = total_dead_data.keys() 110 nums = total_dead_data.values() 111 print(names) 112 print(nums) 113 plt.bar(names, nums, width=0.5, color='blue') 114 plt.xlabel("地区") 115 plt.ylabel("死亡人数", rotation=90,size=50) 116 plt.xticks(list(names), rotation=-60, size=50) 117 for a, b in zip(list(names), list(nums)): 118 plt.text(a, b, b, ha='center', va='bottom', size=35) 119 plt.sca(p3) 120 121 #绘制治愈数据 122 p4 = plt.subplot(224) 123 names = total_heal_data.keys() 124 nums = total_heal_data.values() 125 print(names) 126 print(nums) 127 plt.bar(names, nums, width=0.3, color='red') 128 plt.xlabel("地区") 129 plt.ylabel("治愈人数", rotation=90,size=50) 130 plt.xticks(list(names), rotation=-60, size=50) 131 for a, b in zip(list(names), list(nums)): 132 plt.text(a, b, b, ha='center', va='bottom', size=35) 133 plt.sca(p4) 134 plt.show()
四、结论
1.确诊人数,死亡人数和治愈人数湖北均较高,新增人数人数陕西较多
2.本次作业了解到自身短板过多,很多知识没有学习透彻,导致遇到在过程中遇到很多问题,并且不能完整的完成本次作业