请注意本文采用的可视化工具为:pyecharts 0.1.9.4
首先是数据获取,贴出爬虫demo,如果有需要数据的请评论
# -*- coding:utf-8 -*-
import time
import requests
import pandas as pd
from lxml import etree
class AQI(object):
"""
爬取城市AQI实时数据
"""
def __init__(self):
"""
初始化函数
:attr encoding: 编码
"""
self.encoding = None
def get_encoding(self):
"""
获取网页的编码
:return: None
"""
res = requests.get('http://datacenter.mee.gov.cn/aqiweb2/')
self.encoding = res.apparent_encoding
print('Successfully crawled encoding!')
time.sleep(2)
def crawl_aqi(self, sleep_time=3600):
"""
爬取全国具有监测点的所有城市的AQI实时数据,每小时爬取一次
:param sleep_time: 爬取间隔时间,默认3600秒
:return: None
"""
write_header = True
while 1:
res = requests.get('http://datacenter.mee.gov.cn/aqiweb2/')
parsed_text = etree.HTML(res.text)
timestamp = parsed_text.xpath('/html/body/div[3]/p/i/text()')[0].replace('年', '-'). \
replace('月', '-').replace('日', ' ').replace('时', ':00:00')
print('Successfully crawled timestamp!')
# 直接使用pandas获取和解析数据
data_res = pd.read_html('http://datacenter.mee.gov.cn/aqiweb2/', encoding=self.encoding)
data = data_res[0]
header = ['city', 'AQI', 'PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'main_pollution']
data.columns = header
data['time'] = timestamp
if write_header is True:
data.to_csv('data2.csv', index=False, mode='a', header=True)
write_header = False
else:
data.to_csv('data2.csv', index=False, mode='a', header=False)
print('Successfully crawled data of {} and saved it to file!'.format(timestamp))
time.sleep(sleep_time)
if __name__ == '__main__':
aqi = AQI()
aqi.get_encoding()
aqi.crawl_aqi()
import pandas as pd
import numpy as np
data = pd.read_csv('AQI_Data.csv')
# 打印基本信息
print(data.info())
print(data.head())
# 数据清洗
data['time'] = pd.to_datetime(data['time'])
data = data[data['time'] <= pd.to_datetime('2019-02-11 23:59:59')] # 选取2月4日——2月12日的数据
data = data.replace('—', np.nan) # 打印全部数据会发现:原始数据中有很多'-'
for col in ['AQI', 'PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3']:
data[col] = data[col].astype(float)
print(data.info()) # 确认属性信息
print(len(data['city'].unique())) # 查看城市数量
data.to_csv('data_clean.csv', index=False) # 可以保存清洗好的数据
共有68155条数据,时间为2月4日到2月11日,每隔1小时收集一次,囊括全国367个城市
包含字段:城市、空气质量指数、PM2.5指数、PM10指数、SO2指数、NO2指数、CO指数、O3指数、主要污染源、时间
请注意:2019年2月4日是除夕夜。
问题:
1 燃放烟花爆竹真的对空气质量有影响吗?
2 烟花爆竹对空气质量的影响体现在哪些指标上?
3 烟花爆竹对空气质量的污染程度有多大?
4 哪些区域污染最严重(轻)?
5 哪些城市属于一秒破功型?
6 除夕中午到初一中午的超标城市个数
7 除夕到初七的超标城市个数
# 参考空气质量指数,对所有城市取均值
AQI_total_mean = data[data['time'] <= pd.to_datetime('2019-02-06 23:59:59')].groupby(['time'])['AQI'].mean()
# 画出整体趋势
line = Line(title="全国春节期间空气质量指数总体趋势", subtitle="2019年除夕到初二", width=800)
line.add("", AQI_total_mean.index, np.round(AQI_total_mean.values, 0),mark_point=['max'])
line.render() # 生成本地的html文件
# 画出具体城市变化
def city_AQI(data, cities):
line = Line("春节期间空气质量指数", "2019年除夕到初二", width=800)
for city in cities:
city_aqi = data[(data['city'] == city) & (data['time'] <= pd.to_datetime('2019-02-06 23:59:59'))]
line.add(city, city_aqi.index, np.round(city_aqi['AQI'].values, 0), is_smooth=True, mark_point=["max"])
line.render()
return line
city_AQI(data=data, cities=['北京市', '天津市'])
city_AQI(data=data, cities=['长沙市', '南宁市'])
data_total_idx = data[data['time'] <= pd.to_datetime('2019-02-06 23:59:59')].groupby(['time']).mean()
print(data_total_idx.columns) # 查看各种指标
line = Line("全国春节期间空气质量各指标(2019年除夕到初二)", "单位:μg/m3(CO为mg/m3)", width=800)
for idx in data_total_idx.columns[1:]:
line.add(idx, data_total_idx.index, np.round(data_total_idx[idx].values, 0), is_smooth=True,
legend_text_size=18, xaxis_label_textsize=14, yaxis_label_textsize=18, yaxis_min=8, legend_top=30)
line.render('各种指标.html')
tianjin_idx = data[(data['city']=='天津市')&(data['time']<=pd.to_datetime('2019-02-06 23:59:59'))]
line = Line("天津市春节期间空气质量各指标(2019年除夕到初二)", "单位:μg/m3(CO为mg/m3)", width=800)
for idx in tianjin_idx.columns[2:8]:
line.add(idx, tianjin_idx.index, np.round(tianjin_idx[idx].values, 0), is_smooth=True,
legend_text_size=18, xaxis_label_textsize=14, yaxis_label_textsize=18,
yaxis_min=8, legend_top=30)
line.render('天津.html')
main_pol = data.loc[data['time']==pd.to_datetime('2019-02-05 02:00:00'), 'main_pollution'].value_counts()
print(main_pol)
from pyecharts import Bar
bar = Bar("全国城市首要污染物", "2019年2月5日凌晨2点", width=600)
bar.add("", main_pol.index, main_pol.values, is_stack=True, is_label_show=True,
bar_category_gap='40%', label_color=['#130f40'], label_text_size=18,
legend_text_size=18, xaxis_label_textsize=18, yaxis_label_textsize=18)
bar.render('main.html')
data2 = data[data['time'] <= pd.to_datetime('2019-02-05 23:59:59')]
data_AQI_min = data2.groupby('city')['AQI'].min()
data_AQI_max = data2.groupby('city')['AQI'].max()
data_AQI_times = np.round(data_AQI_max/data_AQI_min, 1) # 用最大值/最小值 作为污染增长倍数
data_AQI_times_top10 = data_AQI_times.nlargest(10)
print(data_AQI_times_top10)
bar = Bar("全国除夕和春节期间空气质量最高最低比Top10城市", "时间:2019年除夕至初一", width=600)
bar.add("", data_AQI_times_top10.index, data_AQI_times_top10.values, is_stack=True, is_label_show=True)
bar.render()
def city_AQI(data, cities, name):
line = Line("春节期间空气质量指数", "2019年除夕到初二", width=800)
for city in cities:
city_aqi = data[(data['city'] == city) & (data['time'] <= pd.to_datetime('2019-02-06 23:59:59'))]
line.add(city, city_aqi.index, np.round(city_aqi['AQI'].values, 0), is_smooth=True, mark_point=["max"])
line.render(name)
return line
# 分别看一下top10城市
city_AQI(data=data, cities=['伊春市', '鸡西市', '大兴安岭地区'], name='3.html')
city_AQI(data=data, cities=['盘锦市', '葫芦岛市', '瓦房店市', '锦州市'], name='4.html')
data_AQI_times_counts = data_AQI_times.value_counts(bins=[1, 5, 10, 15, 20, 25]) # 注意这个方法
print(data_AQI_times_counts)
bar = Bar("全国除夕和春节期间空气质量最高最低比的城市数量", "时间:2019年除夕至初一", width=600)
times = ['1-5倍', '5-10倍', '10-15倍', '15-20倍', '20-25倍']
bar.add("", times, data_AQI_times_counts.values, is_stack=True,
is_label_show=True, bar_category_gap='40%', label_color=['#130f40'], label_text_size=18,
legend_text_size=18, xaxis_label_textsize=18, yaxis_label_textsize=18)
bar.render()
start_time = pd.to_datetime('2019-02-04 20:00:00')
end_time = pd.to_datetime('2019-02-05 04:00:00')
AQI_by_city = data[(data['time'] >= start_time) & (data['time'] <= end_time)].groupby('city')['AQI'].mean()
# 可视化,但是好像不好使
from pyecharts import Geo
geo = Geo(
"全国各城市空气质量指数",
"除夕晚上20点至初一凌晨4点平均值",
title_color="#fff",
title_pos="center",
width=800,
height=500,
background_color="#404a59",
)
attr, value = AQI_by_city.index, AQI_by_city.values
geo.add(
"",
attr,
value,
visual_range=[0, 500],
visual_text_color="#fff",
symbol_size=10,
is_visualmap=True,
is_piecewise=True,
visual_split_number=10)
geo.render()
def city_AQI(data, cities, name):
line = Line("春节期间空气质量指数", "2019年除夕到初二", width=800)
for city in cities:
city_aqi = data[(data['city'] == city) & (data['time'] <= pd.to_datetime('2019-02-06 23:59:59'))]
line.add(city, city_aqi.index, np.round(city_aqi['AQI'].values, 0), is_smooth=True, mark_point=["max"])
line.render(name)
return line
print('污染最重top10:')
print(AQI_by_city.nlargest(10))
city_AQI(data=data, cities=['呼和浩特市', '包头市', '咸阳市'], name='lagest.html')
print('污染最轻top10:')
print(AQI_by_city.nsmallest(10))
# 查看小于50的所有城市
# print(AQI_by_city[AQI_by_city <= 50])
city1 = data.loc[(data['time'] == pd.to_datetime('2019-02-05 02:00:00')) & (data['AQI']<100), 'city']
data3 = data[(data['time'] >= pd.to_datetime('2019-02-05 18:00:00'))&
(data['time'] <= pd.to_datetime('2019-02-06 23:59:59'))]
city2 = data3.loc[data3['AQI'] > 200, 'city']
# 得到城市
print(list(set(city1).intersection(set(city2))))
start_time = pd.to_datetime('2019-02-04 12:00:00')
end_time = pd.to_datetime('2019-02-05 12:00:00')
city_over_new_year = data.loc[(data['time'] >= start_time) & (data['time'] <= end_time) &
(data['AQI'] > 100)].groupby('time')['AQI'].count().plot(title='The number of city')
plt.show()
data = data.set_index(data['time']) # 这里我们将time设置为索引,并删除原列
city_over_count = (data.groupby('city')['AQI'].resample('D').mean() > 100).unstack(level=-1).sum()
print(city_over_count)
bar = Bar("全国除夕至初七期间空气质量超标城市数量", "时间: 2019年 总城市数: 367", width=800)
date = ['02-04(除夕)', '02-05(初一)', '02-06(初二)', '02-07(初三)',
'02-08(初四)', '02-09(初五)', '02-10(初六)', '02-11(初七)']
bar.add("", date, city_over_count.values, is_stack=True,
is_label_show=True, bar_category_gap='40%', label_color=['#130f40'], label_text_size=18,
legend_text_size=18, xaxis_label_textsize=18, yaxis_label_textsize=18)
bar.render('1.html')
line = Line("", width=800)
line.add("", date, city_over_count.values+5, is_smooth=True)
line.render('2.html')