使用JupyterLab进行的数据分析
数据集下载
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib as mpl
import matplotlib.pyplot as plt
#显示所有列
pd.set_option('display.max_columns', None)
from matplotlib.font_manager import _rebuild
_rebuild()
# 支持中文
mpl.rcParams['font.sans-serif'] = [u'SimHei'] # 用来正常显示中文标签
mpl.rcParams['axes.unicode_minus'] = False # 用来正常显示负号,解决保存图像是负号'-'显示为方块的问题,或者转换负号为字符串
data = pd.read_csv('air.csv')
# data = DataFrame(df)
# data
data.shape
data.columns
data.dtypes
#查看不同价格区间的AQI,在当前数据集中的占比情况
AQI_min = data.AQI.min()
AQI_max = data.AQI.max()
display(AQI_min, AQI_max)
AQI_cut = pd.cut(data.AQI, bins = [AQI_min, 50, 100, 150, 200, 300, AQI_max])
AQI_count = AQI_cut.value_counts()
AQI_count
def func1():
X = np.arange(len(AQI_count))
Y = AQI_count
plt.figure(figsize=(8,6))
plt.bar(X,Y,color='steelblue',alpha=0.8)
plt.title('AQI分布图')
plt.xlabel('AQI区间')
plt.ylabel('2014-2018年AQI天数')
plt.xticks(np.arange(len(AQI_count)),AQI_count.index, rotation=30)
plt.ylim([0,320000])
percents = [str(round(i*100,2)) + '%'for i in AQI_count / AQI_count.sum()]
for x,y,z in zip(X,Y,percents):
plt.text(x-0.2,y+5000,z)
plt.savefig('C:\\Users\\Administrator\\Desktop\\数据分析图\\1.png')
func1()
# 全国污染程度饼图
def func2():
labels = ['良(50,100]','优(0,50]','轻度污染(100,150]','中度污染(150,200]','重度污染(200,300]','严重污染(300,1210]']
x = [i for i in AQI_count / AQI_count.sum()]
colors= ['#32CD32','#FFDAB9','#8A2BE2','#2442aa','#dd5555','#FFFF00']
explode = [0,0.1,0,0,0,0]
plt.pie(x=x,#绘图的数据
labels=labels,#数据标签
colors=colors,#饼图颜色
autopct='%.1f%%',#设置百分比
startangle=180,#设置初始角度
#frame=1,
#center=(2,2)
explode=explode,#设置突出显示
radius=2#设置饼的半径
)
plt.savefig('C:\\Users\\Administrator\\Desktop\\数据分析图\\2.png')
func2()
# AQI与PM2.5的关系
def func3(pollutant,num1,num2):
data2 = data[data[pollutant] < num1] #利用drop方法将含有特定数值的列删除
data2 = data2[data2[pollutant] != 0]
data2 = data2[data2['AQI'] < num2]
data2 = data2[data2['AQI'] != 0]
plt.scatter(data2.AQI, data2[pollutant],s=5)
plt.xlabel('AQI')
plt.ylabel(pollutant)
plt.title('AQI与%s的关系' % pollutant)
plt.savefig('C:\\Users\\Administrator\\Desktop\\数据分析图\\%s.png' % pollutant)
func3('PM2.5',700,500)
# AQI与PM10的关系
func3('PM10',1000,500)
# AQI与SO2的关系
func3('SO2',800,500)
# AQI与NO2的关系
func3('NO2',300,500)
# AQI与CO的关系
func3('CO',25,500)
# 全国一线及新一线AQI平均值排名
def func4():
yixian_city = data[(data.city=='北京')|(data.city=='上海')|(data.city=='广州')|(data.city=='深圳')|(data.city=='成都')|(data.city=='杭州')|(data.city=='重庆')|(data.city=='武汉')|
(data.city=='苏州')|(data.city=='西安')|(data.city=='天津')|(data.city=='南京')|(data.city=='郑州')|(data.city=='长沙')|(data.city=='沈阳')|(data.city=='青岛')|
(data.city=='宁波')|(data.city=='东莞')|(data.city=='无锡')].groupby("city")["AQI"].mean().sort_values(ascending=False)
plt.figure(figsize=(12,8))
plt.barh(np.arange(len(yixian_city)), yixian_city,color='#FF0000')
plt.yticks(np.arange(len(yixian_city)), yixian_city.index)
plt.xlabel('AQI')
plt.ylabel('城市')
plt.title('全国一线城市及新一线城市AQI平均值排名')
plt.savefig('C:\\Users\\Administrator\\Desktop\\数据分析图\\3.png')
func4()
data['month'] = data.time.str[5:7].astype('int')
# 全国按月份污染物平均值走势
def func5():
country_city = data.groupby('month').mean().sort_index()
country_city2 = country_city[["AQI","PM2.5","PM10","SO2","NO2","CO"]]
plt.figure(figsize=(12,8))
plt.plot(country_city2,label=country_city2.columns,marker = "o" ,mec = "k" , mfc = "w" , mew = 0.5)
plt.legend(country_city2)
plt.xticks(np.arange(1,13))
plt.xlim([1,12])
plt.xlabel('月份')
plt.ylabel('污染物')
plt.title('全国污染物平均值走势')
plt.savefig('C:\\Users\\Administrator\\Desktop\\数据分析图\\4.png')
func5()
# 沿海代表城市与内地代表城市
def func6():
yanhai = data[(data.city=='珠海')|(data.city=='深圳')|(data.city=='广州')|(data.city=='东莞')].groupby("month")["AQI"].mean()
neidi = data[(data.city=='洛阳')|(data.city=='新乡')|(data.city=='开封')|(data.city=='周口')].groupby("month")["AQI"].mean()
plt.figure(figsize=(12,8))
plt.plot(yanhai.index,yanhai,"-",marker = "o" ,mec = "k" , mfc = "w" , mew = 0.5)
plt.plot(neidi.index,neidi,"-",marker = "o" ,mec = "k" , mfc = "w" , mew = 0.5)
plt.legend(['沿海代表城市','内地代表城市'])
plt.xticks(np.arange(1,13))
plt.xlim([1,12])
plt.xlabel('月份')
plt.ylabel('AQI平均值')
plt.title('沿海代表城市与内地代表城市AQI平均值走势')
plt.savefig('C:\\Users\\Administrator\\Desktop\\数据分析图\\5.png')
func6()
data['year'] = data.time.str[:4].astype('int')
# 每年污染物柱状图
def func7_1(year):
return [data[data.year==year]['AQI'].mean(),data[data.year==year]['PM2.5'].mean(),data[data.year==year]['PM10'].mean(),data[data.year==year]['SO2'].mean(),data[data.year==year]['NO2'].mean()]
def func7():
plt.figure(figsize=(12,8))
labels = ["AQI","PM2.5","PM10","SO2","NO2"]
#设定每个柱子的宽度
bar_width = 0.15
x=0
for i in [2014,2015,2016,2017,2018]:
plt.bar(np.arange(5)+x*bar_width,func7_1(i),label=i,alpha=0.8,width=bar_width)
x+=1
plt.legend()
plt.ylim([0,110])
plt.xticks([0.295,1.295,2.295,3.295,4.295],labels)
plt.savefig('C:\\Users\\Administrator\\Desktop\\数据分析图\\6.png')
func7()
# 北京污染程度饼图
def func8(city):
#查看不同价格区间的AQI,在当前数据集中的占比情况
AQI_max = data[data.city==city].AQI.max()
AQI_cut = pd.cut(data[data.city==city].AQI, bins = [0, 50, 100, 150, 200, 300, AQI_max])
AQI_count = AQI_cut.value_counts()
labels = ['良(50,100]','优(0,50]','轻度污染(100,150]','中度污染(150,200]','重度污染(200,300]','严重污染(300,%s]' % AQI_max]
x = [i for i in AQI_count / AQI_count.sum()]
colors= ['#32CD32','#FFDAB9','#8A2BE2','#2442aa','#dd5555','#FFFF00']
explode = [0,0.1,0,0,0,0]
plt.pie(x=x,#绘图的数据
labels=labels,#数据标签
colors=colors,#饼图颜色
autopct='%.1f%%',#设置百分比
startangle=180,#设置初始角度
# frame=1,
# center=(2,2)
explode=explode,#设置突出显示
radius=1.5#设置饼的半径
)
plt.savefig('C:\\Users\\Administrator\\Desktop\\数据分析图\\%s.png' % city)
func8('北京')
func8('上海')
# 广州污染程度饼图
def func9():
#查看不同价格区间的AQI,在当前数据集中的占比情况
AQI_min = data[data.city=='广州'].AQI.min()
AQI_max = data[data.city=='广州'].AQI.max()
display(AQI_min, AQI_max)
AQI_cut = pd.cut(data[data.city=='广州'].AQI, bins = [AQI_min, 50, 100, 150, 200, 207])
AQI_count = AQI_cut.value_counts()
labels = ['良(50,100]','优12,50]','轻度污染(100,150]','中度污染(150,200]','重度污染(200,207]']
x = [i for i in AQI_count / AQI_count.sum()]
colors= ['#32CD32','#FFDAB9','#8A2BE2','#2442aa','#dd5555']
explode = [0,0.1,0,0,0]
plt.pie(x=x,#绘图的数据
labels=labels,#数据标签
colors=colors,#饼图颜色
autopct='%.1f%%',#设置百分比
startangle=180,#设置初始角度
# frame=1,
# center=(2,2)
explode=explode,#设置突出显示
radius=1.5#设置饼的半径
)
plt.savefig('C:\\Users\\Administrator\\Desktop\\数据分析图\\7.png')
func9()
# 深圳污染程度饼图
def func10():
#查看不同价格区间的AQI,在当前数据集中的占比情况
AQI_min = data[data.city=='深圳'].AQI.min()
AQI_max = data[data.city=='深圳'].AQI.max()
display(AQI_min, AQI_max)
AQI_cut = pd.cut(data[data.city=='深圳'].AQI, bins = [0, 50, 100, 150, 187])
AQI_count = AQI_cut.value_counts()
labels = ['良(50,100]','优(0,50]','轻度污染(100,150]','中度污染(150,200]']
x = [i for i in AQI_count / AQI_count.sum()]
colors= ['#32CD32','#FFDAB9','#8A2BE2','#2442aa']
explode = [0,0.1,0,0]
plt.pie(x=x,#绘图的数据
labels=labels,#数据标签
colors=colors,#饼图颜色
autopct='%.1f%%',#设置百分比
startangle=180,#设置初始角度
# frame=1,
# center=(2,2)
explode=explode,#设置突出显示
radius=1.5#设置饼的半径
)
plt.savefig('C:\\Users\\Administrator\\Desktop\\数据分析图\\8.png')
func10()
# 新乡市2017年AQI走势
def func11():
result1 = data[(data.city=='新乡')&(data.year==2017)]["AQI"]
result2 = data[(data.city=='新乡')&(data.year==2017)]["time"]
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)#图片对象
ax.plot(result2,result1,"-")
ax.legend()
ax.axis([0,144,0,400]) #画轴的范围
month = [0,1,2,3,4,5,6,7,8,9,10,11,12]
plt.xticks(np.arange(0,144,11.999),month) #重新设置x轴间隔和刻度值
plt.xlabel('月份')
plt.ylabel('AQI')
plt.title('新乡市2017年AQI走势')
plt.savefig('C:\\Users\\Administrator\\Desktop\\数据分析图\\9.png')
func11()