美国城市主要空气污染物数据报告

#导入文件并进行预处理
import pandas as pd
import numpy as np
import datetime
file='D:\\pollution_us_2000_2016.csv'
df=pd.DataFrame(pd.read_csv(file))

df['SO2 AQI'].fillna(method='bfill',inplace=True)
df['CO AQI'].fillna(method='bfill',inplace=True)

del df['Unnamed: 0']
del df['State Code']
del df['County Code']
del df['Site Num']
del df['NO2 Units']
del df['O3 Units']
del df['SO2 Units']
del df['CO Units']

#统计每个月各种污染物的总均值除以每月统计的次数
list0=[]
list1=[]
list2=[]
list3=[]
list4=[]
for i in range(195):
    list0.append(0)
    list1.append(0)
    list2.append(0)
    list3.append(0)
    list4.append(0)


'''
p = []
for i in range(len(df)):                  #枚举所有的元素
    if df.loc[i][1]=='Arizona':           #查找所有城市是Arizona的信息
        x=pd.to_datetime(df.loc[i][4])    #把
        idx=(x.year-2000)*12+x.month-1    #把年份和月份映射到list的下标
        p.append((x.year-2000)*12+x.month-1) 


ps=set(p)                                 #查看不同的下表范围
print(ps)
'''

for i in range(len(df)):
    if df.loc[i][1]=='Florida':
        x=pd.to_datetime(df.loc[i][4])
        j=(x.year-2000)*12+x.month-1;
        list0[j]=list0[j]+1
        list1[j]=list1[j]+df.loc[i][5]
        list2[j]=list2[j]+df.loc[i][9]
        list3[j]=list3[j]+df.loc[i][13]
        list4[j]=list4[j]+df.loc[i][17]
for j in range(195):
    list1[j]=list1[j]/list0[j]
    list2[j]=list2[j]/list0[j]
    list3[j]=list3[j]/list0[j]
    list4[j]=list4[j]/list0[j]


#画出散点图
from matplotlib.pyplot import *
x = []
for i in range(195):
    x.append(i)
#create new figure
figure(figsize=(50,25))
subplot(2,3,6)
scatter(x,list1,c='r',marker='+')
scatter(x,list2,c='b',marker='+')
scatter(x,list3,c='g',marker='+')
scatter(x,list4,c='c',marker='+')
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2","O3","SO2","CO"])
plt.title("Scatter diagram of main pollutants in Florida")
show()


#画出折线图
import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()

# 时间区间
start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, list1, ls='-')
ax.plot_date(dates, list2, ls='-')
ax.plot_date(dates, list3, ls='-')
ax.plot_date(dates, list4, ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2","O3","SO2","CO"])
plt.title("Broken line diagram of main pollutants in Florida")
fig.autofmt_xdate()
show()
import pandas as pd
import numpy as np
import datetime
file='D:\\pollution_us_2000_2016.csv'
df=pd.DataFrame(pd.read_csv(file))
#df.head()

df['SO2 AQI'].fillna(method='bfill',inplace=True)
df['CO AQI'].fillna(method='bfill',inplace=True)

del df['Unnamed: 0']
del df['State Code']
del df['County Code']
del df['Site Num']
del df['NO2 Units']
del df['O3 Units']
del df['SO2 Units']
del df['CO Units']

list0=[]
list1=[]
list2=[]
list3=[]
list4=[]
for i in range(195):
    list0.append(0)
    list1.append(0)
    list2.append(0)
    list3.append(0)
    list4.append(0)


'''
p = []
for i in range(len(df)):                  #枚举所有的元素
    if df.loc[i][1]=='Arizona':           #查找所有城市是Arizona的信息
        x=pd.to_datetime(df.loc[i][4])    #把
        idx=(x.year-2000)*12+x.month-1    #把年份和月份映射到list的下标
        p.append((x.year-2000)*12+x.month-1) 


ps=set(p)                                 #查看不同的下表范围
print(ps)
'''

for i in range(len(df)):
    if df.loc[i][1]=='Arizona':
        x=pd.to_datetime(df.loc[i][4])
        j=(x.year-2000)*12+x.month-1;
        list0[j]=list0[j]+1
        list1[j]=list1[j]+df.loc[i][5]
        list2[j]=list2[j]+df.loc[i][9]
        list3[j]=list3[j]+df.loc[i][13]
        list4[j]=list4[j]+df.loc[i][17]
for j in range(195):
    list1[j]=list1[j]/list0[j]
    list2[j]=list2[j]/list0[j]
    list3[j]=list3[j]/list0[j]
    list4[j]=list4[j]/list0[j]


import matplotlib.pyplot  as plt
x = []
for i in range(195):
    x.append(i)
#create new figure
figure(figsize=(50,25))
subplot(2,3,6)
scatter(x,list1,c='r',marker='+')
scatter(x,list2,c='b',marker='+')
scatter(x,list3,c='g',marker='+')
scatter(x,list4,c='c',marker='+')
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2","O3","SO2","CO"])
plt.title("Scatter diagram of main pollutants in Arizona")
show()



import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()

# 时间区间
start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, list1, ls='-')
ax.plot_date(dates, list2, ls='-')
ax.plot_date(dates, list3, ls='-')
ax.plot_date(dates, list4, ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2","O3","SO2","CO"])
plt.title("Broken line diagram of main pollutants in Arizona")
fig.autofmt_xdate()
show()
import pandas as pd
import numpy as np
import datetime
file='D:\\pollution_us_2000_2016.csv'
df=pd.DataFrame(pd.read_csv(file))
#df.head()

df['SO2 AQI'].fillna(method='bfill',inplace=True)
df['CO AQI'].fillna(method='bfill',inplace=True)

del df['Unnamed: 0']
del df['State Code']
del df['County Code']
del df['Site Num']
del df['NO2 Units']
del df['O3 Units']
del df['SO2 Units']
del df['CO Units']

list0=[]
list1=[]
list2=[]
list3=[]
list4=[]
for i in range(1000):
    list0.append(0)
    list1.append(0)
    list2.append(0)
    list3.append(0)
    list4.append(0)



for i in range(len(df)):
    if df.loc[i][1]=='California':
        x=pd.to_datetime(df.loc[i][4])
        j=(x.year-2000)*12+x.month-1;
        list0[j]=list0[j]+1
        list1[j]=list1[j]+df.loc[i][5]
        list2[j]=list2[j]+df.loc[i][9]
        list3[j]=list3[j]+df.loc[i][13]
        list4[j]=list4[j]+df.loc[i][17]
for j in range(195):
    list1[j]=list1[j]/list0[j]
    list2[j]=list2[j]/list0[j]
    list3[j]=list3[j]/list0[j]
    list4[j]=list4[j]/list0[j]

list11=[]
list22=[]
list33=[]
list44=[]
for i in range(195):
    list11.append(list1[i])
    list22.append(list2[i])
    list33.append(list3[i])
    list44.append(list4[i])

from matplotlib.pyplot import *
x = []
for i in range(195):
    x.append(i)
#create new figure
figure(figsize=(50,25))
subplot(2,3,6)
scatter(x,list11,c='r',marker='+')
scatter(x,list22,c='b',marker='+')
scatter(x,list33,c='g',marker='+')
scatter(x,list44,c='c',marker='+')
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2","O3","SO2","CO"])
plt.title("Scatter diagram of main pollutants in California")
show()



import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()

# 时间区间
start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, list11, ls='-')
ax.plot_date(dates, list22, ls='-')
ax.plot_date(dates, list33, ls='-')
ax.plot_date(dates, list44, ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2","O3","SO2","CO"])
plt.title("Broken line diagram of main pollutants in California")
fig.autofmt_xdate()
show()
a=[[] for i in range(195)]
for i in range(195):
    for j in range(8):
        a[i].append(0)
b=[]
for i in range(195):
    b.append(0)

for i in range(len(df)):
    if df.loc[i][1]=='Arizona':
        x=pd.to_datetime(df.loc[i][4])
        j=(x.year-2000)*12+x.month-1
        b[j]=b[j]+1;
        a[j][0]=a[j][0]+df.loc[i][6]
        a[j][1]=a[j][1]+df.loc[i][7]
        a[j][2]=a[j][2]+df.loc[i][10]
        a[j][3]=a[j][3]+df.loc[i][11]
        a[j][4]=a[j][4]+df.loc[i][14]
        a[j][5]=a[j][5]+df.loc[i][15]
        a[j][6]=a[j][6]+df.loc[i][18]
        a[j][7]=a[j][7]+df.loc[i][19]


for i in range(195):
        for j in range(8):
            a[i][j]=a[i][j]/b[i];

            
data = [[] for i in range(8)] 
for i in range(195):
    for j in range(8):
        data[j].append(a[i][j])
        
import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()


start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, data[1], ls='-')
ax.plot_date(dates, data[3], ls='-')
ax.plot_date(dates, data[5], ls='-')
ax.plot_date(dates, data[7], ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2 1st Max Hour","O3 1st Max Hour","SO2 1st Max Hour","CO 1st Max Hour"])
plt.title("Broken line diagram of  1st Max Hour in Arizona")
fig.autofmt_xdate()
show()



import matplotlib.pyplot  as plt
x = []
for i in range(195):
    x.append(i)
#create new figure
figure(figsize=(50,25))
subplot(2,3,6)
scatter(x,data[1],c='r',marker='+')
scatter(x,data[3],c='b',marker='+')
scatter(x,data[5],c='g',marker='+')
scatter(x,data[7],c='c',marker='+')
plt.xlabel('Months since 2000')
plt.ylabel('Time')
plt.legend(["NO2 1st Max Hour","O3 1st Max Hour","SO2 1st Max Hour","CO 1st Max Hour"])
plt.title("Maximum time of Arizona pollutant value Scatter plot")
show()

a=[[] for i in range(195)]
for i in range(195):
    for j in range(8):
        a[i].append(0)
b=[]
for i in range(1000):
    b.append(0)
for i in range(len(df)):
    if df.loc[i][1]=='California':
        x=pd.to_datetime(df.loc[i][4])
        j=(x.year-2000)*12+x.month-1
        if j>=0 and j<=194:
            b[j]=b[j]+1;
            a[j][0]=a[j][0]+df.loc[i][6]
            a[j][1]=a[j][1]+df.loc[i][7]
            a[j][2]=a[j][2]+df.loc[i][10]
            a[j][3]=a[j][3]+df.loc[i][11]
            a[j][4]=a[j][4]+df.loc[i][14]
            a[j][5]=a[j][5]+df.loc[i][15]
            a[j][6]=a[j][6]+df.loc[i][18]
            a[j][7]=a[j][7]+df.loc[i][19]
for i in range(195):
        for j in range(8):
            a[i][j]=a[i][j]/b[i];

            
data = [[] for i in range(8)] 
for i in range(195):
    for j in range(8):
        data[j].append(a[i][j])
import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()


start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, data[1], ls='-')
ax.plot_date(dates, data[3], ls='-')
ax.plot_date(dates, data[5], ls='-')
ax.plot_date(dates, data[7], ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2 1st Max Hour","O3 1st Max Hour","SO2 1st Max Hour","CO 1st Max Hour"])
plt.title("Broken line diagram of  1st Max Hour in California")
fig.autofmt_xdate()
show()


import matplotlib.pyplot  as plt
x = []
for i in range(195):
    x.append(i)
#create new figure
figure(figsize=(50,25))
subplot(2,3,6)
scatter(x,data[1],c='r',marker='+')
scatter(x,data[3],c='b',marker='+')
scatter(x,data[5],c='g',marker='+')
scatter(x,data[7],c='c',marker='+')
plt.xlabel('Months since 2000')
plt.ylabel('Time')
plt.legend(["NO2 1st Max Hour","O3 1st Max Hour","SO2 1st Max Hour","CO 1st Max Hour"])
plt.title("Maximum time of California pollutant value Scatter plot")
show()

a=[[] for i in range(195)]
for i in range(195):
    for j in range(8):
        a[i].append(0)
b=[]
for i in range(1000):
    b.append(0)
for i in range(len(df)):
    if df.loc[i][1]=='Florida':
        x=pd.to_datetime(df.loc[i][4])
        j=(x.year-2000)*12+x.month-1
        if j>=0 and j<=194:
            b[j]=b[j]+1;
            a[j][0]=a[j][0]+df.loc[i][6]
            a[j][1]=a[j][1]+df.loc[i][7]
            a[j][2]=a[j][2]+df.loc[i][10]
            a[j][3]=a[j][3]+df.loc[i][11]
            a[j][4]=a[j][4]+df.loc[i][14]
            a[j][5]=a[j][5]+df.loc[i][15]
            a[j][6]=a[j][6]+df.loc[i][18]
            a[j][7]=a[j][7]+df.loc[i][19]
for i in range(195):
        for j in range(8):
            a[i][j]=a[i][j]/b[i];

            
data = [[] for i in range(8)] 
for i in range(195):
    for j in range(8):
        data[j].append(a[i][j])




import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()


start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, data[1], ls='-')
ax.plot_date(dates, data[3], ls='-')
ax.plot_date(dates, data[5], ls='-')
ax.plot_date(dates, data[7], ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2 1st Max Hour","O3 1st Max Hour","SO2 1st Max Hour","CO 1st Max Hour"])
plt.title("Broken line diagram of  1st Max Hour in Florida")
fig.autofmt_xdate()
show()


import matplotlib.pyplot  as plt
x = []
for i in range(195):
    x.append(i)
#create new figure
figure(figsize=(50,25))
subplot(2,3,6)
scatter(x,data[1],c='r',marker='+')
scatter(x,data[3],c='b',marker='+')
scatter(x,data[5],c='g',marker='+')
scatter(x,data[7],c='c',marker='+')
plt.xlabel('Months since 2000')
plt.ylabel('Time')
plt.legend(["NO2 1st Max Hour","O3 1st Max Hour","SO2 1st Max Hour","CO 1st Max Hour"])
plt.title("Maximum time of Florida pollutant value Scatter plot")
show()






pandas数据预处理思维图
#相关性
df.corr()
#查看相关强的数据列
[(df.corr() > 0.8) & (df.corr() != 1)]
相关性
强相关

不同城市的对比:

import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()


start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, list1, ls='-')
ax.plot_date(dates, lis1, ls='-')
ax.plot_date(dates, li1, ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["Arizona NO2 Mean","Florida NO2 Mean","California SO2 Mean"])
plt.title("Broken line diagram")
fig.autofmt_xdate()
show()


你可能感兴趣的:(美国城市主要空气污染物数据报告)