#导入文件并进行预处理
import pandas as pd
import numpy as np
import datetime
file='D:\\pollution_us_2000_2016.csv'
df=pd.DataFrame(pd.read_csv(file))
df['SO2 AQI'].fillna(method='bfill',inplace=True)
df['CO AQI'].fillna(method='bfill',inplace=True)
del df['Unnamed: 0']
del df['State Code']
del df['County Code']
del df['Site Num']
del df['NO2 Units']
del df['O3 Units']
del df['SO2 Units']
del df['CO Units']
#统计每个月各种污染物的总均值除以每月统计的次数
list0=[]
list1=[]
list2=[]
list3=[]
list4=[]
for i in range(195):
list0.append(0)
list1.append(0)
list2.append(0)
list3.append(0)
list4.append(0)
'''
p = []
for i in range(len(df)): #枚举所有的元素
if df.loc[i][1]=='Arizona': #查找所有城市是Arizona的信息
x=pd.to_datetime(df.loc[i][4]) #把
idx=(x.year-2000)*12+x.month-1 #把年份和月份映射到list的下标
p.append((x.year-2000)*12+x.month-1)
ps=set(p) #查看不同的下表范围
print(ps)
'''
for i in range(len(df)):
if df.loc[i][1]=='Florida':
x=pd.to_datetime(df.loc[i][4])
j=(x.year-2000)*12+x.month-1;
list0[j]=list0[j]+1
list1[j]=list1[j]+df.loc[i][5]
list2[j]=list2[j]+df.loc[i][9]
list3[j]=list3[j]+df.loc[i][13]
list4[j]=list4[j]+df.loc[i][17]
for j in range(195):
list1[j]=list1[j]/list0[j]
list2[j]=list2[j]/list0[j]
list3[j]=list3[j]/list0[j]
list4[j]=list4[j]/list0[j]
#画出散点图
from matplotlib.pyplot import *
x = []
for i in range(195):
x.append(i)
#create new figure
figure(figsize=(50,25))
subplot(2,3,6)
scatter(x,list1,c='r',marker='+')
scatter(x,list2,c='b',marker='+')
scatter(x,list3,c='g',marker='+')
scatter(x,list4,c='c',marker='+')
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2","O3","SO2","CO"])
plt.title("Scatter diagram of main pollutants in Florida")
show()
#画出折线图
import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()
# 时间区间
start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, list1, ls='-')
ax.plot_date(dates, list2, ls='-')
ax.plot_date(dates, list3, ls='-')
ax.plot_date(dates, list4, ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2","O3","SO2","CO"])
plt.title("Broken line diagram of main pollutants in Florida")
fig.autofmt_xdate()
show()
import pandas as pd
import numpy as np
import datetime
file='D:\\pollution_us_2000_2016.csv'
df=pd.DataFrame(pd.read_csv(file))
#df.head()
df['SO2 AQI'].fillna(method='bfill',inplace=True)
df['CO AQI'].fillna(method='bfill',inplace=True)
del df['Unnamed: 0']
del df['State Code']
del df['County Code']
del df['Site Num']
del df['NO2 Units']
del df['O3 Units']
del df['SO2 Units']
del df['CO Units']
list0=[]
list1=[]
list2=[]
list3=[]
list4=[]
for i in range(195):
list0.append(0)
list1.append(0)
list2.append(0)
list3.append(0)
list4.append(0)
'''
p = []
for i in range(len(df)): #枚举所有的元素
if df.loc[i][1]=='Arizona': #查找所有城市是Arizona的信息
x=pd.to_datetime(df.loc[i][4]) #把
idx=(x.year-2000)*12+x.month-1 #把年份和月份映射到list的下标
p.append((x.year-2000)*12+x.month-1)
ps=set(p) #查看不同的下表范围
print(ps)
'''
for i in range(len(df)):
if df.loc[i][1]=='Arizona':
x=pd.to_datetime(df.loc[i][4])
j=(x.year-2000)*12+x.month-1;
list0[j]=list0[j]+1
list1[j]=list1[j]+df.loc[i][5]
list2[j]=list2[j]+df.loc[i][9]
list3[j]=list3[j]+df.loc[i][13]
list4[j]=list4[j]+df.loc[i][17]
for j in range(195):
list1[j]=list1[j]/list0[j]
list2[j]=list2[j]/list0[j]
list3[j]=list3[j]/list0[j]
list4[j]=list4[j]/list0[j]
import matplotlib.pyplot as plt
x = []
for i in range(195):
x.append(i)
#create new figure
figure(figsize=(50,25))
subplot(2,3,6)
scatter(x,list1,c='r',marker='+')
scatter(x,list2,c='b',marker='+')
scatter(x,list3,c='g',marker='+')
scatter(x,list4,c='c',marker='+')
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2","O3","SO2","CO"])
plt.title("Scatter diagram of main pollutants in Arizona")
show()
import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()
# 时间区间
start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, list1, ls='-')
ax.plot_date(dates, list2, ls='-')
ax.plot_date(dates, list3, ls='-')
ax.plot_date(dates, list4, ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2","O3","SO2","CO"])
plt.title("Broken line diagram of main pollutants in Arizona")
fig.autofmt_xdate()
show()
import pandas as pd
import numpy as np
import datetime
file='D:\\pollution_us_2000_2016.csv'
df=pd.DataFrame(pd.read_csv(file))
#df.head()
df['SO2 AQI'].fillna(method='bfill',inplace=True)
df['CO AQI'].fillna(method='bfill',inplace=True)
del df['Unnamed: 0']
del df['State Code']
del df['County Code']
del df['Site Num']
del df['NO2 Units']
del df['O3 Units']
del df['SO2 Units']
del df['CO Units']
list0=[]
list1=[]
list2=[]
list3=[]
list4=[]
for i in range(1000):
list0.append(0)
list1.append(0)
list2.append(0)
list3.append(0)
list4.append(0)
for i in range(len(df)):
if df.loc[i][1]=='California':
x=pd.to_datetime(df.loc[i][4])
j=(x.year-2000)*12+x.month-1;
list0[j]=list0[j]+1
list1[j]=list1[j]+df.loc[i][5]
list2[j]=list2[j]+df.loc[i][9]
list3[j]=list3[j]+df.loc[i][13]
list4[j]=list4[j]+df.loc[i][17]
for j in range(195):
list1[j]=list1[j]/list0[j]
list2[j]=list2[j]/list0[j]
list3[j]=list3[j]/list0[j]
list4[j]=list4[j]/list0[j]
list11=[]
list22=[]
list33=[]
list44=[]
for i in range(195):
list11.append(list1[i])
list22.append(list2[i])
list33.append(list3[i])
list44.append(list4[i])
from matplotlib.pyplot import *
x = []
for i in range(195):
x.append(i)
#create new figure
figure(figsize=(50,25))
subplot(2,3,6)
scatter(x,list11,c='r',marker='+')
scatter(x,list22,c='b',marker='+')
scatter(x,list33,c='g',marker='+')
scatter(x,list44,c='c',marker='+')
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2","O3","SO2","CO"])
plt.title("Scatter diagram of main pollutants in California")
show()
import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()
# 时间区间
start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, list11, ls='-')
ax.plot_date(dates, list22, ls='-')
ax.plot_date(dates, list33, ls='-')
ax.plot_date(dates, list44, ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2","O3","SO2","CO"])
plt.title("Broken line diagram of main pollutants in California")
fig.autofmt_xdate()
show()
a=[[] for i in range(195)]
for i in range(195):
for j in range(8):
a[i].append(0)
b=[]
for i in range(195):
b.append(0)
for i in range(len(df)):
if df.loc[i][1]=='Arizona':
x=pd.to_datetime(df.loc[i][4])
j=(x.year-2000)*12+x.month-1
b[j]=b[j]+1;
a[j][0]=a[j][0]+df.loc[i][6]
a[j][1]=a[j][1]+df.loc[i][7]
a[j][2]=a[j][2]+df.loc[i][10]
a[j][3]=a[j][3]+df.loc[i][11]
a[j][4]=a[j][4]+df.loc[i][14]
a[j][5]=a[j][5]+df.loc[i][15]
a[j][6]=a[j][6]+df.loc[i][18]
a[j][7]=a[j][7]+df.loc[i][19]
for i in range(195):
for j in range(8):
a[i][j]=a[i][j]/b[i];
data = [[] for i in range(8)]
for i in range(195):
for j in range(8):
data[j].append(a[i][j])
import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()
start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, data[1], ls='-')
ax.plot_date(dates, data[3], ls='-')
ax.plot_date(dates, data[5], ls='-')
ax.plot_date(dates, data[7], ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2 1st Max Hour","O3 1st Max Hour","SO2 1st Max Hour","CO 1st Max Hour"])
plt.title("Broken line diagram of 1st Max Hour in Arizona")
fig.autofmt_xdate()
show()
import matplotlib.pyplot as plt
x = []
for i in range(195):
x.append(i)
#create new figure
figure(figsize=(50,25))
subplot(2,3,6)
scatter(x,data[1],c='r',marker='+')
scatter(x,data[3],c='b',marker='+')
scatter(x,data[5],c='g',marker='+')
scatter(x,data[7],c='c',marker='+')
plt.xlabel('Months since 2000')
plt.ylabel('Time')
plt.legend(["NO2 1st Max Hour","O3 1st Max Hour","SO2 1st Max Hour","CO 1st Max Hour"])
plt.title("Maximum time of Arizona pollutant value Scatter plot")
show()
a=[[] for i in range(195)]
for i in range(195):
for j in range(8):
a[i].append(0)
b=[]
for i in range(1000):
b.append(0)
for i in range(len(df)):
if df.loc[i][1]=='California':
x=pd.to_datetime(df.loc[i][4])
j=(x.year-2000)*12+x.month-1
if j>=0 and j<=194:
b[j]=b[j]+1;
a[j][0]=a[j][0]+df.loc[i][6]
a[j][1]=a[j][1]+df.loc[i][7]
a[j][2]=a[j][2]+df.loc[i][10]
a[j][3]=a[j][3]+df.loc[i][11]
a[j][4]=a[j][4]+df.loc[i][14]
a[j][5]=a[j][5]+df.loc[i][15]
a[j][6]=a[j][6]+df.loc[i][18]
a[j][7]=a[j][7]+df.loc[i][19]
for i in range(195):
for j in range(8):
a[i][j]=a[i][j]/b[i];
data = [[] for i in range(8)]
for i in range(195):
for j in range(8):
data[j].append(a[i][j])
import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()
start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, data[1], ls='-')
ax.plot_date(dates, data[3], ls='-')
ax.plot_date(dates, data[5], ls='-')
ax.plot_date(dates, data[7], ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2 1st Max Hour","O3 1st Max Hour","SO2 1st Max Hour","CO 1st Max Hour"])
plt.title("Broken line diagram of 1st Max Hour in California")
fig.autofmt_xdate()
show()
import matplotlib.pyplot as plt
x = []
for i in range(195):
x.append(i)
#create new figure
figure(figsize=(50,25))
subplot(2,3,6)
scatter(x,data[1],c='r',marker='+')
scatter(x,data[3],c='b',marker='+')
scatter(x,data[5],c='g',marker='+')
scatter(x,data[7],c='c',marker='+')
plt.xlabel('Months since 2000')
plt.ylabel('Time')
plt.legend(["NO2 1st Max Hour","O3 1st Max Hour","SO2 1st Max Hour","CO 1st Max Hour"])
plt.title("Maximum time of California pollutant value Scatter plot")
show()
a=[[] for i in range(195)]
for i in range(195):
for j in range(8):
a[i].append(0)
b=[]
for i in range(1000):
b.append(0)
for i in range(len(df)):
if df.loc[i][1]=='Florida':
x=pd.to_datetime(df.loc[i][4])
j=(x.year-2000)*12+x.month-1
if j>=0 and j<=194:
b[j]=b[j]+1;
a[j][0]=a[j][0]+df.loc[i][6]
a[j][1]=a[j][1]+df.loc[i][7]
a[j][2]=a[j][2]+df.loc[i][10]
a[j][3]=a[j][3]+df.loc[i][11]
a[j][4]=a[j][4]+df.loc[i][14]
a[j][5]=a[j][5]+df.loc[i][15]
a[j][6]=a[j][6]+df.loc[i][18]
a[j][7]=a[j][7]+df.loc[i][19]
for i in range(195):
for j in range(8):
a[i][j]=a[i][j]/b[i];
data = [[] for i in range(8)]
for i in range(195):
for j in range(8):
data[j].append(a[i][j])
import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()
start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, data[1], ls='-')
ax.plot_date(dates, data[3], ls='-')
ax.plot_date(dates, data[5], ls='-')
ax.plot_date(dates, data[7], ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["NO2 1st Max Hour","O3 1st Max Hour","SO2 1st Max Hour","CO 1st Max Hour"])
plt.title("Broken line diagram of 1st Max Hour in Florida")
fig.autofmt_xdate()
show()
import matplotlib.pyplot as plt
x = []
for i in range(195):
x.append(i)
#create new figure
figure(figsize=(50,25))
subplot(2,3,6)
scatter(x,data[1],c='r',marker='+')
scatter(x,data[3],c='b',marker='+')
scatter(x,data[5],c='g',marker='+')
scatter(x,data[7],c='c',marker='+')
plt.xlabel('Months since 2000')
plt.ylabel('Time')
plt.legend(["NO2 1st Max Hour","O3 1st Max Hour","SO2 1st Max Hour","CO 1st Max Hour"])
plt.title("Maximum time of Florida pollutant value Scatter plot")
show()
#相关性
df.corr()
#查看相关强的数据列
[(df.corr() > 0.8) & (df.corr() != 1)]
不同城市的对比:
import matplotlib.pyplot as plt
from pylab import *
import datetime
import numpy as np
fig = figure(figsize=(15,10))
ax = gca()
start = datetime.datetime(2000,1,1)
stop = datetime.datetime(2016,3,31)
delta = datetime.timedelta(days =30.5)
dates = mpl.dates.drange(start,stop,delta)
ax.plot_date(dates, list1, ls='-')
ax.plot_date(dates, lis1, ls='-')
ax.plot_date(dates, li1, ls='-')
date_format = mpl.dates.DateFormatter('%Y-%m-%d')
ax.xaxis.set_major_formatter(date_format)
plt.xlabel('Months since 2000')
plt.ylabel('Numerical value')
plt.legend(["Arizona NO2 Mean","Florida NO2 Mean","California SO2 Mean"])
plt.title("Broken line diagram")
fig.autofmt_xdate()
show()