import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif'] = 'SimHei' #显示中文
plt.rcParams['axes.unicode_minus'] = False #显示负号
北京 2010-2014年 PM2.5 数据集,可从 UCI 下载
path = 'PRSA_data_2010.1.1-2014.12.31.csv'
data = pd.read_csv(path)
data.tail()
df = pd.read_csv(path, index_col='No',
parse_dates={'datetime': [1,2,3,4]},
date_parser=lambda x: pd.datetime.strptime(x, '%Y %m %d %H')
)
df.head()
df = df.set_index('datetime')
df.info()
'''
DatetimeIndex: 43824 entries, 2010-01-01 00:00:00 to 2014-12-31 23:00:00
Data columns (total 8 columns):
pm2.5 41757 non-null float64
DEWP 43824 non-null int64
TEMP 43824 non-null float64
PRES 43824 non-null float64
cbwd 43824 non-null object
Iws 43824 non-null float64
Is 43824 non-null int64
Ir 43824 non-null int64
dtypes: float64(4), int64(3), object(1)
memory usage: 3.0+ MB
'''
df.head()
plt.figure(figsize=(20,10))
df['TEMP'].plot()
plt.tick_params(labelsize=20)
plt.grid()
df['TEMP'].plot(style='.k', figsize=(20,10))
plt.tick_params(labelsize=20)
plt.grid()
# df['TEMP'].hist(bins=100)
df['TEMP'].plot(kind='hist',bins=100)
plt.grid()
df['TEMP'].plot(kind='kde')
temp = df['TEMP'].to_frame()
temp.head()
groups= temp.groupby(pd.Grouper(freq='Y'))['TEMP']
yearly = {name.year:group.values for name,group in groups}
years = pd.DataFrame()
for k,v in yearly.items():
years[k] = pd.Series(v)
years.head()
plt.style.use({'figure.figsize':(50,15)})
years.plot()
plt.tick_params(labelsize=20)
plt.legend(fontsize=20, markerscale=20)
plt.grid()
years.plot(subplots=True,figsize=(50,15*5))
plt.show()
years.boxplot(figsize=(50,15))
sns.heatmap(years.T)
plt.matshow(years.T, interpolation=None, aspect='auto')
years.hist(bins=100)
group_month = df['2014'].groupby(pd.Grouper(freq='M'))['TEMP']
month = pd.concat([pd.DataFrame(x[1].values) for x in group_month], axis=1)
month = pd.DataFrame(month)
month.columns = range(1,13)
month.boxplot(figsize=(20,5))
plt.title('北京2014年每月温度分布')
plt.style.use({'figure.figsize':(20,5)})
sns.violinplot(data=month, linewidth=2)
plt.title('北京2014年每月温度分布小提琴图')
lag_list = [2**i for i in range(9)]
plt.style.use({'figure.figsize':(30,30)})
for i in range(9):
ax = plt.subplot(3,3,i+1)
ax.set_title('t vs t+{}'.format(lag_list[i]))
lag_plot(month, lag=lag_list[i])
plt.title('气温滞后{}小时散点图'.format(lag_list[i]))
plt.style.use({'figure.figsize':(10,10)})
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(temp)
plt.tick_params(labelsize=10)
plt.yticks(np.linspace(-1,1,20))