# 根据下面特征预测第二天PM2.5的状况
import torch
# import torchtext
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('./PRSA_data_2010.1.1-2014.12.31.csv')
data.info()
#
# RangeIndex: 43824 entries, 0 to 43823
# Data columns (total 13 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 No 43824 non-null int64
# 1 year 43824 non-null int64
# 2 month 43824 non-null int64
# 3 day 43824 non-null int64
# 4 hour 43824 non-null int64
# 5 pm2.5 41757 non-null float64 # 说明有2000+ 条NaN值
# 6 DEWP 43824 non-null int64
# 7 TEMP 43824 non-null float64
# 8 PRES 43824 non-null float64
# 9 cbwd 43824 non-null object
# 10 Iws 43824 non-null float64
# 11 Is 43824 non-null int64
# 12 Ir 43824 non-null int64
# dtypes: float64(4), int64(8), object(1)
# memory usage: 4.3+ MB
data['pm2.5'].isna() # 返回值是True/False
data['pm2.5'].isna().sum()
# 2067
data[data['pm2.5'].isna()] # 返回pm2.5是NaN的数据
data = data.iloc[24:].copy() # 取出第24行以后的数据
data.fillna(method='ffill', inplace=True) # method='ffill' 前向填充,使用最近的前天的数据进行填充。
data.info()
#
# RangeIndex: 43800 entries, 24 to 43823
# Data columns (total 13 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 No 43800 non-null int64
# 1 year 43800 non-null int64
# 2 month 43800 non-null int64
# 3 day 43800 non-null int64
# 4 hour 43800 non-null int64
# 5 pm2.5 43800 non-null float64
# 6 DEWP 43800 non-null int64
# 7 TEMP 43800 non-null float64
# 8 PRES 43800 non-null float64
# 9 cbwd 43800 non-null object
# 10 Iws 43800 non-null float64
# 11 Is 43800 non-null int64
# 12 Ir 43800 non-null int64
# dtypes: float64(4), int64(8), object(1)
# memory usage: 4.3+ MB
# 需要预测的数据:pm2.5的浓度
# 用到的参数:温度/压力/露点等信息。时间信息仅仅是索引
data.drop('No', axis=1, inplace=True)
import datetime
# 将多列时间合并成一列
data['time'] = data.apply(lambda x: datetime.datetime(year=x['year'],
month=x['month'],
day=x['day'],
hour=x['hour']),
axis=1)
data.set_index('time', inplace=True)
data.drop(columns=['year', 'month', 'day', 'hour'], inplace=True) # 删除原来单独的时间相关的列。
data.head()
data.columns = ['pm2.5', 'dew', 'temp', 'press', 'cbwd', 'iws', 'snow', 'rain']
data.cbwd.unique()
# array(['SE', 'cv', 'NW', 'NE'], dtype=object)
data = data.join(pd.get_dummies(data.cbwd)) # 独热编码化
del data['cbwd'] # 删除cbwd这一列,因为这一列的元素已经被独热编码化,被新增的四列替代
data.info()
#
# DatetimeIndex: 43800 entries, 2010-01-02 00:00:00 to 2014-12-31 23:00:00
# Data columns (total 11 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 pm2.5 43800 non-null float64
# 1 dew 43800 non-null int64
# 2 temp 43800 non-null float64
# 3 press 43800 non-null float64
# 4 iws 43800 non-null float64
# 5 snow 43800 non-null int64
# 6 rain 43800 non-null int64
# 7 NE 43800 non-null uint8
# 8 NW 43800 non-null uint8
# 9 SE 43800 non-null uint8
# 10 cv 43800 non-null uint8
# dtypes: float64(4), int64(3), uint8(4)
# memory usage: 4.1 MB
# data['pm2.5'][-1000:].plot() # 最后1000次pm2.5的观测
# data['temp'][-1000:].plot() # 最后1000次温度的观测
data.head(3)
sequence_length = 5*24 # 观测当前时间点前面5天的数据
delay = 24 # 预测当前时间点一天后的PM2.5的数据
data_ = []
for i in range(len(data) - sequence_length - delay): # 从第一个数据开始采样,采样到len(data)-sequence_length-delay的位置
data_.append(data.iloc[i: i + sequence_length + delay]) # 以6天为单位进行采样
data_ = np.array([df.values for df in data_])
data_.shape # 有43656条数据,每条数据长度是144,每个观测有11个特征。
# (43656, 144, 11)
np.random.shuffle(data_) # 乱序
x = data_[:, :-delay, :] # 切前面的5*24
y = data_[:, -1, 0] # 切每个序列的最后一列,第一个值 切出目标值
# x.shape
# (43656, 120, 11)
# y.shape
# (43656,)
x = x.astype(np.float32)
y = y.astype(np.float32)
split_boundary = int(data_.shape[0] * 0.8) # 切出训练数据
train_x = x[: split_boundary]
test_x = x[split_boundary:]
train_y = y[: split_boundary]
test_y = y[split_boundary:]
train_x.shape, test_x.shape, train_y.shape, test_y.shape
# ((34924, 120, 11), (8732, 120, 11), (34924,), (8732,))
mean = train_x.mean(axis=0) # 数据标准化,只可在训练数据集上面进行计算方差/均值,不可在全局数据上面做。 因为你不可以预测未来
std = train_x.std(axis=0)
mean.shape
# (120, 11)
# 分类问题可以不做标准化,回归问题最好做标准化;标准化之后非常有益于反向传播
train_x = (train_x - mean)/std
test_x = (test_x - mean)/std
class Mydataset(torch.utils.data.Dataset):
def __init__(self, features, labels):
self.features = features
self.labels = labels
def __getitem__(self, index):
feature = self.features[index]
label = self.labels[index]
return feature, label
def __len__(self):
return len(self.features)
train_ds = Mydataset(train_x, train_y)
test_ds = Mydataset(test_x, test_y)
BTACH_SIZE = 128
train_dl = torch.utils.data.DataLoader(
train_ds,
batch_size=BTACH_SIZE,
shuffle=True
)
test_dl = torch.utils.data.DataLoader(
test_ds,
batch_size=BTACH_SIZE
)
hidden_size = 64
# 有顺序关系的特征提取,用LSTM或GRU
class Net(nn.Module):
def __init__(self, hidden_size):
super(Net, self).__init__()
self.rnn = nn.LSTM(train_x.shape[-1],
hidden_size,
batch_first=True) # 默认pytorch 第一维time-step,batch在第2维度;此处用batch_first将batch放到第一维
self.fc1 = nn.Linear(hidden_size, 128)
self.fc2 = nn.Linear(128, 1) # fc2层输入是fc1层的输出,因此也是128
def forward(self, inputs):
_, s_o = self.rnn(inputs)
s_o = s_o[-1] # 取出最后一个状态输出
x = F.dropout(F.relu(self.fc1(s_o)))
x = self.fc2(x) # shape:128*1
return torch.squeeze(x) # torch.sequeeze用于去除维度为1的数据
model = Net(hidden_size)
model
# Net(
# (rnn): LSTM(11, 64, batch_first=True)
# (fc1): Linear(in_features=64, out_features=128, bias=True)
# (fc2): Linear(in_features=128, out_features=1, bias=True)
# )
if torch.cuda.is_available():
model.to('cuda')
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
def fit(epoch, model, trainloader, testloader):
total = 0
running_loss = 0
model.train()
for x, y in trainloader:
if torch.cuda.is_available():
x, y = x.to('cuda'), y.to('cuda')
y_pred = model(x)
loss = loss_fn(y_pred, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
with torch.no_grad():
total += y.size(0)
running_loss += loss.item()
# exp_lr_scheduler.step()
epoch_loss = running_loss / len(trainloader.dataset)
test_total = 0
test_running_loss = 0
model.eval()
with torch.no_grad():
for x, y in testloader:
if torch.cuda.is_available():
x, y = x.to('cuda'), y.to('cuda')
y_pred = model(x)
loss = loss_fn(y_pred, y)
test_total += y.size(0)
test_running_loss += loss.item()
epoch_test_loss = test_running_loss / len(testloader.dataset)
print('epoch: ', epoch,
'loss: ', round(epoch_loss, 3),
'test_loss: ', round(epoch_test_loss, 3),
)
return epoch_loss, epoch_test_loss
epochs = 100
train_loss = []
test_loss = []
for epoch in range(epochs):
epoch_loss, epoch_test_loss = fit(epoch,
model,
train_dl,
test_dl)
train_loss.append(epoch_loss)
test_loss.append(epoch_test_loss)
# epoch: 0 loss: 68.588 test_loss: 54.964
# epoch: 1 loss: 55.669 test_loss: 53.593
# epoch: 2 loss: 54.41 test_loss: 52.06
# epoch: 3 loss: 53.641 test_loss: 51.423
# epoch: 4 loss: 52.42 test_loss: 49.987
# epoch: 5 loss: 51.155 test_loss: 48.557
# epoch: 6 loss: 50.076 test_loss: 47.17
# epoch: 7 loss: 48.91 test_loss: 45.625
# epoch: 8 loss: 47.173 test_loss: 43.359
# epoch: 9 loss: 45.375 test_loss: 42.027
# epoch: 10 loss: 43.466 test_loss: 39.962
# epoch: 11 loss: 41.474 test_loss: 39.458
# epoch: 12 loss: 39.881 test_loss: 38.783
# epoch: 13 loss: 38.193 test_loss: 35.53
# epoch: 14 loss: 37.687 test_loss: 33.928
# epoch: 15 loss: 34.739 test_loss: 34.505
# epoch: 16 loss: 35.562 test_loss: 31.928
# epoch: 17 loss: 33.771 test_loss: 28.281
# epoch: 18 loss: 31.609 test_loss: 34.744
# epoch: 19 loss: 30.227 test_loss: 27.182
# epoch: 20 loss: 30.243 test_loss: 29.004
# epoch: 21 loss: 27.315 test_loss: 25.241
# epoch: 22 loss: 27.382 test_loss: 24.698
# epoch: 23 loss: 28.096 test_loss: 25.942
# epoch: 24 loss: 26.235 test_loss: 24.247
# epoch: 25 loss: 24.684 test_loss: 23.874
# epoch: 26 loss: 24.026 test_loss: 22.576
# epoch: 27 loss: 23.737 test_loss: 21.454
# epoch: 28 loss: 21.713 test_loss: 20.979
# epoch: 29 loss: 21.546 test_loss: 21.355
# epoch: 30 loss: 36.789 test_loss: 31.965
# epoch: 31 loss: 30.321 test_loss: 28.344
# epoch: 32 loss: 28.257 test_loss: 27.48
# epoch: 33 loss: 25.646 test_loss: 24.348
# epoch: 34 loss: 23.097 test_loss: 21.579
# epoch: 35 loss: 22.617 test_loss: 22.492
# epoch: 36 loss: 25.101 test_loss: 22.987
# epoch: 37 loss: 21.623 test_loss: 21.107
# epoch: 38 loss: 20.209 test_loss: 19.845
# epoch: 39 loss: 21.194 test_loss: 19.686
# epoch: 40 loss: 23.428 test_loss: 21.376
# epoch: 41 loss: 23.757 test_loss: 26.797
# epoch: 42 loss: 25.559 test_loss: 24.227
# epoch: 43 loss: 21.455 test_loss: 19.811
# epoch: 44 loss: 19.294 test_loss: 19.272
# epoch: 45 loss: 20.36 test_loss: 21.111
# epoch: 46 loss: 18.837 test_loss: 17.935
# epoch: 47 loss: 17.786 test_loss: 18.672
# epoch: 48 loss: 17.368 test_loss: 16.897
# epoch: 49 loss: 16.809 test_loss: 19.155
# epoch: 50 loss: 17.707 test_loss: 17.095
# epoch: 51 loss: 16.12 test_loss: 16.193
# epoch: 52 loss: 16.583 test_loss: 18.015
# epoch: 53 loss: 15.847 test_loss: 15.568
# epoch: 54 loss: 15.263 test_loss: 15.603
# epoch: 55 loss: 14.526 test_loss: 15.696
# epoch: 56 loss: 17.524 test_loss: 17.718
# epoch: 57 loss: 15.436 test_loss: 15.057
# epoch: 58 loss: 14.649 test_loss: 17.361
# epoch: 59 loss: 15.266 test_loss: 14.205
# epoch: 60 loss: 14.246 test_loss: 16.719
# epoch: 61 loss: 14.612 test_loss: 17.773
# epoch: 62 loss: 20.967 test_loss: 20.753
# epoch: 63 loss: 18.532 test_loss: 17.17
# epoch: 64 loss: 15.993 test_loss: 15.616
# epoch: 65 loss: 14.917 test_loss: 14.723
# epoch: 66 loss: 14.126 test_loss: 13.853
# epoch: 67 loss: 13.362 test_loss: 13.607
# epoch: 68 loss: 14.241 test_loss: 17.603
# epoch: 69 loss: 14.564 test_loss: 14.097
# epoch: 70 loss: 12.993 test_loss: 13.177
# epoch: 71 loss: 13.712 test_loss: 15.795
# epoch: 72 loss: 14.401 test_loss: 16.922
# epoch: 73 loss: 13.66 test_loss: 13.42
# epoch: 74 loss: 12.441 test_loss: 13.071
# epoch: 75 loss: 12.037 test_loss: 12.624
# epoch: 76 loss: 11.686 test_loss: 12.377
# epoch: 77 loss: 14.484 test_loss: 13.065
# epoch: 78 loss: 13.982 test_loss: 12.742
# epoch: 79 loss: 11.726 test_loss: 13.673
# epoch: 80 loss: 11.266 test_loss: 12.257
# epoch: 81 loss: 10.888 test_loss: 11.531
# epoch: 82 loss: 10.796 test_loss: 12.417
# epoch: 83 loss: 12.716 test_loss: 12.989
# epoch: 84 loss: 10.872 test_loss: 11.606
# epoch: 85 loss: 11.043 test_loss: 11.456
# epoch: 86 loss: 10.294 test_loss: 15.043
# epoch: 87 loss: 12.387 test_loss: 12.805
# epoch: 88 loss: 10.822 test_loss: 12.145
# epoch: 89 loss: 12.467 test_loss: 13.053
# epoch: 90 loss: 11.999 test_loss: 11.433
# epoch: 91 loss: 10.385 test_loss: 11.115
# epoch: 92 loss: 11.655 test_loss: 10.805
# epoch: 93 loss: 9.786 test_loss: 10.904
# epoch: 94 loss: 9.642 test_loss: 10.536
# epoch: 95 loss: 10.273 test_loss: 10.703
# epoch: 96 loss: 11.223 test_loss: 12.448
# epoch: 97 loss: 10.086 test_loss: 10.532
# epoch: 98 loss: 9.199 test_loss: 10.796
# epoch: 99 loss: 9.668 test_loss: 17.349
# 预测演示
plt.plot(range(1,len(train_loss) + 1) + 1), train_loss, label='train_loss')
plt.plot(range(1,len(train_loss) + 1) + 1), test_loss, label='test_loss')
plt.legend()
test_x.shape # (n,120,11);
# 预测
pred_test_x = model(troch.from_numpy(test_x).cuda())
pred_test_x.shape # torch.Size([8732])
pred_test_x[:4]