无
# !pip install --upgrade pandas
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import paddle
import os
# 复制数据集到指定目录
!mkdir work/hw1_data
!cp data/data74756/train.csv -d work/hw1_data && cp data/data74756/test.csv -d work/hw1_data
mkdir: cannot create directory ‘work/hw1_data’: File exists
df = pd.read_csv('work/hw1_data/train.csv', encoding='big5')
print(df.shape)
(4320, 27)
df.head() # 查看引入的csv文件前5行数据
日期 | 測站 | 測項 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2014/1/1 | 豐原 | AMB_TEMP | 14 | 14 | 14 | 13 | 12 | 12 | 12 | ... | 22 | 22 | 21 | 19 | 17 | 16 | 15 | 15 | 15 | 15 |
1 | 2014/1/1 | 豐原 | CH4 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | ... | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 |
2 | 2014/1/1 | 豐原 | CO | 0.51 | 0.41 | 0.39 | 0.37 | 0.35 | 0.3 | 0.37 | ... | 0.37 | 0.37 | 0.47 | 0.69 | 0.56 | 0.45 | 0.38 | 0.35 | 0.36 | 0.32 |
3 | 2014/1/1 | 豐原 | NMHC | 0.2 | 0.15 | 0.13 | 0.12 | 0.11 | 0.06 | 0.1 | ... | 0.1 | 0.13 | 0.14 | 0.23 | 0.18 | 0.12 | 0.1 | 0.09 | 0.1 | 0.08 |
4 | 2014/1/1 | 豐原 | NO | 0.9 | 0.6 | 0.5 | 1.7 | 1.8 | 1.5 | 1.9 | ... | 2.5 | 2.2 | 2.5 | 2.3 | 2.1 | 1.9 | 1.5 | 1.6 | 1.8 | 1.5 |
5 rows × 27 columns
df.tail()
日期 | 測站 | 測項 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4315 | 2014/12/20 | 豐原 | THC | 1.8 | 1.8 | 1.8 | 1.8 | 1.8 | 1.7 | 1.7 | ... | 1.8 | 1.8 | 2 | 2.1 | 2 | 1.9 | 1.9 | 1.9 | 2 | 2 |
4316 | 2014/12/20 | 豐原 | WD_HR | 46 | 13 | 61 | 44 | 55 | 68 | 66 | ... | 59 | 308 | 327 | 21 | 100 | 109 | 108 | 114 | 108 | 109 |
4317 | 2014/12/20 | 豐原 | WIND_DIREC | 36 | 55 | 72 | 327 | 74 | 52 | 59 | ... | 18 | 311 | 52 | 54 | 121 | 97 | 107 | 118 | 100 | 105 |
4318 | 2014/12/20 | 豐原 | WIND_SPEED | 1.9 | 2.4 | 1.9 | 2.8 | 2.3 | 1.9 | 2.1 | ... | 2.3 | 2.6 | 1.3 | 1 | 1.5 | 1 | 1.7 | 1.5 | 2 | 2 |
4319 | 2014/12/20 | 豐原 | WS_HR | 0.7 | 0.8 | 1.8 | 1 | 1.9 | 1.7 | 2.1 | ... | 1.3 | 1.7 | 0.7 | 0.4 | 1.1 | 1.4 | 1.3 | 1.6 | 1.8 | 2 |
5 rows × 27 columns
df['測項'][:20]
0 AMB_TEMP
1 CH4
2 CO
3 NMHC
4 NO
5 NO2
6 NOx
7 O3
8 PM10
9 PM2.5
10 RAINFALL
11 RH
12 SO2
13 THC
14 WD_HR
15 WIND_DIREC
16 WIND_SPEED
17 WS_HR
18 AMB_TEMP
19 CH4
Name: 測項, dtype: object
可以看出一共有18个特征
data = df.iloc[:, 3:]
data[data == 'NR'] = 0
raw_data = data.to_numpy().astype('float32')
raw_data.shape #(20*18*12, 24)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/pandas/core/frame.py:3093: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._where(-key, value, inplace=True)
(4320, 24)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-5U0MOSTM-1615821389237)(https://drive.google.com/uc?id=1LyaqD4ojX07oe5oDzPO99l9ts5NRyArH)]
-> 12(month)*(18(feature)*480(h))
month_data = {}
for month in range(12):
sample = np.empty([18, 480])
for day in range(20):
sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day) : 18 * (20 * month + day + 1), :]
month_data[month] = sample
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tny7fIRG-1615821389240)(https://drive.google.com/uc?id=1wKoPuaRHoX682LMiBgIoOP4PDyNKsJLK)]
每个月会有 480hrs,每 9 小时形成一個 data,每个月会有 471 個 data,故总资料数为 471 * 12 笔,而每笔 data 有 9 * 18 的 features (一小时 18 个 features * 9 小时)。
对应的 target 则有 471 * 12 个(第 10 个小时的 PM2.5)
x = np.empty([12 * 471, 18 * 9], dtype = float)
y = np.empty([12 * 471, 1], dtype = float)
for month in range(12):
for day in range(20):
for hour in range(24):
if day == 19 and hour > 14:
continue
x[month * 471 + day * 24 + hour, :] = month_data[month][:,day * 24 + hour : day * 24 + hour + 9].reshape(1, -1) # vector dim:18*9 (9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9)
y[month * 471 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9] # value
print(x.shape) # (471*12, 18*9)
print(y.shape)
(5652, 162)
(5652, 1)
mean_x = np.mean(x, axis=0) # 18 * 9
std_x = np.std(x, axis=0) # 18 * 9
# 规范化
def _normalize(X, train = True, specified_column = None, X_mean = None, X_std = None):
# This function normalizes specific columns of X.
# The mean and standard variance of training data will be reused when processing testing data.
#
# Arguments:
# X: data to be processed
# train: 'True' when processing training data, 'False' for testing data
# specific_column: indexes of the columns that will be normalized. If 'None', all columns
# will be normalized.
# X_mean: mean value of training data, used when train = 'False'
# X_std: standard deviation of training data, used when train = 'False'
# Outputs:
# X: normalized data
# X_mean: computed mean value of training data
# X_std: computed standard deviation of training data
if specified_column == None:
specified_column = np.arange(X.shape[1])
if train:
X_mean = np.mean(X[:, specified_column] ,0).reshape(1, -1)
X_std = np.std(X[:, specified_column], 0).reshape(1, -1)
X[:,specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)
return X, X_mean, X_std
# Normalize training and testing data
X_train, X_mean, X_std = _normalize(x, train = True)
Split Training Data Into “train_set” and "validation_set"
def _train_dev_split(X, Y, dev_ratio = 0.25):
# This function spilts data into training set and development set.
train_size = int(len(X) * (1 - dev_ratio))
return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]
def _shuffle(X, Y):
# This function shuffles two equal-length list/array, X and Y, together.
randomize = np.arange(len(X))
np.random.shuffle(randomize)
return (X[randomize], Y[randomize])
# Split data into training set and development set
dev_ratio = 0.1
# 9:1
X_train, y = _shuffle(X_train, y)
x_train, y_train, x_eval, y_eval = _train_dev_split(X_train, y, dev_ratio = dev_ratio)
print("训练集数据:", x_train.shape)
print("训练集标签:", y_train.shape)
print("验证集数据:", x_eval.shape)
print("验证集标签:", y_eval.shape)
训练集数据: (5086, 162)
训练集标签: (5086, 1)
验证集数据: (566, 162)
验证集标签: (566, 1)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-6oEILAmq-1615821389243)(https://drive.google.com/uc?id=1xIXvqZ4EGgmxrp7c9r0LOVbcvd4d9H4N)]
Adagrad优化器
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-0Ueo0v47-1615821389245)(https://drive.google.com/uc?id=1S42g06ON5oJlV2f9RukxawjbE4NpsaB6)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UznZjfiE-1615821389249)(https://drive.google.com/uc?id=1BbXu-oPB9EZBHDQ12YCkYqtyAIil3bGj)]
这里使用了9:1
的训练集和验证集。
使用mini-batch
进行训练
!mkdir work/checkpoint
mkdir: cannot create directory ‘work/checkpoint’: File exists
n = 471*12
dim = 18 * 9 + 1 # 162w + 1b
w = np.zeros([dim, 1])
# Some parameters for training
EPOCH = 1000
batch_size = 512
learning_rate = 100
adagrad = np.zeros([dim, 1])
eps = 1e-9
# Keep the loss at every iteration for plotting
train_loss = []
eval_loss = []
x_train = np.concatenate((np.ones((x_train.shape[0], 1)), x_train), axis=1).astype('float32') # axis=1增加一个数据
x_eval = np.concatenate((np.ones((x_eval.shape[0], 1)), x_eval), axis=1).astype('float32') # axis=1增加一个数据
for epoch in range(EPOCH):
x_train, y_train = _shuffle(x_train, y_train)
x_eval, y_eval = _shuffle(x_eval, y_eval)
# Mini-batch training
step = 0
steps = int(np.floor(x_train.shape[0] / batch_size))
for idx in range(steps):
X = x_train[idx*batch_size:(idx+1)*batch_size]
Y = y_train[idx*batch_size:(idx+1)*batch_size]
loss_train = np.sqrt(np.sum(np.power(np.dot(X, w) - Y, 2)) / batch_size) # rmse
# cal grad
gradient = 2 * np.dot(X.transpose(), np.dot(X, w) - Y) # dim*1
# optim update
adagrad += gradient ** 2
# loss_backward
w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
step += 1
train_loss.append(loss_train)
loss_eval = np.sqrt(np.sum(np.power(np.dot(x_eval, w) - y_eval, 2)) / x_eval.shape[0]) # rmse
eval_loss.append(loss_eval)
if epoch % 50 == 0 or epoch == EPOCH:
print(f'Epoch {epoch}/{EPOCH}: train_loss = {loss_train}, eval_loss = {loss_eval}')
np.save(f'work/checkpoint/weight_epoch{epoch}.npy', w)
print('Training loss: {}'.format(train_loss[-1]))
print('Eval loss: {}'.format(eval_loss[-1]))
np.save('work/weight.npy', w)
# w
Epoch 0/1000: train_loss = 159.4215807237326, eval_loss = 160.03964096127643
Epoch 50/1000: train_loss = 10.710251205011959, eval_loss = 13.633207160142092
Epoch 100/1000: train_loss = 8.940740484311618, eval_loss = 9.910586871289684
Epoch 150/1000: train_loss = 7.205304188039285, eval_loss = 8.512948578348558
Epoch 200/1000: train_loss = 6.576684152794168, eval_loss = 7.444865293517468
Epoch 250/1000: train_loss = 7.977637180300461, eval_loss = 6.880208266650443
Epoch 300/1000: train_loss = 6.266144801748074, eval_loss = 6.546815159494726
Epoch 350/1000: train_loss = 6.430904115053692, eval_loss = 6.285806046257601
Epoch 400/1000: train_loss = 5.916425468087467, eval_loss = 6.195045554834926
Epoch 450/1000: train_loss = 7.15277059658265, eval_loss = 6.083480618594979
Epoch 500/1000: train_loss = 5.82765440672093, eval_loss = 5.979611716769195
Epoch 550/1000: train_loss = 5.91141506637541, eval_loss = 5.966073299482594
Epoch 600/1000: train_loss = 5.435582187224593, eval_loss = 5.948395937855693
Epoch 650/1000: train_loss = 5.019412332964305, eval_loss = 5.924666273559038
Epoch 700/1000: train_loss = 6.035308988695684, eval_loss = 5.91948853709894
Epoch 750/1000: train_loss = 5.602453209142315, eval_loss = 5.872485524056799
Epoch 800/1000: train_loss = 5.416152523981437, eval_loss = 5.897742449011183
Epoch 850/1000: train_loss = 5.489958216577146, eval_loss = 5.900870747279584
Epoch 900/1000: train_loss = 6.006171906853285, eval_loss = 5.867429846520514
Epoch 950/1000: train_loss = 5.896218141280272, eval_loss = 5.903525465262537
Training loss: 5.908377606781291
Eval loss: 5.8538184824615795
testdata = pd.read_csv('work/hw1_data/test.csv', header = None, encoding = 'big5')
test_data = testdata.iloc[:, 2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.to_numpy()
test_x = np.empty([240, 18*9], dtype = float)
for i in range(240):
test_x[i, :] = test_data[18 * i: 18* (i + 1), :].reshape(1, -1)
for i in range(len(test_x)):
for j in range(len(test_x[0])):
if std_x[j] != 0:
test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
test_x = np.concatenate((np.ones([240, 1]), test_x), axis = 1).astype(float)
test_x.shape
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
This is separate from the ipykernel package so we can avoid doing imports until
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/pandas/core/frame.py:3093: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._where(-key, value, inplace=True)
(240, 163)
w = np.load('work/weight.npy')
predict_y = np.dot(test_x, w)
predict_y.shape
(240, 1)
import csv
with open('work/submit1.csv', mode='w', newline='') as submit_file:
csv_writer = csv.writer(submit_file)
header = ['id', 'value']
# print(header)
csv_writer.writerow(header)
for i in range(240):
row = ['id_' + str(i), predict_y[i][0]]
csv_writer.writerow(row)
# print(row)
paddle2.0
实现model = paddle.nn.Linear(162, 1)
model = paddle.Model(model)
model.summary((1, 1, 162))
---------------------------------------------------------------------------
Layer (type) Input Shape Output Shape Param #
===========================================================================
Linear-1 [[1, 1, 162]] [1, 1, 1] 163
===========================================================================
Total params: 163
Trainable params: 163
Non-trainable params: 0
---------------------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
---------------------------------------------------------------------------
{'total_params': 163, 'trainable_params': 163}
可以看到已经包含了偏置,有163个参数
# Nomalize
mean_x = np.mean(x, axis=0)
std_x = np.std(x, axis=0)
for i in range(len(x)): # 12 * 471
for j in range(len(x[0])): # 18 * 9
x[i][j] = (x[i][j] - mean_x[j]) / (std_x[j] + 1e-9)
x_tensor = paddle.to_tensor(x.astype('float32'))
y_tensor = paddle.to_tensor(y.astype('float32'))
print("data shape:", x_tensor.shape)
print("label shape:", y_tensor.shape)
data shape: [5652, 162]
label shape: [5652, 1]
# 模型建立
model = paddle.nn.Linear(in_features=162, out_features=1)
smoothL1loss = paddle.nn.SmoothL1Loss()
optimizer = paddle.optimizer.Adam(learning_rate=2e-2, parameters=model.parameters())
for step in range(20000):
y_predict = model(x_tensor)
loss = smoothL1loss(y_predict, y_tensor)
loss.backward()
optimizer.step()
optimizer.clear_grad()
if step % 2000 == 0:
# print("lr:", optimizer.get_lr())
print(f"step {step} loss {loss.numpy()}")
# print("\n param:", model.parameters())
print("\n final loss:", loss.numpy())
step 0 loss [20.950909]
step 2000 loss [11.723677]
step 4000 loss [11.721383]
step 6000 loss [11.721346]
step 8000 loss [11.721402]
step 10000 loss [11.721395]
step 12000 loss [11.721331]
step 14000 loss [11.721308]
step 16000 loss [11.7213]
step 18000 loss [11.721315]
final loss: [11.721434]
df_test = pd.read_csv('work/hw1_data/test.csv', encoding='big5')
df_test.shape
(4319, 11)
df_test.head(10)
id_0 | AMB_TEMP | 21 | 21.1 | 20 | 20.1 | 19 | 19.1 | 19.2 | 18 | 17 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | id_0 | CH4 | 1.7 | 1.7 | 1.7 | 1.7 | 1.7 | 1.7 | 1.7 | 1.7 | 1.8 |
1 | id_0 | CO | 0.39 | 0.36 | 0.36 | 0.4 | 0.53 | 0.55 | 0.34 | 0.31 | 0.23 |
2 | id_0 | NMHC | 0.16 | 0.24 | 0.22 | 0.27 | 0.27 | 0.26 | 0.27 | 0.29 | 0.1 |
3 | id_0 | NO | 1.3 | 1.3 | 1.3 | 1.3 | 1.4 | 1.6 | 1.2 | 1.1 | 0.9 |
4 | id_0 | NO2 | 17 | 14 | 13 | 14 | 18 | 21 | 8.9 | 9.4 | 5 |
5 | id_0 | NOx | 18 | 16 | 14 | 15 | 20 | 23 | 10 | 10 | 5.8 |
6 | id_0 | O3 | 32 | 31 | 31 | 26 | 16 | 12 | 27 | 20 | 26 |
7 | id_0 | PM10 | 62 | 50 | 44 | 39 | 38 | 32 | 48 | 36 | 25 |
8 | id_0 | PM2.5 | 33 | 39 | 39 | 25 | 18 | 18 | 17 | 9 | 4 |
9 | id_0 | RAINFALL | NR | NR | NR | NR | NR | NR | NR | NR | NR |
testdata = pd.read_csv('work/hw1_data/test.csv', header = None, encoding = 'big5')
test_data = testdata.iloc[:, 2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.to_numpy()
test_x = np.empty([240, 18*9], dtype = float)
for i in range(240):
test_x[i, :] = test_data[18 * i: 18* (i + 1), :].reshape(1, -1)
for i in range(len(test_x)):
for j in range(len(test_x[0])):
if std_x[j] != 0:
test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
test_x = paddle.to_tensor(test_x.astype('float32'))
test_x.shape
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
This is separate from the ipykernel package so we can avoid doing imports until
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/pandas/core/frame.py:3093: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._where(-key, value, inplace=True)
[240, 162]
predict = model(test_x)
# 方式1
import csv
with open('work/submit2.csv', mode='w', newline='') as submit_file:
csv_writer = csv.writer(submit_file)
header = ['id', 'value']
# print(header)
csv_writer.writerow(header)
for i in range(240):
row = ['id_' + str(i), float(predict[i][0].numpy())]
csv_writer.writerow(row)
# print(row)
# 方式2
predict_pd = pd.DataFrame(predict.numpy())
predict_pd.columns = ["value"]
predict_pd.index = ['id_' + str(i) for i in range(240)]
mpy())]
csv_writer.writerow(row)
# print(row)
# 方式2
predict_pd = pd.DataFrame(predict.numpy())
predict_pd.columns = ["value"]
predict_pd.index = ['id_' + str(i) for i in range(240)]
predict_pd.to_csv('work/submit3.csv', index=True, header=True)
相关 reference 可以参考:
Adagrad :
https://youtu.be/yKKNr-QKz2Q?list=PLJV_el3uVTsPy9oCRY30oBPNLCo89yu49&t=705
RMSprop :
https://www.youtube.com/watch?v=5Yt-obwvMHI
Adam
https://www.youtube.com/watch?v=JXQT_vxqwIs