import sys
import pandas as pd
import numpy as np
#from google.colab import drive-保存在云端之类的
#data = pd.read_csv('./train.csv', encoding = 'big5')
import torch
from torch.utils import data
from torch import nn
#数据预处理-1:训练模型部分
#x, y是配套的训练集数据
data_csv = pd.read_csv('./train.csv', encoding = 'big5')
data_csv = data_csv.iloc[:, 3:] #取第三列开始的所有数据
data_csv[data_csv == 'NR'] = 0 #将NR值替换成0
raw_data = data_csv.to_numpy() #转换成numpy数组
#将train数据:12个月,每个月20天,每天24小时,每天测18feature
#range从0开始------------按月取出数据,month_data[0-11]---字典格式
month_data = {}
for month in range(12):
sample = np.empty([18, 480])
for day in range(20):
sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day): 18 * (20 * month + day + 1), :]
month_data[month] = sample
#根据要求进行数据划分,9小时训练,第10小时预测,因此1个月480h-9h=471h,即除去开始的9小时,一共可组成471个训练数据/月,
#一共12*471个,每个9*18
x = np.empty([12 * 471, 18 * 9], dtype = float)
y = np.empty([12 * 471, 1], dtype = float)
for month in range(12):
for day in range(20):
for hour in range(24):
if day == 19 and hour > 14:
continue
x[month * 417 + day * 24 + hour, :] = month_data[month][:, day * 24 + hour: day * 24 + hour + 9].reshape(1, -1) #变成1行
y[month * 417 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9]
#标准化
mean_x = np.mean(x, axis = 0) #跨行求均值
std_x = np.std(x, axis = 0)
for i in range(len(x)): #12*471
for j in range(len(x[0])): #18*9
if std_x[j] != 0 :
x[i][j] = (x[i][j] - mean_x[j]) / std_x[j]
#注意使用时都要转换成torch的格式:只能float32位,需要转换
x = x.astype(np.float32)
x = torch.from_numpy(x).reshape(-1, 162)
y = y.astype(np.float32)
y = torch.from_numpy(y).reshape(-1, 1)
import math
x_train_set = x[: 3768, :]
y_train_set = y[: 3768, :]
x_val = x[3768: , :]
y_val = y[3768: , :]
#分批:batch = 12;train:314batch;val:157batch
def load_array(data_arrays, batch_size, is_train=True):
dataset = data.TensorDataset(*data_arrays)
return data.DataLoader(dataset, batch_size, shuffle=is_train)
batch_size = 12
data_train = load_array((x_train_set, y_train_set), batch_size)
data_val = load_array((x_val, y_val), batch_size)
#定义模型
net = nn.Sequential(nn.Linear(162, 1)) #只需要管有多少w
#初始化
net[0].weight.data.normal_(0, 0.01)
net[0].bias.data.fill_(0)
#loss function
loss = nn.MSELoss()
#优化算法
trainer = torch.optim.SGD(net.parameters(), lr = 0.005)
num_epochs = 30
for epoch in range(num_epochs):
for X, Y in data_train:
l = loss(net(X), Y)
trainer.zero_grad()
l.backward()
trainer.step()
l = loss(net(x), y)
print(f' epoch {epoch + 1}, loss {l: f}')
#参数打印
w = net[0].weight.data
print('w:',w)
b = net[0].bias.data
print('b:', b)
#test数据导入和处理-test一共240份
testdata = pd.read_csv('./test.csv', header = None, encoding = 'big5')
test_data = testdata.iloc[:, 2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.to_numpy()
test_x = np.empty([240, 18*9], dtype = float)
for i in range(240):
test_x[i, :] = test_data[18 * i: 18* (i + 1), :].reshape(1, -1)
for i in range(len(test_x)):
for j in range(len(test_x[0])):
if std_x[j] != 0:
test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
test_x = test_x.astype(np.float32)
test_x = torch.from_numpy(test_x).reshape(-1, 162)
test_y = net(test_x)
test_y1 = test_y.detach().numpy()
#其目的只是为了得到test中的PM2.5的值,因为没有target所以无法得到loss,即test上的预测效果
import csv
with open('submit.csv', mode='w', newline='') as submit_file:
csv_writer = csv.writer(submit_file)
header = ['id', 'value']
print(header)
csv_writer.writerow(header)
for i in range(240):
row = ['id_' + str(i), test_y1[i][0]]
csv_writer.writerow(row)
print(row)
初次编写DL code,有错误欢迎指正。
这里记录学习《李宏毅2020机器学习深度学习》课程中作业部分的一些代码整理。此为作业1的尝试。
在助教给的参考代码基础上进行修改。跟着李沐大神的《动手学深度学习V2》学了pytorch框架的linear regression模型的实现。