hw1_pytorch实现linear regression

import sys
import pandas as pd
import numpy as np
#from google.colab import drive-保存在云端之类的
#data = pd.read_csv('./train.csv', encoding = 'big5')
import torch
from torch.utils import data
from torch import nn

#数据预处理-1:训练模型部分
#x, y是配套的训练集数据
data_csv = pd.read_csv('./train.csv', encoding = 'big5')
data_csv = data_csv.iloc[:, 3:] #取第三列开始的所有数据
data_csv[data_csv == 'NR'] = 0  #将NR值替换成0
raw_data = data_csv.to_numpy()   #转换成numpy数组
#将train数据:12个月,每个月20天,每天24小时,每天测18feature
#range从0开始------------按月取出数据,month_data[0-11]---字典格式
month_data = {}
for month in range(12):
    sample = np.empty([18, 480])
    for day in range(20):
        sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day): 18 * (20 * month + day + 1), :]
    month_data[month] = sample
    
#根据要求进行数据划分,9小时训练,第10小时预测,因此1个月480h-9h=471h,即除去开始的9小时,一共可组成471个训练数据/月,
#一共12*471个,每个9*18
x = np.empty([12 * 471, 18 * 9], dtype = float)
y = np.empty([12 * 471, 1], dtype = float)
for month in range(12):
    for day in range(20):
        for hour in range(24):
            if day == 19 and hour > 14:
                continue
            x[month * 417 + day * 24 + hour, :] = month_data[month][:, day * 24 + hour: day * 24 + hour + 9].reshape(1, -1)  #变成1行
            y[month * 417 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9]

#标准化
mean_x = np.mean(x, axis = 0) #跨行求均值
std_x = np.std(x, axis = 0)
for i in range(len(x)):   #12*471
    for j in range(len(x[0])):  #18*9
        if std_x[j] != 0 :
            x[i][j] = (x[i][j] - mean_x[j]) / std_x[j]

#注意使用时都要转换成torch的格式:只能float32位,需要转换
x = x.astype(np.float32)
x = torch.from_numpy(x).reshape(-1, 162)
y = y.astype(np.float32)
y = torch.from_numpy(y).reshape(-1, 1)

import math
x_train_set = x[: 3768, :]  
y_train_set = y[: 3768, :]
x_val = x[3768: , :]
y_val = y[3768: , :]

#分批:batch = 12;train:314batch;val:157batch
def load_array(data_arrays, batch_size, is_train=True):
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

batch_size = 12
data_train = load_array((x_train_set, y_train_set), batch_size)
data_val = load_array((x_val, y_val), batch_size)

#定义模型
net = nn.Sequential(nn.Linear(162, 1))   #只需要管有多少w
#初始化
net[0].weight.data.normal_(0, 0.01)
net[0].bias.data.fill_(0)
#loss function
loss = nn.MSELoss()
#优化算法
trainer = torch.optim.SGD(net.parameters(), lr = 0.005)

num_epochs = 30
for epoch in range(num_epochs):
    for X, Y in data_train:
        l = loss(net(X), Y)
        trainer.zero_grad()
        l.backward()
        trainer.step()
    l = loss(net(x), y)
    print(f' epoch {epoch + 1}, loss {l: f}')

#参数打印
w = net[0].weight.data
print('w:',w)
b = net[0].bias.data
print('b:', b)

#test数据导入和处理-test一共240份
testdata = pd.read_csv('./test.csv', header = None, encoding = 'big5')
test_data = testdata.iloc[:, 2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.to_numpy()
test_x = np.empty([240, 18*9], dtype = float)
for i in range(240):
    test_x[i, :] = test_data[18 * i: 18* (i + 1), :].reshape(1, -1)
for i in range(len(test_x)):
    for j in range(len(test_x[0])):
        if std_x[j] != 0:
            test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]


test_x = test_x.astype(np.float32)
test_x = torch.from_numpy(test_x).reshape(-1, 162)
test_y = net(test_x)
test_y1 = test_y.detach().numpy()
#其目的只是为了得到test中的PM2.5的值,因为没有target所以无法得到loss,即test上的预测效果
import csv
with open('submit.csv', mode='w', newline='') as submit_file:
    csv_writer = csv.writer(submit_file)
    header = ['id', 'value']
    print(header)
    csv_writer.writerow(header)
    for i in range(240):
        row = ['id_' + str(i), test_y1[i][0]]
        csv_writer.writerow(row)
        print(row)

初次编写DL code,有错误欢迎指正。

这里记录学习《李宏毅2020机器学习深度学习》课程中作业部分的一些代码整理。此为作业1的尝试。

在助教给的参考代码基础上进行修改。跟着李沐大神的《动手学深度学习V2》学了pytorch框架的linear regression模型的实现。

你可能感兴趣的:(深度学习,python)