题记:今天记录一篇错误代码(暂未找到错误原因),网上的教程对于房价预测问题的套路都差不多于是参照了其中一篇,完成了几乎一样的代码,但是首先是k折交叉验证的问题,其次是预测结果问题。
更新:重新手敲了一遍代码发现还是有问题,于是有猜测肯定因为自己的惯性思维导致的问题,自己可能很难发现了,所以找了一份网上的代码直接复制发现能跑,又校对好几遍最终发现原来问题在交叉验证上。可恶啊~
fold_size = x.shape[0]//k
),我以为//k
是注释,每次自己手写的时候都把它删掉了。d2l
模块load_array
代替,方案可行,网上有教程。而且注意是小写的L
不是数字1
。# -*- coding: utf-8 -*-
"""
@Time : 2023/4/4 14:02
@Auth : zyt
@File :02regression.py
@IDE :PyCharm
@Motto:ABC(Always Be Coding)
"""
'''
问题描述:波士顿房价预测问题
'''
import pandas as pd
from torch.utils import data
import torch.nn as nn
import torch
def load_array(data_arrays, batch_size, is_train=False):
"""构造一个PyTorch数据迭代器。"""
dataset = data.TensorDataset(*data_arrays)
return data.DataLoader(dataset, batch_size, shuffle=is_train)
def get_net(in_features):
net = nn.Sequential(nn.Linear(in_features,1))
return net
def log_rmse(net,features,labels):
clipped_preds = torch.clamp(net(features),1,float('inf'))
rmse = torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))
return rmse.item()
def train(net,train_features,train_labels,test_features,test_labels,num_epochs,learning_rate,weight_decay,batch_size):
train_ls,test_ls=[],[]
train_iter = load_array((train_features,train_labels),batch_size)
optimizer = torch.optim.Adam(net.parameters(),lr = learning_rate,weight_decay = weight_decay)
for epoch in range(num_epochs):
for x,y in train_iter:
optimizer.zero_grad()
l = loss(net(x),y)
l.backward()
optimizer.step()
train_ls.append(log_rmse(net,train_features,train_labels))
if test_labels is not None:
test_ls.append(log_rmse(net,test_features,test_labels))
return train_ls,test_ls
def get_k_fold_data(k,i,x,y):
assert k>1
fold_size = x.shape[0]//k
x_train,y_train = None,None
for j in range(k):
idx = slice(j*fold_size,(j+1)*fold_size)
x_part,y_part = x[idx,:],y[idx]
if j==i:
x_valid,y_valid = x_part,y_part
elif x_train is None:
x_train,y_train = x_part,y_part
else:
x_train = torch.cat([x_train,x_part],0)
y_train = torch.cat([y_train, y_part], 0)
return x_train,y_train,x_valid,y_valid
def k_fold(k,x_train,y_train,num_epochs,learning_rate,weight_dacay,batch_size):
train_l_sum,valid_l_sum = 0,0
for i in range(k):
data = get_k_fold_data(k,i,x_train,y_train)
net = get_net(in_features)
train_ls,valid_ls = train(net,*data,num_epochs,learning_rate,weight_dacay,batch_size)
train_l_sum += train_ls[-1]
valid_l_sum += valid_ls[-1]
print(f'折{i+1},训练log rmse{float(train_ls[-1]):f},'f'验证log rmse{float(valid_ls[-1]):f}')
return train_l_sum/k,valid_l_sum/k
def train_and_pred(train_features,test_features,train_labels,data_test,num_epochs,lr,weight_decay,batch_size):
net = get_net(in_features)
train_ls,_ = train(net,train_features,train_labels,None,None,num_epochs,lr,weight_decay,batch_size)
print(f'训练log rmse:{float(train_ls[-1]):f}')
preds = net(test_features).detach().numpy()
# 导出数据
data_test['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([data_test['Id'], data_test['SalePrice']], axis=1)
submission.to_csv('/kaggle/working/submission.csv', index=False)
if __name__ == '__main__':
#导入数据
data_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
data_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
'''
数据预处理
'''
# 提取属性
all_features = pd.concat((data_train.iloc[:, 1:-1], data_test.iloc[:, 1:]),ignore_index=True)
# 提取数值类型的属性
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
# 标准化
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 在标准化数据之后,所有均值消失,因此我们可以将缺失值设为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)
# 离散值数据one-hot编码
all_features = pd.get_dummies(all_features, dummy_na=True)
# 转化为张量格式
n_train = data_train.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32) #训练集
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32) #测试集
train_labels = torch.tensor(data_train.SalePrice.values.reshape(-1, 1), dtype=torch.float32) #训练集标签
'''
训练
'''
in_features = train_features.shape[1]
loss = nn.MSELoss() #均方差损失函数
k, num_epochs, lr, weight_dacay, batch_size = 5 , 1000 , 5 , 0 , 64
train_l,valid_l = k_fold(k,train_features,train_labels,num_epochs,lr,weight_dacay,batch_size)
print(f'{k}-折验证,平均训练log rmse{float(train_l):f},'f'平均验证log rmse{float(valid_l):f}')
'''
预测
'''
train_and_pred(train_features,test_features,train_labels,data_test,num_epochs,lr,weight_dacay,batch_size)