机器学习实战【房价预测问题】(附python代码)

题记:今天记录一篇错误代码(暂未找到错误原因),网上的教程对于房价预测问题的套路都差不多于是参照了其中一篇,完成了几乎一样的代码,但是首先是k折交叉验证的问题,其次是预测结果问题。

更新:重新手敲了一遍代码发现还是有问题,于是有猜测肯定因为自己的惯性思维导致的问题,自己可能很难发现了,所以找了一份网上的代码直接复制发现能跑,又校对好几遍最终发现原来问题在交叉验证上。可恶啊~

  1. 问题:rmse出现nan的原因
    解答:k折交叉验证的代码粗心写错了(fold_size = x.shape[0]//k),我以为//k是注释,每次自己手写的时候都把它删掉了。
  2. 问题:to_csv(‘submission.csv’)
    解答:实际上如果没有用d2l模块的话,这里根本不会报错。
  3. 问题:懒得下载d2l模块
    解答:用自定义的函数load_array代替,方案可行,网上有教程。而且注意是小写的L不是数字1
  4. 问题:kaggle的在线平台
    好难用啊,有时候会出现莫名的问题,新建一个notebook可能就好了。
  5. 问题:submission.csv
    rmse 得分0.17294
# -*- coding: utf-8 -*-
"""
@Time : 2023/4/4 14:02
@Auth : zyt
@File :02regression.py
@IDE :PyCharm
@Motto:ABC(Always Be Coding)
"""

'''
问题描述:波士顿房价预测问题
'''
import pandas as pd
from torch.utils import data
import torch.nn as nn
import torch

def load_array(data_arrays, batch_size, is_train=False):
    """构造一个PyTorch数据迭代器。"""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

def get_net(in_features):
    net = nn.Sequential(nn.Linear(in_features,1))
    return net

def log_rmse(net,features,labels):
    clipped_preds = torch.clamp(net(features),1,float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))
    return rmse.item()

def train(net,train_features,train_labels,test_features,test_labels,num_epochs,learning_rate,weight_decay,batch_size):
    train_ls,test_ls=[],[]
    train_iter = load_array((train_features,train_labels),batch_size)
    optimizer = torch.optim.Adam(net.parameters(),lr = learning_rate,weight_decay = weight_decay)
    for epoch in range(num_epochs):
        for x,y in train_iter:
            optimizer.zero_grad()
            l = loss(net(x),y)
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net,train_features,train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net,test_features,test_labels))
    return train_ls,test_ls

def get_k_fold_data(k,i,x,y):
    assert k>1
    fold_size = x.shape[0]//k
    x_train,y_train = None,None
    for j in range(k):
        idx = slice(j*fold_size,(j+1)*fold_size)
        x_part,y_part = x[idx,:],y[idx]
        if j==i:
            x_valid,y_valid = x_part,y_part
        elif x_train is None:
            x_train,y_train = x_part,y_part
        else:
            x_train = torch.cat([x_train,x_part],0)
            y_train = torch.cat([y_train, y_part], 0)

    return x_train,y_train,x_valid,y_valid

def k_fold(k,x_train,y_train,num_epochs,learning_rate,weight_dacay,batch_size):
    train_l_sum,valid_l_sum = 0,0
    for i in range(k):
        data = get_k_fold_data(k,i,x_train,y_train)
        net = get_net(in_features)
        train_ls,valid_ls = train(net,*data,num_epochs,learning_rate,weight_dacay,batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        print(f'折{i+1},训练log rmse{float(train_ls[-1]):f},'f'验证log rmse{float(valid_ls[-1]):f}')

    return train_l_sum/k,valid_l_sum/k

def train_and_pred(train_features,test_features,train_labels,data_test,num_epochs,lr,weight_decay,batch_size):
    net = get_net(in_features)
    train_ls,_ = train(net,train_features,train_labels,None,None,num_epochs,lr,weight_decay,batch_size)
    print(f'训练log rmse:{float(train_ls[-1]):f}')
    preds = net(test_features).detach().numpy()
    # 导出数据
    data_test['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([data_test['Id'], data_test['SalePrice']], axis=1)
    submission.to_csv('/kaggle/working/submission.csv', index=False)


if __name__ == '__main__':
    #导入数据
    data_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
    data_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
    '''
    数据预处理
    '''
    # 提取属性
    all_features = pd.concat((data_train.iloc[:, 1:-1], data_test.iloc[:, 1:]),ignore_index=True)
    # 提取数值类型的属性
    numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
    # 标准化
    all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
    # 在标准化数据之后,所有均值消失,因此我们可以将缺失值设为0
    all_features[numeric_features] = all_features[numeric_features].fillna(0)
    # 离散值数据one-hot编码
    all_features = pd.get_dummies(all_features, dummy_na=True)
    # 转化为张量格式
    n_train = data_train.shape[0]
    train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32) #训练集
    test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32) #测试集
    train_labels = torch.tensor(data_train.SalePrice.values.reshape(-1, 1), dtype=torch.float32) #训练集标签
    '''
    训练
    '''
    in_features = train_features.shape[1]
    loss = nn.MSELoss() #均方差损失函数
    k, num_epochs, lr, weight_dacay, batch_size = 5 , 1000 , 5 , 0 , 64
    train_l,valid_l = k_fold(k,train_features,train_labels,num_epochs,lr,weight_dacay,batch_size)
    print(f'{k}-折验证,平均训练log rmse{float(train_l):f},'f'平均验证log rmse{float(valid_l):f}')
    '''
    预测
    '''
    train_and_pred(train_features,test_features,train_labels,data_test,num_epochs,lr,weight_dacay,batch_size)

你可能感兴趣的:(研究生入门,代码精进,python,机器学习)