DNN二手车价格预测完整代码

前言

最近在学习深度学习,就用DNN试着跑了个天池赛二手车价格预测,特征还是用之前集成模型跑的特征,通过不断调试模型的学习率、隐藏层数量、神经元数量、优化器、激活函数、迭代次数、batchsize,KFold,最终达到与之前集成模型差不多的分数,但训练时间比catboost及lightgbm要快很多,毕竟只用了很少的迭代次数就能达到差不多的效果,接着在与集成模型进行融合,将之前的成绩从422提高到406,算是一次DNN的练习吧,毕竟模型的上限还是取决于特征工程,下面附上DNN完整代码,经过交叉验证取平均,线上可以达到428左右,需要的朋友自取。


import pandas as pd
import numpy as np
import Meancoder 
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
from torch.autograd import Variable
df = pd.read_csv('/train.csv', sep=' ')
test= pd.read_csv('/test.csv', sep=' ')

def date_process(x):
    year = int(str(x)[:4])
    month = int(str(x)[4:6])
    day = int(str(x)[6:8])
    if month < 1:
        month = 1
    date = datetime(year, month, day)
    return date
df['regDate'] = df['regDate'].apply(date_process)
df['creatDate'] = df['creatDate'].apply(date_process)
df['regDate_year'] = df['regDate'].dt.year
df['regDate_month'] = df['regDate'].dt.month
df['regDate_day'] = df['regDate'].dt.day
df['creatDate_year'] = df['creatDate'].dt.year
df['creatDate_month'] = df['creatDate'].dt.month
df['creatDate_day'] = df['creatDate'].dt.day
df['car_age_day'] = (df['creatDate'] - df['regDate']).dt.days
df['car_age_year'] = round(df['car_age_day'] / 365, 1)

df['notRepairedDamage']=df['notRepairedDamage'].replace('-',0.0).astype('float64')
df['power'][df['power']>600] = 600
df['power'][df['power']<1] = 1
df['v_13'][df['v_13']>6] = 6
df['v_14'][df['v_14']>4] = 4
df['fuelType'] = df['fuelType'].fillna(0)
df['gearbox'] = df['gearbox'].fillna(0)
df['bodyType'] = df['bodyType'].fillna(0)
df['model'] = df['model'].fillna(0)

test['regDate'] = test['regDate'].apply(date_process)
test['creatDate'] = test['creatDate'].apply(date_process)
test['regDate_year'] = test['regDate'].dt.year
test['regDate_month'] = test['regDate'].dt.month
test['regDate_day'] = test['regDate'].dt.day
test['creatDate_year'] = test['creatDate'].dt.year
test['creatDate_month'] = test['creatDate'].dt.month
test['creatDate_day'] = test['creatDate'].dt.day
test['car_age_day'] = (test['creatDate'] - test['regDate']).dt.days
test['car_age_year'] = round(test['car_age_day'] / 365, 1)

test['notRepairedDamage']=test['notRepairedDamage'].replace('-',0).astype('float64')
test['power'][test['power']>600] = 600
test['power'][test['power']<1] = 1
test['v_13'][test['v_13']>6] = 6
test['v_14'][test['v_14']>4] = 4
test['fuelType'] = test['fuelType'].fillna(0)
test['gearbox'] = test['gearbox'].fillna(0)
test['bodyType'] = test['bodyType'].fillna(0)
test['model'] = test['model'].fillna(0)

num_cols = [0,2,3,6,8,10,12,14]
for index, value in enumerate(num_cols):
    for j in num_cols[index+1:]:
        df['new'+str(value)+'*'+str(j)]=df['v_'+str(value)]*df['v_'+str(j)]
        df['new'+str(value)+'+'+str(j)]=df['v_'+str(value)]+df['v_'+str(j)]
        df['new'+str(value)+'-'+str(j)]=df['v_'+str(value)]-df['v_'+str(j)]
        test['new'+str(value)+'*'+str(j)]=test['v_'+str(value)]*test['v_'+str(j)]
        test['new'+str(value)+'+'+str(j)]=test['v_'+str(value)]+test['v_'+str(j)]
        test['new'+str(value)+'-'+str(j)]=test['v_'+str(value)]-test['v_'+str(j)]
for i in range(15):
    df['new'+str(i)+'*year']=df['v_'+str(i)] * df['car_age_year']
    test['new'+str(i)+'*year']=test['v_'+str(i)] * test['car_age_year']

num_cols1 = [3,5,1,11]
for index, value in enumerate(num_cols1):
    for j in num_cols1[index+1:]:
        df['new'+str(value)+'-'+str(j)]=df['v_'+str(value)]-df['v_'+str(j)]
        test['new'+str(value)+'-'+str(j)]=test['v_'+str(value)]-test['v_'+str(j)]

X=df.drop(columns=['price','SaleID','seller','offerType', 'name','creatDate','regionCode','regDate'])
test=test.drop(columns=['SaleID','seller','offerType', 'name','creatDate','regionCode','regDate'])
Y=df['price']

class_list = ['model','brand','power','v_0','v_3','v_8','v_12']#+date_cols  'v_6','v_10','v_14','v_2'
MeanEnocodeFeature = class_list   
ME = Meancoder.MeanEncoder(MeanEnocodeFeature,target_type='regression') 
X = ME.fit_transform(X,Y)   
test = ME.transform(test)

df_concat = pd.concat([X, test], ignore_index = True)
df_concat=StandardScaler().fit_transform(df_concat) 
X1=df_concat[:150000]
test1=df_concat[150000:]

# 模型设置
input_size = 143  
hidden_size = 320
num_classes = 1
batch_size = 2048  
learning_rate = 0.05 
x=torch.tensor(X1,dtype=torch.float32)
y=torch.FloatTensor(Y.to_numpy())
y=Variable(y.view(-1, 1))
test=torch.tensor(test1,dtype=torch.float32)  

class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

net = Net(input_size, hidden_size, num_classes)
print(net)

criterion = nn.L1Loss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

result = []
mean_score = 0
n_folds=5
kf = KFold(n_splits=n_folds ,shuffle=True,random_state=2022)
for train_index, test_index in kf.split(x):
    x_train = x[train_index]
    y_train = y[train_index]
    x_test = x[test_index]
    y_test = y[test_index]
    for i in range(2000):
        for start in range(0, len(x_train), batch_size):  
            end = start + batch_size if start + batch_size < len(x_train) else len(x_train)
            xx = x_train[start:end]
            yy = y_train[start:end]
            outputs = net(xx)
            loss = criterion(outputs, yy)
            net.zero_grad()
            loss.backward()
            optimizer.step()
    y_pred = net.forward(x_test)
    loss1 = criterion(y_test, y_pred)
    mean_score += loss1.item()/ n_folds
    print('验证集loss:{}'.format(loss1.item()))
    test_pred = net.forward(test)
    result.append(test_pred)
# 模型评估
print('mean 验证集Auc:{}'.format(mean_score))
cat_pre=sum(result)/n_folds  
cat_pre=cat_pre.detach().numpy()
ret=pd.DataFrame(cat_pre,columns=['price'])
ret.to_csv('/DNN.csv')

你可能感兴趣的:(python,数据挖掘,深度学习)