思路和上一篇xg一样,也是同样的一个数据集,只是换成了lightgbm
import pandas as pd
train_sales = pd.read_csv('C:\\Train\\train_sales_data.csv',header=0)
train_search = pd.read_csv('C:\\Train\\train_search_data.csv',header=0)
data=train_sales.merge(train_search,on=("adcode","model","regYear","regMonth"),how='inner')
data=data.drop(['province_x','province_y'], axis=1);
print(data)
import copy
categoricals = ['model', 'adcode','bodyType']
for feature in categoricals:
df = copy.copy(pd.get_dummies(data[feature], drop_first=True))
data= pd.concat([data, df], axis=1)
data.drop(columns=feature, inplace=True)
print(data.head())
def to_supervised(data):
x = data.iloc[0:1320*20,:].values
y = data.iloc[1320*4:1320*24,2].values
return x, y
data_x,data_y=to_supervised(data)
print(data_x.shape)
print(data_y.shape)
train_x,test_x=data_x[0:1320*16],data_x[1320*16:26399+1]
train_y,test_y=data_y[0:1320*16],data_y[1320*16:26399+1]
from numpy import nan
from numpy import isnan
from pandas import read_csv
from pandas import to_numeric
from sklearn.metrics import r2_score
import lightgbm as lgb
# multivariate multi-step encoder-decoder lstm
from math import sqrt
from numpy import split
from numpy import array
from pandas import read_csv
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from numpy.random import seed
import numpy as np
import xgboost as xgb
import pandas as pd
#from sklearn.metrics import roc_auc_score
from sklearn.metrics import explained_variance_score
import matplotlib.pyplot as plt
from hyperopt import STATUS_OK,STATUS_RUNNING, fmin, hp, tpe,space_eval, partial
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
##训练参数
SEED = 314159265
VALID_SIZE = 0.25
def model_run(params):
print("Training with params: ")
print(params)
# train
print("Training with params: ")
print(params)
print("training...")
model_lgb = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[test_data],early_stopping_rounds=30)
print("Validating...")
# predict
check =model_lgb.predict(test_x)
print("explained_variance_score...")
score = get_score(test_y, check)
print("pr...")
print('The mse of prediction is: {:.6f}'.format(score))
## print("Predict test set...")
## test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration+1)
return {
'loss': score,
'status': STATUS_OK,
'stats_running': STATUS_RUNNING
}
def optimize(
#trials,
random_state=SEED):
# 自定义hyperopt的参数空间
space = {"n_iter":hp.choice("n_iter",range(50,200)),
"eta":hp.quniform("eta",0.05,0.5,0.05),
'eval_metric': 'rmse',
'objective': 'regression',
'boosting_type': 'gbdt',
'learning_rate': hp.quniform("learning_rate",0.05,0.3,0.02),
'num_leaves': 6,
'max_depth': 4,
'min_child_weight': hp.quniform("min_child_weight",2,10,1)
}
print("---------开始训练参数----------")
# best = fmin(model_run, space, algo=tpe.suggest, max_evals=1)
print("------------partial-------------")
##获取最优的参数
algo = partial(tpe.suggest, n_startup_jobs=1)
print("----------fmin---------------")
best = fmin(model_run, space, algo=algo, max_evals=1000, pass_expr_memo_ctrl=None)
print("-------------------------")
best_params = space_eval(space, best)
print("BEST PARAMETERS: " + str(best_params))
return best_params
##定义计分函数
def get_score(pre,real):
temp=[]
pre_t=[]
real_t=[]
pre=pre.round().astype(int)
for i in range(60):
for j in range(4):
pre_t.append(pre[1320*j+22*i:1320*j+22*(i+1)])
real_t.append(real[1320*j+22*i:1320*j+22*(i+1)])
temp.append(((mean_squared_error(pre_t,real_t))**0.5)/np.mean(real_t))
return sum(temp)/60
print("---------DMatrix----------")
train_data = lgb.Dataset(data=train_x,label=train_y)
test_data = lgb.Dataset(data=test_x,label=test_y)
print("---------开始优化参数----------")
best_params=optimize()
#print(test_prediction)
print("---------优化完成----------")
print(best_params)
##训练模型
print("---------正式训练模型----------")
model_lgb = lgb.train(best_params, train_data, num_boost_round=300, valid_sets=[test_data],early_stopping_rounds=30)
print("---------正式预测模型----------")
print("Predict test set...")
test_prediction = model_lgb.predict(data[1320*20:1320*24])
test_prediction1=model_lgb.predict(test_x)
print("---------预测完成----------")
print(best_params)
print(test_prediction.shape)
test_prediction=test_prediction.round().astype(int)
f = open('C:\\car_re_lgb.txt', 'w')
total = 0
for id in range(1320*4):
str1 =str(test_prediction[total])
str1 += '\n'
total += 1
f.write(str1)
f.close()
print("持久化完成")
test_prediction1=test_prediction1.round().astype(int)
score =get_score(test_y, test_prediction1)
print(1-score)