from hyperopt import fmin, tpe, hp, partial
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, zero_one_loss
import xgboost as xgb
import pandas as pd
def GetNewDataByPandas():
wine = pd.read_csv("../data/wine.csv")
wine['alcohol**2'] = pow(wine["alcohol"], 2)
wine['volatileAcidity*alcohol'] = wine["alcohol"] * wine['volatile acidity']
y = np.array(wine.quality)
X = np.array(wine.drop("quality", axis=1))
columns = np.array(wine.columns)
return X, y, columns
# Read wine quality data from file
X, y, wineNames = GetNewDataByPandas()
# split data to [[0.8,0.2],01]
x_train_all, x_predict, y_train_all, y_predict = train_test_split(X, y, test_size=0.10, random_state=100)
x_train, x_test, y_train, y_test = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=100)
dtrain = xgb.DMatrix(data=x_train,label=y_train,missing=-999.0)
dtest = xgb.DMatrix(data=x_test,label=y_test,missing=-999.0)
evallist = [(dtest, 'eval'), (dtrain, 'train')]
# 自定义hyperopt的参数空间
space = {"max_depth": hp.randint("max_depth", 15),
"n_estimators": hp.randint("n_estimators", 300),
'learning_rate': hp.uniform('learning_rate', 1e-3, 5e-1),
"subsample": hp.randint("subsample", 5),
"min_child_weight": hp.randint("min_child_weight", 6),
def argsDict_tranform(argsDict, isPrint=False):
argsDict["max_depth"] = argsDict["max_depth"] + 5
argsDict['n_estimators'] = argsDict['n_estimators'] + 150
argsDict["learning_rate"] = argsDict["learning_rate"] * 0.02 + 0.05
argsDict["subsample"] = argsDict["subsample"] * 0.1 + 0.5
argsDict["min_child_weight"] = argsDict["min_child_weight"] + 1
if isPrint:
return argsDict
def xgboost_factory(argsDict):
argsDict = argsDict_tranform(argsDict)
params = {'nthread': -1, # 进程数
'max_depth': argsDict['max_depth'], # 最大深度
'n_estimators': argsDict['n_estimators'], # 树的数量
'eta': argsDict['learning_rate'], # 学习率
'subsample': argsDict['subsample'], # 采样数
'min_child_weight': argsDict['min_child_weight'], # 终点节点最小样本占比的和
'objective': 'reg:linear',
'silent': 0, # 是否显示
'gamma': 0, # 是否后剪枝
'colsample_bytree': 0.7, # 样本列采样
'alpha': 0, # L1 正则化
'lambda': 0, # L2 正则化
'scale_pos_weight': 0, # 取值>0时,在数据不平衡时有助于收敛
'seed': 100, # 随机种子
'missing': -999, # 填充缺失值
params['eval_metric'] = ['rmse']
xrf = xgb.train(params, dtrain, params['n_estimators'], evallist,early_stopping_rounds=100)
return get_tranformer_score(xrf)
def get_tranformer_score(tranformer):
xrf = tranformer
dpredict = xgb.DMatrix(x_predict)
prediction = xrf.predict(dpredict, ntree_limit=xrf.best_ntree_limit)
return mean_squared_error(y_predict, prediction)
# 开始使用hyperopt进行自动调参
algo = partial(tpe.suggest, n_startup_jobs=1)
best = fmin(xgboost_factory, space, algo=algo, max_evals=20, pass_expr_memo_ctrl=None)
[15:23:32] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 142 extra nodes, 0 pruned nodes, max_depth=10
[0] eval-rmse:5.03273 train-rmse:4.90203
Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping.
Will train until train-rmse hasn't improved in 100 rounds.
[15:23:32] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 172 extra nodes, 0 pruned nodes, max_depth=10
[1] eval-rmse:4.77384 train-rmse:4.64767
[15:24:04] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 192 extra nodes, 0 pruned nodes, max_depth=15
[299] eval-rmse:0.570382 train-rmse:0.000749
RMSE = xgboost_factory(best)
print('best :', best)
print('best param after transform :')
print('rmse of the best xgboost:', np.sqrt(RMSE))
[15:24:52] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 428 extra nodes, 0 pruned nodes, max_depth=14
[0] eval-rmse:5.02286 train-rmse:4.89385
Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping.
Will train until train-rmse hasn't improved in 100 rounds.
[15:24:52] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 680 extra nodes, 0 pruned nodes, max_depth=14
[1] eval-rmse:4.75938 train-rmse:4.63251
[298] eval-rmse:0.583923 train-rmse:0.000705
[15:24:54] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 0 pruned nodes, max_depth=7
[299] eval-rmse:0.583926 train-rmse:0.000704
best : {'learning_rate': 0.05385158551863543, 'max_depth': 14, 'min_child_weight': 2, 'n_estimators': 173, 'subsample': 0.8}
best param after transform :
{'learning_rate': 0.051077031710372714, 'max_depth': 19, 'min_child_weight': 3, 'n_estimators': 323, 'subsample': 0.5800000000000001}
rmse of the best xgboost: 0.5240080946197716