在数据层面的一些正负采样,业务层面一些数据筛选,以及异常值的处理后。我们进行模型训练,同时需要对模型进行参数的调整,以提升模型的精度。笔者就一些现有的调参框架进行汇总。
pip install bayesian-optimization
构建一个funcLGB_bayesian
:
设定参数的范围bounds_LGB
初始化贝叶斯优化模型(lgb_opt = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=42)
)
贝叶斯优化(lgb_opt.maximize(init_points=5, n_iter=5, acq='ucb', xi=0.0, alpha=1e-6)
)
import lightgbm as lgb
import numpy as np
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error
def LGB_bayesian(
num_leaves,
min_data_in_leaf,
min_sum_hessian_in_leaf, feature_fraction,
lambda_l1, lambda_l2,
min_gain_to_split
):
num_leaves = int(np.round(num_leaves))
min_data_in_leaf = int(np.round(min_data_in_leaf))
param = {
'num_leaves': num_leaves,
'max_bin':128,
'min_data_in_leaf': min_data_in_leaf,
'learning_rate':0.01,
'bagging_fraction': 0.95,
'bagging_freq':5,
'bagging_seed':66,
'feature_fraction': feature_fraction,
'feature_fraction_seed': 66,
# loss
'lambda_l1':lambda_l1,
'lambda_l2':lambda_l2,
'min_gain_to_split': min_gain_to_split,
# greedy
'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf,
# object-metric
'objective': 'regression',
'metric': 'rmse',
'n_jobs':25,
'boosting_type': 'gbdt',
'verbose': 1,
'early_stopping_rounds':50,
'n_estimators': 500
}
lgb_train = lgb.Dataset(train_df[used_features], label=np.log1p(train_df[target]))
lgb_valid = lgb.Dataset(val_df[used_features], label=np.log1p(val_df[target]))
lgb_estimator = lgb.train(param, lgb_train, valid_sets=[lgb_train, lgb_valid], verbose_eval=200)
pred_ = lgb_estimator.predict(val_df[used_features], num_iteration = lgb_estimator.best_iteration)
loss = np.sqrt(mean_squared_error(val_df[target].values, np.round(np.expm1(pred_))))
return -loss
bounds_LGB = {
'num_leaves': (10, 30),
'min_data_in_leaf': (5, 30),
'min_sum_hessian_in_leaf': (0, 5),
'feature_fraction': (0.55, 1),
'lambda_l1': (0, 3),
'lambda_l2': (0, 3),
'min_gain_to_split': (0, 1)
}
lgb_opt = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=42)
print(lgb_opt.space.keys)
print('=='*30)
lgb_opt.maximize(init_points=5, n_iter=5, acq='ucb', xi=0.0, alpha=1e-6)
# ---------------------------------------------------------
print(lgb_opt.max['target'])
rest_dict = lgb_opt.max['params']
lgb_param = {
'num_leaves': int(np.round(rest_dict['num_leaves'])),
'max_bin':128,
'min_data_in_leaf': int(np.round(rest_dict['min_data_in_leaf'])),
'learning_rate':0.01,
'bagging_fraction': 0.95,
'bagging_freq':5,
'bagging_seed':66,
'feature_fraction':rest_dict['feature_fraction'] ,
'feature_fraction_seed': 66,
# loss
'lambda_l1':rest_dict['lambda_l1'] ,
'lambda_l2':rest_dict['lambda_l2'] ,
'min_gain_to_split': rest_dict['min_gain_to_split'] ,
# greedy
'min_sum_hessian_in_leaf': rest_dict['min_sum_hessian_in_leaf'] ,
# object-metric
'objective': 'regression',
'metric': 'rmse',
'n_jobs':25,
'boosting_type': 'gbdt',
'verbose': 1,
'early_stopping_rounds':50,
'n_estimators': 500
}
print(lgb_param)
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
import lightgbm as lgb
lgb_model = lgb.LGBMRegressor(
objective='regression',
metric='rmse',
max_bin=100,
n_estimators=500,
learning_rate=0.01,
bagging_fraction=0.95,
bagging_freq=5,
bagging_seed=66,
feature_fraction_seed=66,
boosting='gbdt',
n_jobs=25,
verbose=0,
early_stopping_rounds=50
)
param_dict = {
"num_leaves": sp_randint(5, 40),
"min_data_in_leaf": sp_randint(5, 64),
"min_sum_hessian_in_leaf": np.linspace(0, 10, 30),
"feature_fraction": np.linspace(0.55, 1, 30),
'lambda_l1': np.linspace(0, 10, 30),
'lambda_l2': np.linspace(0, 10, 30),
"min_gain_to_split": np.linspace(0., 1, 30)
}
random_search = RandomizedSearchCV(
lgb_model,
param_distributions=param_dict,
n_iter=15,
cv=3
)
reg_cv = random_search.fit(
train_df[used_features], np.log1p(train_df[target]),
eval_set=[(train_df[used_features], np.log1p(train_df[target])), (val_df[used_features], np.log1p(val_df[target]))],
verbose=200
)
reg_cv.best_params_
Optuna 是一个自动超参数优化软件框架,专为机器学习而设计。 它具有命令式、运行时定义的用户 API。 使用define-by-run API,使用Optuna编写的代码具有高度模块化,并且Optuna的用户可以动态构建超参数的搜索空间,详细可以看Optuna的github
import optuna
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from functools import partial
def lgb_optuna(trial, train_x, train_y, test_x, test_y):
param = {
'num_leaves': trial.suggest_int(
'num_leaves', 5, 40
),
'max_bin':100,
'min_data_in_leaf': trial.suggest_int(
'min_data_in_leaf', 5, 64
),
'learning_rate':0.01,
'bagging_fraction': 0.95,
'bagging_freq':5,
'bagging_seed':66,
'feature_fraction': trial.suggest_loguniform(
'feature_fraction', 0.55 , 0.99
),
'feature_fraction_seed': 66,
# loss
'lambda_l1':trial.suggest_discrete_uniform(
'lambda_l1', 0.0 , 10.0, 0.1
),
'lambda_l2':trial.suggest_discrete_uniform(
'lambda_l2', 0.0 , 10.0, 0.1
),
'min_gain_to_split': rest_dict['min_gain_to_split'] ,
# greedy
'min_sum_hessian_in_leaf': trial.suggest_discrete_uniform(
'min_sum_hessian_in_leaf', 0.55 , 20.0, 0.1
),
# object-metric
'objective': 'regression',
'metric': 'rmse',
'n_jobs':25,
'boosting': 'gbdt',
'verbose': 1,
'early_stopping_rounds':50,
'n_estimators': 500
}
model = lgb.LGBMRegressor(**param)
model.fit(train_x, train_y,
eval_set=[
(train_x, train_y), (test_x, test_y)
],
early_stopping_rounds=50,
verbose=200
)
pred_ = model.predict(test_x)
loss = np.sqrt(mean_squared_error(test_x, np.round(np.expm1(pred_))))
return loss
study = optuna.create_study(direction='minimize')
lgb_op_partial = partial(lgb_optuna, train_x=train_df[used_features], train_y=train_df[target].values, test_x=val_df[used_features], test_y=val_df[target].values )
study.optimize(lgb_op_partial, n_trials=15)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
Hyperopt:是python中的一个用于"分布式异步算法组态/超参数优化"的类库。使用它我们可以拜托繁杂的超参数优化过程,自动获取最佳的超参数。广泛意义上,可以将带有超参数的模型看作是一个必然的非凸函数,因此hyperopt几乎可以稳定的获取比手工更加合理的调参结果。尤其对于调参比较复杂的模型而言,其更是能以远快于人工调参的速度同样获得远远超过人工调参的最终性能。
from hyperopt import fmin, tpe, hp, partial, STATUS_OK, Trials
def lgb_hyp_opt(hpy_space_dict, train_x, train_y, test_x, test_y):
param = {
'num_leaves': hpy_space_dict['num_leaves'],
'max_bin':100,
'min_data_in_leaf': hpy_space_dict['min_data_in_leaf'],
'learning_rate':0.01,
'bagging_fraction': 0.95,
'bagging_freq':5,
'bagging_seed':66,
'feature_fraction': hpy_space_dict['feature_fraction'],
'feature_fraction_seed': 66,
# loss
'lambda_l1':hpy_space_dict['lambda_l1'],
'lambda_l2':hpy_space_dict['lambda_l2'],
'min_gain_to_split': hpy_space_dict['min_gain_to_split'],
# greedy
'min_sum_hessian_in_leaf': hpy_space_dict['min_sum_hessian_in_leaf'],
# object-metric
'objective': 'regression',
'metric': 'rmse',
'n_jobs':25,
'boosting': 'gbdt',
'verbose': 1,
'early_stopping_rounds':50,
'n_estimators': 500
}
model = lgb.LGBMRegressor(**param)
model.fit(train_x, np.log1p(train_y),
eval_set=[
(train_x, np.log1p(train_y)), (test_x, np.log1p(test_y))
],
early_stopping_rounds=50,
verbose=200
)
pred_ = model.predict(test_x)
loss = np.sqrt(mean_squared_error(test_y, np.round(np.expm1(pred_))))
return {"loss": loss, "status": STATUS_OK}
space = {
'num_leaves': hp.randint(
'num_leaves', 5, 40
),
'min_data_in_leaf': hp.randint(
'min_data_in_leaf', 5, 64
),
'feature_fraction': hp.uniform(
'feature_fraction', 0.55 , 0.99
),
# loss
'lambda_l1': hp.loguniform(
'lambda_l1', -2.5 , 2.5
),
'lambda_l2': hp.loguniform(
'lambda_l2', -2.5 , 2.5
),
'min_gain_to_split': hp.loguniform(
'min_gain_to_split', 1e-3 , 1.0
),
# greedy
'min_sum_hessian_in_leaf': hp.loguniform(
'min_sum_hessian_in_leaf', -2.5, 3
)
}
lgb_opt = partial(lgb_hyp_opt,
train_x=train_df[used_features],
train_y=train_df[target].values,
test_x=val_df[used_features],
test_y=val_df[target].values
)
algo = partial(tpe.suggest, n_startup_jobs=1)
best = fmin(lgb_opt, space, algo=algo,
max_evals=20,
pass_expr_memo_ctrl=None)
print(best)
一般推荐使用optuna
和 bayes_opt
,两者想对其他调参框架得到的模型精度会更高
共性:分为参数空间构建-参数搜索评估器-参数搜索优化器
参数空间构建(这部分可以基于对算法收敛的原理进行相应参数空间设定)
bounds_LGB
param_dict
space
hp
罗列出所有搜索参数的分布参数搜索评估器构建
LGB_bayesian
RandomizedSearchCV
lgb_optuna(trial, train_x, train_y, test_x, test_y)
lgb_hyp_opt(hpy_space_dict, train_x, train_y, test_x, test_y)
参数搜索优化器
BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=42)
lgb_opt.maximize(init_points=5, n_iter=5, acq='ucb', xi=0.0, alpha=1e-6)
RandomizedSearchCV.fit
fit
方法optuna.create_study(direction='minimize')
study.optimize(lgb_op_partial, n_trials=15, n_jobs=2)
可设置并行fmin
best = fmin(lgb_opt, space, algo=algo, max_evals=20, pass_expr_memo_ctrl=None)
algo = partial(tpe.suggest, n_startup_jobs=1)