max_depth : 树的深度
num_leaves : 树的最大叶子节点数
objective : 目标函数
min_data_in_leaf : 一个叶子上最小数据量
learning_rate : 学习率
feature_fraction : 随机抽取特征的比例
bagging_fraction : 在不进行重采样的情况下随机选择部分数据
bagging_freq : bagging的频率,k意味着每k次迭代执行bagging
metric : 评价指标
is_unbalance : 数据是否平衡
import lightgbm as lg
def print_feature_importance_lgb(gbm):
print(80 * '*')
print(31 * '*' + 'Feature Importance' + 31 * '*')
print(80 * '.')
print("\n".join((".%50s => %9.5f" % x) for x in sorted(
zip(gbm.feature_name(), gbm.feature_importance("gain")),
key=lambda x: x[1],
reverse=True)))
print(80 * '.')
def fit_lgb(X_tr,y_tr,X_va,y_va,cates_cols):
params = {
'max_depth':8,
'num_leaves':128,
'objective':'binary',
'min_data_in_leaf':20,
'learning_rate':0.01,
'feature_fraction':0.9,
'bagging_fraction':0.8,
'subsample':0.85,
'bagging_freq':1,
'random_state':2019,
'metric':["binary_logloss"],
'num_threads':16,
'is_unbalance':True
}
MAX_ROUNDS = 10000
dtr = lg.Dataset(X_tr, label=y_tr, categorical_feature=cates_cols)
dva = lg.Dataset(X_va, label=y_va, categorical_feature=cates_cols, reference=dtr)
cls = lg.train(
params,
dtr,
num_boost_round=MAX_ROUNDS,
valid_sets=(dva, dtr),
valid_names=['valid', 'train'],
early_stopping_rounds=125,
verbose_eval=50)
print_feature_importance_lgb(cls)
lg.plot_importance(cls, importance_type='gain', figsize=(11,12), max_num_features=50, grid=False)
return cls
gbm = fit_lgb(X_tr, y_tr, X_va, y_va, nominal_cate_cols)