【模型调参】lgb的参数调节

Table of Contents

  • 1  数据导入
  • 2  模型挑选
  • 3  模型调参
    • 3.1  设立初始参数
    • 3.2  调解n_estimators
    • 3.3  max_depth/num_leaves
    • 3.4  min_child_samples/min_child_weight
    • 3.5  subsample/colsample_bytree(0.6,1)
    • 3.6  reg_alpha/reg_lamb
    • 3.7  学习率
  • 4  测试集生成结果
  • 5  特征选择
import numpy as np              # 导入numpy库
import pandas as pd             # 导入pandas库
import matplotlib as mpl        # 导入matplotlib库
import matplotlib.pyplot as plt 
import seaborn as sns           # 导入seaborn库
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import StandardScaler

数据导入

op = pd.read_csv('op_done.csv',index_col='user')
base = pd.read_csv('base_done.csv',index_col='user')
tr = pd.read_csv('tr_done.csv',index_col='user')
label = pd.read_csv('train_label.csv',index_col='user')
sumbit = pd.read_csv('submit_example.csv',index_col='user')
train = label.join(base).join(op).join(tr)
train.fillna(0,inplace=True)
train.head()
label sex age provider level verified using_time regist_type card_a_cnt card_b_cnt card_c_cnt agreement1 op1_cnt op2_cnt card_d_cnt agreement_total service1_cnt service1_amt service2_cnt agreement2 agreement3 agreement4 acc_count login_cnt_period1 login_cnt_period2 ip_cnt login_cnt_avg login_days_cnt province city balance balance_avg balance1 balance1_avg balance2 balance2_avg service3 service3_level product1_amount product2_amount product3_amount product4_amount product5_amount product6_amount product7_cnt product7_fail_cnt op_time is_all 116a2503b987ea81 b131ac74aa38a121 b2e7fa260df4998d type4 op_type0 op_type1 op_type2 op_type3 op_type4 mode0 mode1 mode2 mode3 mode4 channel0 channel1 channel2 channel3 channel4 ip_num platform_0 platform_1 platform_2 platform_3 platform_4 platform_5 tunnel_in_0 tunnel_in_1 tunnel_in_2 tunnel_in_3 tunnel_in_4 tunnel_out_0 tunnel_out_1 tunnel_out_2 tunnel_out_3 type1_0 type1_1 type1_2 type1_3 type1_4 type1_5 type1_6 type1_7 type1_8 type1_9 type1_10 type1_11 type1_12 type1_13 type1_14 type1_15 type1_16 type1_17 type1_18 type1_19 type2_0 type2_1 type2_2 type2_3 type2_4 type2_5 type2_6 type2_7 type2_8 type2_9 type2_10 type2_11 type2_12 type2_13 tr_time mean_amount ip_ture
user
Train_00000 0 0 24871 0 1 0 24712 1 24712 24712 24706 0 24731 24719 24706 24743 24706 24706 24706 1 0 0 24737 25041 24938 24719 24737 24749 1 0 14 14 1 1 16 5 0 4 1 1 1 0 0 1 24712 24706 102.0 0.705882 1.0 1.0 28.0 72.0 12.0 0.0 44.0 0.0 0.0 0.0 0.0 0.0 12.0 0.0 0.0 0.0 0.0 2.0 0.0 4.0 6.0 7.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0 0.0 13.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 2.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0 0.0 0.0 0.0 7.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 13.0 53330.307692 0.384615
Train_00001 1 0 24889 0 2 0 24716 1 24719 24719 24706 0 24712 24712 24706 24755 24706 24706 24706 1 0 0 24737 25443 24931 24731 24749 24737 2 1 3 8 6 6 1 5 0 4 2 3 1 0 0 6 24712 24706 18.0 0.333333 11.0 0.0 1.0 6.0 6.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0 0.0 0.0 0.0 1.0 0.0 0.0 2.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 36098.000000 0.500000
Train_00002 0 0 24963 0 2 0 24736 7 24712 24712 24706 0 24712 24712 24706 24743 24706 24706 24706 1 0 0 24731 26584 26524 24774 24774 24859 2 0 8 8 1 1 9 5 0 4 1 1 1 0 0 1 24719 24719 8.0 0.125000 0.0 5.0 2.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 2.0 0.0 1.0 5.0 7.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0 0.0 12.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 6.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 7.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12.0 57329.583333 0.500000
Train_00003 0 0 24840 2 2 0 24719 3 24712 24712 24706 0 24719 24706 24706 24737 24706 24706 24706 0 1 0 24712 25571 25529 24908 24737 24846 2 1 1 1 1 1 1 1 0 4 1 1 1 0 0 1 24712 24706 108.0 0.472222 0.0 4.0 46.0 58.0 24.0 0.0 14.0 0.0 0.0 0.0 0.0 0.0 24.0 0.0 0.0 0.0 0.0 6.0 0.0 19.0 2.0 9.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0 0.0 11.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 3.0 0.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 9.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 11.0 61652.454545 0.727273
Train_00004 0 0 24871 2 1 0 24707 3 24712 24712 24706 0 24706 24706 24706 24725 24706 24706 24706 0 0 0 24712 25838 25838 24755 24816 24767 1 0 9 6 1 1 10 3 0 4 1 1 1 0 0 1 24706 24706 5.0 0.200000 0.0 4.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 36689.000000 0.000000
train_X = train.iloc[:,1:].values
train_y = train.iloc[:,0].values
stand = StandardScaler()
train_X = stand.fit_transform(train_X)
test = sumbit.join(base).join(op).join(tr)
test.fillna(0,inplace =True)
test_X = test.iloc[:,1:].values
test_X = stand.fit_transform(test_X)

模型挑选

lr = LogisticRegression(random_state=2018)  # 逻辑回归模型
svm = SVC(probability=True,random_state=2018)  # SVM模型
forest=RandomForestClassifier(n_estimators=100,random_state=2018) # 随机森林
Gbdt=GradientBoostingClassifier(random_state=2018) #CBDT
Xgbc=XGBClassifier(random_state=2018)  #Xgbc
gbm=lgb.LGBMClassifier(random_state=2018)  #lgb
model_name=["lr","svm","forest","Gbdt","Xgbc","gbm"]
def muti_score(model):
    auc = cross_val_score(model, train_X, train_y, scoring='roc_auc', cv=3)
    return auc.mean()
scores = []
for name in model_name:
    model = eval(name)
    socre = muti_score(model)
    scores.append((name,socre))
scores
[('lr', 0.6374291913334925),
 ('svm', 0.42584336157620334),
 ('forest', 0.6732019222635085),
 ('Gbdt', 0.6995580705824883),
 ('Xgbc', 0.6890128512134231),
 ('gbm', 0.7027585172289985)]

模型调参

经过对比 gbdt和gbm效果较好 
调参顺序 n_estimators -- max_depth/num_leaves -- min_child_samples/min_child_weight -- subsample/colsample_bytree --reg_alpha/reg_lambda -- 学习率

设立初始参数

params = {'boosting_type':'gbdt','objective': 'binary','subsample': 0.8,'colsample_bytree': 0.8}
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()
{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

调解n_estimators

param_1 = {'n_estimators':range(50,150,5)}
cv = GridSearchCV(gbm,param_grid=param_1,scoring='roc_auc',cv=5)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
plt.plot(result['param_n_estimators'],result['mean_test_score'])
0.718750789233066 {'n_estimators': 80}





[]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hqXl8ymd-1597326586451)(output_17_2.png)]

params.update(grid_result.best_params_)
gbm=lgb.LGBMClassifier(**params)

max_depth/num_leaves

param_2 = {'max_depth':range(5,9),'num_leaves ':range(20,50,2)}
cv = GridSearchCV(gbm,param_grid=param_2,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
0.7191457708890046 {'max_depth': 8, 'num_leaves ': 20}
params.update({'max_depth': 8, 'num_leaves ': 20})
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()
{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 8,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 80,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'num_leaves ': 20}

min_child_samples/min_child_weight

param_3 = {'min_child_samples':range(10,30,2),'min_child_weight':[i/1000 for i in range(0,20,2)]}
cv = GridSearchCV(gbm,param_grid=param_3,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
0.7191457708890046 {'min_child_samples': 20, 'min_child_weight': 0.0}
params.update({'min_child_samples': 20, 'min_child_weight': 0.0})
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()
{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 8,
 'min_child_samples': 20,
 'min_child_weight': 0.0,
 'min_split_gain': 0.0,
 'n_estimators': 80,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'num_leaves ': 20}

subsample/colsample_bytree(0.6,1)

param_4 = {'subsample':[i/10 for i in range(6,10,1)],'colsample_bytree':[i/10 for i in range(6,10,1)]}
cv = GridSearchCV(gbm,param_grid=param_4,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.6}

reg_alpha/reg_lamb

param_5 = {'subsample':[i/10 for i in range(10)],'colsample_bytree':[i/10 for i in range(10)]}
cv = GridSearchCV(gbm,param_grid=param_5,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.1}

学习率

param_6 = {'learning_rate':[i/100 for i in range(20)]}
cv = GridSearchCV(gbm,param_grid=param_6,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
0.7191457708890046 {'learning_rate': 0.1}

测试集生成结果

gbm.fit(train_X, train_y)
LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
               n_estimators=80, num_leaves =20, objective='binary',
               subsample=0.8)
gbm.feature_importances_
array([ 17,  82,  16,  13,   8,  86,  16,  12,  13,  15,   0,  25,  33,
        12,  25,   0,   6,   1,  11,  12,   6,  49,  87,  61,  56,  51,
        58,  36,  21,  20,  38,  20,  24,   7,  17,  14,   7,   2,  34,
         5,   0,   0,  39,  60, 121,  41,  67,  35,  30,  68,  66,  53,
        12,  49,   3,  12,   1,  18,   2,   9,   1,  24,  26,   8,  13,
         1,  28,  18,  24,   9,   0,   5,   1,   0,   0,   0,   1,  23,
        11,  19,   5,   0,   8,   5,  31,  11,   4,   6,   7,  92,  26,
         0,   0,  12,   0,   0,   0,   1,   0,  16,   0,   0,   0,  35,
         0,   5,   0,  10,   9,  16,   0,   0,   0,   3,   5,   0,  23,
       107,  49])
train.iloc[:,1:].columns
Index(['sex', 'age', 'provider', 'level', 'verified', 'using_time',
       'regist_type', 'card_a_cnt', 'card_b_cnt', 'card_c_cnt',
       ...
       'type2_7', 'type2_8', 'type2_9', 'type2_10', 'type2_11', 'type2_12',
       'type2_13', 'tr_time', 'mean_amount', 'ip_ture'],
      dtype='object', length=119)
feature_importance = pd.DataFrame({'feature':train.iloc[:,1:].columns,'importance':gbm.feature_importances_})
feature_importance.sort_values(by='importance',ascending=False).head(20)
feature importance
44 product7_fail_cnt 121
117 mean_amount 107
89 type1_7 92
22 login_cnt_period1 87
5 using_time 86
1 age 82
49 b2e7fa260df4998d 68
46 is_all 67
50 type4 66
23 login_cnt_period2 61
43 product7_cnt 60
26 login_days_cnt 58
24 ip_cnt 56
51 op_type0 53
25 login_cnt_avg 51
118 ip_ture 49
21 acc_count 49
53 op_type2 49
45 op_time 41
42 product6_amount 39
y_pre = gbm.predict(train_X)
y_pre = gbm.predict_proba(train_X)
roc_auc_score(train_y,y_pre[:,1])
0.7876456295250149
y = gbm.predict_proba(test_X)
y[:,1]
array([0.02967902, 0.44846496, 0.02377314, ..., 0.21914047, 0.28423991,
       0.16758796])
test['prob'] = y[:,1]
pd.DataFrame(test.iloc[:,0]).to_csv('result.csv')
auc = cross_val_score(gbm, train_X, train_y, scoring='roc_auc', cv=10)

array([0.73246514, 0.72179747, 0.72288483, 0.72767674, 0.72240485,
       0.72194103, 0.71986724, 0.71257605, 0.71155348, 0.7186749 ])
auc.mean()
0.7211841722940205

特征选择

list_feature = feature_importance.sort_values(by='importance',ascending=False)['feature'].to_list()
list_socre = []
for i in range(50,120,10):
    fearture = list_feature[:i]
    train_X = stand.fit_transform(train.loc[:,fearture].values)
    auc = cross_val_score(gbm, train_X, train_y, scoring='roc_auc', cv=5)
    list_socre.append((i,auc.mean()))
list_socre
[(50, 0.7164324796787287),
 (60, 0.7178106094882282),
 (70, 0.7200468611823796),
 (80, 0.7193456575143582),
 (90, 0.7190751868013574),
 (100, 0.7190497035344566),
 (110, 0.7182153617821309)]
test_X = stand.fit_transform(test.loc[:,list_feature[:70]].values)
train_X = stand.fit_transform(train.loc[:,list_feature[:70]].values)
gbm.fit(train_X,train_y)
LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
               n_estimators=80, num_leaves =20, objective='binary',
               subsample=0.8)

你可能感兴趣的:(机器学习鸭)