Kaggle实战(四): XGBoost调参

以Kaggle 2015年举办的Otto Group Product Classification Challenge竞赛数据为例,进行XGBoost参数调优探索。

竞赛官网:https://www.kaggle.com/c/otto-group-product-classification-challenge/data

# 导入模块,读取数据
from xgboost import XGBClassifier
import xgboost as xgb

import pandas as pd 
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import log_loss

from matplotlib import pyplot
import seaborn as sns
%matplotlib inline

dpath = 'F:/Python_demo/XGBoost/data/'
train = pd.read_csv(dpath +"Otto_train.csv")
train.head()

1.看样本分布是否均衡

sns.countplot(train.target);
pyplot.xlabel('target');
pyplot.ylabel('Number of occurrences');

Kaggle实战(四): XGBoost调参_第1张图片

每类样本分布不是很均匀,所以交叉验证时也考虑各类样本按比例抽取

2. 提取训练和标签

y_train = train['target']
y_train = y_train.map(lambda s: s[6:])  # .map 迭代进行函数计算,提取出对应的标签1,2.3.  Class_1中标签1位置为s[6:]
y_train = y_train.map(lambda s: int(s)-1)  # 将标签转化为0开始
train = train.drop(["id", "target"], axis=1)  # 删除id,target行
X_train = np.array(train)  # 输入变量

3.设置交叉验证,各类样本不均衡,交叉验证是采用StratifiedKFold,在每折采样时各类样本按比例采样

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)

4. 设置xgboost默认参数,开始对树个数进行调整,

def modelfit(alg, X_train, y_train, useTrainCV=True, cv_folds=None, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgb_param['num_class'] = 9  # 设置分类类别数目
        
        xgtrain = xgb.DMatrix(X_train, label = y_train)  # DMatrix 是 xgb 存储信息的单位,本步骤把数据放进这里面去
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], folds =cv_folds,
             metrics='mlogloss', early_stopping_rounds=early_stopping_rounds, verbose_eval=10)
        # num_boost_round是最大迭代次数,early_stopping_rounds次指标没有提升,输出对应的轮数,verbose_eval次打印一次损失
        n_estimators = cvresult.shape[0]  # 获取最合适树的个数
        alg.set_params(n_estimators = n_estimators)  # 设置最佳值
        
        print( cvresult )  # 输出交叉验证信息
        #result = pd.DataFrame(cvresult)   #cv缺省返回结果为DataFrame
        #result.to_csv('my_preds.csv', index_label = 'n_estimators')
        cvresult.to_csv('my_preds_4_1.csv', index_label = 'n_estimators')  # 保存交叉验证信息
        
        # plot
        test_means = cvresult['test-mlogloss-mean']
        test_stds = cvresult['test-mlogloss-std'] 
        
        train_means = cvresult['train-mlogloss-mean']
        train_stds = cvresult['train-mlogloss-std'] 

        x_axis = range(0, n_estimators)
        pyplot.errorbar(x_axis, test_means, yerr=test_stds ,label='Test')
        pyplot.errorbar(x_axis, train_means, yerr=train_stds ,label='Train')
        pyplot.title("XGBoost n_estimators vs Log Loss")
        pyplot.xlabel( 'n_estimators' )
        pyplot.ylabel( 'Log Loss' )
        pyplot.savefig( 'n_estimators.png' )
    
    #Fit the algorithm on the data
    alg.fit(X_train, y_train, eval_metric='mlogloss')  # 训练最佳算法
        
    #Predict training set:
    train_predprob = alg.predict_proba(X_train)  # 计算训练集正确度
    logloss = log_loss(y_train, train_predprob)

        
    #Print model report:
    print ("logloss of train :" )
    print(logloss)


#params = {"objective": "multi:softprob", "eval_metric":"mlogloss", "num_class": 9}

xgb1 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=1000,  #数值大没关系,cv会自动返回合适的n_estimators
        max_depth=5,
        min_child_weight=1,  # 叶子节点需要的最小样本权重
        gamma=0,  # 节点分裂所需的最小损失函数下降值
        silent=0,  # 输出测试信息
        subsample=0.3,  # 样本采样比
        colsample_bytree=0.8,  # 特征比例
        colsample_bylevel=0.7,  # 分裂所需特征比例
        objective= 'multi:softprob',  # 多分类问题
        seed=3)


modelfit(xgb1, X_train, y_train, cv_folds = kfold)

5. 调整树的参数:max_depth & min_child_weight

# 采用上面得到的n_estimators最优值(699),其余参数继续默认值
# 设置超参数调整范围:max_depth 建议3-10, min_child_weight=1/sqrt(稀有事件数) =5.5

max_depth = range(3,10,2)
min_child_weight = range(1,6,2)
param_test2_1 = dict(max_depth=max_depth, min_child_weight=min_child_weight)
{'max_depth': range(3, 10, 2), 'min_child_weight': range(1, 6, 2)}

注意:用交叉验证评价模型性能时,用scoring参数定义评价指标。评价指标是越高越好,因此用一些损失函数当评价指标时,需要再加负号,如neg_log_loss,neg_mean_squared_error 详见sklearn文档:http://scikit-learn.org/stable/modules/model_evaluation.html#log-loss

# 使用机器学习算法进行网格搜索
xgb2_1 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=699,  #第一轮参数调整得到的n_estimators最优值
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.3,
        colsample_bytree=0.8,
        colsample_bylevel = 0.7,
        objective= 'multi:softprob',
        seed=3)


gsearch2_1 = GridSearchCV(xgb2_1, param_grid = param_test2_1, scoring='neg_log_loss',n_jobs=-1, cv=kfold)  # param_grid为所调整的参数
# 网格搜索 (分类器,所调整的参数,评价标准,并行, 交叉验证 )
gsearch2_1.fit(X_train , y_train)

查看评价指标

gsearch2_1.grid_scores_, gsearch2_1.best_params_, gsearch2_1.best_score_, gsearch2_1.cv_results_
# 不同参数情况下的评价结果,最佳结果的参数的组合,最好的评分,交叉验证结果

详细参数,属性设置可见:https://blog.51cto.com/emily18/2088128?utm_source=oschina-app

对验证结果保存并绘制相应图形

# summarize results
print("Best: %f using %s" % (gsearch2_1.best_score_, gsearch2_1.best_params_))
test_means = gsearch2_1.cv_results_[ 'mean_test_score' ]
test_stds = gsearch2_1.cv_results_[ 'std_test_score' ]
train_means = gsearch2_1.cv_results_[ 'mean_train_score' ]
train_stds = gsearch2_1.cv_results_[ 'std_train_score' ]

pd.DataFrame(gsearch2_1.cv_results_).to_csv('my_preds_maxdepth_min_child_weights_1.csv')

# plot results
test_scores = np.array(test_means).reshape(len(max_depth), len(min_child_weight))
train_scores = np.array(train_means).reshape(len(max_depth), len(min_child_weight))

for i, value in enumerate(max_depth):
    pyplot.plot(min_child_weight, -test_scores[i], label= 'test_max_depth:'   + str(value))
#for i, value in enumerate(min_child_weight):
#    pyplot.plot(max_depth, train_scores[i], label= 'train_min_child_weight:'   + str(value))
    
pyplot.legend()
pyplot.xlabel( 'max_depth' )                                                                                                      
pyplot.ylabel( 'Log Loss' )
pyplot.savefig('max_depth_vs_min_child_weght_1.png' )

6.调整完max_depth & min_child_weight后,再次调整一下弱分类区的数目

7.调整subsample(每棵树所用样本比例) 和 colsample_bytree(每棵树所用特征比例),与调整max_depth & min_child_weight过程类似

subsample = [i/10.0 for i in range(3,9)]
colsample_bytree = [i/10.0 for i in range(6,10)]
param_test3_1 = dict(subsample=subsample, colsample_bytree=colsample_bytree)
{'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
 'subsample': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}

网格搜索:

xgb3_1 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=645,  #第二轮参数调整得到的n_estimators最优值
        max_depth=6,
        min_child_weight=4,
        gamma=0,
        subsample=0.3,
        colsample_bytree=0.8,
        colsample_bylevel = 0.7,
        objective= 'multi:softprob',
        seed=3)


gsearch3_1 = GridSearchCV(xgb3_1, param_grid = param_test3_1, scoring='neg_log_loss',n_jobs=-1, cv=kfold)
gsearch3_1.fit(X_train , y_train)

gsearch3_1.grid_scores_, gsearch3_1.best_params_,     gsearch3_1.best_score_

画出调整过程中的参数

print("Best: %f using %s" % (gsearch3_1.best_score_, gsearch3_1.best_params_))
test_means = gsearch3_1.cv_results_[ 'mean_test_score' ]
test_stds = gsearch3_1.cv_results_[ 'std_test_score' ]
train_means = gsearch3_1.cv_results_[ 'mean_train_score' ]
train_stds = gsearch3_1.cv_results_[ 'std_train_score' ]

pd.DataFrame(gsearch3_1.cv_results_).to_csv('my_preds_subsampleh_colsample_bytree_1.csv')

# plot results
test_scores = np.array(test_means).reshape(len(colsample_bytree), len(subsample))
train_scores = np.array(train_means).reshape(len(colsample_bytree), len(subsample))

for i, value in enumerate(colsample_bytree):
    pyplot.plot(subsample, -test_scores[i], label= 'test_colsample_bytree:'   + str(value))
#for i, value in enumerate(min_child_weight):
#    pyplot.plot(max_depth, train_scores[i], label= 'train_min_child_weight:'   + str(value))
    
pyplot.legend()
pyplot.xlabel( 'subsample' )                                                                                                      
pyplot.ylabel( 'Log Loss' )
pyplot.savefig( 'subsample_vs_colsample_bytree1.png' )
Best: -1.813067 using {'subsample': 0.8, 'colsample_bytree': 0.9}

Kaggle实战(四): XGBoost调参_第2张图片

8.调整正则化参数:reg_alpha(L1/L0正则的惩罚系数)和reg_lambda(L2正则惩罚系数)

reg_alpha = [ 1.5, 2]    #default = 0, 测试0.1,1,1.5,2
reg_lambda = [0.5, 1, 2]      #default = 1,测试0.1, 0.5, 1,2

param_test5_1 = dict(reg_alpha=reg_alpha, reg_lambda=reg_lambda)

xgb5_1 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=645,  #第二轮参数调整得到的n_estimators最优值
        max_depth=6,
        min_child_weight=4,
        gamma=0,
        subsample=0.7,
        colsample_bytree=0.6,
        colsample_bylevel = 0.7,
        objective= 'multi:softprob',
        seed=3)


gsearch5_1 = GridSearchCV(xgb5_1, param_grid = param_test5_1, scoring='neg_log_loss',n_jobs=-1, cv=kfold)
gsearch5_1.fit(X_train , y_train)

# summarize results
print("Best: %f using %s" % (gsearch5_1.best_score_, gsearch5_1.best_params_))
test_means = gsearch5_1.cv_results_[ 'mean_test_score' ]
test_stds = gsearch5_1.cv_results_[ 'std_test_score' ]
train_means = gsearch5_1.cv_results_[ 'mean_train_score' ]
train_stds = gsearch5_1.cv_results_[ 'std_train_score' ]

pd.DataFrame(gsearch5_1.cv_results_).to_csv('my_preds_reg_alpha_reg_lambda_1.csv')

# plot results
test_scores = np.array(test_means).reshape(len(reg_alpha), len(reg_lambda))
train_scores = np.array(train_means).reshape(len(reg_alpha), len(reg_lambda))

#log_reg_alpha = [0,0,0,0]
#for index in range(len(reg_alpha)):
#   log_reg_alpha[index] = math.log10(reg_alpha[index])
    
for i, value in enumerate(reg_alpha):
    pyplot.plot(reg_lambda, -test_scores[i], label= 'reg_alpha:'   + str(value))
#for i, value in enumerate(min_child_weight):
#    pyplot.plot(max_depth, train_scores[i], label= 'train_min_child_weight:'   + str(value))
    
pyplot.legend()
pyplot.xlabel( 'reg_alpha' )                                                                                                      
pyplot.ylabel( '-Log Loss' )
pyplot.savefig( 'reg_alpha_vs_reg_lambda1.png' )

9.最后再次调整弱分类器的数目

你可能感兴趣的:(Kaggle)