(六)XGBoost使用交叉验证

import numpy as np
import xgboost as xgb

### load data in do training
dtrain = xgb.DMatrix(basePath+'data/agaricus.txt.train')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
num_round = 2

print('running cross validation')
running cross validation
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
#metrics:验证数据的评估指标,默认指标(rmse用于回归,error误差用于分类
xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'error'}, seed=0,
       callbacks=[xgb.callback.print_evaluation(show_stdv=True)])
[0] train-error:0.0506682+0.009201 test-error:0.0557316+0.0158887 [1] train-error:0.0213034+0.00205561 test-error:0.0211884+0.00365323
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
test-error-mean test-error-std train-error-mean train-error-std
0 0.055732 0.015889 0.050668 0.009201
1 0.021188 0.003653 0.021303 0.002056
print('running cross validation, disable standard deviation display')
running cross validation, disable standard deviation display
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value 
# num_boost_round=10:增强数量的迭代
res = xgb.cv(param, dtrain, num_boost_round=10, nfold=5,
             metrics={'error'}, seed=0,
             callbacks=[xgb.callback.print_evaluation(show_stdv=False),
                        xgb.callback.early_stop(3)])  #提前停止的条件:Will train until test-error hasn't improved in 3 rounds.
[0] train-error:0.0506682 test-error:0.0557316 Multiple eval metrics have been passed: ‘test-error’ will be used for early stopping. Will train until test-error hasn’t improved in 3 rounds. [1] train-error:0.0213034 test-error:0.0211884 [2] train-error:0.0099418 test-error:0.0099786 [3] train-error:0.0141256 test-error:0.0144336 [4] train-error:0.0059878 test-error:0.0062948 [5] train-error:0.0020344 test-error:0.0016886 [6] train-error:0.0012284 test-error:0.001228 [7] train-error:0.0012284 test-error:0.001228 [8] train-error:0.0009212 test-error:0.001228 [9] train-error:0.0006142 test-error:0.001228 Stopping. Best iteration: [6] train-error:0.0012284+0.000260265 test-error:0.001228+0.00104094
print(res)
test-error-mean test-error-std train-error-mean train-error-std 0 0.055732 0.015889 0.050668 0.009201 1 0.021188 0.003653 0.021303 0.002056 2 0.009979 0.004828 0.009942 0.006076 3 0.014434 0.003517 0.014126 0.001706 4 0.006295 0.003123 0.005988 0.001878 5 0.001689 0.000574 0.002034 0.001470 6 0.001228 0.001041 0.001228 0.000260
print('running cross validation, with preprocessing function')
running cross validation, with preprocessing function
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
#预处理函数,接受(dtrain, dtest, param)并返回转换后的版本。
def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label == 1)
    param['scale_pos_weight'] = ratio   #控制正权重和负权重的平衡,这对不平衡类很有用。要考虑的一个典型值:sum(负实例)/ sum(正实例)
    return (dtrain, dtest, param)
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'auc'}, seed=0, fpreproc=fpreproc)  #auc:曲线下的面积
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
test-auc-mean test-auc-std train-auc-mean train-auc-std
0 0.958232 0.005778 0.958228 0.001442
1 0.981431 0.002595 0.981414 0.000647
###使用自定义损失函数
# you can also do cross validation with customized loss function
# See custom_objective.py
##
print('running cross validation, with cutomsized loss function')
running cross validation, with cutomsized loss function
def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1.0 - preds)
    return grad, hess
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
param = {'max_depth':2, 'eta':1, 'silent':1}
# train with customized objective
xgb.cv(param, dtrain, num_round, nfold=5, seed=0,
       obj=logregobj, feval=evalerror)
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
test-error-mean test-error-std test-rmse-mean test-rmse-std train-error-mean train-error-std train-rmse-mean train-rmse-std
0 0.055732 0.015889 1.598043 0.012826 0.050668 0.009201 1.595072 0.003868
1 0.021188 0.003653 2.449282 0.080900 0.021303 0.002056 2.442600 0.076834
#rmse: root mean square error
#mae: mean absolute error

你可能感兴趣的:(XGBoost学习)