【xgboost】XGBoost 原生api调参

  • 1. 数据
  • 2. 设置参数 & 训练
  • 3. 网格搜索调参
    • 3.1 外套sklearn api⭐⭐⭐
    • 3.2 自定义目标函数
    • 3.3 设置搜索参数 & 初次搜索
    • 3.4 确定部分参数 & 进一步搜索
  • 4. 循环、交叉验证调参

1. 数据

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer()
print(data['data'].shape, data['target'].shape)
x_data = data.data
y_data = data.target
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=666)

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)
'''
(569, 30) (569,)
'''

2. 设置参数 & 训练

import xgboost as xgb


params = {'objective': 'binary:logistic',
          'max_depth': 5,
          'min_child_weight': 0.8,
          'verbosity': 0,
          'subsample': 0.8,
          'colsample_bytree': 0.8,
          'gamma': 0.1,
          'lambda': 0.8,
          'eta': 1}

num_round = 50
watch_list = [(dtrain, 'train'), (dtest, 'test')]
evals_result = {}

xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=num_round,
    evals=watch_list,
    early_stopping_rounds=10,
    evals_result=evals_result
)
'''
[0]	train-logloss:0.19172	test-logloss:0.30370
[1]	train-logloss:0.09322	test-logloss:0.22359
[2]	train-logloss:0.05241	test-logloss:0.19594
[3]	train-logloss:0.03034	test-logloss:0.21439
[4]	train-logloss:0.02134	test-logloss:0.20752
[5]	train-logloss:0.01631	test-logloss:0.20046
[6]	train-logloss:0.01476	test-logloss:0.21509
[7]	train-logloss:0.01407	test-logloss:0.20678
[8]	train-logloss:0.01231	test-logloss:0.20136
[9]	train-logloss:0.01197	test-logloss:0.20550
[10]	train-logloss:0.01053	test-logloss:0.19675
[11]	train-logloss:0.00992	test-logloss:0.20746
'''
preds = xgb_model.predict(dtest)
labels = dtest.get_label()
labels, preds
'''
array([0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0.,
       0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1.,
       0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 0.,
       0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0.], dtype=float32)
array([2.3674986e-03, 9.9861741e-01, 9.9975663e-01, 2.1583840e-01,
       1.1854486e-03, 6.3544436e-04, 1.2252343e-04, 9.9900925e-01,
       6.5811968e-05, 9.9750012e-01, 9.9970168e-01, 9.9982613e-01,
       1.4022045e-01, 9.8603803e-01, 9.6359092e-01, 9.9929202e-01,
       9.9949872e-01, 9.9832839e-01, 9.9978691e-01, 9.9982613e-01,
       6.3544436e-04, 6.2098140e-03, 9.9957401e-01, 9.9969757e-01,
       9.9982423e-01, 9.9957675e-01, 9.9975401e-01, 1.4727229e-04,
       9.9970168e-01, 1.3158246e-04, 1.0680066e-03, 9.9991202e-01,
       9.9991202e-01, 1.0036851e-01, 1.4891458e-04, 6.8655396e-01,
       9.9900925e-01, 6.5541302e-04, 8.1419773e-02, 9.9970526e-01,
       6.5962277e-02, 9.9671751e-01, 1.5763033e-04, 9.9976224e-01,
       1.8256794e-01, 1.4891458e-04, 9.9982613e-01, 9.9935108e-01,
       6.6271797e-04, 9.9606222e-01, 9.9767417e-01, 6.6271797e-04,
       6.6271797e-04, 9.9754131e-01, 9.9975401e-01, 7.4481402e-05,
       9.0036017e-01, 3.5478315e-01, 9.9987698e-01, 1.1854486e-03,
       9.9930811e-01, 3.8913928e-02, 9.9082899e-01, 7.0148520e-04,
       9.9900925e-01, 9.9870503e-01, 9.9982613e-01, 9.6617037e-01,
       1.5763033e-04, 1.8701397e-01, 9.9964190e-01, 9.9982613e-01,
       2.0063692e-03, 9.9939549e-01, 8.6724478e-01, 9.9991202e-01,
       9.9935108e-01, 9.7720438e-01, 2.3587022e-03, 1.5763033e-04,
       5.4037949e-04, 4.3376945e-03, 7.8841185e-05, 9.9982423e-01,
       3.1165761e-04, 9.6175414e-01, 9.9949872e-01, 1.4727229e-04,
       9.9987698e-01, 9.9982613e-01, 9.9797589e-01, 9.9478018e-01,
       9.4439185e-01, 1.3158246e-04, 1.5761836e-01, 9.9597138e-01,
       9.6866369e-01, 9.9949872e-01, 7.1160225e-03, 9.9847311e-01,
       9.9786395e-01, 5.4037949e-04, 1.1854486e-03, 9.9964190e-01,
       1.8933583e-02, 9.9413067e-01, 7.8841185e-05, 1.3096399e-03,
       9.9991202e-01, 9.9705601e-01, 8.6442098e-02, 9.3267983e-01,
       9.9922729e-01, 1.6699207e-04], dtype=float32)

'''
error = sum([1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]]) / float(len(preds))

print(f'error={error: .2f}')
'''
error= 0.07
'''
''' 
回归
'objective': 'reg:squarederror', # 平方误差目标函数
np.sqrt(mean_squared_error(preds, labels)) # rmse
'''

3. 网格搜索调参

3.1 外套sklearn api⭐⭐⭐

import xgboost as xgb


# 自定义类,实现外套sklearn api
class MyXGBoost:
    def __init__(self, **params):
        self.params = params
        if 'num_boost_round' in self.params:
            self.num_boost_round = self.params['num_boost_round']

        # 默认回归,分类需要修改objective
        self.params.update({'verbosity': 1,
                            'objective':'reg:squarederror',
                            'seed':0})
        self.bst = None
    
    def fit(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, y_train)
        self.bst = xgb.train(params=self.params
                             ,dtrain=dtrain
                             ,num_boost_round=self.num_boost_round
                             )
        
    def predict(self, x_pred):
        dpred = xgb.DMatrix(x_pred)
        return self.bst.predict(dpred)

    def kfold(self, x_train, y_train, nfold=5):
        dtrain = xgb.DMatrix(x_train, y_train)
        cv_round = xgb.cv(params=self.params
                          ,dtrain=dtrain
                          ,num_boost_round=self.num_boost_round
                          ,nfold=nfold
                          ,early_stopping_rounds=10
                          )
        
        return cv_round.iloc[-1, :]
    
    def get_params(self, deep=True):
        return self.params
    
    def set_params(self, **params):
        self.params.update(params)
        return self

3.2 自定义目标函数

from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
import numpy as np

def score_fn(y_pred, y_true):
    return np.sqrt(mean_squared_error(y_pred, y_true))

# rmse 值越小拟合越好
cv_score_fn = make_scorer(score_func=score_fn
                          ,greater_is_better=False)

3.3 设置搜索参数 & 初次搜索

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


params_grid = {
    'max_depth': list([4, 5, 6, 7])
    ,'min_child_weight': list([1, 5, 9])
    ,'subsample': list([0.6, 0.8, 1])
    ,'colsample_bytree': list([0.6, 0.8, 1])
}

xgb_model = MyXGBoost(num_boost_round=20)

grid = GridSearchCV(estimator=xgb_model
                    ,param_grid=params_grid
                    ,scoring=cv_score_fn
                    ,cv=3
                    ,n_jobs=-1)
grid.fit(x_train, y_train)

print(grid.best_params_)

不知道为什么 Parameters: { "num_boost_round" } are not used.

'''
[17:04:14] WARNING: C:\Users\dev-admin\croot2\xgboost-split_1675461376218\work\src\learner.cc:767: 
Parameters: { "num_boost_round" } are not used.

{'colsample_bytree': 1, 'max_depth': 5, 'min_child_weight': 9, 'subsample': 1}
'''

3.4 确定部分参数 & 进一步搜索

params = {
    'max_depth': 5
    ,'min_child_weight': 9
    ,'subsample': 1
    ,'colsample_bytree':1
    ,'num_boost_round': 20
}

xgb_model = MyXGBoost(**params)

params_grid = {
    'gamma': list([0, 0.1, 0.5])
    ,'lambda': list([1, 1.5])
    ,'eta': list([0.3, 0.5, 1])
}
grid = GridSearchCV(estimator=xgb_model
                    ,param_grid=params_grid
                    ,scoring=cv_score_fn
                    ,cv=3
                    ,n_jobs=-1)
grid.fit(x_train, y_train)

print(grid.best_params_)
'''
[17:04:15] WARNING: C:\Users\dev-admin\croot2\xgboost-split_1675461376218\work\src\learner.cc:767: 
Parameters: { "num_boost_round" } are not used.

{'eta': 0.3, 'gamma': 0, 'lambda': 1}
'''

4. 循环、交叉验证调参

import xgboost as xgb

params = {
    'colsample_bytree': 1,
    'max_depth': 7,
    'min_child_weight': 5,
    'subsample': 0.8
}
etas = [0.3, 0.5, 1]
num_boost_rounds = [10, 15, 20]
nfold = 3

best_eta, best_round = 0, 0
best_score = float('inf')

for eta in etas:
    for num_boost_round in num_boost_rounds:
        params.update({'eta': eta})
        dtrain = xgb.DMatrix(x_train, y_train)

        cv_rounds = xgb.cv(params=params
                           ,dtrain=dtrain
                           ,num_boost_round=num_boost_round
                           ,nfold=nfold
                           ,early_stopping_rounds=10
                           )
        # 返回num_boost_round轮评估指标的train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
        # 选取最后一轮的test-rmse-mean
        score = cv_rounds.iloc[-1, -2]

        if score < best_score:
            best_score = score
            best_eta = eta
            best_round = num_boost_round

print(f'eta: {best_eta}, num_boost_round: {best_round}')
'''
eta: 0.5, num_boost_round: 15
'''

你可能感兴趣的:(Python,sklearn,xgboost,python,机器学习)