from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
data = load_breast_cancer()
print(data['data'].shape, data['target'].shape)
x_data = data.data
y_data = data.target
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=666)
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)
'''
(569, 30) (569,)
'''
import xgboost as xgb
params = {'objective': 'binary:logistic',
'max_depth': 5,
'min_child_weight': 0.8,
'verbosity': 0,
'subsample': 0.8,
'colsample_bytree': 0.8,
'gamma': 0.1,
'lambda': 0.8,
'eta': 1}
num_round = 50
watch_list = [(dtrain, 'train'), (dtest, 'test')]
evals_result = {}
xgb_model = xgb.train(
params=params,
dtrain=dtrain,
num_boost_round=num_round,
evals=watch_list,
early_stopping_rounds=10,
evals_result=evals_result
)
'''
[0] train-logloss:0.19172 test-logloss:0.30370
[1] train-logloss:0.09322 test-logloss:0.22359
[2] train-logloss:0.05241 test-logloss:0.19594
[3] train-logloss:0.03034 test-logloss:0.21439
[4] train-logloss:0.02134 test-logloss:0.20752
[5] train-logloss:0.01631 test-logloss:0.20046
[6] train-logloss:0.01476 test-logloss:0.21509
[7] train-logloss:0.01407 test-logloss:0.20678
[8] train-logloss:0.01231 test-logloss:0.20136
[9] train-logloss:0.01197 test-logloss:0.20550
[10] train-logloss:0.01053 test-logloss:0.19675
[11] train-logloss:0.00992 test-logloss:0.20746
'''
preds = xgb_model.predict(dtest)
labels = dtest.get_label()
labels, preds
'''
array([0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0.,
0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0.,
0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1.,
0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 0.,
0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0.], dtype=float32)
array([2.3674986e-03, 9.9861741e-01, 9.9975663e-01, 2.1583840e-01,
1.1854486e-03, 6.3544436e-04, 1.2252343e-04, 9.9900925e-01,
6.5811968e-05, 9.9750012e-01, 9.9970168e-01, 9.9982613e-01,
1.4022045e-01, 9.8603803e-01, 9.6359092e-01, 9.9929202e-01,
9.9949872e-01, 9.9832839e-01, 9.9978691e-01, 9.9982613e-01,
6.3544436e-04, 6.2098140e-03, 9.9957401e-01, 9.9969757e-01,
9.9982423e-01, 9.9957675e-01, 9.9975401e-01, 1.4727229e-04,
9.9970168e-01, 1.3158246e-04, 1.0680066e-03, 9.9991202e-01,
9.9991202e-01, 1.0036851e-01, 1.4891458e-04, 6.8655396e-01,
9.9900925e-01, 6.5541302e-04, 8.1419773e-02, 9.9970526e-01,
6.5962277e-02, 9.9671751e-01, 1.5763033e-04, 9.9976224e-01,
1.8256794e-01, 1.4891458e-04, 9.9982613e-01, 9.9935108e-01,
6.6271797e-04, 9.9606222e-01, 9.9767417e-01, 6.6271797e-04,
6.6271797e-04, 9.9754131e-01, 9.9975401e-01, 7.4481402e-05,
9.0036017e-01, 3.5478315e-01, 9.9987698e-01, 1.1854486e-03,
9.9930811e-01, 3.8913928e-02, 9.9082899e-01, 7.0148520e-04,
9.9900925e-01, 9.9870503e-01, 9.9982613e-01, 9.6617037e-01,
1.5763033e-04, 1.8701397e-01, 9.9964190e-01, 9.9982613e-01,
2.0063692e-03, 9.9939549e-01, 8.6724478e-01, 9.9991202e-01,
9.9935108e-01, 9.7720438e-01, 2.3587022e-03, 1.5763033e-04,
5.4037949e-04, 4.3376945e-03, 7.8841185e-05, 9.9982423e-01,
3.1165761e-04, 9.6175414e-01, 9.9949872e-01, 1.4727229e-04,
9.9987698e-01, 9.9982613e-01, 9.9797589e-01, 9.9478018e-01,
9.4439185e-01, 1.3158246e-04, 1.5761836e-01, 9.9597138e-01,
9.6866369e-01, 9.9949872e-01, 7.1160225e-03, 9.9847311e-01,
9.9786395e-01, 5.4037949e-04, 1.1854486e-03, 9.9964190e-01,
1.8933583e-02, 9.9413067e-01, 7.8841185e-05, 1.3096399e-03,
9.9991202e-01, 9.9705601e-01, 8.6442098e-02, 9.3267983e-01,
9.9922729e-01, 1.6699207e-04], dtype=float32)
'''
error = sum([1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]]) / float(len(preds))
print(f'error={error: .2f}')
'''
error= 0.07
'''
'''
回归
'objective': 'reg:squarederror', # 平方误差目标函数
np.sqrt(mean_squared_error(preds, labels)) # rmse
'''
import xgboost as xgb
# 自定义类,实现外套sklearn api
class MyXGBoost:
def __init__(self, **params):
self.params = params
if 'num_boost_round' in self.params:
self.num_boost_round = self.params['num_boost_round']
# 默认回归,分类需要修改objective
self.params.update({'verbosity': 1,
'objective':'reg:squarederror',
'seed':0})
self.bst = None
def fit(self, x_train, y_train):
dtrain = xgb.DMatrix(x_train, y_train)
self.bst = xgb.train(params=self.params
,dtrain=dtrain
,num_boost_round=self.num_boost_round
)
def predict(self, x_pred):
dpred = xgb.DMatrix(x_pred)
return self.bst.predict(dpred)
def kfold(self, x_train, y_train, nfold=5):
dtrain = xgb.DMatrix(x_train, y_train)
cv_round = xgb.cv(params=self.params
,dtrain=dtrain
,num_boost_round=self.num_boost_round
,nfold=nfold
,early_stopping_rounds=10
)
return cv_round.iloc[-1, :]
def get_params(self, deep=True):
return self.params
def set_params(self, **params):
self.params.update(params)
return self
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
import numpy as np
def score_fn(y_pred, y_true):
return np.sqrt(mean_squared_error(y_pred, y_true))
# rmse 值越小拟合越好
cv_score_fn = make_scorer(score_func=score_fn
,greater_is_better=False)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
params_grid = {
'max_depth': list([4, 5, 6, 7])
,'min_child_weight': list([1, 5, 9])
,'subsample': list([0.6, 0.8, 1])
,'colsample_bytree': list([0.6, 0.8, 1])
}
xgb_model = MyXGBoost(num_boost_round=20)
grid = GridSearchCV(estimator=xgb_model
,param_grid=params_grid
,scoring=cv_score_fn
,cv=3
,n_jobs=-1)
grid.fit(x_train, y_train)
print(grid.best_params_)
不知道为什么 Parameters: { "num_boost_round" } are not used.
'''
[17:04:14] WARNING: C:\Users\dev-admin\croot2\xgboost-split_1675461376218\work\src\learner.cc:767:
Parameters: { "num_boost_round" } are not used.
{'colsample_bytree': 1, 'max_depth': 5, 'min_child_weight': 9, 'subsample': 1}
'''
params = {
'max_depth': 5
,'min_child_weight': 9
,'subsample': 1
,'colsample_bytree':1
,'num_boost_round': 20
}
xgb_model = MyXGBoost(**params)
params_grid = {
'gamma': list([0, 0.1, 0.5])
,'lambda': list([1, 1.5])
,'eta': list([0.3, 0.5, 1])
}
grid = GridSearchCV(estimator=xgb_model
,param_grid=params_grid
,scoring=cv_score_fn
,cv=3
,n_jobs=-1)
grid.fit(x_train, y_train)
print(grid.best_params_)
'''
[17:04:15] WARNING: C:\Users\dev-admin\croot2\xgboost-split_1675461376218\work\src\learner.cc:767:
Parameters: { "num_boost_round" } are not used.
{'eta': 0.3, 'gamma': 0, 'lambda': 1}
'''
import xgboost as xgb
params = {
'colsample_bytree': 1,
'max_depth': 7,
'min_child_weight': 5,
'subsample': 0.8
}
etas = [0.3, 0.5, 1]
num_boost_rounds = [10, 15, 20]
nfold = 3
best_eta, best_round = 0, 0
best_score = float('inf')
for eta in etas:
for num_boost_round in num_boost_rounds:
params.update({'eta': eta})
dtrain = xgb.DMatrix(x_train, y_train)
cv_rounds = xgb.cv(params=params
,dtrain=dtrain
,num_boost_round=num_boost_round
,nfold=nfold
,early_stopping_rounds=10
)
# 返回num_boost_round轮评估指标的train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
# 选取最后一轮的test-rmse-mean
score = cv_rounds.iloc[-1, -2]
if score < best_score:
best_score = score
best_eta = eta
best_round = num_boost_round
print(f'eta: {best_eta}, num_boost_round: {best_round}')
'''
eta: 0.5, num_boost_round: 15
'''