from catboost.datasets import titanic
import numpy as np
train_df, test_df = titanic()
train_df.head()
# 缺失值统计
null_value_stats = train_df.isnull().sum()
null_value_stats[null_value_stats != 0]
# 填充缺失值
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)
# 分开特征和标签
X = train_df.drop('Survived', axis=1)
y = train_df.Survived
# 获取非float类型特征索引
cat_fea_idx = np.where(X.dtypes != np.float)[0]
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)
X_test = test_df
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score
model = CatBoostClassifier(
custom_loss=['Accuracy'] # default = 'logloss'
, random_seed=42
, logging_level='Silent'
)
model.fit(
X_train, y_train,
cat_features=cat_fea_idx,
eval_set=(X_validation, y_validation),
plot=True
)
cv_params = model.get_params()
cv_params.update({
'loss_function': 'Logloss'
})
cv_data = cv(
Pool(X, y, cat_features=cat_fea_idx),
cv_params,
plot=True
)
print('best validation accurecy score: {:.2f}±{:.2f} on step {}'.format(
np.max(cv_data['test-Accuracy-mean']),
cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
np.argmax(cv_data['test-Accuracy-mean'])
))
# best validation accurecy score: 0.83±0.02 on step 543
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test) # 每个类别的概率
print(predictions[:10])
print(predictions_probs[:10])
model_without_seed = CatBoostClassifier(iterations=10, logging_level='Silent')
model_without_seed.fit(X, y, cat_features=cat_fea_idx)
print('random seed assigned for this model: {}'.format(model_without_seed.random_seed_))
params = {
'iterations': 500,
'learning_rate': 0.1,
'eval_metric': 'Accuracy',
'random_seed': 42,
'logging_level': 'Silent',
'use_best_model': False
}
train_pool = Pool(X_train, y_train, cat_features=cat_fea_idx)
validate_pool = Pool(X_validation, y_validation, cat_features=cat_fea_idx)
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)
best_model_params = params.copy()
best_model_params.update({
'use_best_model': True
})
best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool)
print('simple model validation accuracy: {:.4}'.format(
accuracy_score(y_validation, model.predict(X_validation))))
print('-----')
print('best model validation accuracy: {:.4}'.format(
accuracy_score(y_validation, best_model.predict(X_validation))
))
%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)
# Wall time: 8.61 s
%%time
earlystop_params = params.copy()
earlystop_params.update({
'od_type': 'Iter',
'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool)
# Wall time: 1.49 s
print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
accuracy_score(y_validation, model.predict(X_validation))
))
print('----')
print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
accuracy_score(y_validation, earlystop_model.predict(X_validation))
))
# Simple model tree count: 500
# Simple model validation accuracy: 0.7982
# ----
# Early-stopped model tree count: 82
# Early-stopped model validation accuracy: 0.8072
current_params = params.copy()
current_params.update({
'iterations': 10
})
model = CatBoostClassifier(**current_params).fit(X_train, y_train, cat_features=cat_fea_idx)
# get baseline (only with prediction_type='RawFormulaVal')
baseline = model.predict(X_train, prediction_type='RawFormulaVal')
model.fit(X_train, y_train, cat_fea_idx, baseline=baseline)
params_with_snapshot = params.copy()
params_with_snapshot.update({
'iterations': 5,
'learning_rate': 0.5,
'logging_level': 'Verbose'
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool,
save_snapshot=True)
params_with_snapshot.update({
'iterations': 10,
'learning_rate': 0.1
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool,
save_snapshot=True)
class LoglossObjective(object):
def calc_ders_range(self, approxes, targets, weights):
assert len(approxes) == len(targets)
if weights is not None:
assert len(weights) == len(approxes)
result = []
for index in range(len(targets)):
e = np.exp(approxes[index])
p = e / (1 + e)
der1 = (1 - p) if targets[index] > 0 else -p
der2 = -p * (1 - p)
if weights is not None:
der1 *= weights[index]
der2 *= weights[index]
result.append((der1, der2))
return result
model = CatBoostClassifier(
iterations=10,
random_seed=42,
loss_function=LoglossObjective(),
eval_metric='Logloss'
)
model.fit(train_pool)
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')
class LoglossMetric(object):
def get_final_error(self, error, weight):
return error / (weight + 1e-38)
def is_max_optimal(self):
return False
def evaluate(self, approxes, target, weight):
assert len(approxes) == 1
assert len(target) == len(approxes[0])
approx = approxes[0]
error_sum = 0.0
weight_sum = 0.0
for i in range(len(approx)):
w = 1.0 if weight is None else weight[i]
weight_sum += w
error_sum += -w * (target[i] * approx[i] - np.log(1 + np.exp(approx[i])))
return error_sum, weight_sum
model = CatBoostClassifier(
iterations=10,
random_seed=42,
loss_function='Logloss',
eval_metric=LoglossMetric()
)
model.fit(train_pool)
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')
model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(
train_pool)
ntree_start, ntree_end, eval_period = 3, 9, 2
predictions_iterator = model.staged_predict(validate_pool, 'Probability',
ntree_start, ntree_end, eval_period)
for preds, tree_count in zip(predictions_iterator, range(ntree_start, ntree_end, eval_period)):
print('first class probability using the first {} trees: {}'.format(tree_count, preds[:5, 1]))
# first class probability using the first 3 trees: [0.53597869 0.41039128 0.42057479 0.64281031 0.46576685]
# first class probability using the first 5 trees: [0.63722688 0.42492029 0.46209302 0.70926021 0.44280772]
# first class probability using the first 7 trees: [0.66964764 0.42409144 0.46124982 0.76101033 0.47205986]
model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(
train_pool)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
print('{}:{}'.format(name, score))
# Sex:59.004092014268586
# Pclass:16.340887169747035
# Ticket:6.028107169932204
# Cabin:3.8347242202560192
# Fare:3.712969667934384
# Age:3.484451204182482
# Parch:3.378089740355865
# Embarked:2.3139994072899555
# SibSp:1.9026794060334504
# PassengerId:0.0
# Name:0.0
model = CatBoostClassifier(iterations=50
,random_seed=42
,logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['AUC'], plot=True)
model1 = CatBoostClassifier(iterations=10,
depth=1,
train_dir='model_depth_1/',
logging_level='Silent'
)
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=10,
depth=5,
train_dir='model_depth_5/',
logging_level='Silent'
)
model2.fit(train_pool, eval_set=validate_pool)
from catboost import MetricVisualizer
widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
widget.start()
model = CatBoostClassifier(iterations=10, random_seed=42,
logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')
model = CatBoostClassifier()
model.load_model('catboost_model.dump')
import hyperopt
def hyperopt_obj(params):
# 优化目标函数
model = CatBoostClassifier(
l2_leaf_reg=int(params['l2_leaf_reg']),
learning_rate=params['learning_rate'],
iterations=500,
eval_metric='Accuracy',
random_seed=42,
verbose=False,
loss_function='Logloss'
)
cv_data = cv(
Pool(X, y, cat_features=cat_fea_idx),
model.get_params()
)
best_accuracy = np.max(cv_data['test-Accuracy-mean'])
return 1 - best_accuracy # 最小化
from numpy.random import RandomState
params_space = {
'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}
trials = hyperopt.Trials()
best = hyperopt.fmin(
hyperopt_obj,
space=params_space,
algo=hyperopt.tpe.suggest,
max_evals=50,
trials=trials,
rstate=RandomState(123)
)
print(best)
model = CatBoostClassifier(
l2_leaf_reg=int(best['l2_leaf_reg']),
learning_rate=best['learning_rate'],
iterations=500,
eval_metric='Accuracy',
random_seed=42,
verbose=False,
loss_function='Logloss'
)
cv_data = cv(
Pool(X, y, cat_features=cat_fea_idx), model.get_params())
print('precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))
# precise validation accuracy score: 0.8338945005611672