catboost 学习案例

from catboost.datasets import titanic
import numpy as np

train_df, test_df = titanic()
train_df.head()

1.1 Feature Preparation

# 缺失值统计
null_value_stats = train_df.isnull().sum()
null_value_stats[null_value_stats != 0]

# 填充缺失值
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

# 分开特征和标签
X = train_df.drop('Survived', axis=1)
y = train_df.Survived

# 获取非float类型特征索引
cat_fea_idx = np.where(X.dtypes != np.float)[0]

1.2 Data Splitting

from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

X_test = test_df

2. catboost basics

from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score

2.1 model training

model = CatBoostClassifier(
        custom_loss=['Accuracy']  # default = 'logloss'
        , random_seed=42
        , logging_level='Silent'
)

model.fit(
    X_train, y_train,
    cat_features=cat_fea_idx,
    eval_set=(X_validation, y_validation),
    plot=True
)

catboost 学习案例_第1张图片

2.2 Model Cross-Validation

cv_params = model.get_params()
cv_params.update({
    'loss_function': 'Logloss'
})
cv_data = cv(
    Pool(X, y, cat_features=cat_fea_idx),
    cv_params,
    plot=True
)

catboost 学习案例_第2张图片

print('best validation accurecy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

# best validation accurecy score: 0.83±0.02 on step 543

2.3 Model Applying

predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)  # 每个类别的概率
print(predictions[:10])
print(predictions_probs[:10])

3. catboost features

model_without_seed = CatBoostClassifier(iterations=10, logging_level='Silent')
model_without_seed.fit(X, y, cat_features=cat_fea_idx)

print('random seed assigned for this model: {}'.format(model_without_seed.random_seed_))


params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}
train_pool = Pool(X_train, y_train, cat_features=cat_fea_idx)
validate_pool = Pool(X_validation, y_validation, cat_features=cat_fea_idx)

3.1 using the best model

model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool)

print('simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))))
print('-----')

print('best model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, best_model.predict(X_validation))
))

3.2 Early Stopping

%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

# Wall time: 8.61 s


%%time
earlystop_params = params.copy()
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool)

# Wall time: 1.49 s


print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('----')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, earlystop_model.predict(X_validation))
))

# Simple model tree count: 500
# Simple model validation accuracy: 0.7982
# ----
# Early-stopped model tree count: 82
# Early-stopped model validation accuracy: 0.8072

3.3 using baseline

current_params = params.copy()
current_params.update({
    'iterations': 10
})
model = CatBoostClassifier(**current_params).fit(X_train, y_train, cat_features=cat_fea_idx)
# get baseline  (only with prediction_type='RawFormulaVal')
baseline = model.predict(X_train, prediction_type='RawFormulaVal')

model.fit(X_train, y_train, cat_fea_idx, baseline=baseline)

3.4 Snapshot Support

params_with_snapshot = params.copy()
params_with_snapshot.update({
    'iterations': 5,
    'learning_rate': 0.5,
    'logging_level': 'Verbose'
})

model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, 
                                                       save_snapshot=True)
params_with_snapshot.update({
    'iterations': 10,
    'learning_rate': 0.1
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool,
                                                      save_snapshot=True)

catboost 学习案例_第3张图片

3.5 use defined objective function

class LoglossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
            
        result = []
        for index in range(len(targets)):
            e = np.exp(approxes[index])
            p = e / (1 + e)
            der1 = (1 - p) if targets[index] > 0 else -p
            der2 = -p * (1 - p)
            
            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]
                
            result.append((der1, der2))
        return result
model = CatBoostClassifier(
        iterations=10,
        random_seed=42,
        loss_function=LoglossObjective(),
        eval_metric='Logloss'
)
model.fit(train_pool)

preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

3.6 use defined metric function

class LoglossMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)
    
    def is_max_optimal(self):
        return False
    
    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        
        approx = approxes[0]
        
        error_sum = 0.0
        weight_sum = 0.0
        
        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += -w * (target[i] * approx[i] - np.log(1 + np.exp(approx[i])))
            
        return error_sum, weight_sum
model = CatBoostClassifier(
        iterations=10,
        random_seed=42,
        loss_function='Logloss',
        eval_metric=LoglossMetric()
)

model.fit(train_pool)
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

3.7 staged predict

model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(
        train_pool)
ntree_start, ntree_end, eval_period = 3, 9, 2
predictions_iterator = model.staged_predict(validate_pool, 'Probability', 
                                           ntree_start, ntree_end, eval_period)
for preds, tree_count in zip(predictions_iterator, range(ntree_start, ntree_end, eval_period)):
    print('first class probability using the first {} trees: {}'.format(tree_count, preds[:5, 1]))

# first class probability using the first 3 trees: [0.53597869 0.41039128 0.42057479 0.64281031 0.46576685]
# first class probability using the first 5 trees: [0.63722688 0.42492029 0.46209302 0.70926021 0.44280772]
# first class probability using the first 7 trees: [0.66964764 0.42409144 0.46124982 0.76101033 0.47205986]

3.8 feature importances

model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(
        train_pool)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}:{}'.format(name, score))

# Sex:59.004092014268586
# Pclass:16.340887169747035
# Ticket:6.028107169932204
# Cabin:3.8347242202560192
# Fare:3.712969667934384
# Age:3.484451204182482
# Parch:3.378089740355865
# Embarked:2.3139994072899555
# SibSp:1.9026794060334504
# PassengerId:0.0
# Name:0.0

3.9 eval metrics

model = CatBoostClassifier(iterations=50
                           ,random_seed=42
                           ,logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['AUC'], plot=True)

catboost 学习案例_第4张图片

3.10 learning processes comparison

model1 = CatBoostClassifier(iterations=10,
                            depth=1,
                            train_dir='model_depth_1/',
                            logging_level='Silent'
                           )
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=10,
                            depth=5,
                            train_dir='model_depth_5/',
                            logging_level='Silent'
                           )
model2.fit(train_pool, eval_set=validate_pool)
from catboost import MetricVisualizer
widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
widget.start()

catboost 学习案例_第5张图片

3.11 model saving

model = CatBoostClassifier(iterations=10, random_seed=42, 
                           logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')

model = CatBoostClassifier()
model.load_model('catboost_model.dump')

4 Parameters Tuning

import hyperopt

def hyperopt_obj(params):
    # 优化目标函数
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric='Accuracy',
        random_seed=42,
        verbose=False,
        loss_function='Logloss'
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=cat_fea_idx),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy   # 最小化
from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
        hyperopt_obj,
        space=params_space,
        algo=hyperopt.tpe.suggest,
        max_evals=50,
        trials=trials,
        rstate=RandomState(123)
)

print(best)
model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric='Accuracy',
    random_seed=42,
    verbose=False,
    loss_function='Logloss'
)
cv_data = cv(
    Pool(X, y, cat_features=cat_fea_idx), model.get_params())
print('precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

# precise validation accuracy score: 0.8338945005611672

 

你可能感兴趣的:(机器学习,python,数据分析,python,机器学习,catboost,数据分析)