快速掌握CatBoost基本用法

简介

CatBoost是一款高性能机器学习开源库,基于GBDT,由俄罗斯搜索巨头Yandex在2017年开源。

CatBoost特点有:

  1. 免调参高质量
  2. 支持类别特征
  3. 快速和可用GPU
  4. 提高准确性
  5. 快速预测

回归

CatBoostRegressor类使用类似数组的数据

from catboost import CatBoostRegressor

# 数据集
train_data = [[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60]]
train_labels = [10, 20, 30]
eval_data = [[2, 4, 6, 8], [1, 4, 50, 60]]

# 定义模型
model = CatBoostRegressor(iterations=2, learning_rate=1, depth=2)

# 训练
model.fit(train_data, train_labels)

# 预测
preds = model.predict(eval_data)
print(preds)  #[15.625 18.125]

在GPU上训练

CatBoost的GPU训练开箱即用,但系统编译器必须与CUDA Toolkit兼容。
若报错请自行编译CatBoost Build from source on Windows

from catboost import CatBoostClassifier

# 数据集
train_data = [[0, 3], [4, 1], [8, 1], [9, 1]]
train_labels = [0, 0, 1, 1]

# 定义模型
model = CatBoostClassifier(iterations=1000, task_type="GPU", devices='0:1')

# 训练
model.fit(train_data, train_labels, verbose=False)

二分类

CatBoostClassifier类使用类似数组的数据

from catboost import CatBoostClassifier

# 数据集
cat_features = [0, 1]  # 类别特征下标
train_data = [["a", "b", 1, 4, 5, 6], 
              ["a", "b", 4, 5, 6, 7],
              ["c", "d", 30, 40, 50, 60]]
train_labels = [1, 1, -1]
eval_data = [["a", "b", 2, 4, 6, 8], 
             ["a", "d", 1, 4, 50, 60]]

# 定义模型
model = CatBoostClassifier(iterations=2, learning_rate=1, depth=2)

# 训练
model.fit(train_data, train_labels, cat_features)

# 预测类别
preds_class = model.predict(eval_data)

# 预测每个类别的可能性
preds_proba = model.predict_proba(eval_data)

# 预测Raw formula value
preds_raw = model.predict(eval_data, prediction_type='RawFormulaVal')

print(preds_class)
print(preds_proba)
print(preds_raw)
'''
[1 1]
[[0.37014499 0.62985501]
 [0.4641579  0.5358421 ]]
[0.53159487 0.14361474]
'''

使用CatBoost自带的数据集处理Pool类进行训练

from catboost import CatBoostClassifier, Pool

# 数据集
train_data = Pool(data=[[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60]],
                  label=[1, 1, -1],
                  weight=[0.1, 0.2, 0.3])

# 定义模型
model = CatBoostClassifier(iterations=10)

# 训练
model.fit(train_data)

# 预测
preds_class = model.predict(train_data)
print(preds_class)  # [ 1  1 -1]

多分类

定义模型时指定loss_function='MultiClass'

from catboost import Pool, CatBoostClassifier

# 数据集
cat_features = [0]  # 类别特征
train_data = [["summer", 1924, 44],
              ["summer", 1932, 37],
              ["winter", 1980, 37],
              ["summer", 2012, 204]]
train_label = ["France", "USA", "USA", "UK"]
eval_data = [["winter", 1996, 197],
             ["winter", 1968, 37],
             ["summer", 2002, 77],
             ["summer", 1948, 59]]
eval_label = ["USA", "France", "USA", "UK"]

train_dataset = Pool(data=train_data, label=train_label, cat_features=cat_features)
eval_dataset = Pool(data=eval_data, label=eval_label, cat_features=cat_features)

# 定义模型
model = CatBoostClassifier(iterations=10,
                           learning_rate=1,
                           depth=2,
                           loss_function='MultiClass')
# 训练
model.fit(train_dataset)

# 预测
preds_class = model.predict(eval_dataset)
preds_proba = model.predict_proba(eval_dataset)
preds_raw = model.predict(eval_dataset, prediction_type='RawFormulaVal')
print(preds_class)
print(preds_proba)
print(preds_raw)
'''
[['USA']
 ['USA']
 ['UK']
 ['USA']]
[[0.20060959 0.2862616  0.51312881]
 [0.07388963 0.06071726 0.86539311]
 [0.27590481 0.46474219 0.259353  ]
 [0.2580995  0.1213261  0.6205744 ]]
[[-0.43157053 -0.07602515  0.50759567]
 [-0.75475564 -0.95110009  1.70585572]
 [-0.15318701  0.36823989 -0.21505288]
 [-0.04081236 -0.7956756   0.83648797]]
 '''

最优结果

调用get_best_score()

from catboost import CatBoostClassifier, Pool

# 数据集
train_data = [[0, 3], [4, 1], [8, 1], [9, 1]]
train_labels = [0, 0, 1, 1]
eval_data = [[2, 1], [3, 1], [9, 0], [5, 3]]
eval_labels = [0, 1, 1, 0]

eval_dataset = Pool(eval_data, eval_labels)

# 定义模型
model = CatBoostClassifier(
    learning_rate=0.03,
    custom_metric=['Logloss', 'AUC:hints=skip_train~false'])

# 训练
model.fit(train_data, train_labels, eval_set=eval_dataset, verbose=False)

# 最优结果
print(model.get_best_score())
# {'learn': {'Logloss': 0.005758294697120604, 'AUC': 1.0}, 'validation': {'Logloss': 0.5366281810311608, 'AUC': 1.0}}

最优迭代

返回达到最优结果时的迭代轮数

from catboost import CatBoostClassifier, Pool

# 数据集
train_data = [[0, 3], [4, 1], [8, 1], [9, 1]]
train_labels = [0, 0, 1, 1]
eval_data = [[2, 1], [3, 1], [9, 0], [5, 3]]
eval_labels = [0, 1, 1, 0]

eval_dataset = Pool(eval_data, eval_labels)

# 定义模型
model = CatBoostClassifier(learning_rate=0.03, eval_metric='AUC')

# 训练
model.fit(train_data, train_labels, eval_set=eval_dataset, verbose=False)

# 最优迭代
print(model.get_best_iteration())  # 8

加载数据集

带类别特征

from catboost import Pool

cat_features = [0, 1, 2]
data = [["a", "b", 1, 4, 5, 6], 
        ["a", "b", 4, 5, 6, 7],
        ["c", "d", 30, 40, 50, 60]]
label = [1, 1, -1]

dataset = Pool(data, label, cat_features)

无类别特征

from catboost import Pool

data = [[1, 4, 5, 6],
        [4, 5, 6, 7],
        [30, 40, 50, 60]]
label = [1, 1, -1]
dataset = Pool(data, label)

不带标签(用于预测)

from catboost import Pool

data = [[1, 4, 5, 6],
        [4, 5, 6, 7],
        [30, 40, 50, 60]]
dataset = Pool(data)

从文件加载数据集

无类别特征

新建文件data.tsv(以tab分隔),输入以下内容:

1    1935    01
1    1958    08
0    1969    09

当没有指定参数column_description时,Pool类默认第1列为标签值,其余列为数值特征。

from catboost import Pool

dataset = Pool("data.tsv")
print(dataset.get_label())  # ['1', '1', '0']

带类别特征

新建文件data_with_cat_features.tsv(以tab分隔),输入以下内容:

1935    born    1
1958    deceased    1
1969    born    0

新建文件data_with_cat_features.cd,输入以下内容:

1    Categ
2    Label

Pool类指定参数column_description进行带类别特征加载数据集

from catboost import Pool

dataset = Pool("data_with_cat_features.tsv",
               column_description="data_with_cat_features.cd")
print(dataset.get_label())  # ['1', '1', '0']

libsvm格式

新建文件data.libsvm,输入以下内容:

1 1:0.1 3:2.2 4:3
0 2:0.22 3:0.82
0 1:0.02 4:0.61
1 3:0.72 4:0.5

Pool类加载libsvm格式数据集时需要加入前缀libsvm://

from catboost import Pool

dataset = Pool("libsvm://data.libsvm")
print(dataset.get_label())  # ['1.0' '0.0' '0.0' '1.0']

带类别特征的libsvm格式

新建文件data_with_cat_features.libsvm,输入以下内容:

1 1:0.1 3:small 4:3 5:Male
0 2:0.22 3:small 5:Female
0 1:0.02 4:0.61 5:Female
1 3:large 4:0.5 5:Male

新建文件data_with_cat_features_for_libsvm.cd,输入以下内容:

0    Label
1    Num
2    Num
3    Categ
4    Num
5    Categ

Pool类加载

from catboost import Pool

dataset = Pool("libsvm://data_with_cat_features.libsvm", 
                column_description="data_with_cat_features_for_libsvm.cd")
print(dataset.get_label())  # ['1.0' '0.0' '0.0' '1.0']

从scipy.sparse加载数据集

加载scipy.sparse.csr_matrix

import numpy as np
import scipy.sparse
from catboost import Pool


row = np.array([0, 0, 1, 2, 2, 2, 3, 3, 4])
col = np.array([0, 2, 2, 0, 1, 2, 0, 2, 2])
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
X = scipy.sparse.csr_matrix((data, (row, col)), shape=(5, 3))
y = np.array([0, 1, 0, 1, 0])

dataset = Pool(X, y)

加载pandas.SparseDataFrame

import numpy as np
import pandas as pd
from catboost import Pool

X = pd.SparseDataFrame({
    'a': [1, 4, 0, 0, 1],
    'b': [0, 0, 1, 7, 8],
    'c': [30, 0, 0, 50, 0]
})
y = np.array([0, 1, 0, 1, 1])

dataset = Pool(X, y)

加载含稀疏列的pandas.DataFrame

import numpy as np
import pandas as pd
from catboost import Pool

X = pd.DataFrame({
    'a': pd.SparseArray([1, 4, 0, 0, 1]),
    'b': pd.SparseArray([0.0, 0.0, 1.0, 7.0, 8.0]),
    'c': pd.SparseArray([30, 0, 0, 50, 0]),
    'd': pd.SparseArray(['a', 'b', '', 'c', ''], fill_value=''),
})
y = np.array([0, 1, 0, 1, 1])

dataset = Pool(X, y, cat_features=['d'])

Pool切片

from catboost import Pool

data = [[1, 3],
        [0, 4],
        [1, 7],
        [6, 4],
        [5, 3]]

dataset = Pool(data)
print(dataset.num_row())  # 5

dataset_part = dataset.slice([0, 1, 2])  # 取下标0、1、2
print(dataset_part.num_row())  # 3

交叉验证

指定权重

权重参与计算损失函数和指标,权重默认设为1

import numpy as np
from catboost import Pool, CatBoostClassifier

# 数据集
train_data = np.random.randint(1, 100, size=(100, 10))
train_labels = np.random.randint(2, size=(100))
train_weight = np.random.random(100)

# 指定权重
train_dataset = Pool(train_data,
                     train_labels,
                     weight=train_weight)

# 定义模型
model = CatBoostClassifier(iterations=10)

# 训练
model.fit(train_dataset)

最优模型

模型训练时提供eval_set同时指定use_best_model=True

from catboost import Pool, CatBoostClassifier

# 数据集
cat_features = [0]
train_data = [["France", 1924, 44], ["USA", 1932, 37], ["USA", 1980, 37]]
train_label = [1, 1, 0]

eval_data = [["USA", 1996, 197], ["France", 1968, 37], ["USA", 2002, 77]]
eval_label = [0, 0, 1]

train_dataset = Pool(data=train_data,
                     label=train_label,
                     cat_features=cat_features)
eval_dataset = Pool(data=eval_data,
                    label=eval_label,
                    cat_features=cat_features)

# 定义模型
model = CatBoostClassifier(iterations=100)

# 训练
model.fit(train_dataset, eval_set=eval_dataset, use_best_model=True)

print("Count of trees in model = {}".format(model.tree_count_))
'''
bestTest = 0.6655278162
bestIteration = 99
Count of trees in model = 100
'''

保存并加载模型

from catboost import CatBoostClassifier

# 数据集
train_data = [[1, 3], [0, 4], [1, 7]]
train_labels = [1, 0, 1]

# 定义模型
model = CatBoostClassifier(learning_rate=0.03)

# 训练
model.fit(train_data, train_labels, verbose=False)

# 保存模型
model.save_model("model")

# 加载模型
from_file = CatBoostClassifier()
from_file.load_model("model")

输出中间过程

可以输出第n棵树的预测值,n的范围为[1, 树的总数–1]

from catboost import Pool, CatBoostClassifier

# 数据集
cat_features = [0]
train_data = [["France", 1924, 44], ["USA", 1932, 37], ["USA", 1980, 37]]
train_label = [1, 1, 0]
eval_data = [["USA", 1996, 197], ["France", 1968, 37], ["USA", 2002, 77]]
eval_label = [0, 0, 1]

train_dataset = Pool(data=train_data, label=train_label, cat_features=cat_features)
eval_data = Pool(data=eval_data, label=eval_label, cat_features=cat_features)

# 定义模型
model = CatBoostClassifier(iterations=100)

# 训练
model.fit(train_dataset, silent=True)

# 中间过程
staged_predictions_brute_force = []

# 方法一:循环
for i in range(1, model.tree_count_ + 1):
    staged_predictions_brute_force.append(model.predict(eval_data, ntree_end=i))
# print(staged_predictions_brute_force)

# 方法二(更快):staged_predict()
staged_predictions = list(model.staged_predict(eval_data))
# print(staged_predictions)

使用预训练结果(baseline)

预训练结果(不是可能性和类别,只能是原始值)可以设为新模型的baseline,baseline的格式取决于具体任务:

  • 多分类:二维数组
  • 回归、二分类、排序:一维数组

以下代码将会下载msrank数据集,约173MB,适用于排序任务。
建议手动下载到项目根目录\catboost_cached_datasets\msrank并解压。

import numpy as np
from catboost.datasets import msrank
from catboost import CatBoostRegressor, Pool
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 数据集
train_df, test_df = msrank()
X_train, y_train = train_df.drop([0, 1], axis=1).values, train_df[0].values
X_test, y_test = test_df.drop([0, 1], axis=1).values, test_df[0].values

# 划分训练数据为两部分
# 第一部分为baseline模型
# 第二部分为主模型
splitted_data = train_test_split(X_train, y_train, test_size=0.5)
X_train_first, X_train_second, y_train_first, y_train_second = splitted_data
catboost_model = CatBoostRegressor(iterations=200, verbose=False)

##########################################
############# baseline模型 ###############
##########################################

baseline_value = y_train_first.mean()
train_baseline = np.array([baseline_value] * y_train_second.shape[0])
test_baseline = np.array([baseline_value] * y_test.shape[0])

# 设置baseline
train_pool = Pool(X_train_second, y_train_second, baseline=train_baseline)
test_pool = Pool(X_test, y_test, baseline=test_baseline)

# 训练
catboost_model.fit(train_pool, eval_set=test_pool)

# 预测
preds1 = catboost_model.predict(test_pool)

# baseline加上预测值
preds2 = test_baseline + catboost_model.predict(X_test)

# 检测直接预测原值和设了baseline的差是否相同
assert (np.abs(preds1 - preds2) < 1e-6).all()

print(mean_squared_error(y_test, preds1))  # 0.5628231839885657

##########################################
########### 线性baseline模型 #############
##########################################

# 第一部分训练baseline模型(线性回归)
baseline_model = Ridge(alpha=3e3, normalize=True)
baseline_model.fit(X_train_first, y_train_first)

# 准备baselines
train_baseline = baseline_model.predict(X_train_second)
test_baseline = baseline_model.predict(X_test)

# 设置baseline
train_pool = Pool(X_train_second, y_train_second, baseline=train_baseline)
test_pool = Pool(X_test, y_test, baseline=test_baseline)

# 训练
catboost_model.fit(train_pool, eval_set=test_pool)

# 预测
preds1 = catboost_model.predict(test_pool)

# baseline加上预测值
preds2 = baseline_model.predict(X_test) + catboost_model.predict(X_test)

# 检测直接预测原值和设了baseline的差是否相同
assert (np.abs(preds1 - preds2) < 1e-6).all()

print(mean_squared_error(y_test, preds1))  # 0.564365366004454

接着训练

from catboost import CatBoostRegressor

# 数据集
train_data = [[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60]]
eval_data = [[2, 4, 6, 8], [1, 4, 50, 60]]
train_labels = [10, 20, 30]

# 定义模型
model1 = CatBoostRegressor(iterations=2, learning_rate=0.2, depth=2)

# 训练
model1.fit(train_data, train_labels)

# 以相同参数接着训练,结果会更新到model1
model1.fit(train_data, train_labels, init_model=model1)

# 以不同参数接着训练,model2改变,原模型不变
model2 = CatBoostRegressor(iterations=4, learning_rate=0.1, depth=4)
model2.fit(train_data, train_labels, init_model=model1)
print(model1.get_best_score())  # {'learn': {'RMSE': 6.714527844157103}}
print(model2.get_best_score())  # {'learn': {'RMSE': 6.113864540316429}}

导出模型为Apple CoreML格式

此处本人未实验,故略,可点击标题查看原文。

数据相关性

调用get_object_importance(),计算来自训练数据集的对象对来自输入数据集的对象的优化度量值的影响,数值越大越相关。
更多内容查看get_object_importance()和论文Finding Influential Training Samples for Gradient Boosted Decision Trees(为GBDT寻找有影响力的训练样本)

from catboost import Pool, CatBoost

# 数据集
cat_features = [0]
train_data = [["France", 1924, 44], ["USA", 1932, 37], ["USA", 1980, 37]]
train_label = [1, 1, 0]
data = [["USA", 1996, 197], ["France", 1968, 37], ["USA", 2002, 77]]
label = [0, 0, 1]

train_dataset = Pool(data=train_data, label=train_label, cat_features=cat_features)
dataset = Pool(data=data, cat_features=cat_features, label=label)

# 定义模型
cb = CatBoost({'iterations': 10})

# 训练
cb.fit(train_dataset)

# 对象强度计算
indices, scores = cb.get_object_importance(dataset, train_dataset, top_size=100)
print(indices)  # [0, 1, 2]
print(scores)  # [0. 0. 0.]

自定义损失函数

自定义损失函数,需要实现以下接口的对象:

class UserDefinedObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights都是浮点数的索引容器(定义了_len__和_getitem__的容器),权值可以是None
        # 为理解这些参数的含义,假设您的数据集中有一个子集正在被处理
        # approxes包含当前对这个子集的预测,而targets包含您提供的数据集的目标值
        # 
        # 此函数应返回一个list(der1, der2)
        # 其中der1是损失函数对于预测值的一阶导数。der2是二阶导数
        pass
    
class UserDefinedMultiClassObjective(object):
    def calc_ders_multi(self, approxes, target, weight):
        # approxes - 对单个对象的每个维数进行预测的浮点数索引容器
        # target - 包含一个单独的期望值
        # weight - 包含对象的权重
        #
        # 此函数应该返回一个tuple (der1, der2)
        # 其中der1是损失函数对每个维度的预测值的一阶导数的list-like对象。der2是一个二阶导数矩阵。
        pass

例子:

  • Logloss
class LoglossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        for index in range(len(targets)):
            e = np.exp(approxes[index])
            p = e / (1 + e)
            der1 = targets[index] - p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result

model = CatBoostClassifier(loss_function=LoglossObjective())
  • RMSE
class RmseObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        for index in range(len(targets)):
            der1 = targets[index] - approxes[index]
            der2 = -1

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result
    
model = CatBoostRegressor(loss_function=RmseObjective())
  • MultiClass
class MultiClassObjective(object):
    def calc_ders_multi(self, approx, target, weight):
        approx = np.array(approx) - max(approx)
        exp_approx = np.exp(approx)
        exp_sum = exp_approx.sum()
        grad = []
        hess = []
        for j in range(len(approx)):
            der1 = -exp_approx[j] / exp_sum
            if j == target:
                der1 += 1
            hess_row = []
            for j2 in range(len(approx)):
                der2 = exp_approx[j] * exp_approx[j2] / (exp_sum**2)
                if j2 == j:
                    der2 -= exp_approx[j] / exp_sum
                hess_row.append(der2 * weight)
                
            grad.append(der1 * weight)
            hess.append(hess_row)
        return (grad, hess)
    
model = CatBoostClassifier(loss_function=MultiClassObjective())

自定义过拟合检测器和最优模型选择

需要实现以下接口的对象:

class UserDefinedMetric(object):
    def is_max_optimal(self):
        # 返回是否“更大的值意味着更好”
        pass

    def get_final_error(self, error, weight):
        # 返回根据误差和权重计算的最终值
        pass
    
    def evaluate(self, approxes, target, weight):
        # approxes - 索引容器的list(定义了_len__和_getitem__的容器),一个容器一个维度,均包含浮点数
        # weight - 一维索引容器
        # target - 一维索引容器
        # weight - 权重,可为空
        # 返回(误差, 权重和)
        pass

例子:

  • Logloss
class LoglossMetric(object):
    def is_max_optimal(self):
        return False
    
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            e = np.exp(approx[i])
            p = e / (1 + e)
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += -w * (target[i] * np.log(p) + (1 - target[i]) * np.log(1 - p))

        return error_sum, weight_sum
    
model = CatBoostClassifier(eval_metric=LoglossMetric())
  • RMSE
class RmseMetric(object):
    def is_max_optimal(self):
        return False
    
    def get_final_error(self, error, weight):
        return np.sqrt(error / (weight + 1e-38))

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += w * ((approx[i] - target[i])**2)

        return error_sum, weight_sum
    
model = CatBoostRegressor(eval_metric=RmseMetric())
  • Accuracy
class AccuracyMetric(object):
    def is_max_optimal(self):
        return True
    
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def evaluate(self, approxes, target, weight):
        best_class = np.argmax(approxes, axis=0)
        
        accuracy_sum = 0
        weight_sum = 0 

        for i in range(len(target)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            accuracy_sum += w * (best_class[i] == target[i])

        return accuracy_sum, weight_sum
    
model = CatBoostClassifier(eval_metric=AccuracyMetric())

你可能感兴趣的:(python,机器学习)