GBDT二元分类算法Python实现

GBDT二元分类算法

  • GBDT(梯度提升树)二元分类
    • 1.什么是GBDT(梯度提升树)?
    • 2.GBDT处理二元分类详解
    • 3.GBDT二元分类算法具体实现
      • 3.1构造CART回归树
      • 3.2GBDT具体实现
    • 4.数据集

GBDT(梯度提升树)二元分类

1.什么是GBDT(梯度提升树)?

如果你还不是很熟悉GBDT的基本原理,请参考以下两篇博文
1.GBDT(梯度提升树)基本原理及python实现
2.GBDT原理详解

2.GBDT处理二元分类详解

这篇博文讲的很详细,深入理解GBDT二分类算法

3.GBDT二元分类算法具体实现

GBDT二元分类算法Python实现_第1张图片
GBDT二元分类算法Python实现_第2张图片

3.1构造CART回归树

关键:
1.叶节点生成方式

'''
说明:为了便于实现,dataSet类型ndarray,
dataSet: [X,Y,y_res]  #dataSet的组成结构 m*n
X:样本训练集 m*(n-2)
Y:样本标签  m*1
y_res: 残差 m*1
''' 
def leaf(dataSet):
    """计算节点的数值
    :param dataSet: {ndarray}训练样本
    :return: 均值
    """
    '''
    生成叶子节点
    '''
    return np.sum(dataSet[:, -1]) / (np.sum((dataSet[:, -2] - dataSet[:, -1]) * (1 - dataSet[:, -2] + dataSet[:, -1])))

构造CART回归树

import numpy as np
import copy

import os
os.environ['PATH'] = os.pathsep + 'C:\\Program Files\\Graphviz 2.44.1\\bin'

class Node:
    """
    树的节点类
    """

    def __init__(self, feature=-1, split_val=None, results=None, left=None, right=None):
        """
        :param feature: 用于切分数据集的特征索引
        :param split_val: 设置切分的值
        :param results: 存储节点的值
        :param left: 左子树
        :param right: 右子树
        """

        self.feature = feature
        self.split_val = split_val
        self.results = results
        self.left = left
        self.right = right


'''
说明:为了便于实现,dataSet类型ndarray,
dataSet: [X,Y,y_res]  #dataSet的组成结构
X:样本训练集
Y:样本标签
y_res: 残差
'''
def leaf(dataSet):
    """计算节点的数值
    :param dataSet: {ndarray}训练样本
    :return: 均值
    """
    '''
    生成叶子节点
    '''
    return np.sum(dataSet[:, -1]) / (np.sum((dataSet[:, -2] - dataSet[:, -1]) * (1 - dataSet[:, -2] + dataSet[:, -1])))


def err_cnt(dataSet):
    """计算误差
    :param dataSet: {ndarray}训练数据
    :return: 总方差
    """

    return np.var(dataSet[:, -1]) * np.shape(dataSet)[0]


def split_tree(dataSet, feature, split_val):
    """根据特征feature中的值split_val将数据集data划分为左右子树
    :param data: {list}训练样本
    :param feature: {int}需要划分的特征索引
    :param split_val: {float}指定的划分值
    :return:(set_1, set_2): {tuple} 左右子树的集合
    """

    set_L = dataSet[np.nonzero(dataSet[:, feature] <= split_val)[0], :]
    set_R = dataSet[np.nonzero(dataSet[:, feature] > split_val)[0], :]
    return set_L, set_R


class CART_regression(object):
    """
    CART算法类
    """

    def __init__(self, X, Y, min_sample, min_err, max_height=20):
        """
        :param X: 回归样本数据的特征
        :param Y: 回归样本数据的标签
        :param min_sample: 每个叶节点最少样本数
        :param min_err: 最小损失
        """
        self.X = X
        self.Y = Y
        self.min_sample = min_sample
        self.min_err = min_err
        self.max_height = max_height

    def fit(self):
        """
        构建树
        input:data{list} -- 训练样本
              min_sample{int} -- 叶子节点中最少样本数
              min_err{float} -- 最小的error
        output: node:树的根节点
        """
        # 将样本特征与样本标签合成完整的样本
        # X存放带样本标签的数据集,Y存放第i次的残差
        data = np.c_[self.X, self.Y]
        # 初始化
        best_err = err_cnt(data)
        # 存储最佳切分属性及最佳切分点
        bestCriteria = None
        # 存储切分后的两个数据集
        bestSets = None
        # 构建决策树,返回该决策树的根节点
        if np.shape(data)[0] <= self.min_sample or self.max_height == 1 or best_err <= self.min_err :
            return Node(results=leaf(data))

        # 开始构建CART回归树
        num_feature = np.shape(data[0])[0] - 2
        for feat in range(num_feature):
            val_feat = np.unique(data[:, feat])
            for val in val_feat:
                # 尝试划分
                set_L, set_R = split_tree(data, feat, val)
                if np.shape(set_L)[0] < 2 or np.shape(set_R)[0] < 2:
                    continue
                # 计算划分后的error值
                err_now = err_cnt(set_L) + err_cnt(set_R)
                # 更新最新划分
                if err_now < best_err:
                    best_err = err_now
                    bestCriteria = (feat, val)
                    bestSets = (set_L, set_R)
        # 生成左右子树
        left = CART_regression(bestSets[0][:, :-1], bestSets[0][:, -1], self.min_sample, self.min_err, self.max_height-1).fit()
        right = CART_regression(bestSets[1][:, :-1], bestSets[1][:, -1], self.min_sample, self.min_err, self.max_height-1).fit()
        return Node(feature=bestCriteria[0], split_val=bestCriteria[1], left=left, right=right)



def predict(sample, tree):
    f"""对每一个样本sample进行预测
    :param sample: {list}:样本
    :param tree: 训练好的CART回归模型
    :return: results{float} :预测值
    """

    # 叶子节点
    if tree.results is not None:
        return tree.results
    else:
        # 不是叶节点
        val_sample = sample[tree.feature]
        branch = None
        # 选择右子树
        if val_sample > tree.split_val:
            branch = tree.right
        else:
            branch = tree.left
        return predict(sample, branch)


def test(X, tree):
    """评估CART回归模型
    :param X: {list} 测试样本
    :param Y: {list} 测试标签
    :param tree: 训练好的CART回归树模型
    :return:  均方误差
    """

    m = np.shape(X)[0]
    y_hat = []
    for i in range(m):
        pre = predict(X[i], tree)
        y_hat.append(pre)
    return y_hat


def numLeaf(tree):
    if tree.results is not None:
        return 1
    else:
        return numLeaf(tree.left) + numLeaf(tree.right)


def heightTree(tree):
    if tree.results is not None:
        return 1
    else:
        heightL = heightTree(tree.left)
        heughtR = heightTree(tree.right)
        if heightL > heughtR:
            return heightL + 1
        else:
            return heughtR + 1
def showTree(tree):
    node = {}

    if tree.results is None:
        node['feat'] = tree.feature
        node['splitVal'] = tree.split_val
        print(node)
        showTree(tree.left)
        showTree(tree.right)
    else:
        node['value'] = tree.results
        print(node)

3.2GBDT具体实现

import CART_regression_tree
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn import tree
import sys
import numpy as np


def load_data(data_file):
    """导入训练数据
    :param data_file: {string} 保存训练数据的文件
    :return: {list} 训练数据
    """
    X, Y = [], []
    f = open(data_file)
    for line in f.readlines():
        sample = []
        lines = line.strip().split('\t')
        Y.append(lines[-1])
        for i in range(len(lines) - 1):
            sample.append(float(lines[i]))
        X.append(sample)
    return X, Y


# 二分类输出非线性映射
def sigmoid(x):
    if x >= 0:
        return 1 / (1 + np.exp(-x))
    else:
        return np.exp(x) / (1 + np.exp(x))


class GBDT_RT(object):
    """
    GBDT回归算法类
    """
    def __init__(self):
        self.trees = None
        self.learn_rate = None
        self.init_val = None

    def get_init_value(self, y):
        """计算初始值的平均值
        :param y: {ndarray} 样本标签列表
        :return: average:{float} 样本标签的平均值
        """
        """
        初始化:
        F0(x)=log(p1/(1 - p1))
        """
        p = np.count_nonzero(y)
        n = np.shape(y)[0]
        return np.log(p / (n-p))

    def get_residuals(self, y, y_hat):
        """
        计算样本标签域预测列表的残差
        :param y: {ndarray} 样本标签列表
        :param y_hat: {ndarray} 预测标签列表
        :return: y_residuals {list} 样本标签与预测标签列表的残差
        """

        y_residuals = []
        for i in range(len(y)):
            y_residuals.append(y[i] - y_hat[i])
        return y_residuals

    def fit(self, X, Y, n_estimates, learn_rate, min_sample, min_err, max_height):
        """
        训练GDBT模型
        :param X: {list} 样本特征
        :param Y: {list} 样本标签
        :param n_estimates: {int} GBDT中CART树的个数
        :param learn_rate: {float} 学习率
        :param min_sample: {int} 学习CART时叶节点最小样本数
        :param min_err: {float} 学习CART时最小方差
        """

        # 初始化预测标签和残差
        self.init_val = self.get_init_value(Y)

        n = np.shape(Y)[0]
        F = np.array([self.init_val] * n)
        y_hat = np.array([sigmoid(self.init_val)] * n)
        y_residuals = Y - y_hat
        y_residuals = np.c_[Y, y_residuals]

        self.trees = []
        self.learn_rate = learn_rate
        # 迭代训练GBDT
        for j in range(n_estimates):
            tree = CART_regression_tree.CART_regression(X, y_residuals, min_sample, min_err, max_height).fit()
            for k in range(n):
                res_hat = CART_regression_tree.predict(X[k], tree)
                # 计算此时的预测值等于原预测值加残差预测值
                F[k] += self.learn_rate * res_hat
                y_hat[k] = sigmoid(F[k])
            y_residuals = Y - y_hat
            y_residuals = np.c_[Y, y_residuals]
            self.trees.append(tree)

    def GBDT_predicts(self, X_test):
        """
        预测多个样本
        :param X_test: {list} 测试集
        :return: predicts {list} 预测的结果
        """
        predicts = []
        for i in range(np.shape(X_test)[0]):
            pre_y = self.init_val
            for tree in self.trees:
                pre_y += self.learn_rate * CART_regression_tree.predict(X_test[i], tree)
            if sigmoid(pre_y) >= 0.5:
                predicts.append(1)
            else:
                predicts.append(0)
        return predicts

    def cal_error(self, Y_test, predicts):
        """
        计算预测误差
        :param Y_test: {测试样本标签列表}
        :param predicts: {list} 测试样本预测列表
        :return: error {float} 均方误差
        """

        y_test = np.array(Y_test)
        y_predicts = np.array(predicts)
        error = np.square(y_test - y_predicts).sum() / len(Y_test)
        return error


if __name__ == '__main__':
    # name of features
    featName = ['Number', 'Plasma', 'Diastolic', 'Triceps', '2-Hour', 'Body', 'Diabetes', 'Age', 'Class']
    path = "D:\\YSA\\dataFile\\xg.csv"
    # read data file
    data = pd.read_csv(path, sep=',', header=0, names=featName)
    # set random seed
    np.random.seed(123)
    # split data into train and test
    X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1].values, data.iloc[:, -1].values,
                                                        test_size=0.2, random_state=123)
    # normalize train
    # X_train = MinMaxScaler().fit_transform(X_train)
    # normalize test
    # X_test = MinMaxScaler().fit_transform(X_test)
    gbdtTrees = GBDT_RT()
    gbdtTrees.fit(X_train, y_train, n_estimates=50, learn_rate=0.2, min_sample=30, min_err=0.3, max_height=4)
    for i in range(2):
        print(CART_regression_tree.numLeaf(gbdtTrees.trees[i]))
        print(CART_regression_tree.heightTree(gbdtTrees.trees[i]))
        CART_regression_tree.showTree(gbdtTrees.trees[i])
        print('--------------------------------------------')
    y_hat = gbdtTrees.GBDT_predicts(X_test)
    print(y_hat)
    acc = accuracy_score(y_test, y_hat)
    print(acc)

4.数据集

百度云链接:
链接:https://pan.baidu.com/s/1ae7X0xLj83zWYrgh7UQZtw
提取码:wno3

你可能感兴趣的:(推荐系统经典模型,算法,机器学习)