机器学习07-GBDT-python

文章目录

    • 一. 原理推导
    • 二. python实现
      • 1. 回归
        • 1. 计算f初始值
        • 2. 计算负梯度
        • 3. 更新叶子节点值
        • 4. 更新f值
        • 5. 训练模型
        • 6. 预测
      • 2. 二分类gbdt
        • 1.标签为 {0,1}的分类问题
          • 1. 初始化f0值
          • 2. 计算负梯度
          • 3. 拟合叶子节点
          • 4. 更新f值
          • 5. 训练模型
          • 6. 预测
        • 2.标签为{-1,1}的分类问题
          • 1. 初始化f0值
          • 2. 计算负梯度
          • 3. 拟合叶子节点
          • 4. 更新f值
          • 5. 训练模型
          • 6. 预测
    • 三. sklearn

github简单实现

一. 原理推导

机器学习07-GBDT-python_第1张图片机器学习07-GBDT-python_第2张图片机器学习07-GBDT-python_第3张图片

二. python实现

1. 回归

1. 计算f初始值

# 步骤1:初始化f(x) 的初始值
    def initFValue(self , targets):
    # 这里简单直接初始化为0,当然对平方损失函数,这里可以使用平均值来
        self.fValue = [0 for i in targets]
        #avg = sum(targets) / len(targets)
        #self.fValue = [avg for i in targets]

2. 计算负梯度

    def calaResidual(self, yReal , fValue):
#        ret = [0 for f in yReal]
#        for i in range(len(yReal)):
#            ret[i] = yReal[i] - yPre[i]
        
        ret = [ i-y for i,y in zip(yReal,fValue) ]

3. 更新叶子节点值

    def updateLeafValue(self ,tree, dataSet,target):
        pass
        # 因为我是借用cart回归树的叶子值,而回归树叶子节点的值就是平均值

4. 更新f值

    def updateFValue(self,dataSet , tree):
        # 迭代一轮就把叶子节点的值相加
        for i in range(len(self.fValue)):
            data = dataSet[i]
            self.fValue[i] += cart.cartReg().predict0(data , tree)

5. 训练模型

    def fit(self , dataSet , target):
        # 1 初始化f0
        self.initFValue(target)
        
        for i  in range(self.nTrees):
            # 2 计算负梯度
            newTarget = self.calaResidual(target , self.fValue)
            print newTarget[0]
            # 3 构造回归树
            tree = cart.cartReg().buildTree(dataSet ,newTarget, self.depth , self.leafSize)
            # 4 拟合叶子节点
            self.updateLeafValue(tree , dataSet , newTarget)
            # 5 更新f值
            self.updateFValue(dataSet  , tree)
            
            self.allTrees.append(tree)
            self.lost.append(self.calaLost(target))
        return self

6. 预测

    def predict(self , predList):
        value = 0.0
        for tree in self.allTrees:
        	# 每棵树对应叶子节点值相加就是需要预测的值
            value +=  cart.cartReg().predict0(predList, tree)
            #print value
        return value

2. 二分类gbdt

1.标签为 {0,1}的分类问题

1. 初始化f0值
    def init_f_value(self, target):
        '''
        初始化f0的值
        :param target:
        :return:
        '''
        z = sum(target) / sum([1 - y for y in target])
        self.fValue = [math.log(z) for i in target]
        
        return self
2. 计算负梯度
@staticmethod
def sigmod(z):
    return 1.0 / (1.0 + (float)(math.exp(-z)))

def computer_residual(self, target):
    return [y - self.sigmod(z) for y, z in zip(target, self.fValue)]
3. 拟合叶子节点
    def calculate_leaf_value(self, target, index):
        """
        1. 计算叶子节点值
        2. 更新f值
        :param target:
        :param index: 叶子节点索引
        :return: 叶子节点值
        """
        # 更新叶子节点值
        y = [target[i] for i in index]
        fn = [self.fValue[i] for i in index]
        p = [self.sigmod(f) for f in fn]
        residual = sum(yi - pi for yi, pi in zip(y, p))
        prob = sum([pi * (1 - pi) for pi in p])
        # 更新f值
        value = residual / prob
        self.update_f_value(value, index)
        return value
   def update_leaf_value(self, tree, data_set, target):
        spilt_index = tree['spIndex']
        spilt_value = tree['spValue']
        index_left, index_right = self.spilt_data(data_set, spilt_index, spilt_value)
        data_left = [data_set[i] for i in index_left]
        data_right = [data_set[i] for i in index_right]
        if isinstance(tree['left'], dict):
            self.update_leaf_value(tree['left'], data_left, target)
        else:
            tree['left'] = self.calculate_leaf_value(target, index_left)
        
        if isinstance(tree['right'], dict):
            self.update_leaf_value(tree['right'], data_right, target)
        else:
            tree['right'] = self.calculate_leaf_value(target, index_right)
4. 更新f值
    def update_f_value(self, value, index):
        for i in index:
            self.fValue[i] += value
5. 训练模型
    def fit(self, data_set, target):
        self.init_f_value(target)
        for i in range(self.n_trees):
            new_target = self.computer_residual(target)
            tree = cart.cartReg().buildTree(data_set, new_target, self.depth, self.leaf_size)
            self.update_leaf_value(tree, data_set, target)
            self.all_trees.append(tree)
            lost = self.compute_lost(target)
            self.losts.append(lost)
6. 预测
    def predict_prob(self, predict_vector):
        fn = 0.0
        for tree in self.all_trees:
            fn += cart.cartReg().predict0(predict_vector, tree)
        return self.sigmod(fn)
    
    def predict(self, predict_vector, thr=0.5):
        prob = self.predict_prob(predict_vector)
        if prob > thr:
            return 1
        return 0

2.标签为{-1,1}的分类问题

1. 初始化f0值
    def initFValue(self, target):
        self.fValue = [t / (abs(t) * (2 - abs(t))) for t in target]
2. 计算负梯度
    def calaResidual(self, yReal, fValue):
        target = [2.0 * y / (1 + math.exp(2 * y * f)) for y, f in zip(yReal, fValue) ]
        return target
3. 拟合叶子节点
    def calaLeafValue(self , target):
        sum0 = 0.0
        for t in target:
            sum0 += abs(t) * (2 - abs(t))
            if sum0 == 0:
                pass
        return sum(target) / sum0
    
    def updateLeafValue(self, tree, dataSet, target):
        # 通过tree字典去切分数据
        
        spIndex = tree['spIndex']
        spValue = tree['spValue']
        subLeft, subRight, tarLeft, tarRight = cart.cartReg.spiltData(dataSet, target, spIndex, spValue)
        if not isinstance(tree['left'], dict):
            tree['left'] = self.calaLeafValue(tarLeft)
        else:
            self.updateLeafValue(tree['left'], subLeft, tarLeft)
        
        if not isinstance(tree['right'], dict):
            tree['right'] = self.calaLeafValue(tarRight)
        else:
            self.updateLeafValue(tree['right'], subRight, tarRight)
4. 更新f值
    def updateFValue(self, dataSet , tree):
        for i in range(len(self.fValue)):
            data = dataSet[i]
            self.fValue[i] += cart.cartReg().predict0(data , tree)
5. 训练模型
    def fit(self , dataSet , target):
        # 1 初始化f0
        self.initFValue(target)
        
        for i  in range(self.nTrees):
            # 2 计算负梯度
            newTarget = self.calaResidual(target , self.fValue)
            print newTarget[0]
            # 3 构造回归树
            tree = cart.cartReg().buildTree(dataSet ,newTarget, self.depth , self.leafSize)
            # 4 拟合叶子节点
            self.updateLeafValue(tree , dataSet , newTarget)
            # 5 更新f值
            self.updateFValue(dataSet  , tree)
            
            self.allTrees.append(tree)
            self.lost.append(self.calaLost(target))
        return self
6. 预测
    def predict(self, predList, threshold = 0.5):
        p = self.predict_prob(predList)
        if p >= threshold:
            return 1
        return -1
    
    def predict_prob(self, predList):
        value = super(GBDTClissity, self).predict(predList)
        # print value
        p = 1 / (1 + math.exp(-value))
        # print p
        return p

三. sklearn

你可能感兴趣的:(算法,python,机器学习)