GBDT手动实现,纯代码

本人刚开始开始机器学习,在关于gbdt的实现上,总是想着重现gbdt的官方接口,但由于没找到有用的资料硬生生纠结了一个星期
gbdt回归就是简单的用cart回归取迭代(负梯度)残差,今天这里要实现的是gbdt二分类,代码如下:

# -- encoding:utf-8 --
"""
Create by ibf on 2019/11/18
"""

import numpy as np
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import r2_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn import tree
import pydotplus

#适应于gbdt的cart树,输出为f的更新值,而不是概率的预测值
class Try_GBDT():
    def __init__(self,num_round=10,eta=0.1,gamma=0,Lambda=1,scoring="mse",y_true=[]):
        self.num_round = num_round
        self.eta = eta
        self.ensemble = []
        self.haty = None 
        self.f = None 
        self.y_true=y_true
    def _Gain(self,listL,listR,X_train,y_train):
        """计算Gain,方便确定最佳分枝"""
        #print(listL,listR,X_train)
        return np.var(y_train)-len(listL)/X_train.shape[0]*np.var(y_train[listL])-len(listR)/X_train.shape[0]*np.var(y_train[listR])
    def _w(self,indexlist,y_train):
            y_r=self.y_true[indexlist]-y_train[indexlist]
            return np.sum(y_train[indexlist])/np.sum(y_r.dot(1-y_r))
    def fit(self,X_train):
        
        def BestSplit(X_train,indexlist):
            """寻找最佳切分,如果有最佳切分,返回切分特征和切分值;如果无最佳切分,返回None"""
            bestGain = 0
            bestSplitFeature = -1
            bestSplitValue = -1
            #print('x',X_train)
            for Feature in range(X_train.shape[1]):
                #print('feature',Feature)
                #ValueSet = set(X_train[:,Feature])
                #ValueSet=np.sort(X_train[:,Feature].unique())
                ValueSet=sorted(set(X_train[:,Feature]))
                #print('valueset',ValueSet)
                for Val in ValueSet:
                    if Val==ValueSet[-1]:break
                    Val=(Val+ValueSet[int(np.argwhere(ValueSet==Val))+1])/2
                    boolindexLeft = X_train[:,Feature] <= Val
                    boolindexRight = ~boolindexLeft
                    #print('boolindex',boolindexLeft,boolindexRight,indexlist)
                    indexLeft = indexlist[boolindexLeft]
                    indexRight = indexlist[boolindexRight]
                    gain = self._Gain(indexLeft,indexRight,X_train,y_train)
                    if gain > bestGain and len(indexLeft)>1 and len(indexRight)>1:
                        bestGain = gain
                        bestSplitFeature = Feature
                        bestSplitValue = Val
            if bestSplitFeature == -1:
                return None,None
            else:
                return bestSplitFeature,bestSplitValue
        def create_tree(X_train,y_train,indexlist = np.arange(len(X_train)),depth=0):
            """建立新树,以字典形式保存,并更新self.f(这次更新后每个样本的目标分数)"""
            bestSplitFeature,bestSplitValue = BestSplit(X_train,indexlist)
            #print('bestSplitFeature',bestSplitFeature,bestSplitValue)
            if bestSplitFeature is None or depth>=1:
                w = self._w(indexlist,y_train)
                self.f[indexlist] = w
                return w
            else :
                depth+=1
                left_index = X_train[:,bestSplitFeature] <= bestSplitValue
                sub_X_train_left = X_train [left_index]
                sub_X_train_right = X_train [~left_index] 
                indexlist_left = indexlist[left_index]
                indexlist_right = indexlist[~left_index]
                #print('depth',depth)
                #print('indexlistleftandright',indexlist_left,indexlist_right)
                leftchild = create_tree(sub_X_train_left,y_train,indexlist_left,depth)
                rightchild = create_tree(sub_X_train_right,y_train,indexlist_right,depth)
                return {bestSplitFeature:{"<={}".format(bestSplitValue): leftchild,">{}".format(bestSplitValue): rightchild}}
        self.haty = np.zeros(len(X_train))+np.log(6.0/4)
        for _ in range(self.num_round):
            self.f = np.empty(len(X_train))
            y_train=self.y_true-1/(1+np.exp(-self.haty))
            newtree = create_tree(X_train,y_train,np.arange(len(X_train)),0)
            self.ensemble.append(newtree)
            #print('ensemble',self.ensemble)
            self.haty = self.haty + self.eta*self.f
        return self.haty,y_train,newtree
    def draw_one_tree(self,index):
        from graphviz import Digraph

        def export_graphviz(tree,root_index): 
            root = next(iter(tree))
            text_node.append([str(root_index),"feature:{}".format(root)])
            secondDic = tree[root]
            for key in secondDic:
                if type(secondDic[key]) == dict:
                    i[0] += 1
                    secondrootindex=i[0]
                    text_edge.append([str(root_index),str(secondrootindex),str(key)])
                    export_graphviz(secondDic[key],secondrootindex)
                else:
                    i[0] += 1
                    text_node.append([str(i[0]),str(secondDic[key])])
                    text_edge.append([ str(root_index) , str(i[0]) , str(key) ])


        tree = self.ensemble[index]
        text_node=[]
        text_edge=[]
        i=[1]
        export_graphviz(tree,i[0])
        #print('treenode,treeedge',text_node,text_edge)
        dot = Digraph()
        for line in text_node:
            dot.node(line[0],line[1])
        for line in text_edge:
            dot.edge(line[0],line[1],line[2])

        dot.view()
    def predict(self,X_test):
        return np.array([self._predict(test) for test in X_test])
        
    def _predict(self,test):
        """对单条测试集进行预测"""
        def __predict(tree,test):
            feature = next(iter(tree))
            #print('tree',tree,iter(tree),next(iter(tree)))
            secondDic = tree[feature]
            content = test[feature]
            for key in secondDic:
                if eval(str(content)+key):
                    if type(secondDic[key]) == dict :
                        return __predict(secondDic[key],test)
                    else:
                        return secondDic[key]

        assert len(self.ensemble) != 0,"fit before predict"
        res = 0
        for i in range(len(self.ensemble)):
            tree = self.ensemble[i]
            res_temp = __predict(tree,test)
            res += res_temp*self.eta
        res+=np.log(6.0/4)
        return 1/(1+np.exp(-res))
def main():
    #准备数据
    np.random.seed(28)
    x = np.random.randn(10, 2) * 5
    y = np.array([1] * 6 + [0] * 4).astype(np.int)
    y_true = y
    n = 4
    
    #手动实现gbdt二分类
    reg = Try_GBDT(num_round=n,y_true=y_true) 
    haty,y_train,newtree=reg.fit(x)
    #reg.draw_one_tree(3)
    print('gbdtself_predict',reg.predict(x))
    
    #gbdt官方实现
    algox = GradientBoostingClassifier(n_estimators=n, max_depth=1)
    algox.fit(x, y_true)
    from sklearn import tree
    import pydotplus
    j=0
    for i in algox.estimators_: 
        dot_data = tree.export_graphviz(decision_tree=i[0], out_file=None)
        graph = pydotplus.graph_from_dot_data(dot_data)
        #print(i[0].predict(x))
        graph.write_png('class%s.png'%(j))
        j+=1
    print('gbdtoffi_predict',algox.predict_proba(x))
main()

`这里 的大部分代码来自:xgboost手动实现https://blog.csdn.net/weixin_44264662/article/details/100896521

你可能感兴趣的:(机器学习)