本人刚开始开始机器学习,在关于gbdt的实现上,总是想着重现gbdt的官方接口,但由于没找到有用的资料硬生生纠结了一个星期
gbdt回归就是简单的用cart回归取迭代(负梯度)残差,今天这里要实现的是gbdt二分类,代码如下:
# -- encoding:utf-8 --
"""
Create by ibf on 2019/11/18
"""
import numpy as np
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import r2_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn import tree
import pydotplus
#适应于gbdt的cart树,输出为f的更新值,而不是概率的预测值
class Try_GBDT():
def __init__(self,num_round=10,eta=0.1,gamma=0,Lambda=1,scoring="mse",y_true=[]):
self.num_round = num_round
self.eta = eta
self.ensemble = []
self.haty = None
self.f = None
self.y_true=y_true
def _Gain(self,listL,listR,X_train,y_train):
"""计算Gain,方便确定最佳分枝"""
#print(listL,listR,X_train)
return np.var(y_train)-len(listL)/X_train.shape[0]*np.var(y_train[listL])-len(listR)/X_train.shape[0]*np.var(y_train[listR])
def _w(self,indexlist,y_train):
y_r=self.y_true[indexlist]-y_train[indexlist]
return np.sum(y_train[indexlist])/np.sum(y_r.dot(1-y_r))
def fit(self,X_train):
def BestSplit(X_train,indexlist):
"""寻找最佳切分,如果有最佳切分,返回切分特征和切分值;如果无最佳切分,返回None"""
bestGain = 0
bestSplitFeature = -1
bestSplitValue = -1
#print('x',X_train)
for Feature in range(X_train.shape[1]):
#print('feature',Feature)
#ValueSet = set(X_train[:,Feature])
#ValueSet=np.sort(X_train[:,Feature].unique())
ValueSet=sorted(set(X_train[:,Feature]))
#print('valueset',ValueSet)
for Val in ValueSet:
if Val==ValueSet[-1]:break
Val=(Val+ValueSet[int(np.argwhere(ValueSet==Val))+1])/2
boolindexLeft = X_train[:,Feature] <= Val
boolindexRight = ~boolindexLeft
#print('boolindex',boolindexLeft,boolindexRight,indexlist)
indexLeft = indexlist[boolindexLeft]
indexRight = indexlist[boolindexRight]
gain = self._Gain(indexLeft,indexRight,X_train,y_train)
if gain > bestGain and len(indexLeft)>1 and len(indexRight)>1:
bestGain = gain
bestSplitFeature = Feature
bestSplitValue = Val
if bestSplitFeature == -1:
return None,None
else:
return bestSplitFeature,bestSplitValue
def create_tree(X_train,y_train,indexlist = np.arange(len(X_train)),depth=0):
"""建立新树,以字典形式保存,并更新self.f(这次更新后每个样本的目标分数)"""
bestSplitFeature,bestSplitValue = BestSplit(X_train,indexlist)
#print('bestSplitFeature',bestSplitFeature,bestSplitValue)
if bestSplitFeature is None or depth>=1:
w = self._w(indexlist,y_train)
self.f[indexlist] = w
return w
else :
depth+=1
left_index = X_train[:,bestSplitFeature] <= bestSplitValue
sub_X_train_left = X_train [left_index]
sub_X_train_right = X_train [~left_index]
indexlist_left = indexlist[left_index]
indexlist_right = indexlist[~left_index]
#print('depth',depth)
#print('indexlistleftandright',indexlist_left,indexlist_right)
leftchild = create_tree(sub_X_train_left,y_train,indexlist_left,depth)
rightchild = create_tree(sub_X_train_right,y_train,indexlist_right,depth)
return {bestSplitFeature:{"<={}".format(bestSplitValue): leftchild,">{}".format(bestSplitValue): rightchild}}
self.haty = np.zeros(len(X_train))+np.log(6.0/4)
for _ in range(self.num_round):
self.f = np.empty(len(X_train))
y_train=self.y_true-1/(1+np.exp(-self.haty))
newtree = create_tree(X_train,y_train,np.arange(len(X_train)),0)
self.ensemble.append(newtree)
#print('ensemble',self.ensemble)
self.haty = self.haty + self.eta*self.f
return self.haty,y_train,newtree
def draw_one_tree(self,index):
from graphviz import Digraph
def export_graphviz(tree,root_index):
root = next(iter(tree))
text_node.append([str(root_index),"feature:{}".format(root)])
secondDic = tree[root]
for key in secondDic:
if type(secondDic[key]) == dict:
i[0] += 1
secondrootindex=i[0]
text_edge.append([str(root_index),str(secondrootindex),str(key)])
export_graphviz(secondDic[key],secondrootindex)
else:
i[0] += 1
text_node.append([str(i[0]),str(secondDic[key])])
text_edge.append([ str(root_index) , str(i[0]) , str(key) ])
tree = self.ensemble[index]
text_node=[]
text_edge=[]
i=[1]
export_graphviz(tree,i[0])
#print('treenode,treeedge',text_node,text_edge)
dot = Digraph()
for line in text_node:
dot.node(line[0],line[1])
for line in text_edge:
dot.edge(line[0],line[1],line[2])
dot.view()
def predict(self,X_test):
return np.array([self._predict(test) for test in X_test])
def _predict(self,test):
"""对单条测试集进行预测"""
def __predict(tree,test):
feature = next(iter(tree))
#print('tree',tree,iter(tree),next(iter(tree)))
secondDic = tree[feature]
content = test[feature]
for key in secondDic:
if eval(str(content)+key):
if type(secondDic[key]) == dict :
return __predict(secondDic[key],test)
else:
return secondDic[key]
assert len(self.ensemble) != 0,"fit before predict"
res = 0
for i in range(len(self.ensemble)):
tree = self.ensemble[i]
res_temp = __predict(tree,test)
res += res_temp*self.eta
res+=np.log(6.0/4)
return 1/(1+np.exp(-res))
def main():
#准备数据
np.random.seed(28)
x = np.random.randn(10, 2) * 5
y = np.array([1] * 6 + [0] * 4).astype(np.int)
y_true = y
n = 4
#手动实现gbdt二分类
reg = Try_GBDT(num_round=n,y_true=y_true)
haty,y_train,newtree=reg.fit(x)
#reg.draw_one_tree(3)
print('gbdtself_predict',reg.predict(x))
#gbdt官方实现
algox = GradientBoostingClassifier(n_estimators=n, max_depth=1)
algox.fit(x, y_true)
from sklearn import tree
import pydotplus
j=0
for i in algox.estimators_:
dot_data = tree.export_graphviz(decision_tree=i[0], out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
#print(i[0].predict(x))
graph.write_png('class%s.png'%(j))
j+=1
print('gbdtoffi_predict',algox.predict_proba(x))
main()
`这里 的大部分代码来自:xgboost手动实现https://blog.csdn.net/weixin_44264662/article/details/100896521