文章目录
-
- 一. 原理推导
- 二. python实现
-
- 1. 回归
-
- 1. 计算f初始值
- 2. 计算负梯度
- 3. 更新叶子节点值
- 4. 更新f值
- 5. 训练模型
- 6. 预测
- 2. 二分类gbdt
-
- 1.标签为 {0,1}的分类问题
-
- 1. 初始化f0值
- 2. 计算负梯度
- 3. 拟合叶子节点
- 4. 更新f值
- 5. 训练模型
- 6. 预测
- 2.标签为{-1,1}的分类问题
-
- 1. 初始化f0值
- 2. 计算负梯度
- 3. 拟合叶子节点
- 4. 更新f值
- 5. 训练模型
- 6. 预测
- 三. sklearn
github简单实现
一. 原理推导
二. python实现
1. 回归
1. 计算f初始值
def initFValue(self , targets):
self.fValue = [0 for i in targets]
2. 计算负梯度
def calaResidual(self, yReal , fValue):
ret = [ i-y for i,y in zip(yReal,fValue) ]
3. 更新叶子节点值
def updateLeafValue(self ,tree, dataSet,target):
pass
4. 更新f值
def updateFValue(self,dataSet , tree):
for i in range(len(self.fValue)):
data = dataSet[i]
self.fValue[i] += cart.cartReg().predict0(data , tree)
5. 训练模型
def fit(self , dataSet , target):
self.initFValue(target)
for i in range(self.nTrees):
newTarget = self.calaResidual(target , self.fValue)
print newTarget[0]
tree = cart.cartReg().buildTree(dataSet ,newTarget, self.depth , self.leafSize)
self.updateLeafValue(tree , dataSet , newTarget)
self.updateFValue(dataSet , tree)
self.allTrees.append(tree)
self.lost.append(self.calaLost(target))
return self
6. 预测
def predict(self , predList):
value = 0.0
for tree in self.allTrees:
value += cart.cartReg().predict0(predList, tree)
return value
2. 二分类gbdt
1.标签为 {0,1}的分类问题
1. 初始化f0值
def init_f_value(self, target):
'''
初始化f0的值
:param target:
:return:
'''
z = sum(target) / sum([1 - y for y in target])
self.fValue = [math.log(z) for i in target]
return self
2. 计算负梯度
@staticmethod
def sigmod(z):
return 1.0 / (1.0 + (float)(math.exp(-z)))
def computer_residual(self, target):
return [y - self.sigmod(z) for y, z in zip(target, self.fValue)]
3. 拟合叶子节点
def calculate_leaf_value(self, target, index):
"""
1. 计算叶子节点值
2. 更新f值
:param target:
:param index: 叶子节点索引
:return: 叶子节点值
"""
y = [target[i] for i in index]
fn = [self.fValue[i] for i in index]
p = [self.sigmod(f) for f in fn]
residual = sum(yi - pi for yi, pi in zip(y, p))
prob = sum([pi * (1 - pi) for pi in p])
value = residual / prob
self.update_f_value(value, index)
return value
def update_leaf_value(self, tree, data_set, target):
spilt_index = tree['spIndex']
spilt_value = tree['spValue']
index_left, index_right = self.spilt_data(data_set, spilt_index, spilt_value)
data_left = [data_set[i] for i in index_left]
data_right = [data_set[i] for i in index_right]
if isinstance(tree['left'], dict):
self.update_leaf_value(tree['left'], data_left, target)
else:
tree['left'] = self.calculate_leaf_value(target, index_left)
if isinstance(tree['right'], dict):
self.update_leaf_value(tree['right'], data_right, target)
else:
tree['right'] = self.calculate_leaf_value(target, index_right)
4. 更新f值
def update_f_value(self, value, index):
for i in index:
self.fValue[i] += value
5. 训练模型
def fit(self, data_set, target):
self.init_f_value(target)
for i in range(self.n_trees):
new_target = self.computer_residual(target)
tree = cart.cartReg().buildTree(data_set, new_target, self.depth, self.leaf_size)
self.update_leaf_value(tree, data_set, target)
self.all_trees.append(tree)
lost = self.compute_lost(target)
self.losts.append(lost)
6. 预测
def predict_prob(self, predict_vector):
fn = 0.0
for tree in self.all_trees:
fn += cart.cartReg().predict0(predict_vector, tree)
return self.sigmod(fn)
def predict(self, predict_vector, thr=0.5):
prob = self.predict_prob(predict_vector)
if prob > thr:
return 1
return 0
2.标签为{-1,1}的分类问题
1. 初始化f0值
def initFValue(self, target):
self.fValue = [t / (abs(t) * (2 - abs(t))) for t in target]
2. 计算负梯度
def calaResidual(self, yReal, fValue):
target = [2.0 * y / (1 + math.exp(2 * y * f)) for y, f in zip(yReal, fValue) ]
return target
3. 拟合叶子节点
def calaLeafValue(self , target):
sum0 = 0.0
for t in target:
sum0 += abs(t) * (2 - abs(t))
if sum0 == 0:
pass
return sum(target) / sum0
def updateLeafValue(self, tree, dataSet, target):
spIndex = tree['spIndex']
spValue = tree['spValue']
subLeft, subRight, tarLeft, tarRight = cart.cartReg.spiltData(dataSet, target, spIndex, spValue)
if not isinstance(tree['left'], dict):
tree['left'] = self.calaLeafValue(tarLeft)
else:
self.updateLeafValue(tree['left'], subLeft, tarLeft)
if not isinstance(tree['right'], dict):
tree['right'] = self.calaLeafValue(tarRight)
else:
self.updateLeafValue(tree['right'], subRight, tarRight)
4. 更新f值
def updateFValue(self, dataSet , tree):
for i in range(len(self.fValue)):
data = dataSet[i]
self.fValue[i] += cart.cartReg().predict0(data , tree)
5. 训练模型
def fit(self , dataSet , target):
self.initFValue(target)
for i in range(self.nTrees):
newTarget = self.calaResidual(target , self.fValue)
print newTarget[0]
tree = cart.cartReg().buildTree(dataSet ,newTarget, self.depth , self.leafSize)
self.updateLeafValue(tree , dataSet , newTarget)
self.updateFValue(dataSet , tree)
self.allTrees.append(tree)
self.lost.append(self.calaLost(target))
return self
6. 预测
def predict(self, predList, threshold = 0.5):
p = self.predict_prob(predList)
if p >= threshold:
return 1
return -1
def predict_prob(self, predList):
value = super(GBDTClissity, self).predict(predList)
p = 1 / (1 + math.exp(-value))
return p
三. sklearn