\quad \quad 单层决策树(decision stump),也称决策树桩,它是一种简单的决策树,通过给定的阈值,进行分类。如下图所示(仅对 petal length 进行了判断):
优化目标:最低错误率
分类函数 stumpClassify():
\quad \quad 通过阈值比较对数据进行分类 。所有在阈值一边的数据会分到类别-1, 而在 另外一边的数据分到类别-1。
决策树buildStump():数据集上最佳决策树
\quad \quad 通过遍历,改变不同的阈值,计算最终的分类误差,找到分类误差最小的分类方式,即为我们要找的最佳单层决策树。
伪代码:
- 将最小错误率minError设为Inf(正无穷)
- 对于数据集每一个特征:(第一层循环)
- 对于数据集每一个步长:(第二层循环)
- 对每个不等号:(第三层循环)
建立一颗决策树并用加权数据集对它进行测试
如果错误率低于minError,将当前决策树设为最佳单层决策树- 返回最佳单层决策树
# 单层决策树分类函数,与阈值进行比较
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1
if threshIneq == 'lt':#如果分类标志为'lt',则分类规则:小于等于阈值,归类到-1;将不等号在大 、小 之间切换。
retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1
else:
retArray[dataMatrix[:,dimen] > threshVal] = -1.0 #如果大于阈值,则赋值为-1
return retArray
"""
Parameters:
dataArr - 数据矩阵
classLabels - 数据标签
D - 样本权重
Returns:
bestStump - 最佳单层决策树信息
minError - 最小误差
bestClasEst - 最佳的分类结果
"""
# 找到数据集上最佳的单层决策树
def buildStump(dataArr,classLabels,D):
dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T
m,n = np.shape(dataMatrix)
numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1)))#初始化步长,最佳单层决策树,最佳分类结果
minError = float('inf') #最小误差初始化为正无穷大
for i in range(n): #遍历所有特征
rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值
stepSize = (rangeMax - rangeMin) / numSteps #计算步长
for j in range(-1, int(numSteps) + 1):
for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than
threshVal = (rangeMin + float(j) * stepSize) #计算阈值
predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果
errArr = np.mat(np.ones((m,1))) #初始化误差矩阵
errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0
weightedError = D.T * errArr #计算误差
print("split: dim %d, threshVal %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError))
if weightedError < minError: #找到误差最小的分类方式
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump,minError,bestClasEst
举一个简单的例子:
1、创建数据集
# 创建单层决策树的数据集
def loadSimpData():
datMat = np.matrix([[ 1. , 2.1],
[ 1.5, 1.6],
[ 1.3, 1. ],
[ 1. , 1. ],
[ 2. , 1. ]])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
return datMat,classLabels
2、基于上述数据集创建单层决策树
dataArr,classLabels = loadSimpData()
D = np.mat(np.ones((5, 1)) / 5)
bestStump,minError,bestClasEst = buildStump(dataArr,classLabels,D)
print('bestStump:\n', bestStump)
print('minError:\n', minError)
print('bestClasEst:\n', bestClasEst)
split: dim 0, threshVal 0.90, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, threshVal 0.90, thresh ineqal: gt, the weighted error is 0.600
split: dim 0, threshVal 1.00, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, threshVal 1.00, thresh ineqal: gt, the weighted error is 0.600
split: dim 0, threshVal 1.10, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, threshVal 1.10, thresh ineqal: gt, the weighted error is 0.600
split: dim 0, threshVal 1.20, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, threshVal 1.20, thresh ineqal: gt, the weighted error is 0.600
split: dim 0, threshVal 1.30, thresh ineqal: lt, the weighted error is 0.200
split: dim 0, threshVal 1.30, thresh ineqal: gt, the weighted error is 0.800
split: dim 0, threshVal 1.40, thresh ineqal: lt, the weighted error is 0.200
split: dim 0, threshVal 1.40, thresh ineqal: gt, the weighted error is 0.800
split: dim 0, threshVal 1.50, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, threshVal 1.50, thresh ineqal: gt, the weighted error is 0.600
split: dim 0, threshVal 1.60, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, threshVal 1.60, thresh ineqal: gt, the weighted error is 0.600
split: dim 0, threshVal 1.70, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, threshVal 1.70, thresh ineqal: gt, the weighted error is 0.600
split: dim 0, threshVal 1.80, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, threshVal 1.80, thresh ineqal: gt, the weighted error is 0.600
split: dim 0, threshVal 1.90, thresh ineqal: lt, the weighted error is 0.400
split: dim 0, threshVal 1.90, thresh ineqal: gt, the weighted error is 0.600
split: dim 0, threshVal 2.00, thresh ineqal: lt, the weighted error is 0.600
split: dim 0, threshVal 2.00, thresh ineqal: gt, the weighted error is 0.400
split: dim 1, threshVal 0.89, thresh ineqal: lt, the weighted error is 0.400
split: dim 1, threshVal 0.89, thresh ineqal: gt, the weighted error is 0.600
split: dim 1, threshVal 1.00, thresh ineqal: lt, the weighted error is 0.200
split: dim 1, threshVal 1.00, thresh ineqal: gt, the weighted error is 0.800
split: dim 1, threshVal 1.11, thresh ineqal: lt, the weighted error is 0.200
split: dim 1, threshVal 1.11, thresh ineqal: gt, the weighted error is 0.800
split: dim 1, threshVal 1.22, thresh ineqal: lt, the weighted error is 0.200
split: dim 1, threshVal 1.22, thresh ineqal: gt, the weighted error is 0.800
split: dim 1, threshVal 1.33, thresh ineqal: lt, the weighted error is 0.200
split: dim 1, threshVal 1.33, thresh ineqal: gt, the weighted error is 0.800
split: dim 1, threshVal 1.44, thresh ineqal: lt, the weighted error is 0.200
split: dim 1, threshVal 1.44, thresh ineqal: gt, the weighted error is 0.800
split: dim 1, threshVal 1.55, thresh ineqal: lt, the weighted error is 0.200
split: dim 1, threshVal 1.55, thresh ineqal: gt, the weighted error is 0.800
split: dim 1, threshVal 1.66, thresh ineqal: lt, the weighted error is 0.400
split: dim 1, threshVal 1.66, thresh ineqal: gt, the weighted error is 0.600
split: dim 1, threshVal 1.77, thresh ineqal: lt, the weighted error is 0.400
split: dim 1, threshVal 1.77, thresh ineqal: gt, the weighted error is 0.600
split: dim 1, threshVal 1.88, thresh ineqal: lt, the weighted error is 0.400
split: dim 1, threshVal 1.88, thresh ineqal: gt, the weighted error is 0.600
split: dim 1, threshVal 1.99, thresh ineqal: lt, the weighted error is 0.400
split: dim 1, threshVal 1.99, thresh ineqal: gt, the weighted error is 0.600
split: dim 1, threshVal 2.10, thresh ineqal: lt, the weighted error is 0.600
split: dim 1, threshVal 2.10, thresh ineqal: gt, the weighted error is 0.400
bestStump:
{‘dim’: 0, ‘thresh’: 1.3, ‘ineq’: ‘lt’}
minError:
[[0.2]]
bestClasEst:
[[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]]
\quad \quad 经过遍历,我们找到,训练好的最佳单层决策树的最小分类误差为0.2,就是对于该数据集,无论用什么样的单层决策树,分类误差最小就是0.2。
完整代码:
import numpy as np
"""
Parameters:
无
Returns:
dataMat - 数据矩阵
classLabels - 数据标签
"""
# 创建单层决策树的数据集
def loadSimpData():
datMat = np.matrix([[ 1. , 2.1],
[ 1.5, 1.6],
[ 1.3, 1. ],
[ 1. , 1. ],
[ 2. , 1. ]])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
return datMat,classLabels
"""
Parameters:
dataMatrix - 数据矩阵
dimen - 第dimen列,也就是第几个特征
threshVal - 阈值
threshIneq - 标志:有'lt':表示小于等于;'gt':表示大于等于
Returns:
retArray - 分类结果
"""
# 单层决策树分类函数,与阈值进行比较
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1
if threshIneq == 'lt':#如果分类标志为'lt',则分类规则:小于等于阈值,归类到-1;将不等号在大 、小 之间切换。
retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1
else:
retArray[dataMatrix[:,dimen] > threshVal] = -1.0 #如果大于阈值,则赋值为-1
return retArray
"""
Parameters:
dataArr - 数据矩阵
classLabels - 数据标签
D - 样本权重
Returns:
bestStump - 最佳单层决策树信息
minError - 最小误差
bestClasEst - 最佳的分类结果
"""
# 找到数据集上最佳的单层决策树
def buildStump(dataArr,classLabels,D):
dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T
m,n = np.shape(dataMatrix)
numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1)))#初始化步长,最佳单层决策树,最佳分类结果
minError = float('inf') #最小误差初始化为正无穷大
for i in range(n): #遍历所有特征
rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值
stepSize = (rangeMax - rangeMin) / numSteps #计算步长
for j in range(-1, int(numSteps) + 1):
for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than
threshVal = (rangeMin + float(j) * stepSize) #计算阈值
predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果
errArr = np.mat(np.ones((m,1))) #初始化误差矩阵
errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0
weightedError = D.T * errArr #计算误差
print("split: dim %d, threshVal %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError))
if weightedError < minError: #找到误差最小的分类方式
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump,minError,bestClasEst
if __name__ == '__main__':
dataArr,classLabels = loadSimpData()
D = np.mat(np.ones((5, 1)) / 5)
bestStump,minError,bestClasEst = buildStump(dataArr,classLabels,D)
print('bestStump:\n', bestStump)
print('minError:\n', minError)
print('bestClasEst:\n', bestClasEst)
参考资料:
机器学习实战