此次算法实现都是基于《统计学习方法》的描述,而不是西瓜书,不涉及西瓜书内的”预剪枝“和”后剪枝“,剪枝算法为《统计学习方法》算法5.4,计算损失函数时直接计算整棵树的值,没有实现”局部进行”。
生成树部分参考决策树python源码实现(含预剪枝和后剪枝)
"""
实现决策树模型,生成决策树包括ID3,C4.5
剪枝算法实现一般剪枝,即比较损失函数
ID3和C4.5决策树要求数据集是离散的,以课本数据集和西瓜数据集展示生成,剪枝和预测
"""
import json
import numpy as np
from collections import Counter
import copy
# 创建数据集 备注 李航《统计学习方法》中表5.1 贷款申请数据数据
def createDataLH():
data = np.array([['青年', '否', '否', '一般']])
data = np.append(data, [['青年', '否', '否', '好']], axis=0)
data = np.append(data, [['青年', '是', '否', '好']
, ['青年', '是', '是', '一般']
, ['青年', '否', '否', '一般']
, ['中年', '否', '否', '一般']
, ['中年', '否', '否', '好']
, ['中年', '是', '是', '好']
, ['中年', '否', '是', '非常好']
, ['中年', '否', '是', '非常好']
, ['老年', '否', '是', '非常好']
, ['老年', '否', '是', '好']
, ['老年', '是', '否', '好']
, ['老年', '是', '否', '非常好']
, ['老年', '否', '否', '一般']
], axis=0)
label = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'])
name = np.array(['年龄', '有工作', '有房子', '信贷情况'])
return data, label, name
# 创建西瓜书数据集2.0
def createDataXG20():
data = np.array([['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
, ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑']
, ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
, ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑']
, ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
, ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘']
, ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘']
, ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑']
, ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑']
, ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘']
, ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑']
, ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘']
, ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑']
, ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑']
, ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘']
, ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑']
, ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑']])
label = np.array(['是', '是', '是', '是', '是', '是', '是', '是', '否', '否', '否', '否', '否', '否', '否', '否', '否'])
name = np.array(['色泽', '根蒂', '敲声', '纹理', '脐部', '触感'])
return data, label, name
class Node():
def __init__(self, label, FeatureName, ChildrenNodeDic, Data, FatherNode):
"""
:param label: 该结点所预测的label
:param FeatureName: 生成子结点依据的特征是哪个,叶子结点是None
:param ChildrenNodeDic: 字典,内容是{生成这个子结点的依据特征取值:子结点},叶子结点是None
:param Data: 该叶结点包含的数据集
"""
self.label = label
self.FeatureName = FeatureName
self.ChildrenNodeDic = ChildrenNodeDic
self.Data = Data
self.FatherNode = FatherNode
self.VisualData = {
"此结点预测标签:": self.label,
# "此结点存储数据:":self.Data, 这条也不能写,不然 default=lambda obj: obj.VisualData 会报错
"此结点孩子结点的划分特征:": self.FeatureName,
"此结点的孩子结点:": self.ChildrenNodeDic,
# 不能写FatherNode,不然会循环调用
}
def DeleteChildren(self):
"""
删除此结点的子结点,并更新可视化数据,之前实现是一直忘记更新可视化数据,一度崩溃.
:return:
"""
self.ChildrenNodeDic = None
self.VisualData = {
"此结点预测标签:": self.label,
# "此结点存储数据:":self.Data, 这条也不能写,不然 default=lambda obj: obj.VisualData 会报错
"此结点孩子结点的划分特征:": self.FeatureName,
"此结点的孩子结点:": self.ChildrenNodeDic,
# 不能写FatherNode,不然会循环调用
}
def __repr__(self):
return json.dumps(self, indent=3, default=lambda obj: obj.VisualData, ensure_ascii=False)
class ClassificationDecisionTree():
def __repr__(self):
return str(self.TreeRoot)
def __init__(self, X, y, names, method, epsilon, alpha, Purning=True):
"""
:param X: 特征数据集
:param y: 标签数据集
:param method: 构建决策树的方法
:param names: 特征名称
:param epsilon: 信息熵阈值
:param alpha: 剪枝的惩罚参数,如果不剪枝可以随便取值
:param Purning: 是否剪枝
"""
X = np.asarray(X)
y = np.asarray(y).reshape(-1, 1)
self.OriginalData = np.concatenate([X, y], axis=1) # 按列合并特征数据集和标签数据集,组成算法中的数据集,与书中的"数据集"契合
self.Originalnames = names
self.TreeRoot = self.BuildTree(self.OriginalData, self.Originalnames, method, epsilon)
self.FeatureIndexDic = {FeatureName: FeatureIndex for FeatureIndex, FeatureName in
enumerate(names)} # 特征名称方便可视化展示,特征索引方便预测调用
if Purning:
self.TreeRoot = self.DecidePurning(self.TreeRoot, alpha, float("inf")) # 剪枝,初始Loss设置为无穷大
def BuildTree(self, Data, names, method, epsilon):
# 递归调用的开始,构建根结点,它对应的数据集就是完整的数据集,且父结点是None
return self.CreateNode(Data, names, method, epsilon, None)
@staticmethod
def GetHeatFunction(Data):
"""
计算数据集的经验熵
:param Data: 数据集
:return: 经验熵
"""
data_size = Data.shape[0]
y = Data[:, -1]
labels, labels_count = np.unique(y, return_counts=True)
# 计算C_k/D
P_iList = labels_count / data_size
return -np.sum(P_iList * np.log2(P_iList))
@staticmethod
def GetFeatureHeatFunction(FeatureIndex, Data):
"""
计算数据集关于某个特征的熵,实际上就是FectureVector的熵,FectureVector是数据集的第FeatureIndex列
这里函数参数设置为了与书契合而这样做,实际上可以将此函数与计算熵的函数合并
:param FeatureIndex: 特征索引
:param Data: 数据集
:return: 数据集关于特征的熵
"""
data_size = Data.shape[0]
FectureVector = Data[:, FeatureIndex]
FeatureValues, FeatureValuesCount = np.unique(FectureVector, return_counts=True)
# 计算D_i/D,即P_i
P_iList = FeatureValuesCount / data_size
return -np.sum(P_iList * np.log2(P_iList))
def GetConditionalHeatFunction(self, FeatureIndex, Data):
"""
计算某个特征对数据集的条件熵
:param FeatureIndex: 特征在数据集中的索引
:param Data: 数据集
:return: 条件熵
"""
data_size = Data.shape[0]
FeatureVector = Data[:, FeatureIndex] # 特征列
FeatureValues = set(FeatureVector)
# print(FeatureValues)
ConditionalHeatFunction = 0
for FeatureValue in FeatureValues:
# FeatureVector==FeatureValue)
Data_i = Data[FeatureVector == FeatureValue, :]
ConditionalHeatFunction += (Data_i.shape[0] / data_size) * self.GetHeatFunction(Data_i)
return ConditionalHeatFunction
@staticmethod
def SplitData(Data, FeatureIndex):
"""
删除数据集中的FeatureIndex列(重要!!!),并且根据特征分割数据集,注意这里要先取特征列,再删除
:param Data: 数据集
:param FeatureIndex:特征
:return: 分割后的数据集,结构是字典,结构是{特征取值:这个特征取值的子数据集}
"""
FeatureVector = Data[:, FeatureIndex]
Data = np.delete(Data, FeatureIndex, axis=1)
# print("FeatureVector={}".format(FeatureVector))
FeatureValues = set(FeatureVector)
SplitedDataDic = {}
for FeatureValue in FeatureValues:
SplitedDataDic[FeatureValue] = Data[FeatureVector == FeatureValue, :]
return SplitedDataDic
def CreateNode(self, Data, FeatureNames, method, epsilon, FatherNode):
"""
依据数据集创建结点
:param Data: 数据集,前n-1列是特征数据集,最后一列是标签数据集
:param FeatureNames: 特征集名称
:param method: 构建结点方法,ID3,C4.5
:param epsilon: 信息熵阈值
:param FatherNode: 本次创建结点的父结点,有父结点这一属性,方便剪枝时向上走
:return: 创建的结点
"""
if len(set(Data[:, -1])) == 1: # 标签唯一
return Node(Data[0, -1], None, None, Data, FatherNode)
if FeatureNames is None: # 待选特征集是空集
return Node(Counter(Data[:, -1][0]).most_common(1)[0][0], None, None, Data, FatherNode)
# 选择最优特征
BestFeatureIndex = -1
BestGain = -1
TmptGain = -1
for FeatureIndex in range(Data.shape[1] - 1): # 特征的列数是数据集的列数减去标签的列数
HeatFunction = self.GetHeatFunction(Data) # 计算信息熵
ConditionalHeatFunction = self.GetConditionalHeatFunction(FeatureIndex, Data)
Gain = HeatFunction - ConditionalHeatFunction
if method == "ID3": # 比较信息增益
TmptGain = Gain
elif method == "C4.5": # 比较信息增益比
FeatureHeatFunction = self.GetFeatureHeatFunction(FeatureIndex, Data)
TmptGain = Gain / FeatureHeatFunction
if TmptGain > BestGain:
BestGain = TmptGain
BestFeatureIndex = FeatureIndex
if BestGain < epsilon:
return Node(Counter(Data[:, -1]).most_common(1)[0][0], None, None, Data, FatherNode)
BestFeatureName = FeatureNames[BestFeatureIndex]
# print("BestFeatureName={}".format(BestFeatureName))
FeatureNames = np.delete(FeatureNames, BestFeatureIndex)
SplitedDataDic = self.SplitData(Data, BestFeatureIndex)
# print(SplitedDataDic)
ChildrenNodeDic = {}
ThisNode = Node(Counter(Data[:, -1]).most_common(1)[0][0], BestFeatureName, ChildrenNodeDic, Data, FatherNode)
for FeatureValue in SplitedDataDic.keys():
#这些子结点的父结点就是这次构建的结点
node = self.CreateNode(SplitedDataDic[FeatureValue], FeatureNames, method, epsilon, ThisNode)
ChildrenNodeDic[FeatureValue] = node
return ThisNode
def predict(self, x):
"""
根据特征,预测label
:param x: 特征
:return: label
"""
# 根据特征一直走到叶结点,输出叶结点label
node = self.TreeRoot
while node.ChildrenNodeDic is not None:
# 根据FeatureName得到原始样本的FeatureIndex
FeatureIndex = self.FeatureIndexDic[node.FeatureName]
if x[FeatureIndex] in node.ChildrenNodeDic.keys():
node = node.ChildrenNodeDic[x[FeatureIndex]]
else:
return node.label # 出现没有在训练集中出现的特征取值,直接返回这个结点的预测
return node.label
def score(self, X, y):
X = np.asarray(X)
y = np.asarray(y)
cnt = 0
for index, x in enumerate(X):
if self.predict(x) == y[index]:
cnt += 1
return cnt / (X.shape[0])
def GetLeaveNodes(self, tree, LeaveNodeList):
"""
将空列表作为参数传入,结束之后传入的列表内有tree的叶子结点
:param tree:待查树
:param LeaveNodeList:存储叶子结点的空列表
:return:无,直接修改参数LeaveNodeList
"""
if tree:
if tree.ChildrenNodeDic is None:
LeaveNodeList.append(tree)
# print(tree)
else:
for ChildrenNode in tree.ChildrenNodeDic.values():
self.GetLeaveNodes(ChildrenNode, LeaveNodeList)
def GetLoss(self, node, alpha):
"""
得到node为根的子树的损失函数值,与书中的公式对应
:param node: 子树根
:param alpha: 惩罚参数
:return: 损失函数值
"""
LeaveNodeList = []
self.GetLeaveNodes(node, LeaveNodeList)
NumOfLeaveNode = len(LeaveNodeList)
Loss = 0
for ThisNode in LeaveNodeList:
H_t = self.GetHeatFunction(ThisNode.Data)
N_t = ThisNode.Data.shape[0]
Loss += N_t * H_t
Loss += alpha * NumOfLeaveNode
return Loss
def GetPurningTree(self, tree, Data):
"""
找到tree中存储Data的那个结点,将其子结点删除
:param tree: 待查找的tree
:param Data: 用于标明结点身份的信息,数据集
:return: 删除数据集是Data的那个结点后的树
"""
# 在剪枝时可能因为遍历找父结点时已经进行了剪枝,而发生找不到的情况,
# 暂时想不到好的办法使得剪枝时的叶结点从高层向低层遍历,因此剪枝时可能会有直接从上面开始判断剪不剪的情况
if tree is None:
# print("找不到要删除子结点的父结点")
pass
elif tree.Data.shape == Data.shape and (tree.Data == Data).all():
# print("找到了要删除子结点的父结点")
# print("删除前tree={}".format(tree))
tree.DeleteChildren()
# print("删除后tree={}".format(tree))
return tree
elif tree.ChildrenNodeDic is not None:
# 还不是叶结点并且此结点也不是要删除的结点,继续往下查找
ChildrenNodeDic = tree.ChildrenNodeDic
for ChildrenNodeKey in ChildrenNodeDic.keys():
# 如果值不相等,说明不是要找的结点,对树的子结点进行查找删除,并将删除后的子树挂到原来树上
tree.ChildrenNodeDic[ChildrenNodeKey] = self.GetPurningTree(ChildrenNodeDic[ChildrenNodeKey], Data)
return tree
def DecidePurning(self, tree, alpha, Loss):
"""
从叶子结点出发,向上回缩,判断回缩前和回缩后的损失函数,来判断是否剪枝
:param tree: 待决定剪枝的树,也是当前能够获得的最优的树
:param alpha: 惩罚参数
:param Loss: 上一次剪枝完成后的损失,由于是从下往上回缩,如果
上一次剪枝完成后的损失和这次剪枝完成后的损失一样,即此次的叶子结点
都没有被剪枝,即此次的tree已经是最优的,再剪叶子结点就会增大损失,此时停止剪枝
:return: 剪枝后的树
"""
LeaveNodeList = []
self.GetLeaveNodes(tree, LeaveNodeList)
if len(LeaveNodeList) == 1:
# 只有一个结点没有叶子结点可以减了
return tree
# print(len(LeaveNodeList))
# 找到叶子结点对应的不同父结点,剪枝是发生在叶子结点的父结点上的
FatherNodeSet = set()
for ThisNode in LeaveNodeList:
FatherNodeSet.add(ThisNode.FatherNode)
# print(len(FatherNodeSet))
# 对tree的每组叶结点尝试剪枝(回缩回父结点),比较损伤函数,判断是否进行更新
ThisTimeBestLoss = -1 # 保存这次剪枝所能达到的最小Loss
for FatherNode in FatherNodeSet:
SameTree = copy.deepcopy(tree)
# 每个结点存储的数据是唯一的,根据数据来判断是否是要进行处理的那个结点
Data = FatherNode.Data
PurningTree = self.GetPurningTree(SameTree, Data)
# 比较损失函数
# print("找到了父结点={}".format(FatherNode))
# print("PurningTree={}".format(PurningTree))
PurningLoss = self.GetLoss(PurningTree, alpha)
BeforeLoss = self.GetLoss(tree, alpha)
print("考察结点{}\n剪枝后损失:PurningLoss={},剪枝前损失:BeforeLoss={}".format(FatherNode,PurningLoss,BeforeLoss))
if (PurningLoss <= BeforeLoss):
# 剪枝后Loss减少,更新树,并保存当前得到的最小Loss
print("发生了剪枝")
tree = PurningTree
print("此次剪枝后tree={}".format(PurningTree))
ThisTimeBestLoss = PurningLoss
if ThisTimeBestLoss == Loss:
# 这次最好的Loss和上次一样,说明这次修剪没有任何改动,已经达到最优
return tree
else:
# 对剪枝后的树再进行剪枝,
return self.DecidePurning(tree, alpha, ThisTimeBestLoss)
lhData, lhLabel, lhName = createDataLH()
xgData, xgLabel, xgName = createDataXG20()
alpha=2 #惩罚参数
Purninglhtree = ClassificationDecisionTree(lhData, lhLabel, lhName,
"ID3", epsilon=0.4, alpha=alpha, Purning=True) # 惩罚参数设置较高时会发生剪枝,较低时与不剪枝一样
NotPurninglhtree = ClassificationDecisionTree(lhData, lhLabel, lhName, "ID3",
epsilon=0.4, alpha=None, Purning=False)
print("不剪枝剪枝lhtree={}".format(NotPurninglhtree))
print("剪枝lhtree={}".format(Purninglhtree))
# 西瓜决策树需要设置阈值较高,否则只有根
Purningxgtree = ClassificationDecisionTree(xgData, xgLabel, xgName,
"ID3", epsilon=0, alpha=alpha, Purning=True) # 惩罚参数设置较高,保证剪枝,从而看出与不剪枝的对比
NotPurningxgtree = ClassificationDecisionTree(xgData, xgLabel, xgName, "ID3",
epsilon=0, alpha=None, Purning=False) #
print("不剪枝剪枝xgtree={}".format(NotPurningxgtree))
print("剪枝xgtree={}".format(Purningxgtree))