决策树模型实现:ID3,C4.5生成,剪枝,预测

此次算法实现都是基于《统计学习方法》的描述,而不是西瓜书,不涉及西瓜书内的”预剪枝“和”后剪枝“,剪枝算法为《统计学习方法》算法5.4,计算损失函数时直接计算整棵树的值,没有实现”局部进行”。
生成树部分参考决策树python源码实现(含预剪枝和后剪枝)

"""
实现决策树模型,生成决策树包括ID3,C4.5
剪枝算法实现一般剪枝,即比较损失函数

ID3和C4.5决策树要求数据集是离散的,以课本数据集和西瓜数据集展示生成,剪枝和预测


"""
import json
import numpy as np
from collections import Counter
import copy


# 创建数据集 备注 李航《统计学习方法》中表5.1 贷款申请数据数据
def createDataLH():
	data = np.array([['青年', '否', '否', '一般']])
	data = np.append(data, [['青年', '否', '否', '好']], axis=0)
	data = np.append(data, [['青年', '是', '否', '好']
		, ['青年', '是', '是', '一般']
		, ['青年', '否', '否', '一般']
		, ['中年', '否', '否', '一般']
		, ['中年', '否', '否', '好']
		, ['中年', '是', '是', '好']
		, ['中年', '否', '是', '非常好']
		, ['中年', '否', '是', '非常好']
		, ['老年', '否', '是', '非常好']
		, ['老年', '否', '是', '好']
		, ['老年', '是', '否', '好']
		, ['老年', '是', '否', '非常好']
		, ['老年', '否', '否', '一般']
							], axis=0)
	label = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'])
	name = np.array(['年龄', '有工作', '有房子', '信贷情况'])
	return data, label, name


# 创建西瓜书数据集2.0
def createDataXG20():
	data = np.array([['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
						, ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑']
						, ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
						, ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑']
						, ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
						, ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘']
						, ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘']
						, ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑']
						, ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑']
						, ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘']
						, ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑']
						, ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘']
						, ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑']
						, ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑']
						, ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘']
						, ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑']
						, ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑']])
	label = np.array(['是', '是', '是', '是', '是', '是', '是', '是', '否', '否', '否', '否', '否', '否', '否', '否', '否'])
	name = np.array(['色泽', '根蒂', '敲声', '纹理', '脐部', '触感'])
	return data, label, name



class Node():
	def __init__(self, label, FeatureName, ChildrenNodeDic, Data, FatherNode):
		"""
		:param label: 该结点所预测的label
		:param FeatureName: 生成子结点依据的特征是哪个,叶子结点是None
		:param ChildrenNodeDic: 字典,内容是{生成这个子结点的依据特征取值:子结点},叶子结点是None
		:param Data: 该叶结点包含的数据集
		"""
		self.label = label
		self.FeatureName = FeatureName
		self.ChildrenNodeDic = ChildrenNodeDic
		self.Data = Data
		self.FatherNode = FatherNode
		self.VisualData = {
			"此结点预测标签:": self.label,
			# "此结点存储数据:":self.Data, 这条也不能写,不然 default=lambda obj: obj.VisualData 会报错
			"此结点孩子结点的划分特征:": self.FeatureName,
			"此结点的孩子结点:": self.ChildrenNodeDic,

			# 不能写FatherNode,不然会循环调用
		}

	def DeleteChildren(self):
		"""
		删除此结点的子结点,并更新可视化数据,之前实现是一直忘记更新可视化数据,一度崩溃.
		:return:
		"""
		self.ChildrenNodeDic = None
		self.VisualData = {
			"此结点预测标签:": self.label,
			# "此结点存储数据:":self.Data, 这条也不能写,不然 default=lambda obj: obj.VisualData 会报错
			"此结点孩子结点的划分特征:": self.FeatureName,
			"此结点的孩子结点:": self.ChildrenNodeDic,
			# 不能写FatherNode,不然会循环调用
		}

	def __repr__(self):
		return json.dumps(self, indent=3, default=lambda obj: obj.VisualData, ensure_ascii=False)


class ClassificationDecisionTree():
	def __repr__(self):
		return str(self.TreeRoot)

	def __init__(self, X, y, names, method, epsilon, alpha, Purning=True):
		"""
		:param X: 特征数据集
		:param y: 标签数据集
		:param method: 构建决策树的方法
		:param names: 特征名称
		:param epsilon: 信息熵阈值
		:param alpha: 剪枝的惩罚参数,如果不剪枝可以随便取值
		:param Purning: 是否剪枝
		"""
		X = np.asarray(X)
		y = np.asarray(y).reshape(-1, 1)
		self.OriginalData = np.concatenate([X, y], axis=1)  # 按列合并特征数据集和标签数据集,组成算法中的数据集,与书中的"数据集"契合
		self.Originalnames = names
		self.TreeRoot = self.BuildTree(self.OriginalData, self.Originalnames, method, epsilon)
		self.FeatureIndexDic = {FeatureName: FeatureIndex for FeatureIndex, FeatureName in
								enumerate(names)}  # 特征名称方便可视化展示,特征索引方便预测调用
		if Purning:
			self.TreeRoot = self.DecidePurning(self.TreeRoot, alpha, float("inf"))  # 剪枝,初始Loss设置为无穷大

	def BuildTree(self, Data, names, method, epsilon):
		# 递归调用的开始,构建根结点,它对应的数据集就是完整的数据集,且父结点是None
		return self.CreateNode(Data, names, method, epsilon, None)

	@staticmethod
	def GetHeatFunction(Data):
		"""
		计算数据集的经验熵
		:param Data: 数据集
		:return: 经验熵
		"""
		data_size = Data.shape[0]
		y = Data[:, -1]
		labels, labels_count = np.unique(y, return_counts=True)
		# 计算C_k/D
		P_iList = labels_count / data_size
		return -np.sum(P_iList * np.log2(P_iList))

	@staticmethod
	def GetFeatureHeatFunction(FeatureIndex, Data):
		"""
		计算数据集关于某个特征的熵,实际上就是FectureVector的熵,FectureVector是数据集的第FeatureIndex列
		这里函数参数设置为了与书契合而这样做,实际上可以将此函数与计算熵的函数合并
		:param FeatureIndex: 特征索引
		:param Data: 数据集
		:return: 数据集关于特征的熵
		"""
		data_size = Data.shape[0]
		FectureVector = Data[:, FeatureIndex]
		FeatureValues, FeatureValuesCount = np.unique(FectureVector, return_counts=True)
		# 计算D_i/D,即P_i
		P_iList = FeatureValuesCount / data_size
		return -np.sum(P_iList * np.log2(P_iList))

	def GetConditionalHeatFunction(self, FeatureIndex, Data):
		"""
		计算某个特征对数据集的条件熵
		:param FeatureIndex: 特征在数据集中的索引
		:param Data: 数据集
		:return: 条件熵
		"""
		data_size = Data.shape[0]
		FeatureVector = Data[:, FeatureIndex]  # 特征列
		FeatureValues = set(FeatureVector)
		# print(FeatureValues)
		ConditionalHeatFunction = 0
		for FeatureValue in FeatureValues:
			# FeatureVector==FeatureValue)
			Data_i = Data[FeatureVector == FeatureValue, :]
			ConditionalHeatFunction += (Data_i.shape[0] / data_size) * self.GetHeatFunction(Data_i)
		return ConditionalHeatFunction

	@staticmethod
	def SplitData(Data, FeatureIndex):
		"""
		删除数据集中的FeatureIndex列(重要!!!),并且根据特征分割数据集,注意这里要先取特征列,再删除
		:param Data: 数据集
		:param FeatureIndex:特征
		:return: 分割后的数据集,结构是字典,结构是{特征取值:这个特征取值的子数据集}
		"""

		FeatureVector = Data[:, FeatureIndex]
		Data = np.delete(Data, FeatureIndex, axis=1)
		# print("FeatureVector={}".format(FeatureVector))
		FeatureValues = set(FeatureVector)
		SplitedDataDic = {}
		for FeatureValue in FeatureValues:
			SplitedDataDic[FeatureValue] = Data[FeatureVector == FeatureValue, :]
		return SplitedDataDic

	def CreateNode(self, Data, FeatureNames, method, epsilon, FatherNode):
		"""
		依据数据集创建结点
		:param Data: 数据集,前n-1列是特征数据集,最后一列是标签数据集
		:param FeatureNames: 特征集名称
		:param method: 构建结点方法,ID3,C4.5
		:param epsilon: 信息熵阈值
		:param FatherNode: 本次创建结点的父结点,有父结点这一属性,方便剪枝时向上走
		:return: 创建的结点
		"""

		if len(set(Data[:, -1])) == 1:  # 标签唯一
			return Node(Data[0, -1], None, None, Data, FatherNode)
		if FeatureNames is None:  # 待选特征集是空集
			return Node(Counter(Data[:, -1][0]).most_common(1)[0][0], None, None, Data, FatherNode)
		# 选择最优特征
		BestFeatureIndex = -1
		BestGain = -1
		TmptGain = -1
		for FeatureIndex in range(Data.shape[1] - 1):  # 特征的列数是数据集的列数减去标签的列数
			HeatFunction = self.GetHeatFunction(Data)  # 计算信息熵
			ConditionalHeatFunction = self.GetConditionalHeatFunction(FeatureIndex, Data)
			Gain = HeatFunction - ConditionalHeatFunction
			if method == "ID3":  # 比较信息增益
				TmptGain = Gain
			elif method == "C4.5":  # 比较信息增益比
				FeatureHeatFunction = self.GetFeatureHeatFunction(FeatureIndex, Data)
				TmptGain = Gain / FeatureHeatFunction
			if TmptGain > BestGain:
				BestGain = TmptGain
				BestFeatureIndex = FeatureIndex
		if BestGain < epsilon:
			return Node(Counter(Data[:, -1]).most_common(1)[0][0], None, None, Data, FatherNode)
		BestFeatureName = FeatureNames[BestFeatureIndex]
		# print("BestFeatureName={}".format(BestFeatureName))
		FeatureNames = np.delete(FeatureNames, BestFeatureIndex)
		SplitedDataDic = self.SplitData(Data, BestFeatureIndex)
		# print(SplitedDataDic)
		ChildrenNodeDic = {}
		ThisNode = Node(Counter(Data[:, -1]).most_common(1)[0][0], BestFeatureName, ChildrenNodeDic, Data, FatherNode)
		for FeatureValue in SplitedDataDic.keys():
			#这些子结点的父结点就是这次构建的结点
			node = self.CreateNode(SplitedDataDic[FeatureValue], FeatureNames, method, epsilon, ThisNode)
			ChildrenNodeDic[FeatureValue] = node
		return ThisNode

	def predict(self, x):
		"""
		根据特征,预测label
		:param x: 特征
		:return: label
		"""
		# 根据特征一直走到叶结点,输出叶结点label
		node = self.TreeRoot
		while node.ChildrenNodeDic is not None:
			# 根据FeatureName得到原始样本的FeatureIndex
			FeatureIndex = self.FeatureIndexDic[node.FeatureName]
			if x[FeatureIndex] in node.ChildrenNodeDic.keys():
				node = node.ChildrenNodeDic[x[FeatureIndex]]
			else:
				return node.label  	# 出现没有在训练集中出现的特征取值,直接返回这个结点的预测
		return node.label

	def score(self, X, y):
		X = np.asarray(X)
		y = np.asarray(y)
		cnt = 0
		for index, x in enumerate(X):
			if self.predict(x) == y[index]:
				cnt += 1
		return cnt / (X.shape[0])

	def GetLeaveNodes(self, tree, LeaveNodeList):
		"""
		将空列表作为参数传入,结束之后传入的列表内有tree的叶子结点
		:param tree:待查树
		:param LeaveNodeList:存储叶子结点的空列表
		:return:无,直接修改参数LeaveNodeList
		"""
		if tree:
			if tree.ChildrenNodeDic is None:
				LeaveNodeList.append(tree)
			# print(tree)
			else:
				for ChildrenNode in tree.ChildrenNodeDic.values():
					self.GetLeaveNodes(ChildrenNode, LeaveNodeList)

	def GetLoss(self, node, alpha):
		"""
		得到node为根的子树的损失函数值,与书中的公式对应
		:param node: 子树根
		:param alpha: 惩罚参数
		:return: 损失函数值
		"""
		LeaveNodeList = []
		self.GetLeaveNodes(node, LeaveNodeList)
		NumOfLeaveNode = len(LeaveNodeList)
		Loss = 0
		for ThisNode in LeaveNodeList:
			H_t = self.GetHeatFunction(ThisNode.Data)
			N_t = ThisNode.Data.shape[0]
			Loss += N_t * H_t
		Loss += alpha * NumOfLeaveNode
		return Loss

	def GetPurningTree(self, tree, Data):
		"""
		找到tree中存储Data的那个结点,将其子结点删除
		:param tree: 待查找的tree
		:param Data: 用于标明结点身份的信息,数据集
		:return: 删除数据集是Data的那个结点后的树
		"""
		# 在剪枝时可能因为遍历找父结点时已经进行了剪枝,而发生找不到的情况,
		# 暂时想不到好的办法使得剪枝时的叶结点从高层向低层遍历,因此剪枝时可能会有直接从上面开始判断剪不剪的情况
		if tree is None:
			# print("找不到要删除子结点的父结点")
			pass
		elif tree.Data.shape == Data.shape and (tree.Data == Data).all():
			# print("找到了要删除子结点的父结点")
			# print("删除前tree={}".format(tree))
			tree.DeleteChildren()
			# print("删除后tree={}".format(tree))
			return tree

		elif tree.ChildrenNodeDic is not None:
			# 还不是叶结点并且此结点也不是要删除的结点,继续往下查找
			ChildrenNodeDic = tree.ChildrenNodeDic
			for ChildrenNodeKey in ChildrenNodeDic.keys():
				# 如果值不相等,说明不是要找的结点,对树的子结点进行查找删除,并将删除后的子树挂到原来树上
				tree.ChildrenNodeDic[ChildrenNodeKey] = self.GetPurningTree(ChildrenNodeDic[ChildrenNodeKey], Data)
		return tree

	def DecidePurning(self, tree, alpha, Loss):
		"""
		从叶子结点出发,向上回缩,判断回缩前和回缩后的损失函数,来判断是否剪枝
		:param tree: 待决定剪枝的树,也是当前能够获得的最优的树
		:param alpha: 惩罚参数
		:param Loss: 上一次剪枝完成后的损失,由于是从下往上回缩,如果
		上一次剪枝完成后的损失和这次剪枝完成后的损失一样,即此次的叶子结点
		都没有被剪枝,即此次的tree已经是最优的,再剪叶子结点就会增大损失,此时停止剪枝
		:return: 剪枝后的树
		"""
		LeaveNodeList = []
		self.GetLeaveNodes(tree, LeaveNodeList)
		if len(LeaveNodeList) == 1:
			# 只有一个结点没有叶子结点可以减了
			return tree
		# print(len(LeaveNodeList))
		# 找到叶子结点对应的不同父结点,剪枝是发生在叶子结点的父结点上的
		FatherNodeSet = set()
		for ThisNode in LeaveNodeList:
			FatherNodeSet.add(ThisNode.FatherNode)

		# print(len(FatherNodeSet))
		# 对tree的每组叶结点尝试剪枝(回缩回父结点),比较损伤函数,判断是否进行更新
		ThisTimeBestLoss = -1  # 保存这次剪枝所能达到的最小Loss
		for FatherNode in FatherNodeSet:
			SameTree = copy.deepcopy(tree)
			# 每个结点存储的数据是唯一的,根据数据来判断是否是要进行处理的那个结点
			Data = FatherNode.Data
			PurningTree = self.GetPurningTree(SameTree, Data)
			# 比较损失函数
			# print("找到了父结点={}".format(FatherNode))
			# print("PurningTree={}".format(PurningTree))
			PurningLoss = self.GetLoss(PurningTree, alpha)
			BeforeLoss = self.GetLoss(tree, alpha)
			print("考察结点{}\n剪枝后损失:PurningLoss={},剪枝前损失:BeforeLoss={}".format(FatherNode,PurningLoss,BeforeLoss))
			if (PurningLoss <= BeforeLoss):
				# 剪枝后Loss减少,更新树,并保存当前得到的最小Loss
				print("发生了剪枝")
				tree = PurningTree
				print("此次剪枝后tree={}".format(PurningTree))
				ThisTimeBestLoss = PurningLoss
		if ThisTimeBestLoss == Loss:
			# 这次最好的Loss和上次一样,说明这次修剪没有任何改动,已经达到最优
			return tree
		else:
			# 对剪枝后的树再进行剪枝,
			return self.DecidePurning(tree, alpha, ThisTimeBestLoss)


lhData, lhLabel, lhName = createDataLH()
xgData, xgLabel, xgName = createDataXG20()


alpha=2 			#惩罚参数
Purninglhtree = ClassificationDecisionTree(lhData, lhLabel, lhName,
										   "ID3", epsilon=0.4, alpha=alpha, Purning=True)  # 惩罚参数设置较高时会发生剪枝,较低时与不剪枝一样
NotPurninglhtree = ClassificationDecisionTree(lhData, lhLabel, lhName, "ID3",
											  epsilon=0.4, alpha=None, Purning=False)
print("不剪枝剪枝lhtree={}".format(NotPurninglhtree))
print("剪枝lhtree={}".format(Purninglhtree))

# 西瓜决策树需要设置阈值较高,否则只有根
Purningxgtree = ClassificationDecisionTree(xgData, xgLabel, xgName,
										   "ID3", epsilon=0, alpha=alpha, Purning=True)  # 惩罚参数设置较高,保证剪枝,从而看出与不剪枝的对比
NotPurningxgtree = ClassificationDecisionTree(xgData, xgLabel, xgName, "ID3",
											  epsilon=0, alpha=None, Purning=False)  #
print("不剪枝剪枝xgtree={}".format(NotPurningxgtree))
print("剪枝xgtree={}".format(Purningxgtree))


你可能感兴趣的:(统计学习方法代码实现,决策树,剪枝,机器学习,算法)