import numpy as np import pandas as pd def createDataSet(): dataSet = [['sunny', 'hot', 'high', 'False', 'no'], ['sunny', 'hot', 'high', 'True', 'no'], ['overcast', 'hot', 'high', 'False', 'yes'], ['rain', 'mild', 'high', 'False', 'yes'], ['rain', 'cool', 'normal', 'False', 'yes'], ['rain', 'cool', 'normal', 'True', 'no'], ['overcast', 'cool', 'normal', 'True', 'yes'], ['sunny', 'mild', 'high', 'False', 'no'], ['sunny', 'cool', 'normal', 'False', 'yes'], ['rain', 'mild', 'normal', 'False', 'yes'], ['sunny', 'mild', 'normal', 'True', 'yes'], ['overcast', 'mild', 'high', 'True', 'yes'], ['overcast', 'hot', 'normal', 'False', 'yes'], ['rain', 'mild', 'high', 'True', 'no']] features = ['outlook', 'temperature', 'humidity', 'windy', 'play'] return dataSet,features # 计算当前节点的信息熵(entropy) def calcEnt(dataSet): label = dataSet.iloc[:,-1].to_list() # 取最后一列标签列 class_values = np.unique(label) # 去重 shannonEnt = 0 for class_val in class_values: num = label.count(class_val) #统计对应该标签的样本个数 shannonEnt += (num/len(label)) * np.log2(num/len(label)) # 根据占比累加熵 return -shannonEnt # 对比当前数据中每一个特征的条件熵并选出最优特征作为决策节点 def chooseBestFeatureToSplit(df): baseEntropy = calcEnt(df) bestInfoGain = 0 bestFeature = '' # 计算每一列特征决策点的熵 for feature in df.columns[:-1]: branchs = df[feature].unique() # 当前特征的分支 newEntropy = 0 for branch in branchs: subdf = df[df[feature] == branch] # 提取每一个分支的子表 newEntropy += len(subdf)/len(df) * calcEnt(subdf) # 计算该分支的熵并累加 infoGain = baseEntropy - newEntropy # 原始熵与该特征决策点的熵的差值,即信息增益 if (infoGain > bestInfoGain): # 保留当前最大信息增益及其对应的决策节点 bestInfoGain = infoGain bestFeature = feature return bestFeature # 投票,类别占比高的直接返回结果 def majorityCnt(classList): max_num = 0 max_class = '' class_val = np.unique(classList) for class_ in class_val: num = classList.count(class_) if num > max_num: max_num = num max_class = class_ return max_class def createTree(dataSet): classList = dataSet.iloc[:,-1].to_list() # 是否去打球 class_val = np.unique(classList) if len(class_val)==1: return class_val[0] # 如果只有一种选择,则直接返回结果 if len(dataSet.columns)-1 == 1: #如果分裂到特征只剩一列直接投票将结果返回 return majorityCnt(classList) bestFeature = chooseBestFeatureToSplit(dataSet) # 选择当前特征中最优决策节点 # outlook myTree = {bestFeature: {}} # 最优特征作为当前的决策节点 branchs = dataSet[bestFeature].unique() # 获取当前特征的分支 # 遍历每一个分支,并提取子表继续递归进行树的构建 for branch in branchs: df = dataSet[dataSet[bestFeature] == branch] # 根据相应分支名称获取字表 subdf = df.drop(bestFeature, axis='columns') # 已作为决策点的特征列不再参与之后决策点的构建 myTree[bestFeature][branch] = createTree(subdf) return myTree if __name__ == '__main__': dataSet, features = createDataSet() # 创造示列数据 df = pd.DataFrame(data=dataSet, columns=features) print(createTree(df)) # 输出决策树模型结果