这里只实现了对于离散值来计算的生成的决策树,代码如下:
from math import log
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
def creatDataSet():
dataset = [
['青绿','蜷缩','浊响','清晰','凹陷','硬滑','是'],
['乌黑','蜷缩','沉闷','清晰','凹陷','硬滑','是'],
['乌黑','蜷缩','浊响','清晰','凹陷','硬滑','是'],
['青绿','蜷缩','沉闷','清晰','凹陷','硬滑','是'],
['浅白','蜷缩','浊响','清晰','凹陷','硬滑','是'],
['青绿','稍蜷','浊响','清晰','稍凹','软粘','是'],
['乌黑','稍蜷','浊响','稍糊','稍凹','软粘','是'],
['乌黑','稍蜷','浊响','清晰','稍凹','硬滑','是'],
['乌黑','稍蜷','沉闷','稍糊','稍凹','硬滑','否'],
['青绿','硬挺','清脆','清晰','平坦','软粘','否'],
['浅白','硬挺','清脆','模糊','平坦','硬滑','否'],
['浅白','蜷缩','浊响','模糊','平坦','软粘','否'],
['青绿','稍蜷','浊响','稍糊','凹陷','硬滑','否'],
['浅白','稍蜷','沉闷','稍糊','凹陷','硬滑','否'],
['乌黑','稍蜷','浊响','清晰','稍凹','软粘','否'],
['浅白','蜷缩','浊响','模糊','平坦','硬滑','否'],
['青绿','蜷缩','沉闷','稍糊','稍凹','硬滑','否'],
]
labels = ['色泽','根蒂','敲声','纹理','脐部','触感']
return dataset,labels
def createTree(dataset,lables,featLables):
classList = [example[-1] for example in dataset]#将dataSet中的数据先按行依次放入example中,然后取得example中的example[-1]元素,
#放入列表featList中 这里放入的是最后一列的”是否“ 即看看是否所有数据都是好瓜或者坏瓜
if classList.count(classList[0]) == len(classList):
#如里面的所有结果都是一样的 都为是 或者否
return classList[0]#返回值为任意位置上的值 因为都一样
if len(dataset[0]) == 1:#只剩一个属性特征的时候
return majorityCnt(classList)#计算当前节点数最多的情况
bestFeat = chooseBestFeatureToSplit(dataset)#选择最好的一个特征来作为根节点
bestFeatlable = lables[bestFeat]#找到当前特征的标签名字(属性名字 如色泽)
featLables.append(bestFeatlable)#选择的特征标签保存下来 主要是用于构建树
myTree = {bestFeatlable:{}}
del lables[bestFeat] #删掉当前已经选择好的属性
featValue = [example[bestFeat] for example in dataset]#得到当前属性可能取到的值
uniqueVals = set(featValue)#去重
for value in uniqueVals:
sublables = lables[:]
myTree[bestFeatlable][value] = createTree(splitDataSet(dataset,bestFeat,value),sublables,featLables)#创建0或者1的分支
return myTree
def majorityCnt(classList):#看当前节点yes多还是no多 返回多的结果
classCount = {}
for vote in classList:
if vote not in classCount.keys():classCount[vote] = 0 #用于形成一个以yes no为关键字的字典
classCount[vote] += 1
sortedclassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse=True)
return sortedclassCount[0][0]
def calcShannonEnt(dataset):#计算当前熵值
numexamples = len(dataset)
labelCounts = {}
for featVec in dataset:#算当前yes no的数
currentlabel = featVec[-1]
if currentlabel not in labelCounts.keys():
labelCounts[currentlabel] = 0#如当前标签值没有出现过 则把标签值加到字典里
labelCounts[currentlabel] += 1
shannonEnt = 0
for key in labelCounts:
prop = float(labelCounts[key])/numexamples #算当前标签出现的概率
shannonEnt -= prop * log(prop,2)#熵值公式
return shannonEnt
def chooseBestFeatureToSplit(dataset):#选择最好的一个特征来作为根节点
numFeatures = len(dataset[0]) - 1 #算出属性的个数
baseEntropy = calcShannonEnt(dataset)#最初的熵值
bestInfoGain = 0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataset]#当前列的特征
uniqueVals = set(featList)#当前属性的类别 0,1,2等
newEntropy = 0
for val in uniqueVals:#对每一个特征来算熵值 分支下来的熵值 第一列来划分下的不同值的一个熵值
subDataSet = splitDataSet(dataset,i,val)#切分当前数据集
prob = len(subDataSet)/float(len(dataset))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if(infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def splitDataSet(dataset,axis,val):
retDataSet = []
for featVec in dataset:#找到当前数据集在这个分叉上的
if featVec[axis] == val:#去掉当前这一列
reducedfeatVec = featVec[:axis]
reducedfeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedfeatVec)
return retDataSet
if __name__=='__main__':
dataset, labels = creatDataSet()
featLables = []
myTree = createTree(dataset,labels,featLables)
print(myTree)
运行结果: