from math importlogimportoperatordef calcShannonEnt(dataSet): #计算数据的熵
numEntries=len(dataSet) #数据条数
labelCounts={}for featVec indataSet:
currentLabel=featVec[-1]#每一行最后一个字(类别)
if currentLabel not inlabelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1 #统计有多少个类似以及每个类的数量
shannonEnt=0for key inlabelCounts:
prob=float(labelCounts[key])/numEntries #计算单个类的熵值
shannonEnt-=prob*log(prob,2)#累加每个类的熵值
returnshannonEntdef createDataSet1(): #创建示例数据
dataSet = [['晴天','高温','中湿','无风','不宜'],
['晴天','高温','中湿','有风','不宜'],
['多云','高温','低湿','无风','适宜'],
['雨天','低温','高湿','无风','适宜'],
['雨天','低温','低湿','无风','适宜'],
['雨天','低温','低湿','有风','不宜'],
['多云','低温','低湿','有风','适宜'],
['晴天','中温','高湿','无风','不宜'],
['晴天','低温','低湿','无风','适宜'],
['雨天','中温','低湿','无风','适宜'],
['晴天','中温','低湿','有风','适宜'],
['多云','中温','中湿','有风','适宜'],
['多云','高温','低湿','无风','适宜'],
['雨天','中温','低湿','有风','不宜']]
labels= ['天气','温度','湿度','风况']#四个特征
returndataSet,labelsdef splitDataSet(dataSet,axis,value):#按某个特征分类后的数据
retDataSet=[]for featVec indataSet:if featVec[axis]==value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)returnretDataSetdef chooseBestFeatureToSplit(dataSet): #选择最优的分类特征
numFeatures = len(dataSet[0])-1baseEntropy= calcShannonEnt(dataSet)#原始的熵
bestInfoGain =0
bestFeature= -1
for i inrange(numFeatures):
featList= [example[i] for example indataSet]
uniqueVals=set(featList)
newEntropy=0for value inuniqueVals:
subDataSet=splitDataSet(dataSet,i,value)
prob= len(subDataSet)/float(len(dataSet))
newEntropy+=prob*calcShannonEnt(subDataSet)#按特征分类后的熵
infoGain = baseEntropy - newEntropy #原始熵与按特征分类后的熵的差值
if (infoGain>bestInfoGain):#若按某特征划分后,熵值减少的最大,则次特征为最优分类特征
bestInfoGain=infoGain
bestFeature=ireturnbestFeaturedef majorityCnt(classList):#按分类后类别数量排序:
classCount={}for vote inclassList:if vote not inclassCount.keys():
classCount[vote]=0
classCount[vote]+=1sortedClassCount= sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)returnsortedClassCount[0][0]defcreateTree(dataSet,labels):
classList=[example[-1] for example in dataSet] #类别
if classList.count(classList[0])==len(classList):returnclassList[0]if len(dataSet[0])==1:returnmajorityCnt(classList)
bestFeat=chooseBestFeatureToSplit(dataSet)#选择最优特征
bestFeatLabel=labels[bestFeat]
myTree={bestFeatLabel:{}}#分类结果以字典的形式保存
del(labels[bestFeat])
featValues=[example[bestFeat] for example indataSet]
uniqueVals=set(featValues)for value inuniqueVals:
subLabels=labels[:]
myTree[bestFeatLabel][value]=createTree(splitDataSet\
(dataSet,bestFeat,value),subLabels)returnmyTreeif __name__=='__main__':
dataSet, labels=createDataSet1()#创造示例数据
print(createTree(dataSet,labels))#输出决策树模型