python决策树预测模型_Python实现天气决策树模型

from math importlogimportoperatordef calcShannonEnt(dataSet): #计算数据的熵

numEntries=len(dataSet) #数据条数

labelCounts={}for featVec indataSet:

currentLabel=featVec[-1]#每一行最后一个字(类别)

if currentLabel not inlabelCounts.keys():

labelCounts[currentLabel]=0

labelCounts[currentLabel]+=1 #统计有多少个类似以及每个类的数量

shannonEnt=0for key inlabelCounts:

prob=float(labelCounts[key])/numEntries #计算单个类的熵值

shannonEnt-=prob*log(prob,2)#累加每个类的熵值

returnshannonEntdef createDataSet1(): #创建示例数据

dataSet = [['晴天','高温','中湿','无风','不宜'],

['晴天','高温','中湿','有风','不宜'],

['多云','高温','低湿','无风','适宜'],

['雨天','低温','高湿','无风','适宜'],

['雨天','低温','低湿','无风','适宜'],

['雨天','低温','低湿','有风','不宜'],

['多云','低温','低湿','有风','适宜'],

['晴天','中温','高湿','无风','不宜'],

['晴天','低温','低湿','无风','适宜'],

['雨天','中温','低湿','无风','适宜'],

['晴天','中温','低湿','有风','适宜'],

['多云','中温','中湿','有风','适宜'],

['多云','高温','低湿','无风','适宜'],

['雨天','中温','低湿','有风','不宜']]

labels= ['天气','温度','湿度','风况']#四个特征

returndataSet,labelsdef splitDataSet(dataSet,axis,value):#按某个特征分类后的数据

retDataSet=[]for featVec indataSet:if featVec[axis]==value:

reducedFeatVec=featVec[:axis]

reducedFeatVec.extend(featVec[axis+1:])

retDataSet.append(reducedFeatVec)returnretDataSetdef chooseBestFeatureToSplit(dataSet): #选择最优的分类特征

numFeatures = len(dataSet[0])-1baseEntropy= calcShannonEnt(dataSet)#原始的熵

bestInfoGain =0

bestFeature= -1

for i inrange(numFeatures):

featList= [example[i] for example indataSet]

uniqueVals=set(featList)

newEntropy=0for value inuniqueVals:

subDataSet=splitDataSet(dataSet,i,value)

prob= len(subDataSet)/float(len(dataSet))

newEntropy+=prob*calcShannonEnt(subDataSet)#按特征分类后的熵

infoGain = baseEntropy - newEntropy #原始熵与按特征分类后的熵的差值

if (infoGain>bestInfoGain):#若按某特征划分后,熵值减少的最大,则次特征为最优分类特征

bestInfoGain=infoGain

bestFeature=ireturnbestFeaturedef majorityCnt(classList):#按分类后类别数量排序:

classCount={}for vote inclassList:if vote not inclassCount.keys():

classCount[vote]=0

classCount[vote]+=1sortedClassCount= sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)returnsortedClassCount[0][0]defcreateTree(dataSet,labels):

classList=[example[-1] for example in dataSet] #类别

if classList.count(classList[0])==len(classList):returnclassList[0]if len(dataSet[0])==1:returnmajorityCnt(classList)

bestFeat=chooseBestFeatureToSplit(dataSet)#选择最优特征

bestFeatLabel=labels[bestFeat]

myTree={bestFeatLabel:{}}#分类结果以字典的形式保存

del(labels[bestFeat])

featValues=[example[bestFeat] for example indataSet]

uniqueVals=set(featValues)for value inuniqueVals:

subLabels=labels[:]

myTree[bestFeatLabel][value]=createTree(splitDataSet\

(dataSet,bestFeat,value),subLabels)returnmyTreeif __name__=='__main__':

dataSet, labels=createDataSet1()#创造示例数据

print(createTree(dataSet,labels))#输出决策树模型

你可能感兴趣的:(python决策树预测模型)