#按照给定区间划分数据集
def splitDataSet_bydata_font(dataSet,axis,value):# 待划分的数据集 划分数据集的特征 比较的特征值
retDataSet_font=[]
if isinstance(dataSet,list) ==False: #判断dataSet是不是列表
dataSet=dataSet.tolist() #转化列表
for featVec in dataSet:#遍历每一行
if featVec[axis] <=value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis:])#放列表中的元素
retDataSet_font.append(reducedFeatVec)#把整个列表放入
return retDataSet_font
#按照给定特征区间划分数据集
def splitDataSet_bydata_back(dataSet,axis,value):# 待划分的数据集 划分数据集的特征 比较的特征值
retDataSet_back=[]
if isinstance(dataSet,list) ==False:
dataSet=dataSet.tolist()
for featVec in dataSet:#遍历每一行
if featVec[axis] >value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis:])#放列表中的元素
retDataSet_back.append(reducedFeatVec)#把整个列表放入
return retDataSet_back
#判断最优值
def chooseBestData(dataset):
num=len(dataset[0])-1 #除掉类别
baseEnt=calcShannonEnt(dataset)#信息熵
print("原本的信息熵",baseEnt)
bestGain=0.0
bestFeature=-1
bestdata=0
for i in range(num):#0 1 2
#创建唯一的分类标签列表
featlist=[example[i] for example in dataset]#取该行数据的第“ i ”位元素
for value in featlist:
newEnt=0.0
#计算每种划分方式的信息熵
subDataSet_font=splitDataSet_bydata_font(dataset,i,value)
subDataSet_back=splitDataSet_bydata_back(dataset,i,value)
prob_font=len(subDataSet_font)/float(len(dataset))#计算比例
prob_back=len(subDataSet_back)/float(len(dataset))
newEnt=prob_font*calcShannonEnt(subDataSet_font)+prob_back*calcShannonEnt(subDataSet_back)
#计算信息增益
inforGain=baseEnt-newEnt
#计算最好的信息熵
if (inforGain>bestGain):
print("当前信息熵增益为:",inforGain,"当前最优特征为",i,"划分值为:",value)
bestGain=inforGain
bestFeature=i
bestdata=value
return bestFeature,bestdata
#投票分类
def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys():classCount[vote]=0
classCount[vote]+=1
sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
print(sortedClassCount)
return sortedClassCount[0][0] #返回出现次数最多的分类
def createTree(dataSet,labels):
#类别完全相同则停止划分
classList=[example[-1] for example in dataSet]
if classList.count(classList[0])==len(classList):
#print("发生了类别完全相同",classList[0])
return classList[0]
#遍历完所有特征时返回出现次数最多的类别
if len(dataSet[0])==1:
return majorityCnt(classList)
bestFeat,bestData=chooseBestData(dataSet)
bestFeatLabel=labels[bestFeat]
myTree={bestFeatLabel:{}}
#分支的多少和循环次数有关
listJudge=["<="+str(bestData),">="+str(bestData)]
subLabels=labels[:] #复制一份
print(bestFeat,bestData)
newDataSet_font=splitDataSet_bydata_font(dataSet,bestFeat,bestData)
newDataSet_back=splitDataSet_bydata_back(dataSet,bestFeat,bestData)
print(newDataSet_font)
if(newDataSet_font!=[] and bestFeat!=-1):
myTree[bestFeatLabel][listJudge[0]]=createTree(newDataSet_font,subLabels)
if(newDataSet_back!=[] and bestFeat!=-1):
myTree[bestFeatLabel][listJudge[1]]=createTree(newDataSet_back,subLabels)
return myTree
def createTree(dataSet,labels,depth):
classList=[example[-1] for example in dataSet]
#达到指定深度停止划分
if depth==0:
return majorityCnt(classList)
#类别完全相同则停止划分
if classList.count(classList[0])==len(classList):
return classList[0]
#遍历完所有特征时返回出现次数最多的类别
if len(dataSet[0])==1:
return majorityCnt(classList)
bestFeat,bestData=chooseBestData(dataSet)
bestFeatLabel=labels[bestFeat]
myTree={bestFeatLabel:{}}
#分支的多少和循环次数有关
listJudge=["<="+str(bestData),">"+str(bestData)]
subLabels=labels[:] #复制一份
print(bestFeat,bestData)
newDataSet_font=splitDataSet_bydata_font(dataSet,bestFeat,bestData)
newDataSet_back=splitDataSet_bydata_back(dataSet,bestFeat,bestData)
print(newDataSet_font)
if(newDataSet_font!=[] and bestFeat!=-1):
newDepth=depth-1
myTree[bestFeatLabel][listJudge[0]]=createTree(newDataSet_font,subLabels,newDepth)
if(newDataSet_back!=[] and bestFeat!=-1):
newDepth=depth-1
myTree[bestFeatLabel][listJudge[1]]=createTree(newDataSet_back,subLabels,newDepth)
return myTree
if __name__ == '__main__':
mytree=createTree(data,labels,3)
createPlot(mytree)
AttributeError: 'dict' object has no attribute 'iteritems'