西瓜数据集.jpg
@生成分类字典
# -*- coding: UTF-8 -*-
#设置默认编码,否则中文会乱码
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from math import log
#1、获取样例集和属性列表
def filetodataset(filename):
fr=open(filename,'r')
all_lines=fr.readlines() #list形式,每行为1个str
featname=all_lines[0].strip().split(',') #list形式
featname=featname[:-1]
dictcategory={}
dataset=[]
for sample in all_lines[1:]:
sample=sample.strip().split(',') #以逗号为分割符拆分列表
dataset.append(sample)
return dataset,featname
#2、计算香农商
def calcent(dataset):
dictcategory={}
for i in dataset:
category=i[-1]
if category not in dictcategory:
dictcategory[category]=0
dictcategory[category]+=1
num=len(dataset)
shannon=0
for i in dictcategory:
prob=float(dictcategory[i])/num
shannon-=prob*log(prob,2)
return shannon
#3、对特定属性选择特定取值后,将满足该条件的剩余数据集组合留待计算香农商
def splitdataset(dataset,axis,value):
subdataset=[]
for sample in dataset:
if sample[axis]==value:
reducedfeatvec=sample[:axis]
reducedfeatvec.extend(sample[axis+1:])
subdataset.append(reducedfeatvec)
return subdataset
#4、选择最佳的划分属性
def choosebestfeaturetosplit(dataset):
attrnum=len(dataset[0]) #计算属性个数
baseshannon=calcent(dataset) #计算整个样本集的香农商
bestinfogain=0.0 ; bestfeature=-1
for i in range(attrnum-1):
featlist=[example[i] for example in dataset] #取出特定属性的所有值。dataset包含了类,但不影响,因为取不到
unifeat=set(featlist) #每个属性所含的值
attrshannon=0
for value in unifeat:
subdataset=splitdataset(dataset,i,value)
shannon=calcent(subdataset) #每个属性值取每个值的香农商
prob=len(subdataset)/float(len(dataset))
attrshannon+=prob*shannon
infogain=baseshannon-attrshannon
if infogain>bestinfogain:
bestinfogain=infogain
bestfeature=i
return bestfeature
#5、返回样例中类最多的那个类别
def majorclass(data):
aa=[sample[-1] for sample in data] #获取每个样例最后的类别
bb={}
for i in aa:
bb[i]=aa.count(i)
#将字典bb降序排列,书中用的另一种方式
bb= sorted(bb.iteritems(), key=lambda d:d[1], reverse = True)
return bb
#6、生成决策树
def createtree(mydata,labels): #labels为属性标签
#情况1、当所有样例的类别一致时,返回类别
samplelabel=[sample[-1] for sample in mydata]
usamplelabel=list(set(samplelabel))
if len(usamplelabel)==1:
return usamplelabel[0]
#情况2、当属性已经用完,则选择类别最多的显示
if len(mydata[0])==1:
return majorclass(mydata)
#情况3:选择最佳划分属性进行划分
bestfeature=choosebestfeaturetosplit(mydata)
bestfeaturelabel=labels[bestfeature]
mytree={bestfeaturelabel:{}}
del labels[bestfeature]
featurevalue=[sample[bestfeature] for sample in mydata]
ufeaturevalue=set(featurevalue)
for value in ufeaturevalue:
sublabels=labels[:]
mytree[bestfeaturelabel][value]=createtree(splitdataset(mydata,bestfeature,value),sublabels)
return mytree
if __name__=='__main__':
import json
filename='/Users/enniu/Desktop/jqxx/xiguaset.txt'
mydata,featname=filetodataset(filename)
#shannon=calcent(mydata)
#choosebestfeaturetosplit(mydata)
mytree=createtree(mydata,featname)
print json.dumps(mytree, ensure_ascii=False) #直接打印字典,里面含有中文,控制台信息输出窗口按照ascii编码输出utf8编码的字符串。
结果如下:
{"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}
说明
1、在结点上下游(递归)属性只出现一次,因为后面算法会剔除掉。同个属性可能出现在不同分叉路
2、与机器学习书相比P78,少了个色泽浅白为好瓜的判断
@绘制树形图
# -*- coding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import matplotlib.pyplot as plt
import json
#mytree={"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}
anothertree={'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
#anothertree={'no surfacing': {1: {'flippers': {0: 'no', 1: 'yes'}},0: 'no'}}
#print json.dumps(mytree,ensure_ascii=False)
#计算叶节点数目
def calculateleaf(mytree):
numleaf=0
firststr=mytree.keys()[0] #获取字典第一个键值
seconddict=mytree[firststr]
for key in seconddict.keys():
if type(seconddict[key]).__name__=='dict':
numleaf+= calculateleaf(seconddict[key])
else:
numleaf+=1
return numleaf
#计算数的层数
def calculatedepth(mytree):
maxdepth=0
firststr=mytree.keys()[0]
seconddict=mytree[firststr]
for key in seconddict.keys():
#print key,
if type(seconddict[key]).__name__=='dict':
numdepth=1+calculatedepth(seconddict[key])
else:
numdepth=1 #到叶节点后,计算树深度的变量+1
if numdepth>maxdepth:
maxdepth=numdepth
#print numdepth,maxdepth
return maxdepth
def plotmidtext(cntrpt,parentpt,txtstring):
xmid=(parentpt[0]-cntrpt[0])/2.0+cntrpt[0]
ymid=(parentpt[1]-cntrpt[1])/2.0+cntrpt[1]
createplot.ax1.text(xmid,ymid,txtstring)
decisionnode=dict(boxstyle="sawtooth",fc="0.8")
leafnode=dict(boxstyle="round4",fc="0.8")
arrow_args=dict(arrowstyle="
def plotnode(nodetext,centerpt,parentpt,nodetype):
createplot.ax1.annotate(nodetext,xy=parentpt,xytext=centerpt,arrowprops=arrow_args,\
xycoords='axes fraction',va='center',ha='center',bbox=nodetype)
def plottree(mytree,parentpt,nodetxt):
numleafs=calculateleaf(mytree)
depth=calculatedepth(mytree)
firststr=mytree.keys()[0]
cntrpt=(plottree.xoff+(1.0+float(numleafs))/2.0/plottree.totalw,plottree.yoff)
print '子节点坐标:',cntrpt
plotmidtext(cntrpt,parentpt,nodetxt) #自定义函数
plotnode(firststr,cntrpt,parentpt,decisionnode) #刚开始根节点与子节点是连在一起的?
print '绘制连接箭头',cntrpt,parentpt
seconddict=mytree[firststr]
plottree.yoff=plottree.yoff-1.0/(1.0*plottree.totald) #控制宽度
print 'y轴值:',plottree.yoff
for key in seconddict.keys():
if type(seconddict[key]).__name__=='dict':
print '***sandy***',plottree.xoff #经过else的判断后已变为1/6
plottree(seconddict[key],cntrpt,str(key))
print '***lam***',plottree.xoff
else:
plottree.xoff=plottree.xoff+1.0/plottree.totalw
plotnode(seconddict[key],(plottree.xoff,plottree.yoff),cntrpt,leafnode)
print '灯灯hoho',(plottree.xoff,plottree.yoff),cntrpt
plotmidtext((plottree.xoff,plottree.yoff),cntrpt,str(key))
#plottree.yoff=plottree.yoff+1.0/plottree.totald
def createplot(intree):
fig=plt.figure(1,facecolor='white')
fig.clf()
axprops=dict(xticks=[0,0.2,0.4,0.6,0.8,1],yticks=[0,0.2,0.4,0.6,0.8,1])
createplot.ax1=plt.subplot(111,frameon=True,**axprops) #把**axprops去掉亦可,默认显示刻度
plottree.totalw=float(calculateleaf(intree))
plottree.totald=float(calculatedepth(intree))
plottree.xoff=-0.5/plottree.totalw
plottree.yoff=1.0
plottree(intree,(0.5,1.0),'')
plt.show()
if __name__=='__main__':
createplot(anothertree)
@@递归探讨
当碰到递归时,沿着递归执行到最终结果(即最后停止递归的地方),然后再依次往上层执行
# -*- coding: UTF-8 -*-
def calculatedepth(mytree):
maxdepth=0
firststr=mytree.keys()[0]
seconddict=mytree[firststr]
for key in seconddict.keys():
print key
if type(seconddict[key]).__name__=='dict':
print '**'
numdepth=1+calculatedepth(seconddict[key])
print '第1种情况',numdepth
else:
numdepth=1 #到叶节点后,计算树深度的变量+1
print '第2种情况',numdepth
if numdepth>maxdepth:
maxdepth=numdepth
print (numdepth,maxdepth)
return maxdepth
mytree={'no surfacing': {1: {'flippers': {0: 'no', 1: 'yes'}},0: 'no'}}
if __name__=='__main__':
a=calculatedepth(mytree)
隐形眼镜数据集.png