西瓜数据集3.0 python_决策树对西瓜数据集2.0二分类

西瓜数据集.jpg

@生成分类字典

# -*- coding: UTF-8 -*-

#设置默认编码,否则中文会乱码

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

from math import log

#1、获取样例集和属性列表

def filetodataset(filename):

fr=open(filename,'r')

all_lines=fr.readlines() #list形式,每行为1个str

featname=all_lines[0].strip().split(',') #list形式

featname=featname[:-1]

dictcategory={}

dataset=[]

for sample in all_lines[1:]:

sample=sample.strip().split(',') #以逗号为分割符拆分列表

dataset.append(sample)

return dataset,featname

#2、计算香农商

def calcent(dataset):

dictcategory={}

for i in dataset:

category=i[-1]

if category not in dictcategory:

dictcategory[category]=0

dictcategory[category]+=1

num=len(dataset)

shannon=0

for i in dictcategory:

prob=float(dictcategory[i])/num

shannon-=prob*log(prob,2)

return shannon

#3、对特定属性选择特定取值后,将满足该条件的剩余数据集组合留待计算香农商

def splitdataset(dataset,axis,value):

subdataset=[]

for sample in dataset:

if sample[axis]==value:

reducedfeatvec=sample[:axis]

reducedfeatvec.extend(sample[axis+1:])

subdataset.append(reducedfeatvec)

return subdataset

#4、选择最佳的划分属性

def choosebestfeaturetosplit(dataset):

attrnum=len(dataset[0]) #计算属性个数

baseshannon=calcent(dataset) #计算整个样本集的香农商

bestinfogain=0.0 ; bestfeature=-1

for i in range(attrnum-1):

featlist=[example[i] for example in dataset] #取出特定属性的所有值。dataset包含了类,但不影响,因为取不到

unifeat=set(featlist) #每个属性所含的值

attrshannon=0

for value in unifeat:

subdataset=splitdataset(dataset,i,value)

shannon=calcent(subdataset) #每个属性值取每个值的香农商

prob=len(subdataset)/float(len(dataset))

attrshannon+=prob*shannon

infogain=baseshannon-attrshannon

if infogain>bestinfogain:

bestinfogain=infogain

bestfeature=i

return bestfeature

#5、返回样例中类最多的那个类别

def majorclass(data):

aa=[sample[-1] for sample in data] #获取每个样例最后的类别

bb={}

for i in aa:

bb[i]=aa.count(i)

#将字典bb降序排列,书中用的另一种方式

bb= sorted(bb.iteritems(), key=lambda d:d[1], reverse = True)

return bb

#6、生成决策树

def createtree(mydata,labels): #labels为属性标签

#情况1、当所有样例的类别一致时,返回类别

samplelabel=[sample[-1] for sample in mydata]

usamplelabel=list(set(samplelabel))

if len(usamplelabel)==1:

return usamplelabel[0]

#情况2、当属性已经用完,则选择类别最多的显示

if len(mydata[0])==1:

return majorclass(mydata)

#情况3:选择最佳划分属性进行划分

bestfeature=choosebestfeaturetosplit(mydata)

bestfeaturelabel=labels[bestfeature]

mytree={bestfeaturelabel:{}}

del labels[bestfeature]

featurevalue=[sample[bestfeature] for sample in mydata]

ufeaturevalue=set(featurevalue)

for value in ufeaturevalue:

sublabels=labels[:]

mytree[bestfeaturelabel][value]=createtree(splitdataset(mydata,bestfeature,value),sublabels)

return mytree

if __name__=='__main__':

import json

filename='/Users/enniu/Desktop/jqxx/xiguaset.txt'

mydata,featname=filetodataset(filename)

#shannon=calcent(mydata)

#choosebestfeaturetosplit(mydata)

mytree=createtree(mydata,featname)

print json.dumps(mytree, ensure_ascii=False) #直接打印字典,里面含有中文,控制台信息输出窗口按照ascii编码输出utf8编码的字符串。

结果如下:

{"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}

说明

1、在结点上下游(递归)属性只出现一次,因为后面算法会剔除掉。同个属性可能出现在不同分叉路

2、与机器学习书相比P78,少了个色泽浅白为好瓜的判断

@绘制树形图

# -*- coding:utf-8 -*-

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

import matplotlib.pyplot as plt

import json

#mytree={"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}

anothertree={'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}

#anothertree={'no surfacing': {1: {'flippers': {0: 'no', 1: 'yes'}},0: 'no'}}

#print json.dumps(mytree,ensure_ascii=False)

#计算叶节点数目

def calculateleaf(mytree):

numleaf=0

firststr=mytree.keys()[0] #获取字典第一个键值

seconddict=mytree[firststr]

for key in seconddict.keys():

if type(seconddict[key]).__name__=='dict':

numleaf+= calculateleaf(seconddict[key])

else:

numleaf+=1

return numleaf

#计算数的层数

def calculatedepth(mytree):

maxdepth=0

firststr=mytree.keys()[0]

seconddict=mytree[firststr]

for key in seconddict.keys():

#print key,

if type(seconddict[key]).__name__=='dict':

numdepth=1+calculatedepth(seconddict[key])

else:

numdepth=1 #到叶节点后,计算树深度的变量+1

if numdepth>maxdepth:

maxdepth=numdepth

#print numdepth,maxdepth

return maxdepth

def plotmidtext(cntrpt,parentpt,txtstring):

xmid=(parentpt[0]-cntrpt[0])/2.0+cntrpt[0]

ymid=(parentpt[1]-cntrpt[1])/2.0+cntrpt[1]

createplot.ax1.text(xmid,ymid,txtstring)

decisionnode=dict(boxstyle="sawtooth",fc="0.8")

leafnode=dict(boxstyle="round4",fc="0.8")

arrow_args=dict(arrowstyle="

def plotnode(nodetext,centerpt,parentpt,nodetype):

createplot.ax1.annotate(nodetext,xy=parentpt,xytext=centerpt,arrowprops=arrow_args,\

xycoords='axes fraction',va='center',ha='center',bbox=nodetype)

def plottree(mytree,parentpt,nodetxt):

numleafs=calculateleaf(mytree)

depth=calculatedepth(mytree)

firststr=mytree.keys()[0]

cntrpt=(plottree.xoff+(1.0+float(numleafs))/2.0/plottree.totalw,plottree.yoff)

print '子节点坐标:',cntrpt

plotmidtext(cntrpt,parentpt,nodetxt) #自定义函数

plotnode(firststr,cntrpt,parentpt,decisionnode) #刚开始根节点与子节点是连在一起的?

print '绘制连接箭头',cntrpt,parentpt

seconddict=mytree[firststr]

plottree.yoff=plottree.yoff-1.0/(1.0*plottree.totald) #控制宽度

print 'y轴值:',plottree.yoff

for key in seconddict.keys():

if type(seconddict[key]).__name__=='dict':

print '***sandy***',plottree.xoff #经过else的判断后已变为1/6

plottree(seconddict[key],cntrpt,str(key))

print '***lam***',plottree.xoff

else:

plottree.xoff=plottree.xoff+1.0/plottree.totalw

plotnode(seconddict[key],(plottree.xoff,plottree.yoff),cntrpt,leafnode)

print '灯灯hoho',(plottree.xoff,plottree.yoff),cntrpt

plotmidtext((plottree.xoff,plottree.yoff),cntrpt,str(key))

#plottree.yoff=plottree.yoff+1.0/plottree.totald

def createplot(intree):

fig=plt.figure(1,facecolor='white')

fig.clf()

axprops=dict(xticks=[0,0.2,0.4,0.6,0.8,1],yticks=[0,0.2,0.4,0.6,0.8,1])

createplot.ax1=plt.subplot(111,frameon=True,**axprops) #把**axprops去掉亦可,默认显示刻度

plottree.totalw=float(calculateleaf(intree))

plottree.totald=float(calculatedepth(intree))

plottree.xoff=-0.5/plottree.totalw

plottree.yoff=1.0

plottree(intree,(0.5,1.0),'')

plt.show()

if __name__=='__main__':

createplot(anothertree)

@@递归探讨

当碰到递归时,沿着递归执行到最终结果(即最后停止递归的地方),然后再依次往上层执行

# -*- coding: UTF-8 -*-

def calculatedepth(mytree):

maxdepth=0

firststr=mytree.keys()[0]

seconddict=mytree[firststr]

for key in seconddict.keys():

print key

if type(seconddict[key]).__name__=='dict':

print '**'

numdepth=1+calculatedepth(seconddict[key])

print '第1种情况',numdepth

else:

numdepth=1 #到叶节点后,计算树深度的变量+1

print '第2种情况',numdepth

if numdepth>maxdepth:

maxdepth=numdepth

print (numdepth,maxdepth)

return maxdepth

mytree={'no surfacing': {1: {'flippers': {0: 'no', 1: 'yes'}},0: 'no'}}

if __name__=='__main__':

a=calculatedepth(mytree)

隐形眼镜数据集.png

你可能感兴趣的:(西瓜数据集3.0,python)