作为一个萌新花了三天终于做出来了,普大喜奔,借鉴了机械学习实战的部分写法,把带有连续型特征值的决策树做了出来。
import math
import numpy as np
import operator
#量化的数据
dataset = np.array([[0,0,0,0,0,0,0.697,0.460,1],
[1,0,1,0,0,0,0.774,0.376,1],
[1,0,0,0,0,0,0.634,0.264,1],
[0,0,1,0,0,0,0.608,0.318,1],
[2,0,0,0,0,0,0.556,0.215,1],
[0,1,0,0,1,1,0.403,0.237,1],
[1,1,0,1,1,1,0.481,0.149,1],
[1,1,0,0,1,0,0.437,0.211,1],
[1,1,1,1,1,0,0.666,0.091,0],
[0,2,2,0,2,1,0.243,0.267,0],
[2,2,2,2,2,0,0.245,0.057,0],
[2,0,0,2,2,1,0.343,0.099,0],
[0,1,0,1,0,0,0.639,0.161,0],
[2,1,1,1,0,0,0.657,0.198,0],
[1,1,0,0,1,1,0.360,0.370,0],
[2,0,0,2,2,0,0.593,0.042,0],
[0,0,1,1,1,0,0.719,0.103,0]]
)
#4.3 信息熵生成决策树2.0
def calshonnonEnt(dataset): #计算特征值的信息熵
shononent=0.
numdataset = len(dataset)
label_dict = {
}
for ai_dataset in dataset:
ai_label = ai_dataset[-1]
if ai_label not in label_dict.keys(): #!!!!!!!!!!这个写法要学会
label_dict[ai_label] = 0
label_dict[ai_label] +=1
for key in label_dict.keys():
num = label_dict[key]
prob = float(num)/numdataset
shononent -=prob*math.log(prob,2)
return shononent
def nonFeqGain(dataset,i,baseEnt): #计算特征的信息熵
dataset =np.array(dataset)
a_dataset = dataset[:,i]
values = list(set(a_dataset))
shonono =0
for ai in values:
ai_dataset = []
for j in range(len(dataset)):
if ai == a_dataset[j]:
ai_dataset.append(dataset[j])
num_ai=len(ai_dataset)
shon = calshonnonEnt(ai_dataset)
shonono += (float(num_ai)/len(dataset))*shon
gain = baseEnt-shonono
return gain
def FreqAbestGain(dataset,i,baseEnt): #计算连续值特征的信息增益
numA = len(dataset[0])
dataset = np.array(dataset)
a_dataset = dataset[:,i]
a_values_list = sorted(list(set(a_dataset)))
t_list = []
gain_dict = {
}
for j in range(len(a_values_list)-1):
tj = (a_values_list[j]+a_values_list[j+1])/2
t_list.append(tj)
for t in t_list:
a_P_dataset =[]
a_N_dataset= []
for m in range(len(dataset)):
if a_dataset[m] <= t:
a_N_dataset.append(dataset[m])
else:a_P_dataset.append(dataset[m])
numN = len(a_N_dataset)
numP = len(a_P_dataset)
probN = float(numN)/(numN+numP)
probP = float(numP)/(numN+numP)
shon = probN*calshonnonEnt(a_N_dataset)+probP*calshonnonEnt(a_P_dataset)
gain = baseEnt-shon
gain_dict[t] = gain
gain_list = sorted(gain_dict.items(),key=lambda x:x[1],reverse=True)[0]
ai = gain_list[0]
ai_gain = gain_list[1]
print(ai,ai_gain)
return ai,ai_gain
def choosenbestA(dataset,baseEnt): #选择最好的特征
a_gain_dict = {
}
numA =len(dataset[0])
list_lianxu = []
list_non_freq=[]
t_dict ={
}
for n in range(numA-1):
if dataset[0][n] not in [0.,1.,2.]: #如果是不是连续
list_lianxu.append(n)
else:list_non_freq.append(n) #如果是连续
for i in list_non_freq:
a_gain_dict[i] = nonFeqGain(dataset,i,baseEnt)
for j in list_lianxu:
t,b =FreqAbestGain(dataset,j,baseEnt)
a_gain_dict[j] = b
t_dict[j] = t
best_i_gain=list(sorted(a_gain_dict.items(),key=lambda x:x[1],reverse=True)[0])
print(best_i_gain)
best_i= best_i_gain[0]
best_gain =best_i_gain[1]
if best_i in list_non_freq:
best_t=0
return best_i,best_gain,best_t
else:
best_t = t_dict[best_i]
return best_i,best_gain,best_t
baseEnt = calshonnonEnt(dataset)
choosenbestA(dataset,baseEnt)
def splitdataset(dataset,axis,value): #去除父节点的特征数据
retdataset = []
for featvec in dataset:
if featvec[axis] == value:
reducefeatvec = np.hstack((featvec[:axis],featvec[axis+1:]))
retdataset.append(reducefeatvec)
return retdataset
def splitFreqXIAOYUDataset(dataset,axis,t): #去除父节点的特征数据
retdataset = []
for featvec in dataset:
if featvec[axis] <= t :
reducefeatvec = np.hstack((featvec[:axis],featvec[axis+1:]))
retdataset.append(reducefeatvec)
return retdataset
def splitFreqDAYUDataset(dataset,axis,t): #去除父节点的数据
retdataset = []
for featvec in dataset:
if featvec[axis] > t :
reducefeatvec = np.hstack((featvec[:axis],featvec[axis+1:]))
retdataset.append(reducefeatvec)
return retdataset
#这个函数是机器学习实践上写的,但是我觉得好像没有什么用处,暂且先放着,希望有人能解答一下
#因为使用条件要求是len(dataset[0])==1,就是分到最后只剩最后一列就是好坏那列
#但是如果数据是完整的应该不会出现这种状况吧
def majoritycnt(classlist):
classcount:{
}
for vote in classlist:
if vote not in classlist.keys():
classcount[vote] = 0
classcount[vote] += 1
sortedclasscount = sorted(classcount.iteritems(),
key = operator.itemgetter(1),reverse=True)
return sortedclasscount[0][0]
labels = ['color','root','sound','textile','belly','feel','density','sugar']
def TREE(dataset,labels):
classlist = [example[-1] for example in dataset]
if classlist.count(classlist[0]) == len(classlist):
return classlist[0]
if len(dataset[0]) == 1:
return majoritycnt(classlist)
baseEnt = calshonnonEnt(dataset)
bestA,bestGain,t = choosenbestA(dataset,baseEnt)
bestfeatlabel = labels[bestA]
mytree = {
bestfeatlabel:{
}}
if t==0:
del(labels[bestA])
featvalues = [example[bestA] for example in dataset]
if featvalues[0] in [0.,1.,2.]:
uniquevals = set(featvalues)
for value in uniquevals:
sublabels = labels[:]
mytree[bestfeatlabel][value] = TREE(splitdataset(dataset,bestA,value),sublabels)
else:
mytree[bestfeatlabel]['<%s'%t] = TREE(splitFreqXIAOYUDataset(dataset,bestA,t),labels)
mytree[bestfeatlabel]['>%s'%t] = TREE(splitFreqDAYUDataset(dataset,bestA,t),labels)
return mytree
a = TREE(dataset,labels)
print(a)