周志华机器学习第四章例题4.3

作为一个萌新花了三天终于做出来了,普大喜奔,借鉴了机械学习实战的部分写法,把带有连续型特征值的决策树做了出来。

import math
import numpy as np
import operator
#量化的数据
dataset = np.array([[0,0,0,0,0,0,0.697,0.460,1],
                    [1,0,1,0,0,0,0.774,0.376,1],
                    [1,0,0,0,0,0,0.634,0.264,1],
                    [0,0,1,0,0,0,0.608,0.318,1],
                    [2,0,0,0,0,0,0.556,0.215,1],
                    [0,1,0,0,1,1,0.403,0.237,1],
                    [1,1,0,1,1,1,0.481,0.149,1],
                    [1,1,0,0,1,0,0.437,0.211,1],
                    [1,1,1,1,1,0,0.666,0.091,0],
                    [0,2,2,0,2,1,0.243,0.267,0],
                    [2,2,2,2,2,0,0.245,0.057,0],
                    [2,0,0,2,2,1,0.343,0.099,0],
                    [0,1,0,1,0,0,0.639,0.161,0],
                    [2,1,1,1,0,0,0.657,0.198,0],
                    [1,1,0,0,1,1,0.360,0.370,0],
                    [2,0,0,2,2,0,0.593,0.042,0],
                    [0,0,1,1,1,0,0.719,0.103,0]]
)

#4.3 信息熵生成决策树2.0

def calshonnonEnt(dataset): #计算特征值的信息熵
    shononent=0.
    numdataset = len(dataset)
    label_dict = {
     }
    for ai_dataset in dataset:
        ai_label = ai_dataset[-1]
        if ai_label not in label_dict.keys(): #!!!!!!!!!!这个写法要学会
            label_dict[ai_label] = 0
        label_dict[ai_label] +=1
    for key in label_dict.keys():
        num = label_dict[key]
        prob = float(num)/numdataset
        shononent -=prob*math.log(prob,2)
    return shononent


def nonFeqGain(dataset,i,baseEnt): #计算特征的信息熵
    dataset =np.array(dataset)
    a_dataset = dataset[:,i]
    values = list(set(a_dataset))
    shonono =0
    for ai in values:
        ai_dataset = []
        for j in range(len(dataset)):
            if ai == a_dataset[j]:
               ai_dataset.append(dataset[j])
        num_ai=len(ai_dataset)
        shon = calshonnonEnt(ai_dataset)
        shonono += (float(num_ai)/len(dataset))*shon
    gain = baseEnt-shonono
    return gain


def FreqAbestGain(dataset,i,baseEnt): #计算连续值特征的信息增益
    numA = len(dataset[0])
    dataset = np.array(dataset)
    a_dataset = dataset[:,i]
    a_values_list = sorted(list(set(a_dataset)))
    t_list = []
    gain_dict = {
     }
    for j in range(len(a_values_list)-1):
        tj = (a_values_list[j]+a_values_list[j+1])/2
        t_list.append(tj)
    for t in t_list:
        a_P_dataset =[]
        a_N_dataset= []
        for m in range(len(dataset)):
            if a_dataset[m] <= t:
                a_N_dataset.append(dataset[m])
            else:a_P_dataset.append(dataset[m])
        numN = len(a_N_dataset)
        numP = len(a_P_dataset)
        probN = float(numN)/(numN+numP)
        probP = float(numP)/(numN+numP)
        shon = probN*calshonnonEnt(a_N_dataset)+probP*calshonnonEnt(a_P_dataset)
        gain = baseEnt-shon
        gain_dict[t] = gain
    gain_list = sorted(gain_dict.items(),key=lambda x:x[1],reverse=True)[0]
    ai = gain_list[0]
    ai_gain = gain_list[1]
    print(ai,ai_gain)
    return ai,ai_gain


def choosenbestA(dataset,baseEnt): #选择最好的特征
    a_gain_dict = {
     }
    numA =len(dataset[0])
    list_lianxu = []
    list_non_freq=[]
    t_dict ={
     }
    for n in range(numA-1):
        if dataset[0][n] not in [0.,1.,2.]: #如果是不是连续
            list_lianxu.append(n)
        else:list_non_freq.append(n) #如果是连续
    for i in list_non_freq:
        a_gain_dict[i] = nonFeqGain(dataset,i,baseEnt)
    for j in list_lianxu:
        t,b =FreqAbestGain(dataset,j,baseEnt)
        a_gain_dict[j] = b
        t_dict[j] = t
    best_i_gain=list(sorted(a_gain_dict.items(),key=lambda x:x[1],reverse=True)[0])
    print(best_i_gain)
    best_i= best_i_gain[0]
    best_gain =best_i_gain[1]
    if best_i in list_non_freq:
        best_t=0
        return best_i,best_gain,best_t
    else:
        best_t = t_dict[best_i]
        return best_i,best_gain,best_t

baseEnt = calshonnonEnt(dataset)
choosenbestA(dataset,baseEnt)



def splitdataset(dataset,axis,value): #去除父节点的特征数据
    retdataset = []
    for featvec in dataset:
        if featvec[axis] == value:
            reducefeatvec = np.hstack((featvec[:axis],featvec[axis+1:]))
            retdataset.append(reducefeatvec)
    return retdataset


def splitFreqXIAOYUDataset(dataset,axis,t): #去除父节点的特征数据
    retdataset = []
    for featvec in dataset:
        if featvec[axis] <= t :
            reducefeatvec = np.hstack((featvec[:axis],featvec[axis+1:]))
            retdataset.append(reducefeatvec)
    return retdataset

def splitFreqDAYUDataset(dataset,axis,t): #去除父节点的数据
    retdataset = []
    for featvec in dataset:
        if featvec[axis] > t :
            reducefeatvec = np.hstack((featvec[:axis],featvec[axis+1:]))
            retdataset.append(reducefeatvec)
    return retdataset


#这个函数是机器学习实践上写的,但是我觉得好像没有什么用处,暂且先放着,希望有人能解答一下
#因为使用条件要求是len(dataset[0])==1,就是分到最后只剩最后一列就是好坏那列
#但是如果数据是完整的应该不会出现这种状况吧
def majoritycnt(classlist):
    classcount:{
     }
    for vote in classlist:
        if vote not in classlist.keys():
            classcount[vote] = 0
            classcount[vote] += 1
    sortedclasscount = sorted(classcount.iteritems(),
                              key = operator.itemgetter(1),reverse=True)
    return sortedclasscount[0][0]

labels = ['color','root','sound','textile','belly','feel','density','sugar']
def TREE(dataset,labels):
    classlist = [example[-1] for example in dataset]
    if classlist.count(classlist[0]) == len(classlist):
        return classlist[0]
    if len(dataset[0]) == 1:
        return majoritycnt(classlist)
    baseEnt = calshonnonEnt(dataset)
    bestA,bestGain,t = choosenbestA(dataset,baseEnt)
    bestfeatlabel  = labels[bestA]
    mytree = {
     bestfeatlabel:{
     }}
    if t==0:
        del(labels[bestA])
    featvalues = [example[bestA] for example in dataset]
    if featvalues[0]  in [0.,1.,2.]:
        uniquevals = set(featvalues)
        for value in uniquevals:
            sublabels = labels[:]
            mytree[bestfeatlabel][value] = TREE(splitdataset(dataset,bestA,value),sublabels)
    else:
            mytree[bestfeatlabel]['<%s'%t] = TREE(splitFreqXIAOYUDataset(dataset,bestA,t),labels)
            mytree[bestfeatlabel]['>%s'%t] = TREE(splitFreqDAYUDataset(dataset,bestA,t),labels)
    return mytree
a = TREE(dataset,labels)
print(a)

你可能感兴趣的:(算法,数据分析,python,决策树)