西瓜书-决策树

 这里只实现了对于离散值来计算的生成的决策树,代码如下:

from math import log
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

def creatDataSet():
    dataset = [
    ['青绿','蜷缩','浊响','清晰','凹陷','硬滑','是'],
    ['乌黑','蜷缩','沉闷','清晰','凹陷','硬滑','是'],
    ['乌黑','蜷缩','浊响','清晰','凹陷','硬滑','是'],
    ['青绿','蜷缩','沉闷','清晰','凹陷','硬滑','是'],
    ['浅白','蜷缩','浊响','清晰','凹陷','硬滑','是'],
    ['青绿','稍蜷','浊响','清晰','稍凹','软粘','是'],
    ['乌黑','稍蜷','浊响','稍糊','稍凹','软粘','是'],
    ['乌黑','稍蜷','浊响','清晰','稍凹','硬滑','是'],
    ['乌黑','稍蜷','沉闷','稍糊','稍凹','硬滑','否'],
    ['青绿','硬挺','清脆','清晰','平坦','软粘','否'],
    ['浅白','硬挺','清脆','模糊','平坦','硬滑','否'],
    ['浅白','蜷缩','浊响','模糊','平坦','软粘','否'],
    ['青绿','稍蜷','浊响','稍糊','凹陷','硬滑','否'],
    ['浅白','稍蜷','沉闷','稍糊','凹陷','硬滑','否'],
    ['乌黑','稍蜷','浊响','清晰','稍凹','软粘','否'],
    ['浅白','蜷缩','浊响','模糊','平坦','硬滑','否'],
    ['青绿','蜷缩','沉闷','稍糊','稍凹','硬滑','否'],
]

    labels = ['色泽','根蒂','敲声','纹理','脐部','触感']
    return dataset,labels

def createTree(dataset,lables,featLables):
    classList = [example[-1] for example in dataset]#将dataSet中的数据先按行依次放入example中,然后取得example中的example[-1]元素,
    #放入列表featList中 这里放入的是最后一列的”是否“ 即看看是否所有数据都是好瓜或者坏瓜
    if classList.count(classList[0]) == len(classList):
        #如里面的所有结果都是一样的 都为是 或者否
        return classList[0]#返回值为任意位置上的值 因为都一样

    if len(dataset[0]) == 1:#只剩一个属性特征的时候
        return majorityCnt(classList)#计算当前节点数最多的情况
    
    bestFeat = chooseBestFeatureToSplit(dataset)#选择最好的一个特征来作为根节点
    bestFeatlable = lables[bestFeat]#找到当前特征的标签名字(属性名字 如色泽)
    featLables.append(bestFeatlable)#选择的特征标签保存下来 主要是用于构建树

    myTree = {bestFeatlable:{}}
    del lables[bestFeat] #删掉当前已经选择好的属性
    featValue = [example[bestFeat] for example in dataset]#得到当前属性可能取到的值
    uniqueVals = set(featValue)#去重
    for value in uniqueVals:
        sublables = lables[:]
        myTree[bestFeatlable][value] = createTree(splitDataSet(dataset,bestFeat,value),sublables,featLables)#创建0或者1的分支

    return myTree

def majorityCnt(classList):#看当前节点yes多还是no多 返回多的结果
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():classCount[vote] = 0 #用于形成一个以yes no为关键字的字典
        classCount[vote] += 1
    sortedclassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse=True)
    
    return sortedclassCount[0][0]

     
def calcShannonEnt(dataset):#计算当前熵值
    numexamples = len(dataset)
    labelCounts = {}
    for featVec in dataset:#算当前yes no的数
        currentlabel = featVec[-1]
        if currentlabel not in labelCounts.keys():
            labelCounts[currentlabel] = 0#如当前标签值没有出现过 则把标签值加到字典里
        labelCounts[currentlabel] += 1

    shannonEnt = 0
    for key in labelCounts:
        prop = float(labelCounts[key])/numexamples #算当前标签出现的概率
        shannonEnt -= prop * log(prop,2)#熵值公式
    return shannonEnt

def chooseBestFeatureToSplit(dataset):#选择最好的一个特征来作为根节点
    numFeatures = len(dataset[0]) - 1 #算出属性的个数
    baseEntropy = calcShannonEnt(dataset)#最初的熵值
    bestInfoGain = 0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataset]#当前列的特征
        uniqueVals = set(featList)#当前属性的类别 0,1,2等
        newEntropy = 0 
        for val in uniqueVals:#对每一个特征来算熵值 分支下来的熵值 第一列来划分下的不同值的一个熵值
            subDataSet = splitDataSet(dataset,i,val)#切分当前数据集
            prob = len(subDataSet)/float(len(dataset))
            newEntropy += prob * calcShannonEnt(subDataSet)

        infoGain = baseEntropy - newEntropy
        if(infoGain > bestInfoGain):
             bestInfoGain = infoGain
             bestFeature = i
    return bestFeature

            
def splitDataSet(dataset,axis,val):
    retDataSet = []
    for featVec in dataset:#找到当前数据集在这个分叉上的
        if featVec[axis] == val:#去掉当前这一列
            reducedfeatVec = featVec[:axis]
            reducedfeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedfeatVec)
    return  retDataSet


if __name__=='__main__':
    dataset, labels = creatDataSet()
    featLables = []
    myTree = createTree(dataset,labels,featLables)
    print(myTree)
    
    

运行结果:

 

你可能感兴趣的:(周报,大数据)