【机器学习】02 决策树C4.5代码

C4.5

    • 1.引入库
    • 2.读入数据
    • 3.找到样本最多的类
    • 4.计算香农熵
    • 5.划分数据集
    • 6.找出信息增益率最大的值
    • 7.创建树
    • 8.运行结果


1.引入库

import math
import operator

2.读入数据

def createDataset():
    dataSet = [
        ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
        ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
        ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '好瓜'],
        ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '好瓜'],
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '好瓜'],
        ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'],
        ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '坏瓜'],
        ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '坏瓜'],
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '坏瓜'],
        ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '坏瓜'],
        ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '坏瓜'],
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '坏瓜'],
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '坏瓜'],
        ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜']
    ]

    // 特征值列表
    labels = ['色泽', '根蒂', '敲击', '纹理', '脐部', '触感']

    return dataSet, labels

3.找到样本最多的类

def majorityCnt(classList):
    classCount={}

    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote]=0
        classCount[vote]+=1

    //降序
    sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    print(type(sortedClassCount))
    print(sortedClassCount)
    return sortedClassCount[0][0]

4.计算香农熵

def calcShannonEnt(dataSet):
    numEntries=len(dataSet)
    labelCounts={}

    for featVec in dataSet:
        currentLabel=featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel]=0
        labelCounts[currentLabel]+=1

    shannonEnt=0

    for key in labelCounts:
        prob=float(labelCounts[key])/numEntries
        shannonEnt-=prob*math.log(prob,2)
    return shannonEnt

5.划分数据集

def splitDataSet(dataSet,axis,value): 
    retDataSet=[]

    for featVec in dataSet:
        if featVec[axis]==value:
            //去掉索引列
            reducedFeatVec=featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)

    return retDataSet

6.找出信息增益率最大的值

def chooseBestFeatureToSplit(dataSet):
    numFeatures=len(dataSet[0])-1
    //计算所有数据集的香农熵
    baseEntropy=calcShannonEnt(dataSet)
    bestInfoGain=0.0 //最佳特征的索引
    bestFeature=-1 //最佳信息熵

    for i in range(numFeatures):
        //获取每一列特征值
        featList=[example[i] for example in dataSet]
        uniqueVals=set(featList)
        newEntropy=0 //新信息熵

        for value in uniqueVals:
            subDataSet=splitDataSet(dataSet,i,value)
            prob=len(subDataSet)/float(len(dataSet))
            newEntropy+=prob*calcShannonEnt(subDataSet)

        infoGain=baseEntropy-newEntropy
        if infoGain>bestInfoGain:
            bestInfoGain=infoGain
            bestFeature=i

    return bestFeature

7.创建树

def createTree(dataSet,labels):
    //获得每一个标签
    classList=[example[-1] for example in dataSet]

    //标签全相同即全属于同一类别,返回该标签
    if classList.count(classList[0])==len(dataSet):
        return classList[0]
    //所有样本在所有属性上取值相同,类别标记为样本数最多的类
    if len(dataSet[0])==1:
        return majorityCnt(classList)

    //获取最优索引
    bestFeat=chooseBestFeatureToSplit(dataSet)
    //获取最优索引的名称
    bestFeatLabel=labels[bestFeat]

    //创建根节点
    myTree={bestFeatLabel:{}}
    //删除用过的结点
    del(labels[bestFeat])
    //最优特征
    featValues=[example[bestFeat] for example in dataSet]
    uniqueVals=set(featValues)

    for value in uniqueVals:
        subLabels=labels[:]
        myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)

    return myTree

8.运行结果

dataSet,labels=createDataset()
myTree=createTree(dataSet,labels)
TreePlotter.createPlot(myTree)
print(myTree)

{‘纹理’: {‘稍糊’: {‘触感’: {‘软粘’: ‘好瓜’, ‘硬滑’: ‘坏瓜’}}, ‘模糊’: ‘坏瓜’, ‘清晰’: {‘根蒂’: {‘稍蜷’: {‘色泽’: {‘青绿’: ‘好瓜’, ‘乌黑’: {‘触感’: {‘软粘’: ‘坏瓜’, ‘硬滑’: ‘好瓜’}}}}, ‘蜷缩’: ‘好瓜’, ‘硬挺’: ‘坏瓜’}}}}

你可能感兴趣的:(机器学习,python,机器学习,决策树)