决策树Python源程序及自定义源程序

Python决策树源程序:
#!/usr/bin/python
# -*- coding:utf-8 -*-


from math import log
import operator




def calShannonEnt(dataSet):  # 计算信息熵
    numEntries = len(dataSet)
    labelCounts = {}  # 使用一个元组来存储每种类别出现的次数
    for featVec in dataSet:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] = labelCounts[currentLabel] + 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries
        shannonEnt = shannonEnt - prob * log(prob, 2)
    return shannonEnt




def createDataSet():  # 创建数据集
    dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
    labels = ['no surfacing', 'flippers']
    return dataSet, labels




def splitDataSet(dataSet, axis, value):  # 如果样本的特定属性为指定值,就将样本对应属性删除并加入返回集中
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis + 1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet  # 返回axis属性为value的数据




def chooseBestFeatureToSplit(dataSet):  # 选择信息增益最大的属性作为划分属性
    numFeatures = len(dataSet[0]) - 1  # 特征总数
    baseEntropy = calShannonEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]  # 得到所有数据第i个特征的取值
        uniqueVals = set(featList)  # 第i个特征的所有可能取值
        newEntropy = 0.0
        for value in uniqueVals:  # 找到数据集中第i个属性为这些唯一值的数据并求信息熵之和
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet) / float(len(dataSet))
            newEntropy = newEntropy + prob * calShannonEnt(subDataSet)  # 按照第i个属性划分数据集的信息熵
        infoGain = baseEntropy - newEntropy  # 求信息增益
        if infoGain > bestInfoGain:  # 求最大信息增益并保存相应属性对应的下标
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature




def majorityCnt(classList):  # 根据投票法确定分类结果,输入为一组元素的类别向量
    classCount = {}  # 存储种类及数量
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] = classCount[vote] + 1
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reveres=True)  # 对结果元组进行排序
    return sortedClassCount[0][0]




def createTree(dataSet, labels):  # 创建决策树
    classList = [example[-1] for example in dataSet]  # 数据种类的所有可能
    if classList.count(classList[0]) == len(classList):  # 种类唯一,则直接返回该种类(第一个结束条件)
        return classList[0]  # 如果递归结束都是返回一个值
    if len(dataSet[0]) == 1:  # 如果所有属性都划分过了(数据集只有种类,没有属性了),则投票法决定分类结果(第二个结束条件)
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)  # 选择最好的划分属性
    bestFeatLabel = labels[bestFeat]  # 得到划分属性对应的文字标签
    myTree = {bestFeatLabel: {}}  # 要返回的决策树
    del (labels[bestFeat])  # 在标签中删除选用的属性
    featValues = [example[bestFeat] for example in dataSet]  # 得到每条数据划分属性的值
    uniqueVals = set(featValues)  # 得到划分属性的所有可能值
    for value in uniqueVals:  # 数据在该属性上取不同的值就划分到不同的组中
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
    return myTree




def classify(inputTree, featLabels, testVec):
    firstStr = inputTree.keys()[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)  # 寻找当前的决策属性是第几个属性
    for key in secondDict.keys():
        if testVec[featIndex] == key:  # 检查测试样本中这个属性的值等于决策树中的哪个key
            if type(secondDict[key]).__name__ == 'dict':  # 检测到对应的key之后就检查是继续分类还是得到最终分类结果
                classLabel = classify(secondDict[key], featLabels, testVec)
            else:
                classLabel = secondDict[key]
    return classLabel




def storeTree(inputTree, filename):  # 存储构建好的决策树
    import pickle
    fw = open(filename, 'w')
    pickle.dump(inputTree, fw)
    fw.close()




def grabTree(filename):  # 读取存储的决策树
    import pickle
    fr = open(filename)
    return pickle.load(fr)




#画决策树的文件
#treePlotter.py


#[python]
#view
#plain
#copy
# !/usr/bin/python
# -*- coding:utf-8 -*-
#
# import matplotlib.pyplot as plt


decisionNode = dict(boxstyle="sawtooth", fc="0.8")  # 设置结点的形式和底色(0~1),值越大越浅
leafNode = dict(boxstyle="round4", fc="0.8")
arrow = dict(arrowstyle="<-")  # 设置连线形式




def plotNode(nodeTxt, centerPt, parentPt, nodeType):  # 根据位置文字和各种形式绘制结点和父子结点之间的连线
    createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', xytext=centerPt,
                            textcoords='axes fraction', va="center", ha="center", bbox=nodeType, arrowprops=arrow)




def createPlot(inTree):  # 创建一个树
    fig = plt.figure(1, facecolor='white')
    fig.clf()
    axprops = dict(xticks=[0.5, 1], yticks=[0.5])  # 显示哪些坐标轴上的值
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
    plotTree.totalW = float(getNumLeafs(inTree))  # 树的总宽度
    plotTree.totalD = float(getTreeDepth(inTree))  # 树的总高度
    plotTree.xOff = -0.5 / plotTree.totalW
    plotTree.yOff = 1.0
    plotTree(inTree, (0.5, 1.0), '')
    plt.show()




def getNumLeafs(myTree):  # 得到叶节点的数量
    numLeafs = 0
    firstStr = myTree.keys()[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs += 1
    return numLeafs




def getTreeDepth(myTree):  # 得到树的深度
    maxDepth = 0
    firstStr = myTree.keys()[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth:
            maxDepth = thisDepth
    return maxDepth




def retrieveTree(i):  # 初始化数据
    listOfTrees = [{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
                   {'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
                   ]
    return listOfTrees[i]




def plotMidText(cntrPt, parentPt, txtString):  # 画出两个结点间的标注信息
    xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
    yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString)




def plotTree(myTree, parentPt, nodeTxt):
    numLeafs = getNumLeafs(myTree)
    depth = getTreeDepth(myTree)
    firstStr = myTree.keys()[0]
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)  # 计算子节点的位置(不知道怎么确定的X坐标)
    plotMidText(cntrPt, parentPt, nodeTxt)
    plotNode(firstStr, cntrPt, parentPt, decisionNode)  # 画出当前的决策结点
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':  # 如果之后也是决策结点则递归调用这个函数
            plotTree(secondDict[key], cntrPt, str(key))
        else:  # 否则直接画出子叶节点
            plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
    plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD

当要根据课题的方向对源程序进行修改时,需要先通过自定义函数的方式把源程序变为自定义源程序。具体要更改的部分为:

(1)把源程序命名为decisiontree即自定义函数,并与决策树程序存入一个路径中。

(2)在决策树程序开头调用此源程序即调用自定义函数(第四行)。

#!/usr/bin/python
# -*- coding:utf-8 -*-

import decisiontree
import numpy as np

(3)在决策树程序中用自定义函数decisiontree替换决策树源程序DecisionTreeClassifier

修改之前为:

model = DecisionTreeClassifier(criterion='entropy', max_depth=6)

修改之后为:

 model = decisiontree(criterion='entropy', max_depth=6)

修改之后运行决策树程序,运行成功。

你可能感兴趣的:(决策树Python源程序及自定义源程序)