决策树底层代码

import numpy as np
import pandas as pd
def createDataSet():
    dataSet = [['sunny', 'hot', 'high', 'False', 'no'],
               ['sunny', 'hot', 'high', 'True', 'no'],
               ['overcast', 'hot', 'high', 'False', 'yes'],
               ['rain', 'mild', 'high', 'False', 'yes'],
               ['rain', 'cool', 'normal', 'False', 'yes'],
               ['rain', 'cool', 'normal', 'True', 'no'],
               ['overcast', 'cool', 'normal', 'True', 'yes'],
               ['sunny', 'mild', 'high', 'False', 'no'],
               ['sunny', 'cool', 'normal', 'False', 'yes'],
               ['rain', 'mild', 'normal', 'False', 'yes'],
               ['sunny', 'mild', 'normal', 'True', 'yes'],
               ['overcast', 'mild', 'high', 'True', 'yes'],
               ['overcast', 'hot', 'normal', 'False', 'yes'],
               ['rain', 'mild', 'high', 'True', 'no']]
    features = ['outlook', 'temperature', 'humidity', 'windy', 'play']

    return dataSet,features

# 计算当前节点的信息熵(entropy)
def calcEnt(dataSet):
    label = dataSet.iloc[:,-1].to_list() # 取最后一列标签列
    class_values = np.unique(label) # 去重
    shannonEnt = 0
    for class_val in class_values:
        num = label.count(class_val) #统计对应该标签的样本个数
        shannonEnt += (num/len(label)) * np.log2(num/len(label)) # 根据占比累加熵
    return -shannonEnt

# 对比当前数据中每一个特征的条件熵并选出最优特征作为决策节点
def chooseBestFeatureToSplit(df):
    baseEntropy = calcEnt(df)
    bestInfoGain = 0
    bestFeature = ''
    # 计算每一列特征决策点的熵
    for feature in df.columns[:-1]:
        branchs = df[feature].unique() # 当前特征的分支
        newEntropy = 0
        for branch in branchs:
            subdf = df[df[feature] == branch] # 提取每一个分支的子表
            newEntropy += len(subdf)/len(df) * calcEnt(subdf) # 计算该分支的熵并累加
        infoGain = baseEntropy - newEntropy  # 原始熵与该特征决策点的熵的差值,即信息增益
        if (infoGain > bestInfoGain):  # 保留当前最大信息增益及其对应的决策节点
            bestInfoGain = infoGain
            bestFeature = feature
    return bestFeature

# 投票,类别占比高的直接返回结果
def majorityCnt(classList):
    max_num = 0
    max_class = ''
    class_val = np.unique(classList)
    for class_ in class_val:
        num = classList.count(class_)
        if num > max_num:
            max_num = num
            max_class = class_
    return max_class


def createTree(dataSet):
    classList = dataSet.iloc[:,-1].to_list()  # 是否去打球
    class_val = np.unique(classList)
    if len(class_val)==1:
        return class_val[0] # 如果只有一种选择,则直接返回结果
    if len(dataSet.columns)-1 == 1: #如果分裂到特征只剩一列直接投票将结果返回
        return majorityCnt(classList)
    bestFeature = chooseBestFeatureToSplit(dataSet)  # 选择当前特征中最优决策节点
    # outlook
    myTree = {bestFeature: {}} # 最优特征作为当前的决策节点
    branchs = dataSet[bestFeature].unique() # 获取当前特征的分支
    # 遍历每一个分支,并提取子表继续递归进行树的构建
    for branch in branchs:
        df = dataSet[dataSet[bestFeature] == branch] # 根据相应分支名称获取字表
        subdf = df.drop(bestFeature, axis='columns') # 已作为决策点的特征列不再参与之后决策点的构建
        myTree[bestFeature][branch] = createTree(subdf)
    return myTree


if __name__ == '__main__':
    dataSet, features = createDataSet()  # 创造示列数据
    df = pd.DataFrame(data=dataSet, columns=features)
    print(createTree(df))  # 输出决策树模型结果

你可能感兴趣的:(决策树,python)