ID3算法实现决策树(python)

注意这里的代码只适用于结果为0或1的二分类问题:
使用的数据:
ID3算法实现决策树(python)_第1张图片
输出的结果:
ID3算法实现决策树(python)_第2张图片
代码:

import numpy as np

data = []
labels = np.array(["年龄", "收入", "学生", "信用"], dtype=np.str)
with open("data1.txt") as datafile:
    for line in datafile:
        tokens = line.strip().split(' ')  # 处理空白符
        data.append([tk for tk in tokens[:]])

x = np.array(data)  # data



# 计算特征熵
def my_cor(dataArr, aimEle):
    """""
        aimEle:目标特征列
        dataArr:数据数组
        return:特征熵
        """""
    length = len(set(aimEle))  # 得出标签类别数
    subnum = np.zeros((length, 2))  # 一行存结果为0的数据和  一行存结果 1的数据 一行存标签
    tagval = np.zeros(length)
    tagval = tagval.astype(np.str)

    feturenum = length  # 标签种类数
    SUM = 0  # 数据总量
    for t in range(len(aimEle)):
        if aimEle[t] not in tagval:
            tagval[feturenum - 1] = aimEle[t]
            feturenum -= 1
            index = feturenum
        else:
            index = np.where(tagval == aimEle[t])

        subnum[index, int(dataArr[t][-1])] += float(dataArr[t][0])  # 以标签定位行、结果类别定位列
        SUM += int(dataArr[t][0])

    subcor = np.zeros(length)  # 计算特征熵
    Hs = 0
    for t in range(length):
        sum = subnum[t][0] + subnum[t][1]  # 计算每种标签的部分和
        for L in range(2):
            if subnum[t][L] != 0:
                subcor[t] -= ((subnum[t][L] / sum) * np.log2((subnum[t][L] / sum)))
            else:
                subcor[t] = 0
        Hs += (sum / SUM) * subcor[t]
    return Hs


# 计算信息增益  :
def myGs(tagname, dataArr):
    """""
    tagname:标签名称数组
    classifyArr:分类结果数组
    dataArr:数据数组
    return:最佳特征对应的维度
    """""
    SUM = 0  # 总数据和
    SUM1 = 0  # 类别为1的数据和
    SUM0 = 0  # 类别为0的数据和
    for i in range(len(dataArr[:, 0])):  # 第一列放数据量
        SUM += float(dataArr[i, 0])
        if dataArr[i, -1] == '0':
            SUM0 += float(dataArr[i, 0])
        else:
            SUM1 += float(dataArr[i, 0])
    # 4.计算信息熵
    HS = -(SUM0 / SUM) * np.log2(SUM0 / SUM) - (SUM1 / SUM) * np.log2(SUM1 / SUM)
    GS = np.zeros(len(tagname), float)
    for index in range(len(tagname)):
        aimEle = np.array(dataArr[:, index + 1])
        tempcor = my_cor(dataArr, aimEle)
        GS[index] = HS - tempcor
        print(tagname[index] + " 条件熵是:", tempcor, "信息增益是", GS[index])
    best_tag = np.where(GS == np.max(GS))  # 返回最佳特征对应的维度
    # print(best_tag)
    best_tag_index = best_tag[0][0]  # 处理where函数返回的元组
    return best_tag_index


def splitArr(index, dataArr, ag_val):
    """""
        index:分裂依据tag的下标
        dataArr:数据数组
        ag_val:分裂依据tag的值
        return:分裂之后的数据数组
        """""
    # 循环遍历dataArr中的每一行数据
    retDataSet = []
    for featVec in dataArr:
        if featVec[index] == ag_val:
            reduceFeatVec = np.delete(featVec, index)  # 删除这一维特征
            retDataSet.append(reduceFeatVec)
    retDataSet = np.array(retDataSet)
    return retDataSet


def build_tree(tagname, dataArr):
    if dataArr.size == 0:
        return
    if len(set(dataArr[:, -1])) == 1:
        return dataArr[0, -1]  # classify只剩一种直接返回值
    if tagname.size == 1:
        return np.argmax(np.bincount(dataArr[:, -1]))  # 返回classfyArr中出现最多的

    best_tag_index = myGs(tagname, dataArr)
    best_tag = tagname[best_tag_index]
    myTree = {best_tag: {}}
    # 找到需要分类的特征子集
    featValues = dataArr[:, best_tag_index + 1]
    uniqueVals = set(featValues)
    subLabels = np.delete(tagname, best_tag_index)  # 删除已选tag列
    for value in uniqueVals:
        myTree[best_tag][value] = build_tree(subLabels, splitArr(best_tag_index+1, dataArr, value))
    return myTree


print(build_tree(labels, x))

参考blog:https://blog.csdn.net/colourful_sky/article/details/82056125

你可能感兴趣的:(机器学习)