决策树算法实现(ID3分类树)

此处为ID3决策树

import numpy as np
import pandas as pd
from pandas import DataFrame, Series

data = {'no surfacing': [1, 1, 1, 0, 0],
        'flippers': [1, 1, 0, 1, 1],
        'fish': ['yes', 'yes', 'no', 'no', 'no'],
        }
data = DataFrame(data)
data['fish'] = (data['fish'] == 'yes').astype(int)


def get_entropy(dataset):
    '''
    计算熵值
    :param dataset:
    :return:
    '''
    label_sum = dataset.shape[0]  # 标注的总数量 = 数据集总行数
    label_series = dataset.iloc[:, -1].value_counts()  # 标签的所有类别
    p = label_series / label_sum
    entropy = sum(-p * np.log2(p))
    return entropy


def best_split(dataset):
    '''
    找出哪个列最为当前最佳结点
    :return:最佳结点对应的列的位置
    '''
    label_entropy = get_entropy(dataset)  # 首先要计算的熵值是标注的熵值
    best_info_gain = -1  # 记录最优信息增益
    best_column = -1  # 记录最优最优的列
    # 遍历除了标注列以外的列,即特征列
    for column in range(dataset.shape[1] - 1):
        # 当前列下所有的离散值唯一值
        unis = dataset.iloc[:, column].value_counts().index
        entropys = 0
        # 计算当前结点下的唯一离散值的 标注占比*其熵
        for uni in unis:
            sub_dataset = dataset[dataset.iloc[:, column] == uni]  # 相当于分组后每个组的所有成员
            entropy = get_entropy(sub_dataset)
            entropys += sub_dataset.shape[0] / dataset.shape[0] * entropy
            info_gain = label_entropy - entropys
            if info_gain > best_info_gain:
                best_info_gain = info_gain
                best_column = column
    return best_column


def split_by_column(dataset, column, value):
    '''
    把已经当做了当前结点的那一列删去
    :param dataset:
    :param column:
    :param value:
    :return:
    '''
    op_column = dataset.columns[column]
    # 获取当前这一列等于value值的所有另外的列
    redataset = dataset.loc[dataset[op_column] == value, :].drop(op_column, axis=1)
    return redataset


def create_tree(dataset):
    '''
    递归增加分支
    :param dataset:
    :return:
    '''
    feature_list = list(dataset.columns)
    class_list = dataset.iloc[:, -1].value_counts()
    # 递归出口  当数据集只剩下一列 or
    if dataset.shape[1] == 1 or class_list[0] == dataset.shape[0]:
        return class_list.index[0]
    best_column = best_split(dataset)  # 作为当前结点的最佳属性对应的列
    best_feature = feature_list[best_column]
    my_tree = {best_feature: {}}
    del feature_list[best_column]
    value_list = set(dataset.iloc[:, best_column])
    # 对此节点创建分支
    for value in value_list:
        temp = split_by_column(dataset, best_column, value)
        my_tree[best_feature][value] = create_tree(temp)
    return my_tree


def save_tree(tree):
    np.save('./my_tree.npy', tree)


def classify(tree, all_columns, test_data):
    '''
    用训练好的决策树进行分类
    :param tree:
    :param all_columns:
    :param test_data:
    :return:
    '''
    current_node = [key for key in tree][0]
    # print(first_node)
    next_node_dict = tree[current_node]
    feature_index = all_columns.index(current_node)

    for key in next_node_dict:
        print(test_data[feature_index])
        print(key)
        if test_data[feature_index] == key:
            # 如果下一个是结点则继续递归
            if type(next_node_dict[key]) == dict:
                class_label = classify(next_node_dict[key], all_columns, test_data)
            else:
                class_label = next_node_dict[key]
    return class_label


def score(train_data, test_data):
    '''
    预测的准确率评估
    :param train_data:
    :param test_data:
    :return:
    '''
    tree = create_tree(train_data)
    all_columns = list(train_data.columns)
    result = []
    # 循环每一行
    for row in range(test_data.shape[0]):
        obj = test_data.iloc[row, :-1]
        class_label = classify(tree, all_columns, obj)
        result.append(class_label)
    new_series = Series(result)
    accuracy = (test_data.iloc[:, -1] == new_series).mean()
    print(accuracy)


if __name__ == '__main__':
    train_data = data
    test_data = data.iloc[:3, :]
    score(train_data, test_data)

你可能感兴趣的:(机器学习算法)