ID3算法的python实现

数据集

  • 数据集:MNIST数据,图片大小是28×28的,10个类别,使用数据的原始特征,所有每个样本有28×28=784个特征。
  • 图片中的每个元素值都经过二值化
  • 剪枝使用的是预剪枝。

代码

import cv2
import time
import logging
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# 二值化
def binaryzation(img):
    for i in range(len(img)):
        img_1 = img[i]  # 图片二值化
        cv_img = img_1.astype(np.uint8)  # 将图片的0-255取值变为0-1
        cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img)
        img[i] = cv_img

# 树节点类
class Tree(object):
    def __init__(self, node_type, Class=None, feature=None):
        self.node_type = node_type
        self.Child = {}
        self.Class = Class
        self.feature = feature

    def add_tree(self, val, tree):
        self.Child[val] = tree

    def predict(self, features):
        if self.node_type == 'leaf':
            return self.Class
        tree = self.Child[features[self.feature]]
        return tree.predict(features)

# 计算信息增益
def calc_ent(x):
    x_value_list = list(set(list(x)))
    ent = 0.0
    for x_value in x_value_list:
        p = float(x[x == x_value].shape[0]) / x.shape[0]
        logp = np.log2(p)
        ent -= p * logp
    return ent


# 计算条件熵H(y|x)
def calc_condition_ent(x, y):
    x_value_list = list(set(x))
    ent = 0.0
    for x_value in x_value_list:
        sub_y = y[x == x_value]
        temp_ent = calc_ent(sub_y)
        ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent
    return ent


def Create_tree(train_set, train_label, features, epsilon):
    LEAF = 'leaf'
    INTERNAL = 'internal'

    # 步骤1——如果train_set中的所有实例都属于同一类
    label_set = list(set(train_label))
    if len(label_set) == 1:
        return Tree(LEAF, Class=label_set[-1])

    # 步骤2——如果features为空
    label_to_count = Counter(list(train_label))
    max_class, max_len = label_to_count.most_common(1)[0]
    if len(features) == 0 or len(train_label) <= epsilon[1]:
        return Tree(LEAF, Class=max_class)

    # 步骤3——计算信息增益
    max_feature = 0
    max_inf_gain = 0
    HD = calc_ent(train_label)  # 计算训练集的熵
    for feature in features:
        A = np.array(train_set[:, feature].tolist())
        gda = HD - calc_condition_ent(A, train_label)  # 计算条件熵
        if gda > max_inf_gain:
            max_inf_gain, max_feature = gda, feature

    # 步骤4——小于阈值
    if max_inf_gain < epsilon[0]:
        return Tree(LEAF, Class=max_class)

    # 步骤5——构建非空子集
    sub_features = features.copy()
    sub_features.remove(max_feature)  # 每个特征只用一次
    tree = Tree(INTERNAL, feature=max_feature)

    # 划分操作
    feature_value_list = list(set(train_set[:, max_feature].tolist()))
    for feature_value in feature_value_list:
        sub_train_set = train_set[train_set[:, max_feature] == feature_value]
        sub_train_label = train_label[train_set[:, max_feature] == feature_value]
        sub_tree = Create_tree(sub_train_set, sub_train_label, sub_features, epsilon)
        tree.add_tree(feature_value, sub_tree)
    return tree


def predict(test_set, tree):
    result = []
    for features in test_set:
        tmp_predict = tree.predict(features)
        result.append(tmp_predict)
    return np.array(result)


if __name__ == '__main__':
    print('Start read data')
    S = time.time()
    raw_data = pd.read_csv('../data/train.csv')  # 读取数据
    data = raw_data.values  # 获取数据
    print("data shape:", data.shape)
    imgs = data[:, 1:]
    labels = data[:, 0]
    binaryzation(imgs)
    print("imgs shape:", imgs.shape)
    print("labels shape:", labels.shape)

    # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
    train_features, test_features, train_labels, test_labels = train_test_split(
        imgs, labels, test_size=0.33, random_state=23323)
    print("train data count :%d" % len(train_labels))
    print("test data count :%d" % len(test_labels))

    print('read data cost ', time.time() - S, ' second')

    print('Start training')
    S = time.time()
    tree = Create_tree(train_features, train_labels, [i for i in range(784)], (0.1,10))
    print('training cost ', time.time() - S, ' second')

    print('Start predicting')
    S = time.time()
    test_predict = predict(test_features, tree)
    print('predicting cost ', time.time() - S, ' second')

    score = accuracy_score(test_labels, test_predict)
    print("The accruacy socre is ", score)

你可能感兴趣的:(ID3算法的python实现)