CART算法

        CART算法可以用来解决分类与回归问题。CART假设决策树是二叉树,内部节点的取值为“是”或者“否”,左边的分支取值为“是”分支,右边的分支取值为“否”分支。CART算法采用gini系数来进行特征的选择与划分。

 

import cv2
import time
import logging
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os
import struct

total_class = 10

# 这里选用了一个比较小的数据集,因为过大的数据集会导致栈溢出

# 二值化
def binaryzation(img):
    cv_img = img.astype(np.uint8)
    cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img)
    return cv_img


#@log
def binaryzation_features(trainset):
    features = []
    for img in trainset:
        img = np.reshape(img, (28, 28))
        cv_img = img.astype(np.uint8)
        img_b = binaryzation(cv_img)
        features.append(img_b)
    features = np.array(features)
    features = np.reshape(features, (-1, 784))
    return features

def load_mnist(path, kind='train'):
    labels_path = os.path.join(path, '%s-labels.idx1-ubyte' % kind)
    images_path = os.path.join(path, '%s-images.idx3-ubyte' % kind)
    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II', lbpath.read(8)) # 一个I代表4个字节,所以一共有8字节的头部,分别存入变量magic和n中
        labels = np.fromfile(lbpath, dtype=np.uint8) # 一个字节一读,并转化为8位无符号整型
    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack('>IIII', imgpath.read(16))
        images = binaryzation_features(np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784))
        # images = np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784) # 除去16个字节的头部之后,剩下的数据转变为8位无符号整型
    return images, labels
    # 训练集一共有9912422字节,训练集一共有60000个样本,通过样本数无法推得有那么多的字节,应该是经过压缩了的


class TreeNode(object):
    """决策树节点"""

    def __init__(self, **kwargs): # 如果使用**前缀,多余的参数会被认为是字典
        '''
        attr_index: 属性编号
        attr: 属性值
        label: 类别(y)
        left_chuld: 左子结点
        right_child: 右子节点
        '''
        self.attr_index = kwargs.get('attr_index') # 提取字典中的值
        self.attr = kwargs.get('attr')
        self.label = kwargs.get('label')
        self.left_child = kwargs.get('left_child')
        self.right_child = kwargs.get('right_child')


# 计算数据集的基尼指数
def gini_train_set(train_label): # 类列
    train_label_value = set(train_label)
    gini = 0.0
    for i in train_label_value: # 不重复的类
        train_label_temp = train_label[train_label == i]
        pk = float(len(train_label_temp)) / len(train_label)
        gini += pk * (1 - pk) # gini计算公式5.22
    return gini


# 计算一个特征不同切分点的基尼指数,并返回最小的
def gini_feature(train_feature, train_label):
    train_feature_value = set(train_feature)
    min_gini = float('inf')
    return_feature_value = 0
    for i in train_feature_value:
        train_feature_class1 = train_feature[train_feature == i] # array数组类型才能使用这种方式取值
        label_class1 = train_label[train_feature == i]
        # train_feature_class2 = train_feature[train_feature != i]
        label_class2 = train_label[train_feature != i]
        # CART算法中左右子树的划分是选取某一特征值,分成等于该特征值与不等于该特征值两部分
        D1 = float(len(train_feature_class1)) / len(train_feature)
        D2 = 1 - D1
        gini = D1 * gini_train_set(label_class1) + D2 * gini_train_set(label_class2) # 统计学习书上公式5.25
        if min_gini > gini:
            min_gini = gini
            return_feature_value = i
    return min_gini, return_feature_value


def get_best_index(train_set, train_label, feature_indexes):
    '''
    :param train_set: 给定数据集
    :param train_label: 数据集对应的标记
    :return: 最佳切分点,最佳切分变量
    求给定切分点集合中的最佳切分点和其对应的最佳切分变量
    '''
    min_gini = float('inf') # 正无穷
    feature_index = 0 # 最佳切分点
    return_feature_value = 0 # 最佳切分点变量
    for i in range(len(train_set[0])):
        if i in feature_indexes: # feature_indexes中还存在特征i
            train_feature = train_set[:, i]
            gini, feature_value = gini_feature(train_feature, train_label)
            # 调用上面的函数,找出一个变量的最佳切分点
            if gini < min_gini:
                min_gini = gini
                feature_index = i
                return_feature_value = feature_value
    return feature_index, return_feature_value


# 根据最优特征和最优切分点(最优特征中的某一个特征值)划分数据集
def divide_train_set(train_set, train_label, feature_index, feature_value): 
    left = []
    right = []
    left_label = []
    right_label = []
    for i in range(len(train_set)): # 剩余样本集
        line = train_set[i] # 选取剩余样本集中的一个样本
        if line[feature_index] == feature_value: # 是最优切分点的样本放入左边
            left.append(line)
            left_label.append(train_label[i])
        else:
            right.append(line)
            right_label.append(train_label[i])# 不是最优切分点的样本放入右边
    return np.array(left), np.array(right), np.array(left_label), np.array(right_label)

n = 0
def build_tree(train_set, train_label, feature_indexes):
    # 查看是否满足停止条件
    global n
    # n = n + 1
    # print(n)
    train_label_value = set(train_label)
    if len(train_label_value) == 1: # 该节点的所有的样本属于同一个类,则无需继续划分
        return TreeNode(label=train_label[0])
    # 当样本集中最优特征的值都一样时,那会出现所有样本都被归入一侧,另一边子树为空
    if len(train_label_value) == 0: # 样本集为空
        return None
    if len(feature_indexes) == 0 or gini_train_set(train_label) < 0.1: # 没有特征可以继续用于划分
        # return TreeNode(label=train_label[0]) # 不应该这么取,应当取样本中类最多的作为这个节点的类
        max_label_number = 0
        res = 0
        for i in train_label_value:
            now_label_number = len(train_label[train_label == i])
            if now_label_number > max_label_number:
                max_label_number = now_label_number
                res = i
        return TreeNode(label=res)
    
    # 选取最佳切分点
    feature_index, feature_value = get_best_index(train_set, train_label, feature_indexes)
    # 依据最优特征和最优切分点划分数据集
    left, right, left_label, right_label = divide_train_set(train_set, train_label, feature_index, feature_value)
    # 将最优特征从特征集中除去
    feature_indexes.remove(feature_index) # 通过位置找到对应的值,由于多次移除,导致位置与其值并不一定相同
    # 递归,分别生成该节点的左右子树
    left_branch = build_tree(left, left_label, feature_indexes) # 传入feature_indexes副本
    right_branch = build_tree(right, right_label, feature_indexes)
    return TreeNode(left_child = left_branch,
                    right_child = right_branch,
                    attr_index = feature_index,
                    attr = feature_value)

def predict_one(node, test):
    while node is not None and node.label is None:
        if test[node.attr_index] == node.attr:
            node = node.left_child
        else:
            node = node.right_child
    if node is None:
        return 10
    else:
        return node.label


#@log
def predict(tree, test_set):
    result = []
    for test in test_set:
        label = predict_one(tree, test)
        result.append(label)
    return result


if __name__ == '__main__':
    print("CART")
    print("Start read data...")
    
    t1 = time.time()
    train_features,train_labels = load_mnist("data" , kind='train')
    train_features = train_features[0:1000]
    train_labels = train_labels[0:1000]
    test_features,test_labels = load_mnist("data", kind='t10k')
    test_features = test_features[0:1000]
    test_labels = test_labels[0:1000]
    t2 = time.time()
    print("读取数据用时:" + str((t2-t1)))
    
    print('Start training...')
    tree = build_tree(train_features, train_labels, [i for i in range(784)])
    t3 = time.time()
    print("训练数据用时:" + str((t3-t2)))

    print('Start predicting...')
    test_predict = predict(tree, test_features)  # test_features测试集,tree为训练好的模型,函数返回的类型为np.array
    t4 = time.time()
    print("预测结果用时:" + str((t4-t3)))

    r = 0
    for i in range(len(test_predict)):
        if test_predict[i] != None and test_predict[i] == test_labels[i]:
            r = r + 1
        # print(test_predict[i], test_labels[i])
    score = float(r)/float(len(test_predict))
    print("The accruacy score is %f" % score)


参考链接:

1.https://github.com/Dod-o/Statistical-Learning-Method_Code

你可能感兴趣的:(CART算法)