CART算法可以用来解决分类与回归问题。CART假设决策树是二叉树,内部节点的取值为“是”或者“否”,左边的分支取值为“是”分支,右边的分支取值为“否”分支。CART算法采用gini系数来进行特征的选择与划分。
import cv2 import time import logging import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import os import struct total_class = 10 # 这里选用了一个比较小的数据集,因为过大的数据集会导致栈溢出 # 二值化 def binaryzation(img): cv_img = img.astype(np.uint8) cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img) return cv_img #@log def binaryzation_features(trainset): features = [] for img in trainset: img = np.reshape(img, (28, 28)) cv_img = img.astype(np.uint8) img_b = binaryzation(cv_img) features.append(img_b) features = np.array(features) features = np.reshape(features, (-1, 784)) return features def load_mnist(path, kind='train'): labels_path = os.path.join(path, '%s-labels.idx1-ubyte' % kind) images_path = os.path.join(path, '%s-images.idx3-ubyte' % kind) with open(labels_path, 'rb') as lbpath: magic, n = struct.unpack('>II', lbpath.read(8)) # 一个I代表4个字节,所以一共有8字节的头部,分别存入变量magic和n中 labels = np.fromfile(lbpath, dtype=np.uint8) # 一个字节一读,并转化为8位无符号整型 with open(images_path, 'rb') as imgpath: magic, num, rows, cols = struct.unpack('>IIII', imgpath.read(16)) images = binaryzation_features(np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784)) # images = np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784) # 除去16个字节的头部之后,剩下的数据转变为8位无符号整型 return images, labels # 训练集一共有9912422字节,训练集一共有60000个样本,通过样本数无法推得有那么多的字节,应该是经过压缩了的 class TreeNode(object): """决策树节点""" def __init__(self, **kwargs): # 如果使用**前缀,多余的参数会被认为是字典 ''' attr_index: 属性编号 attr: 属性值 label: 类别(y) left_chuld: 左子结点 right_child: 右子节点 ''' self.attr_index = kwargs.get('attr_index') # 提取字典中的值 self.attr = kwargs.get('attr') self.label = kwargs.get('label') self.left_child = kwargs.get('left_child') self.right_child = kwargs.get('right_child') # 计算数据集的基尼指数 def gini_train_set(train_label): # 类列 train_label_value = set(train_label) gini = 0.0 for i in train_label_value: # 不重复的类 train_label_temp = train_label[train_label == i] pk = float(len(train_label_temp)) / len(train_label) gini += pk * (1 - pk) # gini计算公式5.22 return gini # 计算一个特征不同切分点的基尼指数,并返回最小的 def gini_feature(train_feature, train_label): train_feature_value = set(train_feature) min_gini = float('inf') return_feature_value = 0 for i in train_feature_value: train_feature_class1 = train_feature[train_feature == i] # array数组类型才能使用这种方式取值 label_class1 = train_label[train_feature == i] # train_feature_class2 = train_feature[train_feature != i] label_class2 = train_label[train_feature != i] # CART算法中左右子树的划分是选取某一特征值,分成等于该特征值与不等于该特征值两部分 D1 = float(len(train_feature_class1)) / len(train_feature) D2 = 1 - D1 gini = D1 * gini_train_set(label_class1) + D2 * gini_train_set(label_class2) # 统计学习书上公式5.25 if min_gini > gini: min_gini = gini return_feature_value = i return min_gini, return_feature_value def get_best_index(train_set, train_label, feature_indexes): ''' :param train_set: 给定数据集 :param train_label: 数据集对应的标记 :return: 最佳切分点,最佳切分变量 求给定切分点集合中的最佳切分点和其对应的最佳切分变量 ''' min_gini = float('inf') # 正无穷 feature_index = 0 # 最佳切分点 return_feature_value = 0 # 最佳切分点变量 for i in range(len(train_set[0])): if i in feature_indexes: # feature_indexes中还存在特征i train_feature = train_set[:, i] gini, feature_value = gini_feature(train_feature, train_label) # 调用上面的函数,找出一个变量的最佳切分点 if gini < min_gini: min_gini = gini feature_index = i return_feature_value = feature_value return feature_index, return_feature_value # 根据最优特征和最优切分点(最优特征中的某一个特征值)划分数据集 def divide_train_set(train_set, train_label, feature_index, feature_value): left = [] right = [] left_label = [] right_label = [] for i in range(len(train_set)): # 剩余样本集 line = train_set[i] # 选取剩余样本集中的一个样本 if line[feature_index] == feature_value: # 是最优切分点的样本放入左边 left.append(line) left_label.append(train_label[i]) else: right.append(line) right_label.append(train_label[i])# 不是最优切分点的样本放入右边 return np.array(left), np.array(right), np.array(left_label), np.array(right_label) n = 0 def build_tree(train_set, train_label, feature_indexes): # 查看是否满足停止条件 global n # n = n + 1 # print(n) train_label_value = set(train_label) if len(train_label_value) == 1: # 该节点的所有的样本属于同一个类,则无需继续划分 return TreeNode(label=train_label[0]) # 当样本集中最优特征的值都一样时,那会出现所有样本都被归入一侧,另一边子树为空 if len(train_label_value) == 0: # 样本集为空 return None if len(feature_indexes) == 0 or gini_train_set(train_label) < 0.1: # 没有特征可以继续用于划分 # return TreeNode(label=train_label[0]) # 不应该这么取,应当取样本中类最多的作为这个节点的类 max_label_number = 0 res = 0 for i in train_label_value: now_label_number = len(train_label[train_label == i]) if now_label_number > max_label_number: max_label_number = now_label_number res = i return TreeNode(label=res) # 选取最佳切分点 feature_index, feature_value = get_best_index(train_set, train_label, feature_indexes) # 依据最优特征和最优切分点划分数据集 left, right, left_label, right_label = divide_train_set(train_set, train_label, feature_index, feature_value) # 将最优特征从特征集中除去 feature_indexes.remove(feature_index) # 通过位置找到对应的值,由于多次移除,导致位置与其值并不一定相同 # 递归,分别生成该节点的左右子树 left_branch = build_tree(left, left_label, feature_indexes) # 传入feature_indexes副本 right_branch = build_tree(right, right_label, feature_indexes) return TreeNode(left_child = left_branch, right_child = right_branch, attr_index = feature_index, attr = feature_value) def predict_one(node, test): while node is not None and node.label is None: if test[node.attr_index] == node.attr: node = node.left_child else: node = node.right_child if node is None: return 10 else: return node.label #@log def predict(tree, test_set): result = [] for test in test_set: label = predict_one(tree, test) result.append(label) return result if __name__ == '__main__': print("CART") print("Start read data...") t1 = time.time() train_features,train_labels = load_mnist("data" , kind='train') train_features = train_features[0:1000] train_labels = train_labels[0:1000] test_features,test_labels = load_mnist("data", kind='t10k') test_features = test_features[0:1000] test_labels = test_labels[0:1000] t2 = time.time() print("读取数据用时:" + str((t2-t1))) print('Start training...') tree = build_tree(train_features, train_labels, [i for i in range(784)]) t3 = time.time() print("训练数据用时:" + str((t3-t2))) print('Start predicting...') test_predict = predict(tree, test_features) # test_features测试集,tree为训练好的模型,函数返回的类型为np.array t4 = time.time() print("预测结果用时:" + str((t4-t3))) r = 0 for i in range(len(test_predict)): if test_predict[i] != None and test_predict[i] == test_labels[i]: r = r + 1 # print(test_predict[i], test_labels[i]) score = float(r)/float(len(test_predict)) print("The accruacy score is %f" % score)
参考链接:
1.https://github.com/Dod-o/Statistical-Learning-Method_Code