机器学习算法实现(2):ID3、C4.5和CART分类与回归的对比

【系列开头】开这个系列是因为最近学习某些算法纸上谈兵太久,算法流程背的再熟,没有实现过就没法真正的理解算法的细节。这个系列要实现算法的顺序为逻辑回归、决策树(CART)、AdaBoost、GBDT。其他算法根据后续学习情况进行添加。

本文的主要目的是贴出实现代码,并比较ID3、C4.5和CART三种决策树在实现分类与回归问题中的不同。

前言

ID3和C4.5算法的实现几乎一样,唯一一点区别就在确定树的结构时节点分裂前后的增益计算。

暴力枚举所有的可能的树结构,选择损失最小的是NP难问题,所以这里用的是一种贪心算法,每次尝试分裂一个叶节点,计算分裂前后的增益,选择增益最大的。(也就是说每次选择一个特征来分裂当前叶子节点)

ID3算法采用信息增益、C4.5采用信息增益比、CART采用Gini系数。

CART算法是后来一种比较好的实现,但是在实现分类和回归两种方法的时候,代码几乎一样,可以说CART中的分类树就是回归树。

代码总流程

首先从分类和回归两种用途来看。
【ID3和C4.5】分类通过计算信息增益/信息增益比,递归的挑选信息增益最大的作为最优特征进行分裂;
回归则是对每个值对应的y计算方差,选取方差最大的作为最优特征进行分裂。
【CART】分类和回归都是二分一个特征,区别在于分类使用基尼系数来衡量一个特征下的类别纯度,并选取基尼指数高的;而回归计算的是一分为二的两边特征的回归值与均值之间的均方误差。

ID3代码(分类、回归)

分类(自底向上代码):

  1. 计算熵和信息增益函数
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd

def entropy(feature):
    uni_val, cnt = np.unique(feature, return_counts=True)
    H = np.sum(-cnt[i]/np.sum(cnt)*np.log2(cnt[i] / np.sum(cnt)) for i in range(len(uni_val)))
    return H

def InfoGain(dataset, f_test_col, Y_col=-1):
    entropy_before = entropy(dataset.iloc[:, Y_col])
    uni_val, cnt = np.unique(dataset.iloc[:, f_test_col], return_counts=True)
    entropy_cond = np.sum([(cnt[i] / np.sum(cnt)) * entropy(dataset.where(dataset.iloc[:, f_test_col]
                                                                          == uni_val[i]).dropna().iloc[:, Y_col])
                           for i in range(len(uni_val))])
    return entropy_before - entropy_cond
  1. 生成树
def gen_tree(dataset, org_dataset, f_cols, Y_col=-1, p_node_cls=None):
    #1)只有一个类别;2)空节点;3)无特征值可划分.
    if len(np.unique(dataset.iloc[:, Y_col])) <= 1:return np.unique(dataset.iloc[:, Y_col])[0]
    elif len(dataset) == 0:
        uni_cls, cnt = np.unique(
            org_dataset.iloc[:, Y_col], return_counts=True)
        return uni_cls[np.argmax(cnt)]
    elif len(f_cols) == 0:
        return p_node_cls
    else:
        #计算当前节点的最大类
        cur_uni_cls, cnt = np.unique(dataset.iloc[:, Y_col], return_counts=True)
        cur_node_cls = cur_uni_cls[np.argmax(cnt)]
        del cur_uni_cls, cnt
        #选取最优分裂特征
        gains = [InfoGain(dataset, f_col) for f_col in f_cols]
        best_f = f_cols[np.argmax(gains)]
        #更新特征
        f_cols = [col for col in f_cols if col != best_f]
        #按最佳特征的不同取值,划分数据集并递归
        tree = {best_f: {}}
        for val in np.unique(dataset.iloc[:, best_f]):
            sub_data = dataset.where(dataset.iloc[:, best_f] == val).dropna()
            sub_tree = gen_tree(sub_data, dataset, f_cols, Y_col, cur_node_cls)
            tree[best_f][val] = sub_tree
    return tree
  1. 训练与预测
def fit(X_train, Y_train):
    dataset = np.c_[X_train, y_train]
    dataset = pd.DataFrame(dataset, columns=list(range(dataset.shape[1])))
    tree = gen_tree(dataset, dataset, list(range(dataset.shape[1] - 1)))
    return tree
def predict_one(X_test, tree, default=-1):
    for feature in list(X_test.keys()):
        if feature in list(tree.keys()):#如果与根节点的划分特征相同
            try:
                sub_tree = tree[feature][X_test[feature]]
                if isinstance(sub_tree, dict):  # 判断是否还有子树
                    return predict_one(X_test, tree=sub_tree)
                else:
                    return sub_tree
            except: #未见过的情况,返回default
                return default

def predict(X_test, tree):
    X_test = pd.DataFrame(X_test, columns=list(range(X_test.shape[1]))).to_dict(orient='record')
    Y_pred = list()
    for item in X_test:
        Y_pred.append(predict_one(item, tree=tree))
    return Y_pred

回归(自底向上代码):

  1. 计算指定特征上的类别方差
import numpy as np
import pandas as pd

def Var(data, f_test_col, y_col=-1):
    f_uni_val = np.unique(data.iloc[:, f_test_col])
    f_var = 0
    for val in f_uni_val:
        cutset = data[data.iloc[:, f_test_col] == val].reset_index()
        cur_var = (len(cutset) / len(data)) * np.var(cutset.iloc[:, y_col], ddof=1)
        f_var += cur_var
    return f_var
  1. 生成树
def gen_tree(data, org_dataset, f_cols, min_instances=5, y_col=-1, p_node_mean=None):
    if len(data) <= int(min_instances):
        return np.mean(data.iloc[:, y_col])
    elif len(data) == 0:
        return np.mean(org_dataset.iloc[:, y_col])
    elif len(f_cols) == 0:
        return p_node_mean
    else:
        p_node_mean = np.mean(data.iloc[:, y_col])
        f_vars = [Var(data, f) for f in f_cols]
        best_f_idx = np.argmax(f_vars)
        best_f = f_cols[best_f_idx]

        tree = {best_f: {}}
        features = [f for f in f_cols if f != best_f]
        for val in np.unique(data.iloc[:, best_f]):
            subset = data.where(data.iloc[:, best_f] == val).dropna()
            tree[best_f][val] = gen_tree(subset, data, features, min_instances, y_col, p_node_mean)
        return tree
  1. 训练与预测
def fit(X_train, y_train):
    dataset = np.c_[X_train, y_train]
    dataset = pd.DataFrame(dataset, columns=list(range(dataset.shape[1])))
    return gen_tree(dataset, dataset, list(range(dataset.shape[1] - 1)))

def predict_one(x_test, tree, default=0):
    for feature in list(x_test.keys()):
        if feature in list(tree.keys()):
            try:
                sub_tree = tree[feature][x_test[feature]]
                if isinstance(sub_tree, dict):
                    return predict_one(x_test, tree=sub_tree)
                else:
                    return sub_tree
            except:
                return default

def predict(X_test, tree):
    X_test = pd.DataFrame(X_test, columns=list(range(X_test.shape[1]))).to_dict(orient='record')
    print(X_test)
    y_pred = []
    for item in X_test:
        y_pred.append(predict_one(item, tree))
    return np.array(y_pred)

CART代码(分类、回归)

分类(自底向上代码):

  1. 计算Gini指数和二分特征函数
import numpy as np
from scipy import stats

class DecisionTreeClassifier:
    def __init__(self, max_depth=None, min_samples_split=5, min_samples_leaf=5, min_impurity_decrease=0.0):
        self.__max_depth = max_depth
        self.__min_samples_split = min_samples_split
        self.__min_samples_leaf = min_samples_leaf
        self.__min_impurity_decrease = min_impurity_decrease
        self.tree = None
        self.__nodes = 0

    def __Gini(self, data, y_idx=-1):
        K = np.unique(data[:, y_idx])
        gini_idx = 1 - np.sum(np.square(np.sum(data[data[:, y_idx] == k][:, -2]) / np.sum(data[:, -2])) for k in K)
        return gini_idx

    def __BinSplitData(self, data, f_idx, f_val):
        left = data[data[:, f_idx] <= f_val]
        right = data[data[:, f_idx] > f_val]
        return left, right
  1. 找到最佳的(变量,切分点)
    def __FindBestPair(self, data):
        n_sample, n_feature = data.shape
        n_feature -= 2

        if n_sample < self.__min_samples_split or len(np.unique(data[:, -1])) == 1:
            return None, stats.mode(data[:, -1])[0][0]
        Gini_before = self.__Gini(data)
        best_gain = 0
        best_f_idx = None
        best_f_val = stats.mode(np.unique(data[:, -1]))[0][0]

        for f_idx in range(n_feature):
            for f_val in np.unique(data[:, f_idx]):
                data_left, data_right = self.__BinSplitData(data, f_idx, f_val)
                if len(data_left) < self.__min_samples_leaf or len(data_right) < self.__min_samples_leaf:
                    continue
                Gini_after = np.sum(data_left[:, -2]) / np.sum(data[:, -2]) * self.__Gini(data_left) + \
                    np.sum(data_right[:, -2] / np.sum(data[:, -2]) * self.__Gini(data_right))
                gain = Gini_after - Gini_before
                if gain < self.__min_impurity_decrease or gain < best_gain:
                    continue
                else:
                    best_gain = gain
                    best_f_idx, best_f_val = f_idx, f_val
        return best_f_idx, best_f_val
  1. 构建分类树
    def __CART(self, data):
        best_f_idx, best_f_val = self.__FindBestPair(data)
        self.__nodes += 1

        if best_f_idx is None:
            return best_f_val

        if self.__max_depth:
            if self.__nodes >= 2 ** self.__max_depth:
                return stats.mode(data[:, -1])[0][0]

        tree = dict()
        tree['cut_f'] = best_f_idx
        tree['cut_val'] = best_f_val

        data_left, data_right = self.__BinSplitData(data, best_f_idx, best_f_val)
        tree['left'] = self.__CART(data_left)
        tree['right'] = self.__CART(data_right)

        return tree
  1. 训练与预测
    def fit(self, X_train, y_train, sample_weight=None):
        if sample_weight is None:
            sample_weight = np.array([1 / len(X_train)] * len(X_train))
        else:
            sample_weight = sample_weight
        data = np.c_[X_train, sample_weight, y_train]
        self.tree = self.__CART(data)

    def __predict_one(self, x_test, tree):
        if isinstance(tree, dict):
            cut_f_idx, cut_f_val = self.__FindBestPair(data)
            sub_tree = tree['left'] if x_test[cut_f_idx] <= cut_f_val else tree['right']
            return self.__predict_one(x_test, sub_tree)
        else:
            return tree

    def predict(self, X_test):
        return np.array([self.__predict_one(x_test, self.tree) for x_test in X_test])

回归(自底向上代码):

  1. 计算均方误差和二分数据函数
import numpy as np

class DecisionTreeRegressor:
    def __init__(self, max_depth=None, min_samples_split=5, min_samples_leaf=5, min_impurity_decrease=0.0):
        self.__max_depth = max_depth
        self.__min_samples_split = min_samples_split
        self.__min_samples_leaf = min_samples_leaf
        self.__min_impurity_decrease = min_impurity_decrease
        self.tree = None
        self.__nodes = 0
    def __MSE(self, data, y_idx=-1):
        n_sample = len(data)
        mean = np.mean(data[:, y_idx])
        return np.sum(np.square(data[:, y_idx] - mean)) / n_sample

    def __BinSplitData(self, data, f_idx, f_val):
        left = data[data[:, f_idx] <= f_val]
        right = data[data[:, f_idx] > f_val]
        return left, right
  1. 寻找最优(变量,切分点)
    def __FindBestPair(self, data):
        n_sample, n_feature = data.shape
        #数据纯净或数据量小于阈值,直接返回叶结点
        if len(np.unique(data[:, -1])) == 1 or n_sample < self.__min_samples_split:
            return None, np.mean(data[:, -1])
        MSE_before = self.__MSE(data)
        best_gain = 0
        best_f_idx = None
        best_f_val = np.mean(data[:, -1])

        for f_idx in range(n_feature - 1):
            for f_val in np.unique(data[:, f_idx]):
                data_left, data_right = self.__BinSplitData(data, f_idx, f_val)
                if len(data_left) < self.__min_samples_leaf or len(data_right) < self.__min_samples_leaf:
                    continue
                MSE_after = len(data_left) / len(data) * self.__MSE(data_left) + \
                    len(data_right) / len(data) * self.__MSE(data_right)
                gain = MSE_after - MSE_before
                if gain < best_gain or gain < self.__min_impurity_decrease:
                    continue
                else:
                    best_gain = gain
                    best_f_idx, best_f_val = f_idx, f_val
        return best_f_idx, best_f_val
  1. 构建回归树
    def __CART(self, data):
        best_f_idx, best_f_val = self.__FindBestPair(data)
        self.__nodes += 1
        if best_f_idx is None:
            return best_f_val
        if self.__max_depth:
            if self.__nodes >= 2 ** self.__max_depth:
                return np.mean(data[:, -1])

        tree = dict()
        tree['cut_f'] = best_f_idx
        tree['cut_val'] = best_f_val

        data_left, data_right = self.__BinSplitData(data, best_f_idx, best_f_val)
        tree['left'] = self.__CART(data_left)
        tree['right'] = self.__CART(data_right)

        return tree
  1. 拟合与预测
    def fit(self, X_train, y_train):
        data = np.c_[X_train, y_train]
        self.tree = self.__CART(data)

    def __predict_one(self, x_test, tree):
        if isinstance(tree, dict):
            cut_f_idx, cut_val = tree['cut_f'], tree['cut_val']
            sub_tree = tree['left'] if x_test[cut_f_idx] <= cut_val else tree['right']
            return self.__predict_one(x_test, sub_tree)
        else:#叶子节点直接返回值
            return tree

    def predict(self, X_test):
        return [self.__predict_one(x_test, self.tree) for x_test in X_test]

你可能感兴趣的:(炼丹笔记)