【系列开头】开这个系列是因为最近学习某些算法纸上谈兵太久,算法流程背的再熟,没有实现过就没法真正的理解算法的细节。这个系列要实现算法的顺序为逻辑回归、决策树(CART)、AdaBoost、GBDT。其他算法根据后续学习情况进行添加。
本文的主要目的是贴出实现代码,并比较ID3、C4.5和CART三种决策树在实现分类与回归问题中的不同。
ID3和C4.5算法的实现几乎一样,唯一一点区别就在确定树的结构时节点分裂前后的增益计算。
暴力枚举所有的可能的树结构,选择损失最小的是NP难问题,所以这里用的是一种贪心算法,每次尝试分裂一个叶节点,计算分裂前后的增益,选择增益最大的。(也就是说每次选择一个特征来分裂当前叶子节点)
ID3算法采用信息增益、C4.5采用信息增益比、CART采用Gini系数。
CART算法是后来一种比较好的实现,但是在实现分类和回归两种方法的时候,代码几乎一样,可以说CART中的分类树就是回归树。
首先从分类和回归两种用途来看。
【ID3和C4.5】分类通过计算信息增益/信息增益比,递归的挑选信息增益最大的作为最优特征进行分裂;
回归则是对每个值对应的y计算方差,选取方差最大的作为最优特征进行分裂。
【CART】分类和回归都是二分一个特征,区别在于分类使用基尼系数来衡量一个特征下的类别纯度,并选取基尼指数高的;而回归计算的是一分为二的两边特征的回归值与均值之间的均方误差。
分类(自底向上代码):
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd
def entropy(feature):
uni_val, cnt = np.unique(feature, return_counts=True)
H = np.sum(-cnt[i]/np.sum(cnt)*np.log2(cnt[i] / np.sum(cnt)) for i in range(len(uni_val)))
return H
def InfoGain(dataset, f_test_col, Y_col=-1):
entropy_before = entropy(dataset.iloc[:, Y_col])
uni_val, cnt = np.unique(dataset.iloc[:, f_test_col], return_counts=True)
entropy_cond = np.sum([(cnt[i] / np.sum(cnt)) * entropy(dataset.where(dataset.iloc[:, f_test_col]
== uni_val[i]).dropna().iloc[:, Y_col])
for i in range(len(uni_val))])
return entropy_before - entropy_cond
def gen_tree(dataset, org_dataset, f_cols, Y_col=-1, p_node_cls=None):
#1)只有一个类别;2)空节点;3)无特征值可划分.
if len(np.unique(dataset.iloc[:, Y_col])) <= 1:return np.unique(dataset.iloc[:, Y_col])[0]
elif len(dataset) == 0:
uni_cls, cnt = np.unique(
org_dataset.iloc[:, Y_col], return_counts=True)
return uni_cls[np.argmax(cnt)]
elif len(f_cols) == 0:
return p_node_cls
else:
#计算当前节点的最大类
cur_uni_cls, cnt = np.unique(dataset.iloc[:, Y_col], return_counts=True)
cur_node_cls = cur_uni_cls[np.argmax(cnt)]
del cur_uni_cls, cnt
#选取最优分裂特征
gains = [InfoGain(dataset, f_col) for f_col in f_cols]
best_f = f_cols[np.argmax(gains)]
#更新特征
f_cols = [col for col in f_cols if col != best_f]
#按最佳特征的不同取值,划分数据集并递归
tree = {best_f: {}}
for val in np.unique(dataset.iloc[:, best_f]):
sub_data = dataset.where(dataset.iloc[:, best_f] == val).dropna()
sub_tree = gen_tree(sub_data, dataset, f_cols, Y_col, cur_node_cls)
tree[best_f][val] = sub_tree
return tree
def fit(X_train, Y_train):
dataset = np.c_[X_train, y_train]
dataset = pd.DataFrame(dataset, columns=list(range(dataset.shape[1])))
tree = gen_tree(dataset, dataset, list(range(dataset.shape[1] - 1)))
return tree
def predict_one(X_test, tree, default=-1):
for feature in list(X_test.keys()):
if feature in list(tree.keys()):#如果与根节点的划分特征相同
try:
sub_tree = tree[feature][X_test[feature]]
if isinstance(sub_tree, dict): # 判断是否还有子树
return predict_one(X_test, tree=sub_tree)
else:
return sub_tree
except: #未见过的情况,返回default
return default
def predict(X_test, tree):
X_test = pd.DataFrame(X_test, columns=list(range(X_test.shape[1]))).to_dict(orient='record')
Y_pred = list()
for item in X_test:
Y_pred.append(predict_one(item, tree=tree))
return Y_pred
回归(自底向上代码):
import numpy as np
import pandas as pd
def Var(data, f_test_col, y_col=-1):
f_uni_val = np.unique(data.iloc[:, f_test_col])
f_var = 0
for val in f_uni_val:
cutset = data[data.iloc[:, f_test_col] == val].reset_index()
cur_var = (len(cutset) / len(data)) * np.var(cutset.iloc[:, y_col], ddof=1)
f_var += cur_var
return f_var
def gen_tree(data, org_dataset, f_cols, min_instances=5, y_col=-1, p_node_mean=None):
if len(data) <= int(min_instances):
return np.mean(data.iloc[:, y_col])
elif len(data) == 0:
return np.mean(org_dataset.iloc[:, y_col])
elif len(f_cols) == 0:
return p_node_mean
else:
p_node_mean = np.mean(data.iloc[:, y_col])
f_vars = [Var(data, f) for f in f_cols]
best_f_idx = np.argmax(f_vars)
best_f = f_cols[best_f_idx]
tree = {best_f: {}}
features = [f for f in f_cols if f != best_f]
for val in np.unique(data.iloc[:, best_f]):
subset = data.where(data.iloc[:, best_f] == val).dropna()
tree[best_f][val] = gen_tree(subset, data, features, min_instances, y_col, p_node_mean)
return tree
def fit(X_train, y_train):
dataset = np.c_[X_train, y_train]
dataset = pd.DataFrame(dataset, columns=list(range(dataset.shape[1])))
return gen_tree(dataset, dataset, list(range(dataset.shape[1] - 1)))
def predict_one(x_test, tree, default=0):
for feature in list(x_test.keys()):
if feature in list(tree.keys()):
try:
sub_tree = tree[feature][x_test[feature]]
if isinstance(sub_tree, dict):
return predict_one(x_test, tree=sub_tree)
else:
return sub_tree
except:
return default
def predict(X_test, tree):
X_test = pd.DataFrame(X_test, columns=list(range(X_test.shape[1]))).to_dict(orient='record')
print(X_test)
y_pred = []
for item in X_test:
y_pred.append(predict_one(item, tree))
return np.array(y_pred)
分类(自底向上代码):
import numpy as np
from scipy import stats
class DecisionTreeClassifier:
def __init__(self, max_depth=None, min_samples_split=5, min_samples_leaf=5, min_impurity_decrease=0.0):
self.__max_depth = max_depth
self.__min_samples_split = min_samples_split
self.__min_samples_leaf = min_samples_leaf
self.__min_impurity_decrease = min_impurity_decrease
self.tree = None
self.__nodes = 0
def __Gini(self, data, y_idx=-1):
K = np.unique(data[:, y_idx])
gini_idx = 1 - np.sum(np.square(np.sum(data[data[:, y_idx] == k][:, -2]) / np.sum(data[:, -2])) for k in K)
return gini_idx
def __BinSplitData(self, data, f_idx, f_val):
left = data[data[:, f_idx] <= f_val]
right = data[data[:, f_idx] > f_val]
return left, right
def __FindBestPair(self, data):
n_sample, n_feature = data.shape
n_feature -= 2
if n_sample < self.__min_samples_split or len(np.unique(data[:, -1])) == 1:
return None, stats.mode(data[:, -1])[0][0]
Gini_before = self.__Gini(data)
best_gain = 0
best_f_idx = None
best_f_val = stats.mode(np.unique(data[:, -1]))[0][0]
for f_idx in range(n_feature):
for f_val in np.unique(data[:, f_idx]):
data_left, data_right = self.__BinSplitData(data, f_idx, f_val)
if len(data_left) < self.__min_samples_leaf or len(data_right) < self.__min_samples_leaf:
continue
Gini_after = np.sum(data_left[:, -2]) / np.sum(data[:, -2]) * self.__Gini(data_left) + \
np.sum(data_right[:, -2] / np.sum(data[:, -2]) * self.__Gini(data_right))
gain = Gini_after - Gini_before
if gain < self.__min_impurity_decrease or gain < best_gain:
continue
else:
best_gain = gain
best_f_idx, best_f_val = f_idx, f_val
return best_f_idx, best_f_val
def __CART(self, data):
best_f_idx, best_f_val = self.__FindBestPair(data)
self.__nodes += 1
if best_f_idx is None:
return best_f_val
if self.__max_depth:
if self.__nodes >= 2 ** self.__max_depth:
return stats.mode(data[:, -1])[0][0]
tree = dict()
tree['cut_f'] = best_f_idx
tree['cut_val'] = best_f_val
data_left, data_right = self.__BinSplitData(data, best_f_idx, best_f_val)
tree['left'] = self.__CART(data_left)
tree['right'] = self.__CART(data_right)
return tree
def fit(self, X_train, y_train, sample_weight=None):
if sample_weight is None:
sample_weight = np.array([1 / len(X_train)] * len(X_train))
else:
sample_weight = sample_weight
data = np.c_[X_train, sample_weight, y_train]
self.tree = self.__CART(data)
def __predict_one(self, x_test, tree):
if isinstance(tree, dict):
cut_f_idx, cut_f_val = self.__FindBestPair(data)
sub_tree = tree['left'] if x_test[cut_f_idx] <= cut_f_val else tree['right']
return self.__predict_one(x_test, sub_tree)
else:
return tree
def predict(self, X_test):
return np.array([self.__predict_one(x_test, self.tree) for x_test in X_test])
回归(自底向上代码):
import numpy as np
class DecisionTreeRegressor:
def __init__(self, max_depth=None, min_samples_split=5, min_samples_leaf=5, min_impurity_decrease=0.0):
self.__max_depth = max_depth
self.__min_samples_split = min_samples_split
self.__min_samples_leaf = min_samples_leaf
self.__min_impurity_decrease = min_impurity_decrease
self.tree = None
self.__nodes = 0
def __MSE(self, data, y_idx=-1):
n_sample = len(data)
mean = np.mean(data[:, y_idx])
return np.sum(np.square(data[:, y_idx] - mean)) / n_sample
def __BinSplitData(self, data, f_idx, f_val):
left = data[data[:, f_idx] <= f_val]
right = data[data[:, f_idx] > f_val]
return left, right
def __FindBestPair(self, data):
n_sample, n_feature = data.shape
#数据纯净或数据量小于阈值,直接返回叶结点
if len(np.unique(data[:, -1])) == 1 or n_sample < self.__min_samples_split:
return None, np.mean(data[:, -1])
MSE_before = self.__MSE(data)
best_gain = 0
best_f_idx = None
best_f_val = np.mean(data[:, -1])
for f_idx in range(n_feature - 1):
for f_val in np.unique(data[:, f_idx]):
data_left, data_right = self.__BinSplitData(data, f_idx, f_val)
if len(data_left) < self.__min_samples_leaf or len(data_right) < self.__min_samples_leaf:
continue
MSE_after = len(data_left) / len(data) * self.__MSE(data_left) + \
len(data_right) / len(data) * self.__MSE(data_right)
gain = MSE_after - MSE_before
if gain < best_gain or gain < self.__min_impurity_decrease:
continue
else:
best_gain = gain
best_f_idx, best_f_val = f_idx, f_val
return best_f_idx, best_f_val
def __CART(self, data):
best_f_idx, best_f_val = self.__FindBestPair(data)
self.__nodes += 1
if best_f_idx is None:
return best_f_val
if self.__max_depth:
if self.__nodes >= 2 ** self.__max_depth:
return np.mean(data[:, -1])
tree = dict()
tree['cut_f'] = best_f_idx
tree['cut_val'] = best_f_val
data_left, data_right = self.__BinSplitData(data, best_f_idx, best_f_val)
tree['left'] = self.__CART(data_left)
tree['right'] = self.__CART(data_right)
return tree
def fit(self, X_train, y_train):
data = np.c_[X_train, y_train]
self.tree = self.__CART(data)
def __predict_one(self, x_test, tree):
if isinstance(tree, dict):
cut_f_idx, cut_val = tree['cut_f'], tree['cut_val']
sub_tree = tree['left'] if x_test[cut_f_idx] <= cut_val else tree['right']
return self.__predict_one(x_test, sub_tree)
else:#叶子节点直接返回值
return tree
def predict(self, X_test):
return [self.__predict_one(x_test, self.tree) for x_test in X_test]