根据模型的数学原理进行简单的代码自我复现以及使用测试,仅作自我学习用。模型原理此处不作过多赘述,仅罗列自己将要使用到的部分公式。
如文中或代码有错误或是不足之处,还望能不吝指正。
回归树,是决策树的一种,以CART树为基础的二叉树。与原本的分类决策树不同,由于因变量为连续型变量,因此不再使用GINI系数作为划分属性的标准,而是采用均方误差作为替代,以此决定划分的属性以及划分的点。
此处自己实现一个回归树;并根据叶子节点样本数、树的高度以及均方误差进行后剪枝。由于能力有限,在实现时没有做好函数功能的抽象化,有很多功能都是后来想到了才加上去的,导致了算法的效能较低,也算是吸取经验了。
import numpy as np
import pandas as pd
import random
import collections
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from collections import deque
class TreeNode:
def __init__(self,labels_idx=None,left=None,right=None,split_idx=None,is_discrete=None,split_value=None,father=None) -> None:
"""
回归树树结构
left:左子树
right:右子树
labels_idx:在训练中训练集的label对应的下标
is_discrete:是否是离散量
split_idx:划分特征对应的下标
split_value:划分点
father:父亲节点
"""
self.labels_idx = labels_idx
self.left = left
self.right = right
self.split_idx = split_idx
self.is_discrete = is_discrete
self.split_value = split_value
self.father = father
class RegressionTree:
def __init__(self,data,labels,is_discrete,validate_ratio=0.1):
"""
初始化
is_discrete:列表,传入特征是否是变量
validate_ratio:保留验证集的比例
"""
self.data = np.array(data)
self.labels=np.array(labels)
self.feature_num = self.data.shape[1]
self.is_discrete = is_discrete
self.validate_ratio = validate_ratio
self.leaves = []
if validate_ratio>0:
all_index = range(data.shape[0])
self.train_idx,self.test_idx = train_test_split(all_index,test_size=validate_ratio)
self.validate_data = self.data[self.test_idx,:]
self.validate_label = self.labels[self.test_idx]
self.train_data = self.data[self.train_idx,:]
self.train_label = self.labels[self.train_idx]
def get_mse(self,y_pred,y_true):
"""
计算MSE
"""
y_pred = np.array(y_pred)
y_true = np.array(y_true)
return np.mean(np.square(y_pred-y_true))
def generate_tree(self,idxs,min_ratio):
"""
递归生成树结构
idxs:子树结构所含元素的下标
min_ratio:叶子节点至少应当占(训练集+验证集)的比例
"""
root = TreeNode(labels_idx=idxs)
if len(idxs)/self.data.shape[0]<=min_ratio:
return root
idx,split_value = self.choose_feature(self.data[idxs,:],self.labels[idxs])
root.split_value = split_value
root.split_idx = idx
left_idxs = []
right_idxs = []
if self.is_discrete[idx]:
for i in idxs:
if self.data[i,idx] != split_value:
right_idxs.append(i)
else:
left_idxs.append(i)
else:
for i in idxs:
if self.data[i,idx] <= split_value:
right_idxs.append(i)
else:
left_idxs.append(i)
left_idxs = np.array(left_idxs)
right_idxs = np.array(right_idxs)
root.left = self.generate_tree(left_idxs,min_ratio)
if root.left:
root.left.father = root
root.right = self.generate_tree(right_idxs,min_ratio)
if root.right:
root.right.father = root
return root
def train(self,max_depth = 0,min_ratio=0.05):
"""
训练过程,包括创建决策树与剪枝
max_depth:树的最大高度
min_ratio:叶子节点至少应当占(训练集+验证集)的比例
"""
if self.validate_ratio>0:
idx = self.train_idx
else:
idx = range(len(self.labels))
self.tree = self.generate_tree(idx,min_ratio)
#当验证集比例>0时,采取后剪枝策略
if self.validate_ratio>0:
self.find_leaves(self.tree)
nodes = deque(self.leaves)
while len(nodes)>0:
n=len(nodes)
for _ in range(n):
node = nodes.popleft()
if not node.father:
nodes = []
break
valid_pred = self.predict(self.validate_data)
mse_before = self.get_mse(valid_pred,self.validate_label)
backup_left = node.father.left
backup_right= node.father.right
node.father.left = None
node.father.right = None
valid_pred = self.predict(self.validate_data)
mse_after = self.get_mse(valid_pred,self.validate_label)
if mse_after>mse_before:
node.father.left = node.father.left
node.father.right = node.father.right
else:
nodes.append(node.father)
#限制最大高度
if max_depth>0:
nodes = deque([self.tree])
d=1
while len(nodes)>0 and d0:
for node in nodes:
node.left=None
node.right=None
def find_leaves(self,node):
"""
寻找叶子节点
"""
if not node.left and not node.right:
self.leaves.append(node)
return None
else:
if node.left:
self.find_leaves(node.left)
if node.right:
self.find_leaves(node.right)
def predict_one(self,x,node=None):
"""
根据决策树预测给定的单个样本
"""
if node == None:
node = self.tree
while node.left and node.right:
idx = node.split_idx
if self.is_discrete[idx]:
if x[idx]==node.split_value:
node = node.left
else:
node = node.right
else:
if x[idx]>node.split_value:
node = node.right
else:
node = node.left
res_idx = node.labels_idx
return np.mean(self.labels[res_idx])
def predict(self,x,node=None):
"""
预测给定的样本集
"""
x = np.array(x)
predicts = []
for i in range(x.shape[0]):
res = self.predict_one(x[i,:],node)
predicts.append(res)
return predicts
def sum_std(self,x):
"""
计算均方误差
"""
return np.sum(np.square(x-np.mean(x)))/len(x)
def choose_feature(self,x,left_labels):
"""
选择可以让子节点均方误差和最小的特征以及分割方式
"""
std_list = []
split_value_list = []
for i in range(x.shape[1]):
final_split_value,final_sum_std=self.calc_std(x[:,i],self.is_discrete[i],left_labels)
std_list.append(final_sum_std)
split_value_list.append(final_split_value)
idx = np.argmin(std_list)
return idx,split_value_list[idx]
def calc_std(self,feature,is_discrete,labels):
"""
对于一个特征,检索能够使得子节点均方误差最小的分割方式
"""
final_sum_std = float("inf")
final_split_value = 0
idx = range(len(feature))
feature_with_idx = np.c_[idx,feature]
labels = np.array(labels)
if is_discrete:
values = list(set(feature))
idx_dict = {v:[] for v in values}
for i,fea in feature_with_idx:
idx_dict[fea].append(i)
for v in values:
anti_idx = [i for i in idx if i not in idx_dict[v]]
left = labels[idx_dict[v]]
right = labels[anti_idx]
if left.shape[0]==0 or right.shape[0] == 0:
continue
sum_std = self.sum_std(left)+self.sum_std(right)
if sum_std
使用mpg(汽车排放量)数据集进行测试
import pandas as pd
df = pd.read_excel("mpg.xlsx")
df.replace("?",pd.NA,inplace=True)
df.dropna(axis=0,inplace=True)
label = df.iloc[:,0].values
data = df.iloc[:,1:5].values
x_train,x_test,y_train,y_test = train_test_split(data,label)
rt = RegressionTree(x_train,y_train,is_discrete=[True,False,False,False],validate_ratio=0.1)
rt.train(max_depth=10)
res = rt.predict(x_test)
rt.get_mse(res,y_test)
"""
53.20787956384344
"""
from sklearn.tree import DecisionTreeRegressor
dr = DecisionTreeRegressor()
dr.fit(x_train,y_train)
res2 = dr.predict(x_test)
rt.get_mse(res2,y_test)
"""
27.945918367346938
"""
效果比不上sklearn的库;且并不稳定,有时似乎会出现“只有根节点”的现象。如看官发现了不足或是错误之处,欢迎指正。