上一篇写的分类决策树
链接在这里【机器学习】手动实现分类决策树 (不用sk-learn)
这篇继续不用sk-learn库,一步一步实现回归决策树
下面就开始啦
# The sum of squares of the difference between the estimated value and the true value
def rss(data_set):
result = data_set[:, -1]
return np.var(result) * (np.shape(data_set)[0])
# ops(x,y):
# x--Minimum error reduction
# y--Minimum number of samples after classification
def choose_best_split(data_set, ops=(1, 4)):
tols = ops[0]
toln = ops[1]
# if all data are the same, return none
if len(set(data_set[:, -1])) == 1:
return None, None
m, n = np.shape(data_set)
s = rss(data_set)
best_s = np.inf
best_index = 0
best_value = 0
# Go through each of the features
for feat_index in range(n - 1):
# Go through all values for the current feature
for value in np.unique(data_set[:, feat_index]):
left, right = split_data_set(data_set, feat_index, value)
# If the number of samples is small after classification, exit the loop
if np.shape(left)[0] < toln or np.shape(right)[0] < toln:
continue
# Calculate a new error
new_s = rss(left) + rss(right)
# Update minimum error
if new_s < best_s:
best_index = feat_index
best_value = value
best_s = new_s
# If the error reduction is not significant, exit
if (s - best_s) < tols:
return None, None
# If the slicing data set is small, exit
left, right = split_data_set(data_set, best_index, best_value)
if left.shape[0] < toln or right.shape[0] < toln:
return None, None
return best_index, best_value
# The average value of leaf node data set is the predicted value
# return value, dataNum
def count_value(data_set):
result = data_set[:, -1]
value = np.mean(result)
return value, len(result)
def create_division_tree(data_set, feature_names, parent, left=False, right=False, ops=(1, 4)):
best_index, best_value = choose_best_split(data_set, ops=ops)
if best_index is None:
value, num = count_value(data_set)
node = BinaryTree(Leaf(value, num))
if left:
parent.insertLeft(node)
elif right:
parent.insertRight(node)
return node
else:
leftData, rightData = split_data_set(data_set, best_index, best_value)
# create DecisionNode
tempNode = BinaryTree(DecisionNode(best_index, best_value, feature_names[best_index]))
if left: # If the current DecisionNode is the left child of the previous node, add to left
parent.insertLeft(tempNode)
elif right: # right
parent.insertRight(tempNode)
create_division_tree(leftData, feature_names, tempNode, left=True, ops=ops) # Iteration
create_division_tree(rightData, feature_names, tempNode, right=True, ops=ops)
return tempNode
def safe_float(number):
try:
return float(number)
except:
return number
# data
data_root = "./data_regress/"
data_files = []
dataSet = []
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
predict_name = 'MEDV'
for root, dirs, files in os.walk(data_root):
for file in files:
data_files.append(os.path.join(data_root, file))
print(data_files)
for file in data_files:
with open(file, 'r') as f:
lines = list(f)
for line in lines:
a = line.split()
a = np.array(a)
a = list(map(safe_float, a))
dataSet.append(a)
print(dataSet)
dataSet = np.array(dataSet) # 这里将列表转换为数组
print('dataSet: ', dataSet.shape)
# 将数据集随机分成n块,其中一块是测试集,其他n-1是训练集
def getTrainTest(dataSet, n_folds):
train_size = int(len(dataSet) / n_folds) * (n_folds - 1)
dataSet_copy = list(dataSet)
train = []
for i in range(n_folds - 1):
while len(train) < train_size: # 这里不能用if,if只是在第一次判断时起作用,while执行循环,直到条件不成立
index = randrange(len(dataSet_copy))
train.append(dataSet_copy.pop(index)) # pop() 函数用于移除列表中的一个元素(默认最后一个元素),并且返回该元素的值。
test = dataSet_copy
return train, test
def getDataSet():
return dataSet, feature_names, predict_name
# data
dataSet, feature_names, predict_name = getDataSet()
trainData, testData = getTrainTest(dataSet, 5)
trainData = np.array(trainData)
testData = np.array(testData)
print('trainData: ', trainData.shape)
print('testData: ', testData.shape)
# train
my_tree = create_division_tree(trainData, feature_names, None, ops=(10, 5))
def isTree(node):
if type(node.key).__name__ == 'DecisionNode':
return True # if it is tree, return True
# make tree into leaf
def getLeaf(tree):
if isTree(tree.rightChild):
tree.rightChild = getLeaf(tree.rightChild)
if isTree(tree.leftChild):
tree.leftChild = getLeaf(tree.leftChild)
value = (tree.leftChild.key.value + tree.rightChild.key.value) / 2
num = tree.leftChild.key.num + tree.rightChild.key.num
leaf = Leaf(value, num)
return leaf # return leaf node
# use testData to prune
def prune(tree, testData):
# 1. If testData is empty, the tree would be made into a leaf.
if testData.shape[0] == 0:
node = BinaryTree(getLeaf(tree))
return node
# 2. Split data base on the tree
if isTree(tree.leftChild) or isTree(tree.rightChild):
leftData, rightData = split_data_set(testData, tree.key.feature_id, tree.key.feature_value)
# 3. Iteration
if isTree(tree.leftChild):
tree.leftChild = prune(tree.leftChild, leftData) # left
if isTree(tree.rightChild):
tree.rightChild = prune(tree.rightChild, rightData) # right
# 4. If the tree here is a leaf, try to merge if it is necessary
if not isTree(tree.leftChild) and not isTree(tree.rightChild):
leftData, rightData = split_data_set(testData, tree.key.feature_id, tree.key.feature_value)
# compare the error
# 4-1. get current error
error_left = 0
error_right = 0
if leftData.shape[0] != 0:
error_left = np.sum(np.power(leftData[:, -1] - tree.leftChild.key.value, 2))
if rightData.shape[0] != 0:
error_right = np.sum(np.power(rightData[:, -1] - tree.rightChild.key.value, 2))
errorNoMerge = np.sum(error_left) + np.sum(error_right)
# 4-2. get the error after merge
treemean = (tree.leftChild.key.value + tree.rightChild.key.value) / 2
errorMerge = np.sum(np.power(testData[:, -1] - treemean, 2))
# 4-3. compare
if errorMerge < errorNoMerge: # to merge
print("merging")
node = BinaryTree(getLeaf(tree))
return node
else:
return tree # return the tree
return tree # return the tree
# prune
my_tree = prune(my_tree, testData)
# test
sse = 0
sst = 0
mean_real = np.mean(testData[:, -1])
for data in testData:
predict_value, real = test(my_tree, data)
sse += ((real - predict_value) ** 2)
sst += ((real - mean_real) ** 2)
r_square = 1 - sse / sst
# save results
SSE = str("%.03f" % sse)
R2 = str("%.03f" % r_square)
print("SSE:" + SSE)
print("R2:" + R2)
save_path = str('./figures/my_regressor_tree_SSE_' + SSE + '_R2_' + R2 + '.gv')
my_tree.print_tree(save_path=save_path)
完整代码:https://github.com/shxy522/DecisionTree
如有不正确的地方,希望大家不吝赐教~