【机器学习】手动实现回归决策树 (不用sk-learn)

【机器学习】手动实现回归决策树 (不用sk-learn)

上一篇写的分类决策树
链接在这里【机器学习】手动实现分类决策树 (不用sk-learn)
这篇继续不用sk-learn库,一步一步实现回归决策树

  1. 树结构:同分类树
  2. 度量:rss
  3. 选择划分特征
  4. 构造树
  5. 读入数据:波士顿房价预测
  6. 训练
  7. 剪枝
  8. 测试

下面就开始啦

rss

# The sum of squares of the difference between the estimated value and the true value
def rss(data_set):
    result = data_set[:, -1]
    return np.var(result) * (np.shape(data_set)[0])

选择划分特征

# ops(x,y):
# x--Minimum error reduction
# y--Minimum number of samples after classification
def choose_best_split(data_set, ops=(1, 4)):
    tols = ops[0]
    toln = ops[1]

    # if all data are the same, return none
    if len(set(data_set[:, -1])) == 1:
        return None, None

    m, n = np.shape(data_set)
    s = rss(data_set)

    best_s = np.inf
    best_index = 0
    best_value = 0

    # Go through each of the features
    for feat_index in range(n - 1):
        # Go through all values for the current feature
        for value in np.unique(data_set[:, feat_index]):
            left, right = split_data_set(data_set, feat_index, value)
            # If the number of samples is small after classification, exit the loop
            if np.shape(left)[0] < toln or np.shape(right)[0] < toln:
                continue
            # Calculate a new error
            new_s = rss(left) + rss(right)

            # Update minimum error
            if new_s < best_s:
                best_index = feat_index
                best_value = value
                best_s = new_s

    # If the error reduction is not significant, exit
    if (s - best_s) < tols:
        return None, None

    # If the slicing data set is small, exit
    left, right = split_data_set(data_set, best_index, best_value)
    if left.shape[0] < toln or right.shape[0] < toln:
        return None, None

    return best_index, best_value

构造树

# The average value of leaf node data set is the predicted value
# return value, dataNum
def count_value(data_set):
    result = data_set[:, -1]
    value = np.mean(result)
    return value, len(result)



def create_division_tree(data_set, feature_names, parent, left=False, right=False, ops=(1, 4)):
    best_index, best_value = choose_best_split(data_set, ops=ops)
    if best_index is None:
        value, num = count_value(data_set)
        node = BinaryTree(Leaf(value, num))
        if left:
            parent.insertLeft(node)
        elif right:
            parent.insertRight(node)
        return node
    else:
        leftData, rightData = split_data_set(data_set, best_index, best_value)
        # create DecisionNode
        tempNode = BinaryTree(DecisionNode(best_index, best_value, feature_names[best_index]))
        if left:  # If the current DecisionNode is the left child of the previous node, add to left
            parent.insertLeft(tempNode)
        elif right:  # right
            parent.insertRight(tempNode)
        create_division_tree(leftData, feature_names, tempNode, left=True, ops=ops)  # Iteration
        create_division_tree(rightData, feature_names, tempNode, right=True, ops=ops)

    return tempNode

读入数据

def safe_float(number):
    try:
        return float(number)
    except:
        return number


# data
data_root = "./data_regress/"
data_files = []
dataSet = []
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
predict_name = 'MEDV'

for root, dirs, files in os.walk(data_root):
    for file in files:
        data_files.append(os.path.join(data_root, file))
print(data_files)
for file in data_files:
    with open(file, 'r') as f:
        lines = list(f)
        for line in lines:
            a = line.split()
            a = np.array(a)
            a = list(map(safe_float, a))
            dataSet.append(a)
print(dataSet)
dataSet = np.array(dataSet)  # 这里将列表转换为数组
print('dataSet: ', dataSet.shape)

# 将数据集随机分成n块,其中一块是测试集,其他n-1是训练集
def getTrainTest(dataSet, n_folds):
    train_size = int(len(dataSet) / n_folds) * (n_folds - 1)
    dataSet_copy = list(dataSet)
    train = []
    for i in range(n_folds - 1):
        while len(train) < train_size:  # 这里不能用if,if只是在第一次判断时起作用,while执行循环,直到条件不成立
            index = randrange(len(dataSet_copy))
            train.append(dataSet_copy.pop(index))  # pop() 函数用于移除列表中的一个元素(默认最后一个元素),并且返回该元素的值。
    test = dataSet_copy
    return train, test


def getDataSet():
    return dataSet, feature_names, predict_name


# data
dataSet, feature_names, predict_name = getDataSet()
trainData, testData = getTrainTest(dataSet, 5)
trainData = np.array(trainData)
testData = np.array(testData)
print('trainData: ', trainData.shape)
print('testData: ', testData.shape)

训练

# train
my_tree = create_division_tree(trainData, feature_names, None, ops=(10, 5))

剪枝

def isTree(node):
    if type(node.key).__name__ == 'DecisionNode':
        return True  # if it is tree, return True


# make tree into leaf
def getLeaf(tree):
    if isTree(tree.rightChild):
        tree.rightChild = getLeaf(tree.rightChild)
    if isTree(tree.leftChild):
        tree.leftChild = getLeaf(tree.leftChild)

    value = (tree.leftChild.key.value + tree.rightChild.key.value) / 2
    num = tree.leftChild.key.num + tree.rightChild.key.num
    leaf = Leaf(value, num)
    return leaf  # return leaf node


# use testData to prune
def prune(tree, testData):
    # 1. If testData is empty, the tree would be made into a leaf.
    if testData.shape[0] == 0:
        node = BinaryTree(getLeaf(tree))
        return node

    # 2. Split data base on the tree
    if isTree(tree.leftChild) or isTree(tree.rightChild):
        leftData, rightData = split_data_set(testData, tree.key.feature_id, tree.key.feature_value)

    # 3. Iteration
    if isTree(tree.leftChild):
        tree.leftChild = prune(tree.leftChild, leftData)  # left
    if isTree(tree.rightChild):
        tree.rightChild = prune(tree.rightChild, rightData)  # right

    # 4. If the tree here is a leaf, try to merge if it is necessary
    if not isTree(tree.leftChild) and not isTree(tree.rightChild):
        leftData, rightData = split_data_set(testData, tree.key.feature_id, tree.key.feature_value)

        # compare the error
        # 4-1. get current error
        error_left = 0
        error_right = 0
        if leftData.shape[0] != 0:
            error_left = np.sum(np.power(leftData[:, -1] - tree.leftChild.key.value, 2))
        if rightData.shape[0] != 0:
            error_right = np.sum(np.power(rightData[:, -1] - tree.rightChild.key.value, 2))
        errorNoMerge = np.sum(error_left) + np.sum(error_right)
        # 4-2. get the error after merge
        treemean = (tree.leftChild.key.value + tree.rightChild.key.value) / 2
        errorMerge = np.sum(np.power(testData[:, -1] - treemean, 2))
        # 4-3. compare
        if errorMerge < errorNoMerge:  # to merge
            print("merging")
            node = BinaryTree(getLeaf(tree))
            return node
        else:
            return tree  # return the tree
    return tree  # return the tree



# prune
my_tree = prune(my_tree, testData)

测试

# test
sse = 0
sst = 0
mean_real = np.mean(testData[:, -1])
for data in testData:
    predict_value, real = test(my_tree, data)
    sse += ((real - predict_value) ** 2)
    sst += ((real - mean_real) ** 2)
r_square = 1 - sse / sst

保存模型

# save results
SSE = str("%.03f" % sse)
R2 = str("%.03f" % r_square)
print("SSE:" + SSE)
print("R2:" + R2)
save_path = str('./figures/my_regressor_tree_SSE_' + SSE + '_R2_' + R2 + '.gv')
my_tree.print_tree(save_path=save_path)

完整代码:https://github.com/shxy522/DecisionTree
如有不正确的地方,希望大家不吝赐教~

你可能感兴趣的:(机器学习,机器学习,决策树,python)