基于pyhton3.6-机器学习实战-回归树regTrees代码解释

本人是一名数学系研究生,于2017年底第一次接触python和机器学习,作为一名新手,欢迎与大家交流。

我主要给大家讲解代码,理论部分给大家推荐3本书:

《机器学习实战中文版》

《机器学习》周志华

《统计学习方法》李航

以上3本书,第一本是基于python2的代码实现;剩余两本主要作为第一本书理论省略部分的补充,理论大部分都讲得很细。

博客上关于机器学习实战理论解释都很多,参差不齐,好作品也大都借鉴了以上3本书,网上有很多电子版的书。

与其看看一些没用的博客,真心不如以上3本书有收获。

说实话,学习一定要静下心来,切忌浮躁。不懂可以每天看一点,每天你懂一点,天天积累就多了。

操作系统:windows8.1

python版本:python3.6

运行环境:spyder(anaconda)

# -*- coding: utf-8 -*-
"""
Created on Fri Mar 16 21:38:34 2018

@author: Loulch C.C
"""

from numpy import *

def loadDataSet(fileName):
    """
    函数说明:数据导入函数
    """
    dataMat = []
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = list(map(float, curLine))
        #将每行映射成浮点数,python3中map()返回值改变,所以需要修改源代码
        dataMat.append(fltLine)
    return dataMat

def binSplitDataSet(dataSet, feature, value): 
    """
    函数说明: 数据切分函数(切分数据集为两个子集:左子集和右子集)
    dataSet: 数据集 
    feature: 待切分特征 
    value: 特征值
    """
    mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :]
    mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :] 
    #nonzero()的返回值是一个数组array([x,..., x], dtype=int64),其中第一项是索引值列表,()应用数组过滤
    #[[x,..., x],:]将每个索引值所对应的行进行全部复制
    """
    下面的原书代码报错 index 0 is out of bounds,使用上面两行代码
    mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :][0]
    mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :][0]
    """
    return mat0, mat1

def regLeaf(dataSet):
    """
    函数说明: 当chooseBestSplit()确定不再对数据进行切分时,调用regLeaf()生成叶结点
    返回值: 叶结点(在回归树中,其实就是目标变量的均值)
    """
    return mean(dataSet[:,-1])

def regErr(dataSet):
    """
    函数说明:平方误差函数
    返回值:平方误差
    """
    return var(dataSet[:,-1]) * shape(dataSet)[0]

def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    """
    函数说明:回归树切分函数(找到数据的最佳二元切分方式)
    返回值:若找不到好的切分方式,本函数在3种情况下不会切分,直接创建叶结点;
            若找到好的切分方式,返回最好的切分的特征编号和切分特征值
    """
    tolS = ops[0]                          #允许的误差下降值,用户指定参数,用于控制函数的停止时机
    tolN = ops[1]                          #切分的最小样本数,用户指定参数,用于控制函数的停止时机
    #tolS,tolN进行的实际上是一种预剪枝处理
    #################################若所有值都相同,则退出##############################
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1:    
    #用set()对当前所有目标变量建立一个集合,不含相同项,用len()统计不同剩余特征值数目,若为1,则无需切分
        return None,leafType(dataSet)                
    ####################################################################################
    m,n = shape(dataSet)
    S = errType(dataSet)                               
    #该误差S将用于与新切分的误差进行对比,来检查新切分能否降低误差
    bestS = inf; bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):                       #遍历数据的每个属性特征
        # for splitVal in set(dataSet[:,featIndex]): python3报错修改为下面
        for splitVal in set((dataSet[:, featIndex].T.A.tolist())[0]):  #遍历每个特征里不同的特征值
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) #对每个特征进行二元切分
            if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
            newS = errType(mat0) + errType(mat1)
            if newS < bestS:                           #更新为误差最小的特征
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    if (S - bestS) < tolS:                  
    #如果切分后误差效果下降不大,则取消切分,直接创建叶结点
        return None,leafType(dataSet)  
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
    #另外,检查切分后子集大小,若小于最小允许样本数tolN,则停止切分
        return None, leafType(dataSet)
    return bestIndex,bestValue                         #返回特征编号和用于切分的特征值

def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    """
    函数说明: 树构建函数
    dataSet: 数据集
    leafType: 建立叶节点函数
    errType: 误差计算函数
    ops: 包含树构建所需的其他参数的元组
    返回值: 构建好的树retTree
    """
    feat,val = chooseBestSplit(dataSet, leafType, errType, ops)
    if feat == None: return val      #满足停止条件时返回叶结点值,见chooseBestSplit()
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    #切分后的左右子树
    lSet, rSet = binSplitDataSet(dataSet, feat, val)                    #切分数据集
    retTree['left'] = createTree(lSet, leafType, errType, ops)          #建立左子树
    retTree['right'] = createTree(rSet, leafType, errType, ops)         #建立右子树
    return retTree
"""
if __name__=='__main__':
    myDat=loadDataSet('ex00.txt')
    myMat=mat(myDat)
    retTree=createTree(myMat)
    print(retTree)
    myDat1=loadDataSet('ex0.txt')
    myMat1=mat(myDat1)
    retTree1=createTree(myMat1)
    print(retTree1)
   
#"""

import matplotlib.pyplot as plt 

def showDataSet():
    myDat=loadDataSet('ex00.txt') 
    #print('myDat=',myDat)
    myMat=mat(myDat)
    fig, axs = plt.subplots(nrows=3, ncols=1,figsize=(6,19))
    axs[0].scatter(myMat[:,0].flatten().A[0],myMat[:,1].flatten().A[0], c = 'blue')
    myDat1=loadDataSet('ex0.txt')
    #print('myDat1=',myDat1)
    myMat1=mat(myDat1)
    axs[1].scatter(myMat1[:,1].flatten().A[0],myMat1[:,2].flatten().A[0], c = 'blue') 
    myDat2=loadDataSet('ex2.txt') 
    #print('myDat2=',myDat2)
    myMat2=mat(myDat2)
    axs[2].scatter(myMat2[:,0].flatten().A[0],myMat2[:,1].flatten().A[0], c = 'blue')
    plt.show()
    
"""
if __name__=='__main__':
    showDataSet()    
#"""


def isTree(obj):
    """
    函数说明:测试输入变量是否为一棵树
    """
    return (type(obj).__name__=='dict') #判断为字典类型返回true
#
def getMean(tree):
    """
    函数说明:从上往下遍历树直到找到叶节点为止,若找到两个叶节点,计算他们的平均值
    返回值:树的平均值(对树进行塌陷式处理)
    """
    if isTree(tree['right']):
        tree['right'] = getMean(tree['right'])
    if isTree(tree['left']):
        tree['left'] = getMean(tree['left'])
    return (tree['left']+tree['right'])/2.0


#
def prune(tree, testData):
    """
    函数说明:树的后剪枝
    tree:待剪枝的树 
    testData:剪枝所需的测试数据
    返回值:剪枝后的树tree
    """
    if shape(testData)[0] == 0: return getMean(tree)       #没有测试数据则对树进行塌陷式处理
    if (isTree(tree['right']) or isTree(tree['left'])):                 #左右子树有一个非空
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet)
    if not isTree(tree['left']) and not isTree(tree['right']):    #剪枝后判断是否还是有子树
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = sum(power(lSet[:, -1] - tree['left'], 2)) + \
                       sum(power(rSet[:, -1] - tree['right'], 2))       #计算合并前的错误率
        treeMean = (tree['left'] + tree['right']) / 2.0
        errorMerge = sum(power(testData[:, -1] - treeMean, 2))          #计算合并后的错误率
        
        if errorMerge < errorNoMerge:                                   #如果合并后误差变小
            print("merging")
            return treeMean
        else:
            return tree
    else:
        return tree

"""
if __name__ == '__main__':
    myDat2=loadDataSet('ex2.txt') 
    #print('myDat2=',myDat2)
    myMat2=mat(myDat2)
    myTree=createTree(myMat2,ops=(0,1))
    print('myTree=',myTree)
    myDat2Test=loadDataSet('ex2test.txt')
    myMat2Test=mat(myDat2Test)
    print('prune(myTree,myMat2Test)=',prune(myTree,myMat2Test))
    
#"""

#模型树
def linearSolve(dataSet):   
    """
    函数说明:将数据集格式化为自变量X,目标变量Y
    """
    m,n = shape(dataSet)
    X = mat(ones((m,n))); Y = mat(ones((m,1)))
    X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]
    xTx = X.T*X
    if linalg.det(xTx) == 0.0:             #X Y用于简单线性回归,需要判断矩阵可逆
        raise NameError('This matrix is singular, cannot do inverse,\
        try increasing the second value of ops')
    ws = xTx.I * (X.T * Y)                 #标准线性回归
    return ws,X,Y

def modelLeaf(dataSet):                   #不需要切分时生成模型树叶节点
    ws,X,Y = linearSolve(dataSet)
    return ws #返回回归系数

def modelErr(dataSet):                    #用来计算误差找到最佳切分
    ws,X,Y = linearSolve(dataSet)
    yHat = X * ws
    return sum(power(Y-yHat,2))

"""
if __name__ == '__main__':
    myDat2=loadDataSet('exp2.txt') 
    #print('myDat2=',myDat2)
    myMat2=mat(myDat2)
    myTree=createTree(myMat2,modelLeaf,modelErr,ops=(1,10))
    print('myTree=',myTree)
    plt.scatter(myMat2[:,0].flatten().A[0],myMat2[:,1].flatten().A[0], c = 'blue')
    
#"""


#用树回归进行预测

def regTreeEval(model, inDat):
    """
    函数说明:对回归树叶节点进行预测
    """
    return float(model)

def modelTreeEval(model, inDat):
    """
    函数说明:对模型树叶节点进行预测
    """
    ############输入数据格式化处理###########
    n = shape(inDat)[1]
    X = mat(ones((1, n + 1)))
    X[:, 1:n + 1] = inDat
    ########################################
    return float(X * model)                #返回预测值
#
def treeForeCast(tree, inData, modelEval=regTreeEval):               #指定树类型
    """
    函数说明:自顶向下遍历整个树,直到命中叶节点为止,输入单个数据点/行向量。
    返回值:返回一个预测值
    """
    if not isTree(tree): return modelEval(tree, inData)
    if inData[tree['spInd']] > tree['spVal']:
        if isTree(tree['left']):                          #有左子树 递归进入子树
            return treeForeCast(tree['left'], inData, modelEval)
        else:                                             #不存在子树 返回左叶节点
            return modelEval(tree['left'], inData)
    else:
        if isTree(tree['right']):                         #有右子树 递归进入子树
            return treeForeCast(tree['right'], inData, modelEval)
        else:                                             #不存在子树 返回右叶节点
            return modelEval(tree['right'], inData)
        
#对数据进行树结构建模
def createForeCast(tree, testData, modelEval=regTreeEval):
    """
    函数说明:对整个测试集进行预测
    """
    m = len(testData)
    yHat = mat(zeros((m, 1)))
    for i in range(m):
        yHat[i, 0] = treeForeCast(tree, mat(testData[i]), modelEval)
    return yHat

"""
if __name__ == '__main__':
    trainMat=mat(loadDataSet('bikeSpeedVsIq_train.txt')) 
    testMat=mat(loadDataSet('bikeSpeedVsIq_test.txt')) 
    ###################################画散点图####################################
    fig=plt.figure()
    ax=fig.add_subplot(111)
    ax.scatter(testMat[:,0].A.reshape(1,-1),testMat[:,1].A.reshape(1,-1), c = 'blue')
    plt.xlabel('speed of riding')
    plt.ylabel('IQ')
    plt.show()
    ###############################################################################
    myTree=createTree(trainMat,ops=(1,20))
#    print('myTree=',myTree)
#    myTree= {'spInd': 0, 'spVal': 10.0, 'left': {'spInd': 0, 'spVal': 17.0, 'left': 
#        {'spInd': 0, 'spVal': 20.0, 'left': 168.34161286956524, 'right': 157.0484078846154}, 
#        'right': {'spInd': 0, 'spVal': 14.0, 'left': 141.06067981481482, 
#                  'right': 122.90893026923078}}, 'right': {'spInd': 0, 'spVal': 7.0, 
#                  'left': 94.7066578125, 'right': {'spInd': 0, 'spVal': 5.0, 
#                                                   'left': 69.02117757692308, 
#                                                   'right': 50.94683665}}}
    yHat=createForeCast(myTree,testMat[:,0])
    print('回归树的相关性R^2:',corrcoef(yHat,testMat[:,1],rowvar=0)[0,1])
#   回归树的相关性R^2: 0.9640852318222145
    #corrcoef()计算预测值和实际值的相关性R^2,R^2越接近1越好。
    myTree1=createTree(trainMat,modelLeaf,modelErr,ops=(1,20))
    yHat=createForeCast(myTree1,testMat[:,0],modelTreeEval)
    print('模型树的相关性R^2:',corrcoef(yHat,testMat[:,1],rowvar=0)[0,1])
#   模型树的相关性R^2: 0.9760412191380623
    ws,X,Y=linearSolve(trainMat)
    print('ws=',ws)
#   ws= [[37.58916794]
#        [ 6.18978355]] 
    for i in range(shape(testMat)[0]):
        yHat[i]=testMat[i,0]*ws[1,0]+ws[0,0]
    print('标准线性回归的相关性R^2:',corrcoef(yHat,testMat[:,1],rowvar=0)[0,1])
#   标准线性回归的相关性R^2: 0.9434684235674762
#"""




"""
#python Tkinter库创建GUI 
import regTrees

import matplotlib

matplotlib.use('TkAgg') #设置后端TkAgg
#将TkAgg和matplotlib链接起来
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from matplotlib.figure import Figure


def reDraw(tolS, tolN):
    reDraw.f.clf()  #清空之前的图像
    reDraw.a = reDraw.f.add_subplot(111)#重新添加新图
    if chkBtnVar.get():#检查选框model tree是否被选中
        if tolN < 2: tolN = 2
        myTree = regTrees.createTree(reDraw.rawDat, regTrees.modelLeaf,regTrees.modelErr, (tolS, tolN))
        yHat = regTrees.createForeCast(myTree, reDraw.testDat, regTrees.modelTreeEval)
    else:
        myTree = regTrees.createTree(reDraw.rawDat, ops=(tolS, tolN))
        yHat = regTrees.createForeCast(myTree, reDraw.testDat)
    reDraw.a.scatter(reDraw.rawDat[:, 0], reDraw.rawDat[:, 1], s=5)  # 绘制真实值
    reDraw.a.plot(reDraw.testDat, yHat, linewidth=2.0)  # 绘制预测值
    reDraw.canvas.show()


def getInputs():#获取输入
    try:#期望输入是整数
        tolN = int(tolNentry.get())
    except:#清楚错误用默认值替换
        tolN = 10
        print("enter Integer for tolN")
        tolNentry.delete(0, END)
        tolNentry.insert(0, '10')
    try:#期望输入是浮点数
        tolS = float(tolSentry.get())
    except:
        tolS = 1.0
        print("enter Float for tolS")
        tolSentry.delete(0, END)
        tolSentry.insert(0, '1.0')
    return tolN, tolS


def drawNewTree():
    tolN, tolS = getInputs()  # 从输入文本框中获取参数
    reDraw(tolS, tolN)  #绘制图
    
root = Tk()

reDraw.f = Figure(figsize=(5, 4), dpi=100)  # 创建画布
reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root)
reDraw.canvas.show()
reDraw.canvas.get_tk_widget().grid(row=0, columnspan=3)

Label(root, text="tolN").grid(row=1, column=0)
tolNentry = Entry(root)
tolNentry.grid(row=1, column=1)
tolNentry.insert(0, '10')
Label(root, text="tolS").grid(row=2, column=0)
tolSentry = Entry(root)
tolSentry.grid(row=2, column=1)
tolSentry.insert(0, '1.0')
Button(root, text="ReDraw", command=drawNewTree).grid(row=1, column=2, rowspan=3)
chkBtnVar = IntVar()
chkBtn = Checkbutton(root, text="Model Tree", variable=chkBtnVar)
chkBtn.grid(row=3, column=0, columnspan=2)

reDraw.rawDat = mat(regTrees.loadDataSet('sine.txt'))
reDraw.testDat = arange(min(reDraw.rawDat[:, 0]), max(reDraw.rawDat[:, 0]), 0.01)
reDraw(1.0, 10)

root.mainloop()


"""



你可能感兴趣的:(机器学习)