《机器学习实战》读后备忘

最近在看《机器学习实战》第9章,CART算法用于回归,在python2.7版本下跑,发现代码报错,所以勘误了下。下面只是回归树的部分勘误代码:

class treeNode():
    def __init__(self, feat, val, right, left):
        featureToSplitOn = feat#切分的特征
        valueOfSplit = val#切分的值
        rightBranch = right#左子树
        leftBranch = left#右子树

from numpy import *
def loadDataSet(fileName):#加载数据
    dataMat=[]
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = map(float, curLine)
        dataMat.append(fltLine)
    return dataMat
    
def binSplitDataSet(dataSet, feature, value):
    #会不会发生没分割的情况?特征值恰巧处于边界?
    mat0 = dataSet[nonzero(dataSet[:,feature]>value)[0],:]
    mat1 = dataSet[nonzero(dataSet[:,feature]<=value)[0],:]
    return mat0, mat1

#testMat = mat(eye(4))
#print testMat
#mat0,mat1 = binSplitDataSet(testMat, 1,0.5)
#print 'first: ', mat0
#print 'second',mat1

def regLeaf(dataSet):
    return mean(dataSet[:,-1])#计算数据集标签值得均值
    
def regErr(dataSet):
    return var(dataSet[:,-1])*shape(dataSet)[0]#方差*个数,衡量了一个数据集的混乱程度(个人理解:或者说是聚集程度)
    
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    tolS = ops[0]#数据集的足够有序不用分割程度
    tolN = ops[1]#数据集的足够小不用分割程度
    if len(set(dataSet[:,-1].T.tolist()[0]))==1:#数据集的标签值一样,没有必要分割
        return None, leafType(dataSet)#分割下标为空,返回叶子节点
    m,n =shape(dataSet)#数据集大小
    S = errType(dataSet)#数据集混乱度
    bestS = inf#最佳混乱度
    bestIndex = 0#最佳分割特征
    bestValue = 0#最佳分割值,分割值是有限个的,后面代码将看到。这些值是从数据集来的
    for featIndex in range(n-1):#将去一是为了排除标签,获取特征个数
        uniqeFeatVals = set(dataSet[:,featIndex].T.tolist()[0])
        minFeatVal = min(uniqeFeatVals)
        maxFeatVal = max(uniqeFeatVals)
        for splitVal in uniqeFeatVals:#特征确定了,遍历它可能的所有值,如何对处于边界的特征值进行分割?
            #如果是边界特征值,不让他去分割!
            if (splitVal != minFeatVal) and (splitVal != maxFeatVal):
                mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
                if (shape(mat0)[0]

你可能感兴趣的:(工作小问题)