自己造轮子-AdaBoost-DS

自己造轮子系列今天造的是AdaBoost,基分类器用的是DS(decision stump)。之所以会写这个系列主要是我觉得一方面可以锻炼coding能力,而另一方面也有助于算法的理解,毕竟懂的自己推导和理解含义再到实现感觉是不一样的。

from numpy import *

#decision stump Classifier
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
    retArray = ones((shape(dataMatrix)[0],1))
    if threshIneq == 'lt':
        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:,dimen] > threshVal] = -1.0
    return retArray

def buildStump(dataArr, classLabels, D):#D权重向量,方便在AdaBoost中调用,三层循环,一层循环特征,一层循环步长,一层循环不等号
    dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
    m,n = shape(dataMatrix)
    numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m,1)))
    minError = inf
    for i in range(n):
        rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
        stepSize = (rangeMax - rangeMin) / numSteps
        for j in range(-1, int(numSteps) + 1):
            for inequal in ['lt', 'gt']:
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVal = stumpClassify(dataMatrix, i , threshVal, inequal)
                errArr = mat(ones((m,1)))
                errArr[predictedVal == labelMat] = 0
                weightedError = D.T * errArr
                #print('split: dim %d, thresh %.2f, thresh inequal:\
                #%s, the weighted error is : %.3f' %(i, threshVal, inequal,weightedError))
                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVal.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump, minError, bestClasEst

#adaboost的本体,50个基分类器,如果ein已经为0则break
def adaBoostTrainDS(dataArr,classLabels, numIt = 50):
    weakClassArr = []#训练出来的基分类器保存在weakClassArr
    m = shape(dataArr)[0]
    D = mat(ones((m,1))/m)
    aggClassEst = mat(zeros((m,1)))
    for i in range(numIt):
        bestStump, error, classEst = buildStump(dataArr, classLabels, D)
        #print('D:',D.T)
        alpha = float(0.5 * log((1.0 - error) / max(error, 1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        #print('classEst:',classEst.T)
        expon = multiply(-1 * alpha * mat(classLabels).T, classEst)
        D = multiply(D, exp(expon))
        D = D / D.sum()
        aggClassEst += alpha * classEst
        #print('aggClassEst:', aggClassEst.T)
        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1)))
        errorRate = aggErrors.sum() / m
        #print('total error:' ,errorRate,'\n')
        if errorRate == 0.0:break
    return weakClassArr

#构建的分类器,sign(Σα*基分类器)
def adaClassify(datToClass, classifierArr):
    dataMatrix = mat(datToClass)
    m = shape(dataMatrix)[0]
    aggClassEst = mat(zeros((m,1)))
    for i in range(len(classifierArr)):
        classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],\
                                 classifierArr[i]['thresh'],\
                                              classifierArr[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha'] * classEst
        print(aggClassEst)
    return sign(aggClassEst)

你可能感兴趣的:(自己造轮子-AdaBoost-DS)