adaboost算法实现

代码算法的框架
第一步：将数据内容与标签分开
第二步：通过阈值对数据内容进行分类（比如大于阈值部分归为+1或-1，所以一组数据就有两种划分的方法，我们比较这两种方法，哪种错误率比较低）
第三步：找到数据集上最佳的单层决策树（就是将分类结果最好的，对应那组数据返回出来）
第四步：使用AdaBoost算法提升弱分类器性能（训练函数）
第五步：AdaBoost分类函数
第六步：画roc曲线
第七步：主函数

第一步将数据内容与标签分开

def adaptive_load_data(filename):
    # 将数据内容与数据标签分开
    numFeat = len(open(filename).readline().strip().split('\t'))#按\t这种分隔符来分隔字符串,其中\t表示tab
    #print("dd:",open(filename).readline().strip().split('\t'))
    dataMat = [] #内容数据
    labelMat = [] #类别数据
    fr = open(filename)
    for line in fr.readlines():
        lineArr = []
        curLine = line.strip().split('\t')
        for i in range(numFeat-1):#numFeat-1：刚刚除去最后一个类别数据的内容数据长度
            lineArr.append(float(curLine[i])) #将一行数据一个一个的按着序号写进lineArr中，得到一行的数据除了最后一个类别数据
        dataMat.append(lineArr)#将所有内容数据写入
        labelMat.append(float(curLine[-1]))#将所有类别数据写入
    #print("dataMat:",dataMat)
    return dataMat, labelMat

这里返回数据内容和对应的标签

第二步通过阈值对数据内容进行分类

def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
    '''
    通过阈值比较对数据进行分类，所有在阈值一边的数据会被分到-1，另一边的数据被分到1
    :param dataMatrix:数据矩阵
    :param dimen: 维度属性
    :param threshVal: 阈值
    :param threshIneq:阈值比较符号
    :return:单层决策树字典，错误率，类别估计
    '''
    retArray = np.ones((np.shape(dataMatrix)[0], 1))
    #shape函数返回的是矩阵的(行，列），np.shape(dataMatrix)[0]：返回出行的大小
    #zeros()返回一个全0的n维数组，ones（）返回一个全1的n维数组一共有三个参数：shape（用来指定返回数组的大小）、
    # dtype（数组元素的类型）、order（是否以内存中的C或Fortran连续（行或列）顺序存储多维数据）。
    # 后两个参数都是可选的，一般只需设定第一个参数。
    if threshIneq == 'lt':
        retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:, dimen] > threshVal] = -1.0
    return retArray

返回出来的retArray是通过划分阈值后的一个分类结果

第三步找到数据集上最佳的单层决策树


def buildStump(dataArr, classLabels, D):
    #
    # classLabels = np.reshape(classLabels, (len(classLabels), 1))
    dataMatrix = np.mat(dataArr)#mat()函数中数据可以为字符串以分号(；)分割，或者为列表形式以逗号（，）分割。
    # 而array()函数中数据只能为后者形式。
    labelMat = np.mat(classLabels).T
    # print(np.shape(labelMat))   # (5,1)
    m, n = np.shape(dataMatrix)
    numSteps = 10.0
    bestStump = {}
    bestClasEst = np.mat(np.zeros((m, 1)))
    minError = np.inf #numpy中的inf表示一个无限大的正数
    for i in range(n):
        rangeMin = dataMatrix[:, i].min()#i是指列数
        rangeMax = dataMatrix[:, i].max()
        stepSize = (rangeMax-rangeMin)/numSteps
        for j in range(-1, int(numSteps)+1):
            for inequal in ['lt', 'gt']:#就是分类：一类将大于阈值的分为-1，另一类将小于阈值的分为-1
                threshVal = (rangeMin + float(j) * stepSize)#设置阈值，随着J的遍历循环阈值也在发生变化
                predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
                #print('p:', predictedVals)
                #print(np.shape(predictedVals))  #(299, 1) 这里我猜测m=299
                errArr = np.mat(np.ones((m, 1)))
                # print(np.where(predictedVals == labelMat)[0])
                # for num in range(len(predictedVals)):
                #     print(predictedVals[num] == classLabels[num])
                #     if float(predictedVals[num]) == float(classLabels[num]):
                #         errArr[num][0] = 0
                #print(np.shape(errArr))
                errArr[predictedVals == labelMat] =0
                # print(errArr)
                weightedError = D.T * errArr
                print("split:dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" \
                      %(i, threshVal, inequal, weightedError))
                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVals.copy()#复制，但是复制后副本跟原来的互不影响
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    print("bestStump:")
    return bestStump, minError, bestClasEst

返回的bestStump是最佳结果中对应的参数（比如第几维度、阈值为多少、是大于阈值部分为+1还是小于阈值部分为+1），nimError最小误差，bestClasEst最小误差对应的分类结果。

第四步使用AdaBoost算法提升弱分类器性能（训练函数）

def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
    '''
    :param dataArr:数据集（不包含label）
    :param classLabels: 类别标签
    :param numIt: 迭代次数
    :return:
    '''
    weakClassArr = []
    m = np.shape(dataArr)[0]
    D = np.mat(np.ones((m, 1))/m)#一个m列1行，每个数值是1/m的权重
    aggClassEst = np.mat(np.zeros((m, 1)))
    for i in range(numIt):
        bestStump, error, classEst = buildStump(dataArr, classLabels, D)#单层决策树字典，错误率，类别估计
        #print('D:', D.T)
        alpha = float(0.5*np.log((1.0-error)/max(error, 1e-16)))#每个学习器的重要性a
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        #print('classEst:', classEst.T)
        # 为下一次迭代计算D
        expon = np.multiply(-1*alpha*np.mat(list(map(float, classLabels))).T, classEst)#-at*f(x)*H(X)
        #multiply(a,b)就是个乘法，如果a,b是两个数组，那么对应元素相乘
        D = np.multiply(D, np.exp(expon))#权重Dt+1(x)=Dt(x)*exp(-at*f(x)*h(x))
        D = D/D.sum()#权重Dt+1(x)=Dt(x)*exp(-at*f(x)*h(x))/Zm  归一化
        aggClassEst += alpha*classEst#at*h(x)
        #print('aggClassEst:', aggClassEst.T)
        aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1)))#计算误差
        errorRate = aggErrors.sum()/m
        print('errorRate:', errorRate)
        if errorRate == 0.0:
            break
    return weakClassArr, aggClassEst

通过adaboost算法将数据分类，aggClassEst是分类结果，weakClassArr保存最优的参数。

第五步adaboost分类函数

def adaClassify(datToClass, classifierArr):

    dataMatrix = np.mat(datToClass)
    m = np.shape(dataMatrix)[0]
    aggClassEst = np.mat(np.zeros((m, 1)))
    for i in range(len(classifierArr)):
        classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha']*classEst
        print('aggClassEst:', aggClassEst)
    return(np.sign(aggClassEst))

通过调用训练函数训练出来的最优参数进行分类

第六步画roc曲线

def plotROC(predStrengths, classLabels):
    import matplotlib.pyplot as plt
    cur = (1.0, 1.0)
    ySum = 0.0   # 用于计算AUC的值
    numPosClas = sum(np.array(classLabels) == 1.0)   # 计算正例的数目
    yStep = 1/float(numPosClas)
    xStep = 1/float(len(classLabels)-numPosClas)
    sortedIndicies = predStrengths.argsort()
    fig = plt.figure()
    fig.clf()   # 清除当前 figure 的所有axes，但是不关闭这个 window，所以能继续复用于其他的 plot。
    ax = plt.subplot(111)
    for index in sortedIndicies.tolist()[0]:
        if classLabels[index] == 1.0:
            delX = 0
            delY = yStep
        else:
            delX = xStep
            delY = 0
            ySum += cur[1]
        ax.plot([cur[0], cur[0]-delX], [cur[1], cur[1]-delY], c='b')
        cur = (cur[0]-delX, cur[1]-delY)
    ax.plot([0, 1], [0, 1], 'b--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True positive Rate')
    plt.title('ROC curve for AdaBoost Horse Colic Detection System')
    ax.axis([0, 1, 0, 1])
    plt.show()
    print('the Area Under the Curve is:', ySum*xStep)

第七步主函数

if __name__ == "__main__":
    dataMat, classLabels = loadSimpleData()
    datArr, labelArr = adaptive_load_data('horseColicTraining2.txt')
    classifierArr, aggClassEst = adaBoostTrainDS(datArr, labelArr)
    predictions = adaClassify(datArr, classifierArr)
    errArr = np.mat(np.ones((len(datArr), 1)))
    print('训练集的错误率:%.3f%%' % float(errArr[predictions != np.mat(labelArr).T].sum() / len(datArr) * 100))
    plotROC(aggClassEst.T, labelArr)
    testArr,teatlabelArr=adaptive_load_data('horseColicTest2.txt')
    testclassifierArr, testaggClassEst = adaBoostTrainDS(testArr,teatlabelArr)
    predictions = adaClassify(testArr, testclassifierArr)
    errArr = np.mat(np.ones((len(testArr), 1)))
    print('测试集的错误率:%.3f%%' % float(errArr[predictions != np.mat(teatlabelArr).T].sum() / len(testArr) * 100))
    plotROC(testaggClassEst.T, teatlabelArr)

参考：https://blog.csdn.net/qq_22169787/article/details/81413207