机器学习实战:AdaBoost预测病马率

import numpy as np

"""
函数说明:加载数据集
Parameters:
    filename - 文件名
Returns:
    dataMat - 数据集
    labelMat - 标签
"""
def loadDataSet(filename):
    numFeat = len((open(filename).readline().split('\t')))
    dataMat = []; labelMat = []
    fr = open(filename)
    for line in fr.readlines():
        linArr = []
        curline = line.strip().split('\t')
        for i in range(numFeat-1):
            linArr.append(float(curline[i]))
        dataMat.append(linArr)
        labelMat.append(float(curline[-1]))
    return dataMat,labelMat


"""
函数说明:单层决策树分类函数
Parameters:
    dataMatrix - 数据矩阵
    dimen - 第几个特征
    threshIneq - 标志
Returns:
    retArray - 分类结果
"""
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
    retArray = np.ones((np.shape(dataMatrix)[0],1))
    if threshIneq == 'lt':
        retArray[dataMatrix[:,dimen]<=threshVal] = -1.0
    else:
        retArray[dataMatrix[:,dimen]>threshVal] = -1.0
    return retArray


"""
函数说明:找到数据集上最佳的单层决策树
Parameters:
    dataArr - 数据矩阵
    clssLabels - 数据标签
    D - 样本权重
Returns:
    bestStump - 最佳单层决策树信息
    minError - 最小误差
    bestClasEst - 最佳的分类结果
"""
def buildStump(dataArr,classLabels,D):
    dataMatrix = np.mat(dataArr);labelMat = np.mat(classLabels).T
    m,n = np.shape(dataMatrix)
    numSteps = 10.0;bestStump={};bestClasEst = np.mat(np.zeros((m,1)))
    minError = float('inf')  # 最小误差为正无穷
    for i in range(n):
        rangeMin = dataMatrix[:,i].min()
        rangeMax = dataMatrix[:,i].max()
        stepSize = (rangeMax-rangeMin)/numSteps
        for j in range(-1,int(numSteps)+1):
            for inequal in ['lt','gt']:
                threshVal = (rangeMin+float(j)*stepSize)
                predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)
                errArr = np.mat(np.ones((m,1)))
                errArr[predictedVals == labelMat]=0
                weightedError = D.T*errArr
                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump,minError,bestClasEst


"""
函数说明:Adaboost算法
Parameters:dataArr, classLabels, numIt = 40
returns:
    weakClassArr - 训练好的分类器 
    aggClassEst - 类别估计累计值
"""
def adaBoostTrainDS(dataArr,classLabels,numIt=40):
    weakClassArr = []
    m = np.shape(dataArr)[0]
    D = np.mat(np.ones((m,1))/m)
    aggClassEst = np.mat(np.zeros((m,1)))
    for i in range(numIt):
        bestStump,error,clasEst = buildStump(dataArr,classLabels,D)
        alpha = float(0.5*np.log((1.0-error)/max(error,1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        expon = np.multiply(-1*alpha*np.mat(classLabels).T,clasEst)
        D = np.multiply(D,np.exp(expon))
        D = D/D.sum()
        aggClassEst += alpha*clasEst
        aggErrors = np.multiply(np.sign(aggClassEst)!=np.mat(classLabels).T,np.ones((m,1)))
        errorRate = aggErrors.sum()/m
        if errorRate == 0.0: break
    return weakClassArr,aggClassEst


"""
函数说明:AdaBoost分类函数
Parameters:
    datToclass - 待分类样例
    classifierArr - 训练好的分类器
Returns:
    分类结果
"""
def adaClassify(daToClass,classifierArr):
    datMatrix = np.mat(daToClass)
    m = np.shape(datMatrix)[0]
    aggClassEst = np.mat(np.zeros((m,1)))
    for i in range(len(classifierArr)):
        classEst = stumpClassify(datMatrix,classifierArr[i]['dim'],
                                 classifierArr[i]['thresh'], classifierArr[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha']*classEst
    return np.sign(aggClassEst)

if __name__ == '__main__':
    dataArr,labelArr = loadDataSet('horseColicTraining2.txt')
    weakClassArr,aggClassEst = adaBoostTrainDS(dataArr,labelArr)
    testArr,testLabelArr = loadDataSet('horseColicTest2.txt')
    predictions = adaClassify(dataArr,weakClassArr)
    errArr = np.mat(np.ones((len(dataArr),1)))
    errRate = float(errArr[predictions!=np.mat(labelArr).T].sum()/len(dataArr))
    print("训练集的错误率:%.3f%%"% (errRate*100))
    predictions = adaClassify(testArr, weakClassArr)
    errArr = np.mat(np.ones((len(testArr), 1)))
    errRate = float(errArr[predictions != np.mat(testLabelArr).T].sum() / len(testArr))
    print("测试集的错误率:%.3f%%" % (errRate * 100))

训练集

测试集

你可能感兴趣的:(机器学习实战,机器学习,python)