adaBoost使用单层决策树分类器实现分类功能(源代码)

#coding:utf-8
'''结合ADABOOST和DECISION STUMP的算法实现分类'''
from numpy import *

def loadDataMat(filename):
    fr=open(filename)
    data=[]
    label=[]
    for line in fr.readlines():
        curLine=[]
        lineArr=line.strip().split('\t')
        for i in range(len(lineArr)-1):
            curLine.append(float(lineArr[i]))
        data.append(curLine)
        label.append(float(lineArr[-1]))
    return mat(data),label
dataMat,classLabels=loadDataMat('E:\\horseColicTraining2.txt')

def autoNorm(dataMat):  #利用最大最小进行均值归一化
    minVals=dataMat.min(0)
    maxVals=dataMat.max(0)
    ranges=maxVals-minVals
    m=dataMat.shape[0]
    normDataSet=dataMat-tile(minVals,(m,1))
    normDataSet=normDataSet/tile(ranges,(m,1))
    return normDataSet

dataMat=autoNorm(dataMat)

def loadSimpData():
    dataMat=mat([[1.,2.1],[2.,1.1],[1.3,1.],[1.,1.],[2,1]])
    classLabels=[1,1,-1,-1,1]
    return dataMat,mat(classLabels).T
#dataMat,classLabels=loadSimpData()

#开始构建单层决策树生成函数
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#dimen指的是特征,threshIneq指不等式
    retArray=ones((shape(dataMatrix)[0],1))
    if threshIneq=='lt':
        retArray[dataMatrix[:,dimen]<=threshVal]=-1.0
    else: 
        retArray[dataMatrix[:,dimen]>threshVal]=-1.0
    return retArray
def build(dataMat,classLabels,D): #D样本权重
    step=10
    m,n=dataMat.shape
    classLabels=mat(classLabels).T
    bestStump={}  #储存决策树
    bestClassEst=mat(zeros((m,1)))  #储存决策树分类结果
    minError=inf
    for i in range(n):
        rangeMin=dataMat[:,i].min()
        rangeMax=dataMat[:,i].max()
        stepSize=(rangeMax-rangeMin)/step
        for j in range(-1,step):
            for inequal in ['lt','gt']:
                threshVal=(rangeMin+float(j)*stepSize)
                predict=stumpClassify(dataMat,i,threshVal,inequal)
                errArr=mat(ones((m,1)))
                #print 'que: ',classLabels.shape,mat(predict).shape
                errArr[predict==classLabels]=0
                weightedError=sum(D.T*errArr)
                if weightedError'dim']=i
                    bestStump['thresh']=threshVal
                    bestStump['ineq']=inequal
    return bestStump,minError,bestClassEst

#基于单层决策树的ADABOOST训练过程
def adaBoostTrains(dataMat,classLabels,numIt=40):
    weakClassArr=[]#储存所有分类器数组
    m=shape(dataMat)[0]
    D=mat(ones((m,1))/m)#训练数据分配权重
    aggClassEst=mat(zeros((m,1)))#保存强分类器分类结果
    for i in range(numIt):
        bestStump,error,classEst=build(dataMat,classLabels,D)
        #print "D:",D.T
        alpha=float(0.5*log((1.0-error)/max(error,1e-16))) 

        bestStump['alpha']=alpha
        weakClassArr.append(bestStump)#保存分类器
        #print "classEst:",classEst.T
        p=multiply(-1*alpha*mat(classLabels).T,classEst)
        p=exp(p)
        D=multiply(D,p)/D.sum()
        aggClassEst+=classEst*alpha
        #print 'aggClassEst: ',aggClassEst.T   
        aggErrors=multiply(ones((m,1)),sign(aggClassEst)!=mat(classLabels).T)
        erroRate=aggErrors.sum()/m
        print "total rate:",erroRate,'\n'
        if erroRate==0.0: break
    return weakClassArr #返回所有分类器

classify=adaBoostTrains(dataMat,classLabels,1000)//最终循环了1000次,大概错误率为0.14

你可能感兴趣的:(adaBoost使用单层决策树分类器实现分类功能(源代码))