'''结合ADABOOST和DECISION STUMP的算法实现分类'''
from numpy import *
def loadDataMat(filename):
fr=open(filename)
data=[]
label=[]
for line in fr.readlines():
curLine=[]
lineArr=line.strip().split('\t')
for i in range(len(lineArr)-1):
curLine.append(float(lineArr[i]))
data.append(curLine)
label.append(float(lineArr[-1]))
return mat(data),label
dataMat,classLabels=loadDataMat('E:\\horseColicTraining2.txt')
def autoNorm(dataMat):
minVals=dataMat.min(0)
maxVals=dataMat.max(0)
ranges=maxVals-minVals
m=dataMat.shape[0]
normDataSet=dataMat-tile(minVals,(m,1))
normDataSet=normDataSet/tile(ranges,(m,1))
return normDataSet
dataMat=autoNorm(dataMat)
def loadSimpData():
dataMat=mat([[1.,2.1],[2.,1.1],[1.3,1.],[1.,1.],[2,1]])
classLabels=[1,1,-1,-1,1]
return dataMat,mat(classLabels).T
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
retArray=ones((shape(dataMatrix)[0],1))
if threshIneq=='lt':
retArray[dataMatrix[:,dimen]<=threshVal]=-1.0
else:
retArray[dataMatrix[:,dimen]>threshVal]=-1.0
return retArray
def build(dataMat,classLabels,D):
step=10
m,n=dataMat.shape
classLabels=mat(classLabels).T
bestStump={}
bestClassEst=mat(zeros((m,1)))
minError=inf
for i in range(n):
rangeMin=dataMat[:,i].min()
rangeMax=dataMat[:,i].max()
stepSize=(rangeMax-rangeMin)/step
for j in range(-1,step):
for inequal in ['lt','gt']:
threshVal=(rangeMin+float(j)*stepSize)
predict=stumpClassify(dataMat,i,threshVal,inequal)
errArr=mat(ones((m,1)))
errArr[predict==classLabels]=0
weightedError=sum(D.T*errArr)
if weightedError'dim']=i
bestStump['thresh']=threshVal
bestStump['ineq']=inequal
return bestStump,minError,bestClassEst
def adaBoostTrains(dataMat,classLabels,numIt=40):
weakClassArr=[]
m=shape(dataMat)[0]
D=mat(ones((m,1))/m)
aggClassEst=mat(zeros((m,1)))
for i in range(numIt):
bestStump,error,classEst=build(dataMat,classLabels,D)
alpha=float(0.5*log((1.0-error)/max(error,1e-16)))
bestStump['alpha']=alpha
weakClassArr.append(bestStump)
p=multiply(-1*alpha*mat(classLabels).T,classEst)
p=exp(p)
D=multiply(D,p)/D.sum()
aggClassEst+=classEst*alpha
aggErrors=multiply(ones((m,1)),sign(aggClassEst)!=mat(classLabels).T)
erroRate=aggErrors.sum()/m
print "total rate:",erroRate,'\n'
if erroRate==0.0: break
return weakClassArr
classify=adaBoostTrains(dataMat,classLabels,1000)//最终循环了1000次,大概错误率为0.14