AdaBoost是一种集成方法。属于监督学习。
将不用的分类器组合起来,这种方法称为集成方法(ensemble method)或元算法(meta-algorithm)
------------------------------------------------------------------------------------------------------------------------
bagging:原名自举汇聚法(bootstrap aggregating)。从原数据集(大小为N)中有放回地抽取S个新数据集(大小也为N),这意味着某个新数据集中可以有重复的或没有原数据某值。将S个数据集用于某学习算法得到S个分类器(这些分类器的类型一致),再用S个分类器投票分类新数据。每个分类器权重相等,我们称其为并行训练。比如,随机森林就是种先进的bagging方法。
boosting:与bagging类似的集成分类器。但这里不同的分类器是串行训练获得的,因为新分类器会参考已训练分类器的性能(新分类器会集中关注已有分类器分错的数据),而且这些分类器的权重并不相等。(看下面的权重D,每得到一个stump后,算法会增加分错数据的权重,降低分对数据的权重)
------------------------------------------------------------------------------------------------------------------------
↓ 基于单层决策树的AdaBoost分类器,一种常用的boosting方法:
import numpy as np
'''
1 单层决策树,一种弱分类器
2 使用多个弱分类器来构建AdaBoost
'''
# 通过阈值比较对数据进行分类
def stumpClassify(dataM, dim, threshV, threshIneq):
retArray = np.ones((np.shape(dataM)[0], 1))
if threshIneq == 'lt':
retArray[dataM[:, dim] <= threshV ] = -1.0
else:
retArray[dataM[:, dim] > threshV] = -1.0
return retArray
# 找数据上的最佳单层决策树(基于一个权重向量D)
def buildStump(dataArr, classLabels, D):
dataM = np.mat(dataArr)
labelM = np.mat(classLabels).T
m,n = np.shape(dataM)
numSteps = 10.0
bestStump = {}
bestClassEst = np.mat(np.zeros((m,1)))
minErr = inf
for i in range(n): # 遍历所有的特征 (循环n列)
rangeMin = dataM[:,i].min()
rangeMax = dataM[:,i].max()
stepSize = (rangeMax-rangeMin)/numSteps
for j in range(-1, int(numSteps)+1): # 根据每列的值求步长,遍历每步的阈值
for inequal in ['lt', 'gt']:
threshV = rangeMin+float(j)*stepSize
predV = stumpClassify(dataM, i, threshV, inequal)
errArr = np.mat(np.ones((m,1)))
errArr[predV == labelM] = 0
weightErr = D.T*errArr # 计算加权错误率 (意味着输入值加权)
print('split: dim %d, thresh %.2f, thresh inequal: %s , weightErr is %.3f' %
(i, threshV, inequal, weightErr))
if weightErr < minErr:
minErr = weightErr
bestClassEst = predV.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshV
bestStump['ineq'] = inequal
return bestStump, minErr, bestClassEst
# 基于单层决策树(decision stump)的AdaBoost
def adaBoostTrainDS(data, classLabels, iters = 40):
weakClassArr = []
m = np.shape(data)[0]
D = np.mat(np.ones((m,1))/m) # 初始权重
aggClassEst = np.mat(np.zeros((m,1)))
for i in range(iters):
bestStump, err, classEst = buildStump(data, classLabels, D)
print('D: ', D.T)
alpha = float(0.5*np.log((1.0-err)/np.max(err, 1e-16))) # 本次DS输出结果的权重
bestStump['alpha'] = alpha
weakClassArr.append(bestStump) # 每次迭代都找一个最佳弱分类器 (权重D不同) 加入分类器集合
expon = np.multiply(-1*alpha*np.mat(classLabels).T, classEst) # 更新权重D
D = np.multiply(D, np.exp(expon))
D = D/D.sum()
aggClassEst += alpha*classEst # 分类估计值累加计算,同时用于错误率计算
aggErrors = np.multiply(np.sign(aggClassEst)!=np.mat(classLabels).T, np.ones((m,1)))
errRate = aggErrors.sum()/m # np.sign是1和-1(用于二值分类)
print('total error: ', errRate, '\n')
if errRate == 0.0: break
return weakClassArr
# --------------------------------------- 测试分类器 --------------------------------------------
def adaBoostClassify(dataTest, classifyArr):
dataM = np.mat(dataTest)
m = np.shape(dataTest)[0]
aggClassEst = np.mat(np.zeros((m,1)))
for i in range(len(classifyArr)): # 多个弱分类器遍历一遍
classEst = stumpClassify(dataM, classifyArr[i]['dim'], classifyArr[i]['thresh'], classifyArr[i]['ineq'])
aggClassEst += classifyArr[i]['alpha']*classEst # 分类估计值累加计算
print(aggClassEst)
return np.sign(aggClassEst) # (二值分类)
dataTrain, labelTrain, dataTest, labelTest = loadData(filename)
classifyArr = adaBoostTrainDS(dataTrain, labelTrain)
labelPred = adaBoostClassify(dataTest, classifyArr)