前言:可以将不同的分类器组合,这种组合结果被称为集成方法 、 元算法
使用:1.不同算法的集成 2.同一算法下的不同设置集成 3.不同部分分配给不同分类器的集成
算法介绍:AdaBoost
优点:泛华错误率低,易编码,可以应用在大部分的分类器上,无参数调整
缺点:对离群点敏感(离群点是指一个时间序列中,远离序列的一般水平的极端大值和极端小值)
运用数据类型:数值型或者标称型数据
7-1 :单层决策树生成函数
# -*- coding: utf-8 -*-
from numpy import *
def loadSimpData():
datMat = matrix([[1. , 2.1],
[2. , 1.1],
[1.3 , 1.],
[1. , 1.],
[2. ,1.]])
classLabels = [1.0 , 1.0 , -1.0 ,-1.0 ,1.0]
return datMat , classLabels
#通过阈值比较对数据进行分类函数,在阈值一边的会分到-1类别,另一边的分到类别+1
#先全部初始化为1,然后进行过滤,不满足不等式的变为-1
def stumpClassify(dataMatrix , dimen , threshVal , threshIneq) :
retArray = ones((shape(dataMatrix)[0] , 1 ))
if threshIneq == 'lt' :
retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:,dimen] > threshVal] = -1.0
return retArray
#遍历上述函数所有可能输入,找到最佳单层决策树
def buildStump(dataArr,classLabels,D):
dataMatrix = mat(dataArr) ; labelMat = mat(classLabels).T
m,n = shape(dataMatrix)
numSetps = 10.0 #在特征的所有可能值上进行遍历
bestStump = {} #存储给定权重D得到的最佳单层决策树
bestClasEst = mat(zeros((m,1)))
minError = inf #初始化为无穷大,找最小错误率
for i in range(n) :#在特征上进行遍历,计算最大最小值来求得合理步长
rangeMin = dataMatrix[:,i].min() ; rangeMax = dataMatrix[:,i].max();
stepSize = (rangeMax-rangeMin)/numSetps
for j in range(-1,int(numSetps)+1):
for inequal in ['lt' , 'gt'] :#大于小于切换不等式
threshVal = (rangeMin+float(j)*stepSize)
predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)
errArr = mat(ones((m,1))) #如果预测值≠真实值,为1
errArr[predictedVals==labelMat] = 0
weightedError = D.T * errArr #相乘求和得到错误权重数值
if weightedError < minError :
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump , minError , bestClasEst
运行
# -*- coding: utf-8 -*-
import adaboost
from numpy import *
D = mat(ones((5,1))/5)
dataMat , classLabels = adaboost.loadSimpData()
print adaboost.buildStump(dataMat,classLabels,D)
效果:
({'dim': 0, 'ineq': 'lt', 'thresh': 1.3}, matrix([[ 0.2]]), array([[-1.],
[ 1.],
[-1.],
[-1.],
[ 1.]]))
7.4 完整AdaBoost训练过程:
def adaBoostTrainDS(dataArr,classLabels,numIt = 40) : #=数据集,类别标签,迭代次数numIt
weakClassArr = []
m = shape(dataArr)[0] #m是数据的数目
D = mat(ones((m,1))/m) #每个数据点的权重
aggClassEst = mat(zeros((m,1))) #记录每个数据点的类别估计累计值
for i in range(numIt): #如果在迭代次数内错误率为0则退出
bestStump , error , classEst = buildStump(dataArr,classLabels,D)
#返回利用D得到的最小错误率单层决策树,最小的错误率和估计的类别向量
print "D:" , D.T
alpha = float(0.5*log((1.0-error)/max(error,1e-16))) #分类器分配的权重,这里比较是为了防止0出现溢出
bestStump['alpha'] = alpha
weakClassArr.append(bestStump)
print "classEst : " , classEst.T
expon = multiply(-1*alpha*mat(classLabels).T , classEst)
D = multiply(D,exp(expon))
D = D/D.sum()
aggClassEst += alpha*classEst
print "aggClassEst : " , aggClassEst.T
aggErrors = multiply(sign(aggClassEst)!=mat(classLabels).T , ones((m,1)))
errorRate = aggErrors.sum() / m
print "Total error : " , errorRate , "\n"
if errorRate ==0.0 : break
return weakClassArr
样例输入:
# -*- coding: utf-8 -*-
import adaboost
from numpy import *
D = mat(ones((5,1))/5)
dataMat , classLabels = adaboost.loadSimpData()
lassifierArray = adaboost.adaBoostTrainDS(dataMat,classLabels,9)
print lassifierArray
样例输出:
D: [[ 0.2 0.2 0.2 0.2 0.2]]
classEst : [[-1. 1. -1. -1. 1.]]
aggClassEst : [[-0.69314718 0.69314718 -0.69314718 -0.69314718 0.69314718]]
Total error : 0.2
D: [[ 0.5 0.125 0.125 0.125 0.125]]
classEst : [[ 1. 1. -1. -1. -1.]]
aggClassEst : [[ 0.27980789 1.66610226 -1.66610226 -1.66610226 -0.27980789]]
Total error : 0.2
D: [[ 0.28571429 0.07142857 0.07142857 0.07142857 0.5 ]]
classEst : [[ 1. 1. 1. 1. 1.]]
aggClassEst : [[ 1.17568763 2.56198199 -0.77022252 -0.77022252 0.61607184]]
Total error : 0.0
[{'dim': 0, 'ineq': 'lt', 'thresh': 1.3, 'alpha': 0.6931471805599453}, {'dim': 1, 'ineq': 'lt', 'thresh': 1.0, 'alpha': 0.9729550745276565}, {'dim': 0, 'ineq': 'lt', 'thresh': 0.90000000000000002, 'alpha': 0.8958797346140273}]
def adaClassify(datToClass,classifierArr):#基于adaboost的分类
dataMatrix = mat(datToClass)
m = shape(dataMatrix)[0]
aggClassEst = mat(zeros((m,1)))
for i in range(len(classifierArr)): #训练多个弱分类器
classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],
classifierArr[i]['thresh'],
classifierArr[i]['ineq'])
aggClassEst += classifierArr[i]['alpha']*classEst
print aggClassEst
return sign(aggClassEst)
7-5 ROC曲线绘制和AUC计算函数
def plotROC(predStrengths, classLabels):
import matplotlib.pyplot as plt
cur = (1.0,1.0) #保留绘制光标的位置
ySum = 0.0 #计算AUC的值
numPosClas = sum(array(classLabels)==1.0)
yStep = 1/float(numPosClas);
xStep = 1/float(len(classLabels)-numPosClas)
sortedIndicies = predStrengths.argsort()#获取排序索引
fig = plt.figure()
fig.clf()
ax = plt.subplot(111)
#画图
for index in sortedIndicies.tolist()[0]:
if classLabels[index] == 1.0:
delX = 0;
delY = yStep;
else:
delX = xStep;
delY = 0;
ySum += cur[1]
ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY], c='b')
cur = (cur[0]-delX,cur[1]-delY)
ax.plot([0,1],[0,1],'b--')
plt.xlabel('False positive rate'); plt.ylabel('True positive rate')
plt.title('ROC curve for AdaBoost horse colic detection system')
ax.axis([0,1,0,1])
plt.show()
print "the Area Under the Curve is: ",ySum*xStep