SMO算法
SMO表示序列最小优化(Sequential Minimal Optimization),是将大优化问题分解为多个小优化问题来求解的。
通过观察SVM的优化目标我们可以发现其最终的目的是要计算出一组最优的和常数项b的值。SMO的基本思路是先固定之外的所有参数,然后求上的极值。由于存在约束,若固定之外的其他变量,则可由其他变量导出。所以,SMO每次选择两个变量和,并固定其他参数。这样。在参数初始化后,SMO不断执行以下两个步骤直至收敛:
- 选取一对需要更新的变量和
- 固定和以外的参数,更新优化和
SMO算法之所以高效,在于固定其他参数后,仅优化两个参数能做到非常高效。SVM优化目标函数的约束条件为:
仅考虑和时,约束可重写为:
其中
是使约束式成立的常数。
简化版SMO算法代码实现
import numpy as np
# SMO算法中的辅助函数
# 加载数据集
def loadDataSet(fileName):
dataMat = []
labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = line.strip().split('\t')
dataMat.append([float(lineArr[0]), float(lineArr[1])])
labelMat.append(float(lineArr[2]))
return dataMat, labelMat
def selectJrand(i, m):
"""
i:第一个alpha的下标
m:所有alpha的数目
return:不等于i的随机的第二个alpha下标
"""
j = I
while j == I:
j = int(np.random.uniform(0, m))
return j
# 调整alpha值
def clipAlpha(aj, H, L):
if aj > H:
aj = H
if L > aj:
aj = L
return aj
dataArr, labelArr = loadDataSet('testSet.txt')
np.unique(labelArr)
这里先完成了三个辅助函数,然后用loadDataSet
加载数据集,用numpy.unique
打印数据集的标签类别。可以看到,这里的类别标签是-1和1。
array([-1., 1.])
# 简化版SMO
def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
'''
dataMatIn:数据集
classLabels:类别标签
C:常数C
toler:容错率
maxIter:退出前最大循环次数
'''
dataMatrix = np.mat(dataMatIn)
labelMat = np.mat(classLabels).transpose()
b = 0 # 偏置项
m, n = dataMatrix.shape
alphas = np.mat(np.zeros((m, 1))) # 初始化alpha
iteration = 0
while iteration < maxIter:
alphaPairsChanged = 0 # 记录alpha是否已经进行优化
for i in range(m):
fXi = float(np.multiply(alphas, labelMat).T * (dataMatrix*dataMatrix[i,:].T)) + b
Ei = fXi - float(labelMat[i]) # 预测误差
# 如果alpha可以更改,进入优化过程
if((labelMat[i] * Ei < -toler) and (alphas[i] < C)) or \
((labelMat[i] * Ei > toler) and (alphas[i] > 0)):
j = selectJrand(i, m) # 随机选择第二个alpha
fXj = float(np.multiply(alphas, labelMat).T * (dataMatrix*dataMatrix[j,:].T)) + b
Ej = fXj - float(labelMat[j])
alphaIold = alphas[i].copy()
alphaJold = alphas[j].copy()
# 保证alpha在0与C之间
if labelMat[i] != labelMat[j]:
L = max(0, alphas[j] - alphas[I])
H = min(C, C + alphas[j] - alphas[I])
else:
L = max(0, alphas[j] + alphas[i] - C)
H = min(C, alphas[j] + alphas[I])
if L == H:
print('L == H')
continue
# alphaJ的最优修改量
eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - \
dataMatrix[i,:]*dataMatrix[i,:].T - \
dataMatrix[j,:]*dataMatrix[j,:].T
if eta >= 0:
print('eta >= 0')
continue
alphas[j] -= labelMat[j]*(Ei - Ej)/eta
alphas[j] = clipAlpha(alphas[j], H, L)
if np.abs(alphas[j] - alphaJold) < 0.00001:
print('j not moving enough')
continue
# 对alphaI进行修改,修改量与alphaJ相同,但方向相反
alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j])
b1 = b - Ei - labelMat[i] * (alphas[i] - alphaIold) *\
dataMatrix[i,:] * dataMatrix[i,:].T - \
labelMat[j] * (alphas[j] - alphaJold) *\
dataMatrix[i,:] * dataMatrix[j,:].T
b2 = b - Ej - labelMat[i] * (alphas[i] - alphaIold) *\
dataMatrix[i,:] * dataMatrix[j,:].T - \
labelMat[j] * (alphas[j] - alphaJold) *\
dataMatrix[j,:] * dataMatrix[j,:].T
if (0 < alphas[i]) and (C > alphas[I]):
b = b1
elif (0 < alphas[j]) and (C > alphas[j]):
b = b2
else:
b = (b1 + b2)/2.0
alphaPairsChanged += 1
print('iter: %d i: %d, pairs changed %d' % (iteration, i, alphaPairsChanged))
if alphaPairsChanged == 0:
iteration += 1
else:
iteration = 0
print('iteration number: %d' % iteration)
return b, alphas
执行函数并计时。
%%timeit
b, alphas = smoSimple(dataArr, labelArr, 0.6, 0.001, 40)
可以得到简化版SMO运行时间:
4.51 s ± 905 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
可以观察下alpha矩阵,但由于很多值为0,所以需要先过滤下。
alphas[alphas != 0]
output: matrix([[0.13595567, 0.21153347, 0.01845807, 0.36594721]])
还可以查看哪些数据点是支持向量。
# 查看支持向量
for i in range(100):
if alphas[i] != 0:
print(dataArr[i], labelArr[I])
结果如下
[4.658191, 3.507396] -1.0
[3.457096, -0.082216] -1.0
[2.893743, -1.643468] -1.0
[6.080573, 0.418886] 1.0
可视化
得到alphas、b,我们还可以计算w,然后画出分隔超平面,还可以标记出哪些点是支持向量。
# 计算w
def calcWs(alphas, dataArr, classLabels):
X = np.mat(dataArr)
labelMat = np.mat(classLabels).transpose()
m, n = np.shape(X)
w = np.zeros((n, 1))
for i in range(m):
w += np.multiply(alphas[i]*labelMat[i], X[i,:].T)
return w
ws = calcWs(alphas, dataArr, labelArr)
ws
结果如下
array([[ 0.80480726],
[-0.29040192]])
接下来就可以画图了。
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
plt.rcParams['font.sans-serif'] = ['SimHei']
xcord0 = []
ycord0 = []
xcord1 = []
ycord1 = []
markers =[]
colors =[]
dataMat = np.mat(dataArr)
for i in range(len(dataArr)):
if (labelArr[i] == -1):
xcord0.append(dataMat[i, 0])
ycord0.append(dataMat[i, 1])
else:
xcord1.append(dataMat[i, 0])
ycord1.append(dataMat[i, 1])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord0,ycord0, marker='s', s=50)
ax.scatter(xcord1,ycord1, marker='o', s=50, c='red')
plt.title('用圆圈标记的支持向量')
circle = Circle((4.658191, 3.507396), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5)
ax.add_patch(circle)
circle = Circle((3.457096, -0.082216), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5)
ax.add_patch(circle)
circle = Circle((2.893743, -1.643468), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5)
ax.add_patch(circle)
circle = Circle((6.080573, 0.418886), 0.5, facecolor='none', edgecolor=(0,0.8,0.8), linewidth=3, alpha=0.5)
ax.add_patch(circle)
x = np.arange(-2.0, 12.0, 0.1).reshape((140,1))
y = (-ws[0]*x - b)/ws[1]
ax.plot(x,y)
ax.axis([-2,12,-8,6])
plt.show()
结果如下
用完整Platt SMO算法加速优化
class optStruct:
def __init__(self, dataMatIn, classLabels, C, toler):
self.X = dataMatIn
self.labelMat = classLabels
self.C = C
self.toler = toler
self.m = dataMatIn.shape[0]
self.alphas = np.mat(np.zeros((self.m, 1)))
self.b = 0
self.eCache = np.mat(np.zeros((self.m, 2))) # 误差缓存
# 计算E值
def calcEk(oS, k):
fXk = float(np.multiply(oS.alphas, oS.labelMat).T * (oS.X * oS.X[k,:].T)) + oS.b
Ek = fXk - float(oS.labelMat[k])
return Ek
# 通过最大化步长的方式选择第二个alpha
def selectJ(i, oS, Ei):
maxK = -1
maxDeltaE = 0
Ej = 0
oS.eCache[i] = [1, Ei]
validEcacheList = np.nonzero(oS.eCache[:,0].A)[0]
if len(validEcacheList) > 1:
for k in validEcacheList:
if k == I:
continue
Ek = calcEk(oS, k)
deltaE = np.abs(Ei - Ek)
# 选择具有最大步长的j
if deltaE > maxDeltaE:
maxK = k
maxDeltaE = deltaE
Ej = Ek
return maxK, Ej
else:
j = selectJrand(i, oS.m)
Ej = calcEk(oS, j)
return j, Ej
def updateEk(oS, k):
Ek = calcEk(oS, k)
oS.eCache[k] = [1, Ek]
def innerL(i, oS):
Ei = calcEk(oS, i)
if ((oS.labelMat[i] * Ei < -oS.toler) and (oS.alphas[i] < oS.C)) or\
((oS.labelMat[i] * Ei > oS.toler) and (oS.alphas[i] > 0)):
j, Ej = selectJ(i, oS, Ei)
alphaIold = oS.alphas[i].copy()
alphaJold = oS.alphas[j].copy()
if oS.labelMat[i] != oS.labelMat[j]:
L = max(0, oS.alphas[j] - oS.alphas[I])
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[I])
else:
L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
H = min(oS.C, oS.alphas[j] + oS.alphas[I])
if L == H:
# print('L == H')
return 0
eta = 2.0 * oS.X[i, :]*oS.X[j, :].T - oS.X[i,:]*oS.X[i,:].T - oS.X[j,:]*oS.X[j,:].T
if eta >= 0:
# print('eta >= 0')
return 0
oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej)/eta
oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
updateEk(oS, j)
if np.abs(oS.alphas[j] - alphaJold) < 0.00001:
# print('j not moving enough')
return 0
oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i] * (alphaJold - oS.alphas[j])
updateEk(oS, i)
b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) *\
oS.X[i,:]*oS.X[i,:].T - oS.labelMat[j] * \
(oS.alphas[j] - alphaJold) * oS.X[i,:]*oS.X[j,:].T
b2 = oS.b - Ej - oS.labelMat[i] * (oS.alphas[i] - alphaIold) *\
oS.X[i,:]*oS.X[j,:].T - oS.labelMat[j] *\
(oS.alphas[j] - alphaJold) * oS.X[j,:]*oS.X[j,:].T
if (0 < oS.alphas[i]) and (oS.alphas[i] < oS.C):
oS.b = b1
elif (0 < oS.alphas[j]) and (oS.alphas[j] < oS.C):
oS.b = b2
else:
oS.b = (b1 + b2)/2.0
return 1
else:
return 0
def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup=('lin', 0)):
oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).transpose(), C, toler)
iteration = 0
entireSet = True
alphaPairsChanged = 0
while (iteration < maxIter) and ((alphaPairsChanged > 0) or entireSet):
alphaPairsChanged = 0
if entireSet:
# 遍历所有值
for i in range(oS.m):
alphaPairsChanged += innerL(i, oS)
iteration += 1
else:
# 遍历非边界值
nonBoundIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
for i in nonBoundIs:
alphaPairsChanged += innerL(i, oS)
iteration += 1
if entireSet:
entireSet = False
elif alphaPairsChanged == 0:
entireSet = True
return oS.b, oS.alphas
执行并计时
%%timeit
b, alphas = smoP(dataArr, labelArr, 0.1, 0.1, 40)
完整版platt SMO:
73.7 ms ± 13.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
查看支持向量
[4.658191, 3.507396] -1.0
[3.457096, -0.082216] -1.0
[2.893743, -1.643468] -1.0
[6.080573, 0.418886] 1.0
发现与简化版SMO的结果一样。
核函数
现实任务中,除了上述线性可分的例子外,还存在一些非线性可分的数据。例如下图的“异或”问题就不是线性可分的。
对于非线性可分的情况,可将样本从原始空间映射到一个更高维的特征空间,使得样本在这个高维特征空间内线性可分。例如上图,将二维空间映射到三维空间,就能找到一个合适的超平面。
这种从某个特征空间到另一个特征空间的映射是通过核函数来实现的。
直接计算比较困难。为了避开这个障碍,可以设想这样一个函数:
径向基核函数
一个采用向量作为自变量的函数,能够基于向量距离运算输出一个标量。这个距离可以是从<0,0>向量或者其他向量开始计算的距离。接下来将会使用径向基核函数的高斯版本,具体公式如下:
其中是用户定义的用于确定到达率或者说函数值跌落到0的速度参数。
from numpy import *
# 核转换函数
def kernelTrans(X, A, kTup):
m, n = np.shape(X)
K = np.mat(np.zeros((m,1)))
if kTup[0] == 'lin':
K = X * A.T
elif kTup[0] == 'rbf':
for j in range(m):
deltaRow = X[j,:] - A
K[j] = deltaRow * deltaRow.T
K = np.exp(K / (-1*kTup[1]**2))
else:
raise NameError('That Kernel is not recognized')
return K
# 添加核函数参数
class optStruct:
def __init__(self, dataMatIn, classLabels, C, toler, kTup):
self.X = dataMatIn
self.labelMat = classLabels
self.C = C
self.toler = toler
self.m = dataMatIn.shape[0]
self.alphas = np.mat(np.zeros((self.m, 1)))
self.b = 0
self.eCache = np.mat(np.zeros((self.m, 2))) # 误差缓存
self.K = np.mat(np.zeros((self.m, self.m)))
for i in range(self.m):
self.K[:, i] = kernelTrans(self.X, self.X[i,:], kTup)
def calcEk(oS, k):
fXk = float(multiply(oS.alphas,oS.labelMat).T*oS.K[:,k] + oS.b)
Ek = fXk - float(oS.labelMat[k])
return Ek
def innerL(i, oS):
Ei = calcEk(oS, i)
if ((oS.labelMat[i]*Ei < -oS.toler) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i]*Ei > oS.toler) and (oS.alphas[i] > 0)):
j,Ej = selectJ(i, oS, Ei)
alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy();
if (oS.labelMat[i] != oS.labelMat[j]):
L = max(0, oS.alphas[j] - oS.alphas[I])
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[I])
else:
L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
H = min(oS.C, oS.alphas[j] + oS.alphas[I])
if L==H: return 0
eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j]
if eta >= 0: return 0
oS.alphas[j] -= oS.labelMat[j]*(Ei - Ej)/eta
oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
updateEk(oS, j)
if (abs(oS.alphas[j] - alphaJold) < 0.00001): return 0
oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])
updateEk(oS, i)
b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j]
b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]- oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]
if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1
elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2
else: oS.b = (b1 + b2)/2.0
return 1
else: return 0
def testRbf(k1=1.3):
dataArr,labelArr = loadDataSet('testSetRBF.txt')
b,alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1)) #C=200 important
datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
svInd=nonzero(alphas.A>0)[0]
sVs=datMat[svInd] #获取支持向量矩阵
labelSV = labelMat[svInd];
print("there are %d Support Vectors" % shape(sVs)[0])
m,n = shape(datMat)
errorCount = 0
for i in range(m):
kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
if sign(predict)!=sign(labelArr[i]): errorCount += 1
print("the training error rate is: %f" % (float(errorCount)/m))
dataArr,labelArr = loadDataSet('/testSetRBF2.txt')
errorCount = 0
datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
m,n = shape(datMat)
for i in range(m):
kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
if sign(predict)!=sign(labelArr[i]): errorCount += 1
print("the test error rate is: %f" % (float(errorCount)/m))
testRbf()
output:
there are 25 Support Vectors
the training error rate is: 0.000000
the test error rate is: 0.020000
小结
支持向量机是一种分类器。
支持向量机的泛化错误率较低
核方法或者说核技巧会将数据(有时是非线性数据)从一个低维空间映射到一个高维空间,可以将一个在低维空间的非线性问题转换成高维空间下的线性问题来求解。
SVM的效果也对优化参数和所用核函数中的参数敏感。
参考
序列最小优化算法(SMO)浅析