关于基于SMO方法SVM算法原理介绍的博客有很多了,就不在详细展开。具体可以参考
ML-支持向量:SVM、SVC、SVR、SMO原理推导及实现
SVM解释:五、SMO算法
这里重点阐释在具体实现的时候几个存在疑惑的地方
SMO启发式规则:
为什么违反KKT条件的 α i \alpha_i αi就是我们要找的 α i \alpha_i αi呢?
目前理解为:因为要更新 α \alpha α,若是满足KKT条件的就不需要更新,所以使找违反KKT条件的 α \alpha α。之所以要先找 0 < α < C 0<\alpha
因此,首先找到 0 < α < C 0<\alpha
为什么根据 α i \alpha_i αi找到使 ∣ E i − E j ∣ |E_i-E_j| ∣Ei−Ej∣最大的 α j \alpha_j αj?
根据SMO算法推导,详细见参考文献一。最终确定的 α j \alpha_j αj计算公式:
α 2 n e w , u n c ( 未 约 束 剪 辑 ) = α 2 o l d + y 2 ( E 1 − E 2 ) K 11 + K 22 − 2 K 12 \alpha_2^{new,unc}(未约束剪辑)=\alpha_2^{old}+\frac{y_2(E_1-E_2)}{K_{11}+K_{22}-2K_{12}} α2new,unc(未约束剪辑)=α2old+K11+K22−2K12y2(E1−E2)
根据上式可以看出,本轮迭代的最优解的变化程度与 ∣ E i − E j ∣ |E_i-E_j| ∣Ei−Ej∣成正比,为了加快迭代速度,就需要 α 2 \alpha_2 α2尽量变化的快,因此就需要找到使 ∣ E i − E j ∣ |E_i-E_j| ∣Ei−Ej∣最大所对应的 α 2 \alpha_2 α2。
综上,具体实现方法如下:
def _KKT(self, i):
y_g = self._g(i) * self.Y[i]
if self.alpha[i] == 0: # a=0:需要yif(xi)-1>=0
return y_g >= 1
elif 0 < self.alpha[i] < self.C: # 0
return y_g == 1
else:
return y_g <= 1 # a>=C:异常点,需要0≤ξi≤1满足在区间内yif(xi)<=1
def _init_alpha(self): # 找到使|E1-E2|最大的index
# 外层训练首先遍历所有满足0
index_list = [i for i in range(self.m) if 0 < self.alpha[i] <self.C]
# 否则,遍历整个训练集
non_satisfy_list = [i for i in range(self.m) if i not in index_list]
index_list.extend(non_satisfy_list) # 该列表中前面是满足的样本点,后面是不满足的样本点
for i in index_list:
if self._KKT(i):
continue # 跳出本次for循环,break为跳出整个for循环
E1 = self.E[i]
# E1为+,选择最小的作为E2;E1为-, 选择最大的
if E1 >= 0:
j = min(range(self.m), key = lambda x: self.E[x])
# lambda定义隐函数,https://www.zhihu.com/question/20125256
else:
j = max(range(self.m), key = lambda x: self.E[x])
# 找到一对就返回,否则继续循环
return i, j
重点在标注文字处。
具体的实现:
if self.Y[i1] == self.Y[i2]: # 根据alpha1, alpha2的范围(0,C)重新确定新的alpha2的边界
L = max(0, self.alpha[i1]+self.alpha[i2]-self.C)
H = min(self.C, self.alpha[i1]+self.alpha[i2])
else:
L = max(0, self.alpha[i2]-self.alpha[i1])
H = min(self.C, self.C+self.alpha[i2]-self.alpha[i1])
根据推导可以得到 α 2 n e w \alpha_2^{new} α2new的更新方式如下:
α 2 n e w = { H α 2 n e w > H α 2 o l d + y 2 ( E 1 − E 2 ) K 11 + K 22 − 2 K 12 0 < α 2 n e w < H L α 2 n e w < 0 \alpha_2^{new}= \left\{ \begin{aligned} H&&\alpha_2^{new}>H \\ \alpha_2^{old}+\frac{y_2(E_1-E_2)}{K_{11}+K_{22}-2K_{12}}&& 0<\alpha_2^{new}
如何进一步得到 α 1 n e w \alpha_1^{new} α1new?
在确定了 α 1 \alpha_1 α1和 α 2 \alpha_2 α2之后,可以将 ∑ i = 3 m α i y i \sum_{i=3}^m{\alpha_iy_i} ∑i=3mαiyi看作常量 γ \gamma γ,则有
α 1 n e w y 1 + α 2 n e w y 2 = − ∑ i = 3 m α i y i = γ \alpha_1^{new}y_1+\alpha_2^{new}y_2=-\sum_{i=3}^m{\alpha_iy_i}=\gamma α1newy1+α2newy2=−i=3∑mαiyi=γ
对于 α 2 \alpha_2 α2更新前后,有如下等式:
α 1 n e w y 1 + α 2 n e w y 2 = γ = α 1 o l d y 1 + α 2 o l d y 2 \alpha_1^{new}y_1+\alpha_2^{new}y_2=\gamma=\alpha_1^{old}y_1+\alpha_2^{old}y_2 α1newy1+α2newy2=γ=α1oldy1+α2oldy2
对其进行变换即可得到:
α 1 n e w = α 1 o l d + y 1 y 2 ( α 2 o l d − α 2 n e w ) \alpha_1^{new}=\alpha_1^{old}+y_1y_2(\alpha_2^{old}-\alpha_2^{new}) α1new=α1old+y1y2(α2old−α2new)
这里对原文算法核函数的实现进行了进一步的简化。并且增加了Gussian核函数。
def kernel(self, x1, x2):
if self._kernel == 'linear':
return np.dot(x1, x2) # 线性核,直接求其点积即可
elif self._kernel == 'poly':
return (np.dot(x1, x2)+1)**2
elif self._kernel == 'Gussian':
sigma = 1.0
return np.exp(-np.linalg.norm(x1-x2, ord=2)/(2*sigma**2))
return 0
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
def create_data():
iris = load_iris()
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
data = np.array(df.iloc[:100, [0, 1, -1]]) # 通过行号提取数据
# https://blog.csdn.net/w_weiying/article/details/81411257
for i in range(len(data)):
if data[i, -1] == 0:
data[i, -1] == -1
return data[:,:2], data[:, -1]
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)# 数据打乱及划分
class SVM:
def __init__(self, max_iter = 1000, kernel = 'linear'):
self.max_iter = max_iter
self._kernel = kernel
def init_args(self, features, labels):
self.m, self.n = features.shape
self.X = features
self.Y = labels
self.b = 0.0
# 将Ei保存在一个列表里
self.alpha = np.ones(self.m)
self.E = [self._E(i) for i in range(self.m)]
# 松弛变量
self.C = 1.0
def _KKT(self, i):
y_g = self._g(i) * self.Y[i]
if self.alpha[i] == 0: # a=0:需要yif(xi)-1>=0
return y_g >= 1
elif 0 < self.alpha[i] < self.C: # 0
return y_g == 1
else:
return y_g <= 1 # a>=C:异常点,需要0≤ξi≤1满足在区间内yif(xi)<=1
# g(x)为预测值,输入为xi(X[i])
def _g(self, i):
r = self.b
for j in range(self.m):
r += self.alpha[j] * self.Y[j] * self.kernel(self.X[i], self.X[j])
return r
# 核函数
def kernel(self, x1, x2):
if self._kernel == 'linear':
return np.dot(x1, x2) # 线性核,直接求其点积即可
elif self._kernel == 'poly':
return (np.dot(x1, x2)+1)**2
elif self._kernel == 'Gussian':
sigma = 1.0
return np.exp(-np.linalg.norm(x1-x2, ord=2)/(2*sigma**2))
return 0
# 原文核函数的定义
# def kernel(self, x1, x2):
# if self._kernel == 'linear':
# return sum([x1[k] * x2[k] for k in range(self.n)]
# elif self._kernel == 'poly':
# return (sum([x1[k] * x2[k] for k in range(self.n)]) + 1) ** 2
# return 0
# E(x)为g(x)对输入x的预测值和y的残差
def _E(self, i):
return self._g(i)-self.Y[i]
def _init_alpha(self): # 找到使|E1-E2|最大的index
# 外层训练首先遍历所有满足0
index_list = [i for i in range(self.m) if 0 < self.alpha[i] <self.C]
# 否则,遍历整个训练集
non_satisfy_list = [i for i in range(self.m) if i not in index_list]
index_list.extend(non_satisfy_list) # 该列表中前面是满足的样本点,后面是不满足的样本点
for i in index_list:
if self._KKT(i):
continue # 跳出本次for循环,break为跳出整个for循环
E1 = self.E[i]
# E1为+,选择最小的作为E2;E1为-, 选择最大的
if E1 >= 0:
j = min(range(self.m), key = lambda x: self.E[x])
# lambda定义隐函数,https://www.zhihu.com/question/20125256
else:
j = max(range(self.m), key = lambda x: self.E[x])
# 找到一对就返回,否则继续循环
return i, j
def _compare(self, _alpha, L, H):
# 剪辑操作
if _alpha > H:
return H
elif _alpha< L:
return L
else:
return _alpha
def fit(self, features, labels):
# 训练
self.init_args(features, labels) # 初始化参数
for t in range(self.max_iter):
# train
i1, i2 = self._init_alpha() # 找到使|E1-E2|最大的index以实现alpha2的更新速率最快
# 边界
if self.Y[i1] == self.Y[i2]: # 根据alpha1, alpha2的范围(0,C)重新确定新的alpha2的边界
L = max(0, self.alpha[i1]+self.alpha[i2]-self.C)
H = min(self.C, self.alpha[i1]+self.alpha[i2])
else:
L = max(0, self.alpha[i2]-self.alpha[i1])
H = min(self.C, self.C+self.alpha[i2]-self.alpha[i1])
E1 = self.E[i1] # 更新E
E2 = self.E[i2]
eta = self.kernel(self.X[i1], self.X[i1])+self.kernel(
self.X[i2],self.X[i2])-2*self.kernel(self.X[i1],self.X[i2]) # 求解K11+K22-2K12
if eta <= 0:
continue
alpha2_new_unc = self.alpha[i2]+self.Y[i2]*(E1-E2)/eta
alpha2_new = self._compare(alpha2_new_unc, L, H) # 计算alpha2,取最大值
alpha1_new = self.alpha[i1]+self.Y[i1]*self.Y[i2]*(self.alpha[i2]-alpha2_new) # 计算alpha1
self.alpha[i1] = alpha1_new
self.alpha[i2] = alpha2_new
b1_new = self.b
b2_new = self.b
if 0 < alpha1_new < self.C:
b1_new = -E1-self.Y[i1]*self.kernel(self.X[i1], self.X[i1])*(
alpha1_new-self.alpha[i1])-self.Y[i2]*self.kernel(self.X[i2],self.X[i1])*(
alpha2_new-self.alpha[i2])+self.b
if 0 < alpha2_new < self.C:
b2_new = -E2-self.Y[i1]*self.kernel(self.X[i1], self.X[i2])*(
alpha1_new-self.alpha[i1])-self.Y[i2]*self.kernel(self.X[i2],self.X[i2])*(
alpha2_new-self.alpha[i2])+self.b
b_new = (b1_new+b2_new)/2
self.b = b_new
# 更新E
self.E[i1] = self._E(i1)
self.E[i2] = self._E(i2)
return 'train done!'
def predict(self, data):
r = self.b
for i in range(self.m):
r += self.alpha[i]*self.Y[i]*self.kernel(data, self.X[i])
return 1 if r >0 else -1
def predicts(self, datas):
pred = np.array([])
for i in range(len(datas)):
pred = np.append(pred, self.predict(datas[i]))
return pred
def score(self, X_test, y_test):
right_count = 0
for i in range(len(X_test)):
result = self.predict(X_test[i])
if result == y_test[i]:
right_count += 1
return right_count/len(X_test)
svm = SVM(max_iter=500, kernel='Gussian')
svm.fit(X_train, y_train)
print(svm.score(X_test, y_test))