Platt SMO算法是通过一个外循环来选择第一个alpha值的,并且其选择过程会在两种方式之间进行交替:
- 一种是在所有数据集上进行单遍扫描
- 另一种方式则是在非边界alpha中实现单遍扫描。(所谓非边界alpha指的是那些不等于边界0或者C的alpha值。)
在选择第一个alpha值之后,算法会通过一个内循环来选择第二个alpha值。在优化过程中,通过最大化步长的方式来获得第二个alpha值。
import numpy as np
class Optimization:
def __init__(self, data_mat_in, class_labels, c, toler):
self.x = data_mat_in
self.label_mat = class_labels
self.c = c
self.toler = toler
self.m = np.shape(data_mat_in)[0]
self.alphas = np.mat(np.zeros((self.m, 1)))
self.b = 0
self.e_cache = np.mat(np.zeros((self.m, 2)))
def load_data_set(file_name):
data_mat = []
label_mat = []
file = open(file_name)
for line in file.readlines():
line_array = line.strip().split('\t')
data_mat.append([float(line_array[0]), float(line_array[1])])
label_mat.append(float(line_array[2]))
return data_mat, label_mat
def select_rand_j(i, m):
j = i
while j == i:
j = int(np.random.uniform(0, m))
return j
def calc_ek(opt, k):
fxk = float(np.multiply(opt.alphas, opt.label_mat).T * (opt.x * opt.x[k, :].T)) + opt.b
ek = fxk - float(opt.label_mat[k])
return ek
def clip_alpha(aj, high, low):
if aj > high:
aj = high
if aj < low:
aj = low
return aj
def select_j(i, opt, e_i):
max_k = -1
max_delta_e = 0
e_j = 0
opt.e_cache[i] = [1, e_i]
valid_e_cache_lsit = np.nonzero(opt.e_cache[:, 0].A)[0]
if (len(valid_e_cache_lsit)) > 1:
for k in valid_e_cache_lsit:
if k == i:
continue
e_k = calc_ek(opt, k)
delta_e = abs(e_i - e_k)
if delta_e > max_delta_e:
max_k = k
max_delta_e = delta_e
e_j = e_k
return max_k, e_j
else:
j = select_rand_j(i, opt.m)
e_j = calc_ek(opt, j)
return j, e_j
def update_e_k(opt, k):
e_k = calc_ek(opt, k)
opt.e_cache[k] = [1, e_k]
def inner_loop(i, opt):
e_i = calc_ek(opt, i)
if (opt.label_mat[i] * e_i < -opt.toler and opt.alphas[i] < opt.c) \
or (opt.label_mat[i] * e_i > opt.toler and opt.alphas[i] > 0):
j, e_j = select_j(i, opt, e_i)
alpha_i_old = opt.alphas[i].copy()
alpha_j_old = opt.alphas[j].copy()
if opt.label_mat[i] != opt.label_mat[j]:
low = max(0, opt.alphas[j] - opt.alphas[i])
high = min(opt.c, opt.c + opt.alphas[j] - opt.alphas[i])
else:
low = max(0, opt.alphas[j] + opt.alphas[i] - opt.c)
high = min(opt.c, opt.alphas[j] + opt.alphas[i])
if low == high:
print("low == high")
return 0
eta = 2.0 * opt.x[i, :] * opt.x[j, :].T - opt.x[i, :] * opt.x[i, :].T \
- opt.x[j, :] * opt.x[j, :].T
if eta >= 0:
print("eta >= 0")
return 0
opt.alphas[j] -= opt.label_mat[j] * (e_i - e_j) / eta
opt.alphas[j] = clip_alpha(opt.alphas[j], high, low)
update_e_k(opt, j)
if abs(opt.alphas[j] - alpha_j_old) < 0.00001:
print("j not moving enough")
return 0
opt.alphas[i] += opt.label_mat[j] * opt.label_mat[i] * (alpha_j_old - opt.alphas[j])
update_e_k(opt, i)
b1 = opt.b - e_i - opt.label_mat[i] * (opt.alphas[i] - alpha_i_old) \
* opt.x[i, :] * opt.x[i, :].T - opt.label_mat[j] \
* (opt.alphas[j] - alpha_j_old) * opt.x[i, :] * opt.x[j, :].T
b2 = opt.b - e_j - opt.label_mat[i] * (opt.alphas[i] - alpha_i_old) \
* opt.x[i, :] * opt.x[j, :].T - opt.label_mat[j] \
* (opt.alphas[j] - alpha_j_old) * opt.x[j, :] * opt.x[j, :].T
if 0 < opt.alphas[i] < opt.c:
opt.b = b1
elif 0 < opt.alphas[j] < opt.c:
opt.b = b2
else:
opt.b = (b1 + b2)/2.0
return 1
else:
return 0
def smo_algorithm(data_mat_in, class_labels, c, toler, max_iter, ktup=('lin', 0)):
opt = Optimization(np.mat(data_mat_in), np.mat(class_labels).transpose(), c, toler)
iter = 0
entire_set = True
alpha_paires_changed = 0
# 退出外循环的条件: 当迭代次数超过指定的最大值,或者遍历整个集合都未对任意alpha对进行修改时,就退出循环
while iter < max_iter and (alpha_paires_changed > 0 or entire_set):
alpha_paires_changed = 0
if entire_set:
# 在数据集上遍历任意可能的alpha
for i in range(opt.m):
alpha_paires_changed += inner_loop(i, opt)
print("fullSet, iter: %d, i: %d, pairs changed %d" % (iter, i, alpha_paires_changed))
iter += 1
else:
non_bound_is = np.nonzero((opt.alphas.A > 0) * (opt.alphas.A < c))[0]
# 遍历所有的非边界alpha值,也就是不在边界0或者c上的值
for i in non_bound_is:
alpha_paires_changed += inner_loop(i, opt)
print("non-bound, iter: %d, i:%d, pairs changed: %d" % (iter, i, alpha_paires_changed))
iter += 1
# 在非边界循环和边界循环之间进行切换
if entire_set:
entire_set = False
elif alpha_paires_changed == 0:
entire_set = True
print("iteration number: %d" % iter)
return opt.b, opt.alphas
if __name__ == '__main__':
data_arr, label_arr = load_data_set('testSet.txt')
b, alphas = smo_algorithm(data_arr, label_arr, 0.6, 0.001, 40)