Python实现SMO算法

import numpy as np
import random

data = np.loadtxt("heart-c.txt")
# 缓存样本点积
dp = np.dot(data[:,0:data.shape[1]-2], np.transpose(data[:,0:data.shape[1]-2]))
Lab = data[:,data.shape[1] - 1]

# 初始化α向量
alphas = [0] * data.shape[0]

# 初始化偏置项b和惩罚参数C
b = 0
C = 0.8

# 初始化每个样本的估计错误率
E = -Lab

# 设置精度
eps = 0.05



# --计算目标函数值-- #
def objValue(x,id1,id2):
  global data,dp,Lab,alphas,b,C,E,eps
  
  # 计算常量
  con = 0
  i = 0
  j = 0
  for i in range(data.shape[0]):
    if i != id1 and i != id2:
      con = con + alphas[i]
    else:
      continue
    for j in range(data.shape[0]):
      if j != id1 and j != id2:
        con = con - 0.5 * Lab[i] * Lab[j] * dp[i][j] * alphas[i] * alphas[j]
      else:
        continue
  
  # compute t1 and t2
  t0 = list(map(np.dot, Lab, alphas))
  t1 = np.dot(t0,dp[id1,:]) + b - Lab[id1]*alphas[id1]*dp[id1][id1] - Lab[id2]*x*dp[id2][id1]
  t2 = np.dot(t0,dp[id2,:]) + b - Lab[id1]*alphas[id1]*dp[id1][id2] - Lab[id2]*x*dp[id2][id2]
  v = alphas[id1] + x - 0.5*dp[id1][id1]*alphas[id1]**2 - 0.5*dp[id2][id2]*x**2 - \
                    Lab[id1]*Lab[id2]*dp[id1][id2]*alphas[id1]*x - Lab[id1]*alphas[id1]*t1 - \
                    Lab[id2]*x*t2 + con
  return v



# --求解两变量的优化问题-- #
def varPairOpt(id1,id2):
  global data,dp,Lab,alphas,b,C,E,eps
  
  if id1 == id2:
    return 0
  
  alpha1 = alphas[id1]
  alpha2 = alphas[id2]
  alpha1_new = alpha1
  alpha2_new = alpha2
  
  y1 = Lab[id1]
  y2 = Lab[id2]
  e1 = E[id1]
  e2 = E[id2]
  s = y1 * y2
  
  # 计算下限L和上限H
  L = 0
  H = 0
  if y1 != y2:
    L = max(0, alpha2 - alpha1)
    H = min(C, C + alpha2 - alpha1)
  else:
    L = max(0, alpha2 + alpha1 - C)
    H = min(C, alpha2 + alpha1)

  if L == H:
    return 0
  
  eta = 2 * dp[id1][id2] - dp[id1][id1] - dp[id2][id2]
  # 确定alpha2_new的值
  if eta < 0:
    alpha2_new = alpha2 - y2 * (e1 - e2) / eta
    if alpha2_new < L:
      alpha2_new = L
    elif alpha2_new > H:
      alpha2_new = H
  else: # eta equals to zero
    Lobj = objValue(L,id1,id2)
    Hobj = objValue(H,id1,id2)
    if Lobj > Hobj + eps:
      alpha2_new = L
    elif Lobj < Hobj + eps:
      alpha2_new = H
    else:
      alpha2_new = alpha2
  
  # alpha2没有足够大的变化则返回
  if abs(alpha2-alpha2_new) < eps:
    return 0
  alpha1_new = alpha1 + s * (alpha2 - alpha2_new)
  # 更新b
  b1_new = E[id1] + Lab[id1] * (alpha1_new - alphas[id1]) * dp[id1][id2] + \
                    Lab[id2] * (alpha2_new - alphas[id2]) * dp[id1][id2] + b
  b2_new = E[id2] + Lab[id1] * (alpha1_new - alphas[id1]) * dp[id1][id2] + \
                    Lab[id2] * (alpha2_new - alphas[id2]) * dp[id2][id2] + b
  b_old = b
  if alpha1_new > 0 and alpha1_new < C:
    b = b1_new
  elif alpha2_new > 0 and alpha2_new < C:
    b = b2_new
  else:
    b = (b1_new + b2_new) / 2
  
  # 更新E
  E = np.add(np.add(E,b - b_old), np.add(Lab[id1]*(alpha1_new-alphas[id1])*dp[id1,:], \
                    Lab[id2]*(alpha2_new-alphas[id2])*dp[id2,:]))
  # 更新α
  alphas[id1] = alpha1_new
  alphas[id2] = alpha2_new
  return 1


# --选择变量-- #
def select(i,Ei,entireSet):
  global data,dp,Lab,alphas,b,C,E,eps
  
  max_deltaE = 0
  maxj = -1
  if entireSet == 1:  # 从整个训练集中选择
    index = random.randint(0, data.shape[0] - 1)
    while index == i:
      index = random.randint(0, data.shape[0] - 1)
    return index
  else:  # 启发式选择
    indexs = [k for k in range(len(alphas)) if (alphas[k] > 0) and (alphas[k] < C)]
    Len = len(indexs)
    for j in range(Len):
      if indexs[j] == i:
        continue
      Ej = np.dot(list(map(np.dot, Lab, alphas)), dp[indexs[j],:]) + b - Lab[indexs[j]]
      deltaE = abs(Ei - Ej)
      if deltaE > max_deltaE:
        maxj = indexs[j]
        max_deltaE = deltaE
    return maxj



iter_ = 0
max_iter = 50
entireSet = 1  # 作为一个标记看是选择全遍历还是部分遍历
alpha_change = 0

while (iter_ < max_iter) and ((alpha_change > 0) or entireSet):
  
  alpha_change = 0
  # 遍历整个训练集
  if entireSet:
    i = 0
    for i in range(data.shape[0]):
      Ei = np.dot(list(map(np.dot, Lab, alphas)), dp[i,:]) + b - Lab[i]
      if (Lab[i]*Ei < -eps and alphas[i] < C) or (Lab[i]*Ei > eps and alphas[i] > 0):
        # 选择第2个变量
        j = select(i,Ei,entireSet)
        # 优化
        if varPairOpt(i,j):
          alpha_change += 1
    iter_ += 1
  # 遍历满足0 < α < C的训练样本集合
  else:
    indexs = [k for k in range(data.shape[0] - 1) if (alphas[k] > 0) and (alphas[k] < C)]
    Len = len(indexs)
    i = 0
    for i in range(Len):
      Ei = np.dot(list(map(np.dot, Lab, alphas)), dp[indexs[i],:]) + b - Lab[indexs[i]]
      if (Lab[indexs[i]]*Ei < -eps and alphas[indexs[i]] < C) or (Lab[indexs[i]]*Ei > eps and alphas[indexs[i]] > 0):
        # 选择第2个变量
        j = select(indexs[i],Ei,entireSet)
        # 优化
        if varPairOpt(indexs[i],j):
          alpha_change += 1
    iter_ += 1
  
  if entireSet:  # 此次迭代为全遍历,下次变成部分遍历
    entireSet = 0
  elif alpha_change == 0:  # 若部分遍历没有找到需要交换的α,改为全遍历
    entireSet = 1
  print("第 " + str(iter_) + " 次迭代")
  
print(np.around(alphas,decimals=2))

你可能感兴趣的:(机器学习,数据结构与算法)