手写代码实现:
import time
import numpy as np
import math
import random
def loadDataSet(filename):
dataArr = []
labelArr= []
with open(filename,'r') as f:
for line in f.readlines():
line = line.split()
dataArr.append([float(line[0]),float(line[1])])
labelArr.append(float(line[2]))
return dataArr,labelArr
class SVM:
'''
SVM类
'''
def __init__(self,trainDataList,trainLabelList,sigma = 10,C = 200,toler = 0.01):
'''
trainDataList:训练集数据
testDataList:训练集标签
sigma:高斯核参数
C:松弛惩罚项
toler:误差范围
'''
#训练集转为矩阵
self.trainDataMat = np.mat(trainDataList)
#标签为了便利转为列向量
self.trainLabelMat = np.mat(trainLabelList).T
#训练数据m条,n个特征
self.m,self.n = np.shape(self.trainDataMat)
#其他参数
self.sigma = sigma
self.C = C
self.toler = toler
self.b = 0 #偏置
self.alpha = [0 for i in range(self.m)]
self.E = [0 * self.trainLabelMat[i,0] for i in range(self.m)]
self.supportVecIndex = [] #支持向量的索引
#核函数
self.k = self.calcKernel()
def calcKernel(self):
#k[i][j] = exp(-||X_i - X_j||^2/(2*sigma^2))
#k.shape: m * m
k = [[0 for i in range(self.m)] for i in range(self.m)]
for i in range(self.m):
X_i = self.trainDataMat[i,:]
for j in range(self.m):
X_j = self.trainDataMat[j,:]
X_L2 = (X_i-X_j)*(X_i-X_j).T
k[i][j] = np.exp(-X_L2/(2*self.sigma**2))
k[j][i] = k[i][j]
return k
def calcgxi(self,i):
'''
计算函数g(x)对xi的预测值
'''
#找出非零α提高效率
index = [j for j,alpha in enumerate(self.alpha) if alpha != 0]
#计算g(xi) = w*xi + b
g_xi = 0
for j in index:
g_xi += self.alpha[i] * self.trainLabelMat[i] * self.k[i][j]
g_xi += self.b
return g_xi
def isSatisfyKKT(self,i):
'''
检查第i个α是否符合kkt条件
'''
g_xi = self.calcgxi(i)
y_i = self.trainLabelMat[i]
if math.fabs(self.alpha[i]) < self.toler and y_i*g_xi >= 1:
return True
elif self.alpha[i] > -self.toler and self.alpha[i] < self.C + self.toler \
and math.fabs(y_i*g_xi - 1) < self.toler:
return True
elif math.fabs(self.alpha[i] - self.C) < self.toler:
return True
else:
return False
def calcEi(self,i):
'''
计算第i个元素预测值和实际值的差
Ei = g(xi) - yi
'''
g_xi = self.calcgxi(i)
E_i = g_xi - self.trainLabelMat[i]
return E_i
def getAlphaJ(self,E1,i):
'''
SMO中选择第二个变量
:param E1: 第一个变量的E1
:param i: 第一个变量α的下标
:return: E2,α2的下标
'''
#初始化E2
E2 = 0
#|E1-E2| = -1
E1_E2 = -1
maxIndex = -1
nonzeroE = [i for i,E in enumerate(self.E) if E != 0]
# 对每个非零下标遍历
for j in nonzeroE:
#计算E2
E2Tmp = self.calcEi(j)
#找出使得|E1-E2|最大的E2,并更新
if math.fabs(E1 - E2Tmp) > E1_E2:
E1_E2 = math.fabs(E1 - E2Tmp)
E2 = E2Tmp
maxIndex = j
if maxIndex == -1:
maxIndex = i
while(maxIndex == i):
maxIndex = int(random.uniform(0,self.m))
E2 = self.calcEi(maxIndex)
return E2,maxIndex
def train(self, iter = 100):
#iterStep:迭代次数,超过设置次数还未收敛则强制停止
#parameterChanged:单次迭代中有参数改变则增加1
iterStep = 0; parameterChanged = 1
#如果没有达到限制的迭代次数以及上次迭代中有参数改变则继续迭代
#parameterChanged==0时表示上次迭代没有参数改变,如果遍历了一遍都没有参数改变,说明
#达到了收敛状态,可以停止了
while (iterStep < iter) and (parameterChanged > 0):
#打印当前迭代轮数
print('iter:%d:%d'%( iterStep, iter))
#迭代步数加1
iterStep += 1
#新的一轮将参数改变标志位重新置0
parameterChanged = 0
#大循环遍历所有样本,用于找SMO中第一个变量
for i in range(self.m):
#查看第一个遍历是否满足KKT条件,如果不满足则作为SMO中第一个变量从而进行优化
if self.isSatisfyKKT(i) == False:
#如果下标为i的α不满足KKT条件,则进行优化
#第一个变量α的下标i已经确定,接下来按照“7.4.2 变量的选择方法”第二步
#选择变量2。由于变量2的选择中涉及到|E1 - E2|,因此先计算E1
E1 = self.calcEi(i)
#选择第2个变量
E2, j = self.getAlphaJ(E1, i)
#参考“7.4.1两个变量二次规划的求解方法” P126 下半部分
#获得两个变量的标签
y1 = self.trainLabelMat[i]
y2 = self.trainLabelMat[j]
#复制α值作为old值
alphaOld_1 = self.alpha[i]
alphaOld_2 = self.alpha[j]
#依据标签是否一致来生成不同的L和H
if y1 != y2:
L = max(0, alphaOld_2 - alphaOld_1)
H = min(self.C, self.C + alphaOld_2 - alphaOld_1)
else:
L = max(0, alphaOld_2 + alphaOld_1 - self.C)
H = min(self.C, alphaOld_2 + alphaOld_1)
#如果两者相等,说明该变量无法再优化,直接跳到下一次循环
if L == H: continue
#计算α的新值
#依据“7.4.1两个变量二次规划的求解方法”式7.106更新α2值
#先获得几个k值,用来计算事7.106中的分母η
k11 = self.k[i][i]
k22 = self.k[j][j]
k21 = self.k[j][i]
k12 = self.k[i][j]
#依据式7.106更新α2,该α2还未经剪切
alphaNew_2 = alphaOld_2 + y2 * (E1 - E2) / (k11 + k22 - 2 * k12)
#剪切α2
if alphaNew_2 < L: alphaNew_2 = L
elif alphaNew_2 > H: alphaNew_2 = H
#更新α1,依据式7.109
alphaNew_1 = alphaOld_1 + y1 * y2 * (alphaOld_2 - alphaNew_2)
#依据“7.4.2 变量的选择方法”第三步式7.115和7.116计算b1和b2
b1New = -1 * E1 - y1 * k11 * (alphaNew_1 - alphaOld_1) \
- y2 * k21 * (alphaNew_2 - alphaOld_2) + self.b
b2New = -1 * E2 - y1 * k12 * (alphaNew_1 - alphaOld_1) \
- y2 * k22 * (alphaNew_2 - alphaOld_2) + self.b
#依据α1和α2的值范围确定新b
if (alphaNew_1 > 0) and (alphaNew_1 < self.C):
bNew = b1New
elif (alphaNew_2 > 0) and (alphaNew_2 < self.C):
bNew = b2New
else:
bNew = (b1New + b2New) / 2
#将更新后的各类值写入,进行更新
self.alpha[i] = alphaNew_1
self.alpha[j] = alphaNew_2
self.b = bNew
self.E[i] = self.calcEi(i)
self.E[j] = self.calcEi(j)
#如果α2的改变量过于小,就认为该参数未改变,不增加parameterChanged值
#反之则自增1
if math.fabs(alphaNew_2 - alphaOld_2) >= 0.00001:
parameterChanged += 1
#打印迭代轮数,i值,该迭代轮数修改α数目
print("iter: %d i:%d, pairs changed %d" % (iterStep, i, parameterChanged))
#全部计算结束后,重新遍历一遍α,查找里面的支持向量
for i in range(self.m):
#如果α>0,说明是支持向量
if self.alpha[i] > 0:
#将支持向量的索引保存起来
self.supportVecIndex.append(i)
def calcSinglKernel(self, x1, x2):
'''
单独计算核函数
:param x1:向量1
:param x2: 向量2
:return: 核函数结果
'''
# 按照“7.3.3 常用核函数”式7.90计算高斯核
result = (x1 - x2) * (x1 - x2).T
result = np.exp(-1 * result / (2 * self.sigma ** 2))
# 返回结果
return np.exp(result)
def predict(self, x):
'''
对样本的标签进行预测
公式依据“7.3.4 非线性支持向量分类机”中的式7.94
:param x: 要预测的样本x
:return: 预测结果
'''
result = 0
for i in self.supportVecIndex:
# 遍历所有支持向量,计算求和式
# 如果是非支持向量,求和子式必为0,没有必须进行计算
# 这也是为什么在SVM最后只有支持向量起作用
# ------------------
# 先单独将核函数计算出来
tmp = self.calcSinglKernel(self.trainDataMat[i, :], np.mat(x))
# 对每一项子式进行求和,最终计算得到求和项的值
result += self.alpha[i] * self.trainLabelMat[i] * tmp
# 求和项计算结束后加上偏置b
result += self.b
# 使用sign函数返回预测结果
return np.sign(result)
def test(self, testDataList, testLabelList):
'''
测试
:param testDataList:测试数据集
:param testLabelList: 测试标签集
:return: 正确率
'''
# 错误计数值
errorCnt = 0
# 遍历测试集所有样本
for i in range(len(testDataList)):
# 打印目前进度
print('test:%d:%d' % (i, len(testDataList)))
# 获取预测结果
result = self.predict(testDataList[i])
# 如果预测与标签不一致,错误计数值加一
if result != testLabelList[i]:
errorCnt += 1
# 返回正确率
return 1 - errorCnt / len(testDataList)
if __name__ == "__main__":
#记录程序开始时间
startTime = time.time()
#读取训练集
print("Loading Train Data...")
trainDataList,trainLabelList = loadDataSet("traindata.txt")
#读取测试集
print("Loading Test Data...")
testDataList,testLabelList = loadDataSet("testdata.txt")
#初始化SVM
print("start init svm...")
svm = SVM(trainDataList,trainLabelList,1.3,200,0.01)
# 开始训练
print('start to train')
svm.train()
# 开始测试
print('start to test')
accuracy = svm.test(testDataList, testLabelList)
print('the accuracy is:%d' % (accuracy * 100), '%')
# 打印时间
print('time span:', time.time() - startTime)