有用请点赞,没用请差评。
欢迎分享本文,转载请保留出处。
本次是学习了李航博士《统计学习分析》后实现了算法,算法实现了线性支持向量机和非线性支持向量机,采用SMO算法求解。算法中实现了两种核函数:高斯核函数和多项式核函数。下面代码中采用的数据集为鸢尾花数据集中的两个类别的数据,若要尝试非线性数据集可以从笔者github:https://github.com/Tomator01/-Machine-Learning
获取,但是readfile函数需要修改一下。
关于SVM的原理可以参考《统计学习分析》以及某位大佬的博客https://blog.csdn.net/c406495762/article/details/78072313
https://blog.csdn.net/c406495762/article/details/78158354
笔者程序还存在的问题:1、KKT条件还是不太清楚。2、当使用核函数时还存在部分问题,输出的拉格朗日乘子看着不对劲,等过段时间再来看看吧~~哭~~也希望各位大佬读者能帮忙指出错误,谢谢。
程序中的注释已经写的非常清楚了,包括公式的引用,这里不再赘述算法步骤。
# -*- coding:utf-8 -*-
# SVM支持向量机:实现了线性支持向量机和非线性支持向量机(多项式核函数、高斯核函数),非线性还有问题有待解决。
#author:Tomator
# 测试数据集为鸢尾花数据集中提取的两种类型数据
import numpy as np
import matplotlib.pyplot as plt
import random
def readfile(filename):
"""
读取数据集
W:特征向量数组
label:标签(类别)列表
:param filename:
:return:特征向量数组和标签集合列表
"""
save_path="D:\\python3_anaconda3\\学习\机器学习\\机器学习数据集\\"
with open(save_path+filename,'r') as f:
length=len(f.readlines())
print(filename,"length: %d"%length)
W = np.zeros((length,4))
label=[]
i=0
f.seek(0,0)
for line in f.readlines():
linestr=line.strip()
linestrlist=line.split(',')
# print(linestrlist)
# 鸢尾属植物数据集的特征共有四个
number_data=[float(j) for j in linestrlist[0:-1]]
W[i,:]=np.array(number_data)
label.append(linestrlist[-1].strip('\n'))
i+=1
return W,label
def createDataset(filename):
"""
创建待分类数据集
"""
data_vector,label_str=readfile(filename)
# print(data_vector,"\n",label)
# 将原始数据集中非字符串标签改为用数字代表,用于后续画图
data_label=np.zeros(len(label_str))
for i in range(len(label_str)):
if label_str[i]=="Iris-setosa":
data_label[i]=-1
elif label_str[i]=="Iris-versicolor":
data_label[i] = 1
return data_vector,data_label
# # 将原始数据集划分为训练集和测试集,splitRatio为划分比例。
# def splitDataset(dataset, splitRatio):
# trainSize = int(len(dataset) * splitRatio)
# trainSet = []
# copy = list(dataset)
# while len(trainSet) < trainSize:
# index = random.randrange(len(copy))
# # 原始数据集剔除训练集之后剩下的就是测试集
# trainSet.append(copy.pop(index))
# return [trainSet, copy]
class SVM(object):
"""
kernel='linear' or "gaussian" or "poly",分别代表线性分类器、高斯核函数、多项式核函数;
kernel_para:表示核函数的参数,高斯核函数为高斯核参数,多项式核函数为p;
epsilon:误差精度;
maxepoch:最大迭代次数;
C:惩罚因子
train_vector:训练数据集的特征向量
train_label:训练数据集的分类标签
train_nums:训练数据集的样本数
train_err:每个样本的预测误差
alpha:拉格朗日乘子
"""
def __init__(self,kernel='linear',kernel_para=0.0,epsilon = 1e-6,maxepoch=2000,C=1.0,):
self.kernel=kernel
self.kernel_para=kernel_para
self.epsilon=epsilon
self.maxepoch=maxepoch
self.train_vector=None
self.train_label=None
self.train_nums = None
self.train_err=None
self.alpha=None
self.C=C
# 初始化参数
def init_parameters(self,train_vector,train_label):
self.train_vector=train_vector
self.train_label=train_label
self.train_nums = len(train_label)
# 预测误差初始化为-yi
self.train_err= -self.train_label
self.alpha=np.zeros(self.train_nums)
self.b=0
# 选择第二个变量,《统计学习方法》P129
def select_second_alpha(self,ind1):
E1=self.train_err[ind1]
max_diff=0
ind2=None
train_exit_err = np.nonzero(self.train_err)[0]
if len(train_exit_err)>1:
for i in train_exit_err:
# 与indx不相等
if i == ind1:
continue
diff=abs(self.train_err[i]-E1)
# print("diff",diff)
if diff>max_diff:
max_diff=diff
ind2=i
return ind2
# 计算核内积
def cal_k(self,x,y):
# 线性,没有核函数
if self.kernel == "linear":
return np.dot(x,y)
# 高斯核函数
elif self.kernel == "gaussian":
dot_ = np.dot(x, y)
result=np.sum(np.exp(-np.square(x-y)/(2*(self.kernel_para**2))))
return result
# 多项式核函数
elif self.kernel == "poly":
dot_=np.dot(x,y)
return np.sum((dot_+1)**self.kernel_para)
else:
# 核函数名称不正确
exit("the kernel show be 'linear、gaussian、poly'")
#更新参数,参考《统计学习方法》P125-P130.Platt序列最小最优化算法
def update(self,ind1,ind2):
# 挑选出的两个样本的alpha、对应的预测值及误差和阈值b
old_alpha1=self.alpha[ind1]
old_alpha2=self.alpha[ind2]
y1=self.train_label[ind1]
y2=self.train_label[ind2]
# print(ind1,ind2,y1,y2)
# 公式7.104
if y1 == y2:
L=max(0.0,old_alpha2 + old_alpha1 - self.C)
H=min(self.C,old_alpha2 + old_alpha1)
else:
L = max(0.0, old_alpha2 - old_alpha1)
H = min(self.C, self.C+old_alpha2 - old_alpha1)
if L == H:
return 0
E1=self.train_err[ind1]
E2=self.train_err[ind2]
K11 = self.cal_k(self.train_vector[ind1],self.train_vector[ind1])
K12 = self.cal_k(self.train_vector[ind1],self.train_vector[ind2])
K22 = self.cal_k(self.train_vector[ind2], self.train_vector[ind2])
# print("k11",K11,"k22",K22)
# 公式7.107
eta=K11 + K22 - 2 * K12
if eta <=0 :
return 0
# 公式7.106
new_unc_alpha=old_alpha2+y2*(E1-E2)/eta
# 公式7.108
if new_unc_alpha > H:
new_alpha2=H
elif new_unc_alpha < L:
new_alpha2=L
else:
new_alpha2=new_unc_alpha
# 公式7.109
new_alpha1=old_alpha1+y1*y2*(old_alpha2-new_alpha2)
# 更新拉格朗日参数
self.alpha[ind1]=new_alpha1
self.alpha[ind2]=new_alpha2
# 公式7.115
new_b1=-E1-y1*K11*(new_alpha1 - old_alpha1)-y2*K12*(new_alpha2 - old_alpha2)+self.b
# 公式7.116
new_b2=-E2 - y1 * K12 * (new_alpha1 - old_alpha1) - y2 * K22 * (new_alpha2 - old_alpha2) + self.b
# P130文字部分
if 0 < new_alpha1 < self.C:
self.b = new_b1
elif 0 < new_alpha2 < self.C:
self.b = new_b2
else:
self.b = (new_b1 + new_b2) / 2
# 更新预测误差
self.train_err[ind1] = np.sum(self.train_label * self.alpha * self.cal_k(self.train_vector,self.train_vector[ind1])) + self.b - self.train_label[ind1]
self.train_err[ind2] = np.sum(self.train_label * self.alpha * self.cal_k(self.train_vector,self.train_vector[ind2])) + self.b - self.train_label[ind2]
return 1
# 训练模型
def train(self,train_vector,train_label):
# 初始化参数
self.init_parameters(train_vector,train_label)
epochs=0
# 迭代次数小于最大迭代次数
while epochs0,则yg>1, alpha=0则符合
if (r < -self.epsilon and alpha < self.C) or (r > self.epsilon and alpha > 0):
return False
return True
# 利用训练好的模型进行预测测试集
# test_vector为单个待测数据的特征向量
def predict(self,test_vector):
# P131公式
g=np.sum(self.alpha*self.train_label*self.cal_k(self.train_vector,test_vector))
# sign
if (g + self.b)>=0:
return 1
else:
return -1
# 主函数
def main():
# 训练数据集
filename="iris_all_2class.data"
data_vector, label_num=createDataset(filename)
# print(data_vector,label_num)
s=SVM(kernel='linear',epsilon = 0.001,maxepoch=100,C=0.6)
# s = SVM(kernel='gaussian',kernel_para=1.3, epsilon=0.001, maxepoch=500, C=0.6)
alpha=s.train(data_vector,label_num)
print(alpha)
# 测试数据集
test_filename = "testiris.data"
test_vector, test_label = createDataset(test_filename)
score=0
# 计算预测精度
for x,y in zip(test_vector,test_label):
if s.predict(x) == y:
score+=1
print(score/len(test_label))
if __name__ == "__main__":
main()