python实现支持向量机SVM源码(SMO算法)

有用请点赞,没用请差评。

欢迎分享本文,转载请保留出处。

 本次是学习了李航博士《统计学习分析》后实现了算法,算法实现了线性支持向量机和非线性支持向量机,采用SMO算法求解。算法中实现了两种核函数:高斯核函数和多项式核函数。下面代码中采用的数据集为鸢尾花数据集中的两个类别的数据,若要尝试非线性数据集可以从笔者github:https://github.com/Tomator01/-Machine-Learning

获取,但是readfile函数需要修改一下。

 

关于SVM的原理可以参考《统计学习分析》以及某位大佬的博客https://blog.csdn.net/c406495762/article/details/78072313 

https://blog.csdn.net/c406495762/article/details/78158354

笔者程序还存在的问题:1、KKT条件还是不太清楚。2、当使用核函数时还存在部分问题,输出的拉格朗日乘子看着不对劲,等过段时间再来看看吧~~哭~~也希望各位大佬读者能帮忙指出错误,谢谢。

程序中的注释已经写的非常清楚了,包括公式的引用,这里不再赘述算法步骤。

# -*- coding:utf-8 -*-
# SVM支持向量机:实现了线性支持向量机和非线性支持向量机(多项式核函数、高斯核函数),非线性还有问题有待解决。
#author:Tomator
# 测试数据集为鸢尾花数据集中提取的两种类型数据

import numpy as np
import matplotlib.pyplot as plt
import random

def readfile(filename):
    """
    读取数据集
    W:特征向量数组
    label:标签(类别)列表
    :param filename:
    :return:特征向量数组和标签集合列表
    """
    save_path="D:\\python3_anaconda3\\学习\机器学习\\机器学习数据集\\"
    with open(save_path+filename,'r') as f:

        length=len(f.readlines())
        print(filename,"length: %d"%length)
        W = np.zeros((length,4))
        label=[]
        i=0
        f.seek(0,0)
        for line in f.readlines():
            linestr=line.strip()
            linestrlist=line.split(',')
            # print(linestrlist)
            # 鸢尾属植物数据集的特征共有四个
            number_data=[float(j) for j in linestrlist[0:-1]]
            W[i,:]=np.array(number_data)
            label.append(linestrlist[-1].strip('\n'))
            i+=1
    return W,label

def createDataset(filename):
    """
    创建待分类数据集
    """
    data_vector,label_str=readfile(filename)
    # print(data_vector,"\n",label)

    # 将原始数据集中非字符串标签改为用数字代表,用于后续画图
    data_label=np.zeros(len(label_str))
    for i in range(len(label_str)):
        if label_str[i]=="Iris-setosa":
            data_label[i]=-1
        elif label_str[i]=="Iris-versicolor":
            data_label[i] = 1
    return  data_vector,data_label

# # 将原始数据集划分为训练集和测试集,splitRatio为划分比例。
# def splitDataset(dataset, splitRatio):
#     trainSize = int(len(dataset) * splitRatio)
#     trainSet = []
#     copy = list(dataset)
#     while len(trainSet) < trainSize:
#         index = random.randrange(len(copy))
#         # 原始数据集剔除训练集之后剩下的就是测试集
#         trainSet.append(copy.pop(index))
#     return [trainSet, copy]

class SVM(object):
    """
    kernel='linear' or "gaussian" or "poly",分别代表线性分类器、高斯核函数、多项式核函数;
    kernel_para:表示核函数的参数,高斯核函数为高斯核参数,多项式核函数为p;
    epsilon:误差精度;
    maxepoch:最大迭代次数;
    C:惩罚因子
    train_vector:训练数据集的特征向量
    train_label:训练数据集的分类标签
    train_nums:训练数据集的样本数
    train_err:每个样本的预测误差
    alpha:拉格朗日乘子
    """
    def __init__(self,kernel='linear',kernel_para=0.0,epsilon = 1e-6,maxepoch=2000,C=1.0,):
        self.kernel=kernel
        self.kernel_para=kernel_para
        self.epsilon=epsilon
        self.maxepoch=maxepoch
        self.train_vector=None
        self.train_label=None
        self.train_nums = None
        self.train_err=None
        self.alpha=None
        self.C=C

    # 初始化参数
    def init_parameters(self,train_vector,train_label):
        self.train_vector=train_vector
        self.train_label=train_label
        self.train_nums = len(train_label)
        # 预测误差初始化为-yi
        self.train_err= -self.train_label
        self.alpha=np.zeros(self.train_nums)
        self.b=0

    # 选择第二个变量,《统计学习方法》P129
    def select_second_alpha(self,ind1):
        E1=self.train_err[ind1]
        max_diff=0
        ind2=None
        train_exit_err = np.nonzero(self.train_err)[0]
        if len(train_exit_err)>1:
            for i in train_exit_err:
                # 与indx不相等
                if i == ind1:
                    continue
                diff=abs(self.train_err[i]-E1)
                # print("diff",diff)
                if diff>max_diff:
                    max_diff=diff
                    ind2=i
        return ind2

    # 计算核内积
    def cal_k(self,x,y):
        # 线性,没有核函数
        if self.kernel == "linear":
            return np.dot(x,y)
        # 高斯核函数
        elif self.kernel == "gaussian":
            dot_ = np.dot(x, y)
            result=np.sum(np.exp(-np.square(x-y)/(2*(self.kernel_para**2))))
            return result
        # 多项式核函数
        elif self.kernel == "poly":
            dot_=np.dot(x,y)
            return np.sum((dot_+1)**self.kernel_para)
        else:
            # 核函数名称不正确
            exit("the kernel show be 'linear、gaussian、poly'")

    #更新参数,参考《统计学习方法》P125-P130.Platt序列最小最优化算法
    def update(self,ind1,ind2):
        # 挑选出的两个样本的alpha、对应的预测值及误差和阈值b
        old_alpha1=self.alpha[ind1]
        old_alpha2=self.alpha[ind2]
        y1=self.train_label[ind1]
        y2=self.train_label[ind2]
        # print(ind1,ind2,y1,y2)
        # 公式7.104
        if  y1 == y2:
            L=max(0.0,old_alpha2 + old_alpha1 - self.C)
            H=min(self.C,old_alpha2 + old_alpha1)
        else:
            L = max(0.0, old_alpha2 - old_alpha1)
            H = min(self.C, self.C+old_alpha2 - old_alpha1)
        if L == H:
            return 0
        E1=self.train_err[ind1]
        E2=self.train_err[ind2]


        K11 = self.cal_k(self.train_vector[ind1],self.train_vector[ind1])
        K12 = self.cal_k(self.train_vector[ind1],self.train_vector[ind2])
        K22 = self.cal_k(self.train_vector[ind2], self.train_vector[ind2])
        # print("k11",K11,"k22",K22)
        # 公式7.107
        eta=K11 + K22 - 2 * K12
        if eta <=0 :
            return 0
        # 公式7.106
        new_unc_alpha=old_alpha2+y2*(E1-E2)/eta
        # 公式7.108
        if new_unc_alpha > H:
            new_alpha2=H
        elif new_unc_alpha < L:
            new_alpha2=L
        else:
            new_alpha2=new_unc_alpha

        #     公式7.109
        new_alpha1=old_alpha1+y1*y2*(old_alpha2-new_alpha2)

        # 更新拉格朗日参数
        self.alpha[ind1]=new_alpha1
        self.alpha[ind2]=new_alpha2

        # 公式7.115
        new_b1=-E1-y1*K11*(new_alpha1 - old_alpha1)-y2*K12*(new_alpha2 - old_alpha2)+self.b
        # 公式7.116
        new_b2=-E2 - y1 * K12 * (new_alpha1 - old_alpha1) - y2 * K22 * (new_alpha2 - old_alpha2) + self.b
        # P130文字部分
        if 0 < new_alpha1 < self.C:
            self.b = new_b1
        elif 0 < new_alpha2 < self.C:
            self.b = new_b2
        else:
            self.b = (new_b1 + new_b2) / 2
        #     更新预测误差
        self.train_err[ind1] = np.sum(self.train_label * self.alpha * self.cal_k(self.train_vector,self.train_vector[ind1])) + self.b - self.train_label[ind1]
        self.train_err[ind2] = np.sum(self.train_label * self.alpha * self.cal_k(self.train_vector,self.train_vector[ind2])) + self.b - self.train_label[ind2]
        return 1

    # 训练模型
    def train(self,train_vector,train_label):
        # 初始化参数
        self.init_parameters(train_vector,train_label)
        epochs=0
        # 迭代次数小于最大迭代次数
        while epochs0,则yg>1, alpha=0则符合
        if (r < -self.epsilon and alpha < self.C) or (r > self.epsilon and alpha > 0):
            return False
        return True

    # 利用训练好的模型进行预测测试集
    # test_vector为单个待测数据的特征向量
    def predict(self,test_vector):
        # P131公式
        g=np.sum(self.alpha*self.train_label*self.cal_k(self.train_vector,test_vector))
        # sign
        if (g + self.b)>=0:
            return 1
        else:
            return -1

# 主函数
def main():
    # 训练数据集
    filename="iris_all_2class.data"
    data_vector, label_num=createDataset(filename)
    # print(data_vector,label_num)
    s=SVM(kernel='linear',epsilon = 0.001,maxepoch=100,C=0.6)
    # s = SVM(kernel='gaussian',kernel_para=1.3, epsilon=0.001, maxepoch=500, C=0.6)
    alpha=s.train(data_vector,label_num)
    print(alpha)
    # 测试数据集
    test_filename = "testiris.data"
    test_vector, test_label = createDataset(test_filename)
    score=0
    # 计算预测精度
    for x,y in zip(test_vector,test_label):
        if s.predict(x) == y:
            score+=1
    print(score/len(test_label))

if __name__ == "__main__":
    main()

 

 

你可能感兴趣的:(机器学习,彭湃的专栏)