机器学习实战之SVM练习---python

# -*- coding:UTF-8 -*-
import matplotlib.pyplot as plt
import numpy as np
import random

'''
数据读取
'''
def loadDataSet(fileName):
    dataMat = []; labelMat = []
    fr = open(fileName,'rb')
    for line in fr.readlines():
        lineArr = line.decode('utf-8-sig').strip().split('\t')
        dataMat.append([float(lineArr[0]),float(lineArr[1])])
        labelMat.append(float(lineArr[2]))
    # print(dataMat)
    # print(labelMat)
    return dataMat,labelMat



'''
随机选择alpha
        i:alpha下标
        m:alpha参数全部个数
'''
def selectJrand(i,m):
    j = i
    while(j==i):
        j = int(random.uniform(0,m))                #选择一个不等于i的j
    return j

'''
根据取值范围修剪aj
    H:取值上限
    L:取值下限
'''
def clipAlaph(aj,H,L):
    if aj > H:
        aj = H
    if L > aj:
        aj = L
    return aj

'''
简化版SMO
  C:松弛变量   toler:容错率    maxIter:最大迭代次数
    1、计算误差
    2.上下限计算
    3、计算学习速率
    4、更新alpha_j
    5、修剪alpha_j
    6、更新alpha_i
    7、更新b1、b2
'''
def smoSimple(dataMatIn,classLabels,C,toler,maxIter):
    dataMat = np.mat(dataMatIn);labelMat = np.mat(classLabels).transpose()      #转换为numpy的mat存储
    b=0                                                                         #初始化b
    m,n = np.shape(dataMat)                                                     #统计dataMat的维度
    alphas = np.mat(np.zeros((m,1)))                                            #初始化alpha参数为0
    iter_num = 0                                                                #初始化迭代次数
    while(iter_num < maxIter):
        alphaPairsChanged = 0                                       #用于记录alpha是否已经进行优化
        for i in range(m):
            #multiply对应元素相乘
            fxi = float(np.multiply(alphas,labelMat).T*(dataMat*dataMat[i,:].T)) + b
            Ei = fxi - float(labelMat[i])                           #计算误差
            #优化alpha
            if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > C)):
                #随机选择一个alpha_j
                j = selectJrand(i,m)
                #1、计算Ej误差
                fxj = float(np.multiply(alphas, labelMat).T * (dataMat * dataMat[j, :].T)) + b
                Ej = fxj - float(labelMat[j])
                #保存更新前的alpha,使用深拷贝
                alphaIold = alphas[i].copy()
                alphaJold = alphas[j].copy()
                #2、计算上下界L和H
                if(labelMat[i] != labelMat[j]):
                    L = max(0,alphas[j]-alphas[i])
                    H = min(C,C+alphas[j]-alphas[i])
                else:
                    L = max(0, alphas[j] + alphas[i] - C)
                    H = min(C, C + alphas[j] + alphas[i])
                if L==H:print('L==H');continue
                #3、计算eta
                eta = 2.0 * dataMat[i,:]*dataMat[j,:].T - dataMat[i,:]*dataMat[i,:].T - dataMat[j,:]*dataMat[j,:].T
                if eta >= 0:
                    print('eta>=0');continue
                #4、更新alphas[j]
                alphas[j] -= labelMat[j]*(Ei - Ej)/eta
                #修剪alpha[j]
                alphas[j] = clipAlaph(alphas[j],H,L)
                if (abs(alphas[j]-alphaJold) < 0.00001):
                    print('alpha[j]变化太小')
                    continue
                #5、更新alpha[i]
                alphas[i] += labelMat[j]*labelMat[i]*(alphaJold-alphas[j])
                #6、更新b1,b2
                b1 = b - Ei - labelMat[i]*(alphas[i]-alphaIold)*dataMat[i,:]*dataMat[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMat[i,:]*dataMat[j,:].T
                b2 = b - Ej - labelMat[i]*(alphas[i]-alphaIold)*dataMat[i,:]*dataMat[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMat[j,:]*dataMat[j,:].T
                #7、更新b
                if (0and (C >alphas[i]):
                    b=b1
                elif(0and (C >alphas[j]):
                    b=b2
                else:
                    b=(b1+b2)/2
                #统计优化次数
                alphaPairsChanged += 1
                print('第%d次迭代样本:%d,alpha优化次数:%d' %(iter_num,i,alphaPairsChanged))
        #更新迭代次数
        if(alphaPairsChanged == 0):iter_num+=1
        else:iter_num = 0
        print('迭代次数:%d' %iter_num)
    return b,alphas


'''
函数说明:计算w
'''
def get_w(dataMat,labelMat,alphas):
    alphas,dataMat,labelMat = np.array(alphas),np.array(dataMat),np.array(labelMat)
    w = np.dot((np.tile(labelMat.reshape(1,-1).T,(1,2))*dataMat).T,alphas)
    return w.tolist()

"""
函数说明:数据可视化

Parameters:
    dataMat - 数据矩阵
    labelMat - 数据标签

"""
def showDataSet(dataMat, labelMat,alphas,w,b):
    data_plus = []                                  #正样本
    data_minus = []                                 #负样本
    for i in range(len(dataMat)):
        if labelMat[i] > 0:
            data_plus.append(dataMat[i])
        else:
            data_minus.append(dataMat[i])
    data_plus_np = np.array(data_plus)              #转换为numpy队列
    data_minus_np = np.array(data_minus)            #转换为numpy队列

    plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1])   #正样本散点图
    plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1]) #负样本散点图
    x1 = max(dataMat)[0]
    x2 = min(dataMat)[0]
    a1,a2 = w
    b = float(b)
    a1 = float(a1[0])
    a2 = float(a2[0])
    y1, y2 = (b + a1 * x1), (b + a1 * x2)
    plt.plot([x1,x2],[y1,y2])
    #找到支持向量
    for i,alpha in enumerate(alphas):
        if alpha > 0:
            x,y = dataMat[i]
            plt.scatter([x], [y], s=150, c='none', alpha=0.7, linewidth=1.5, edgecolor='red')
    plt.show()


if __name__ == '__main__':
    dataMat,labelMat = loadDataSet('G:\dataSet.txt')
    b,alphas = smoSimple(dataMat,labelMat,0.6,0.001,80)
    w = get_w(dataMat,labelMat,alphas)
    showDataSet(dataMat,labelMat,alphas,w,b)


运行结果:
机器学习实战之SVM练习---python_第1张图片

你可能感兴趣的:(机器学习实战之SVM练习---python)