手撕SVM

支撑向量机 SVM

1. 支撑向量机 SVM

1. 什么是SVM

  • Support Vector Machine

  • 解决的是线性可分问题
  • Hard Margin SVM
  • Soft Margin SVM

2. SVM背后的最优化问题

  • 点到直线的距离

  • 支撑向量机

  • 我们也可以将两个式子变成一个

  • 最优化目标

  • 这是一个有条件的最优化问题

3. Soft Margin SVM

  • L1,L2正则

4. scikit-learn中的SVM

  • 使用SVM和kNN一样,要做数据标准化处理
  • 涉及距离
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets

iris = datasets.load_iris()

X = iris.data
y = iris.target

X = X[y<2,:2]
y = y[y<2]

plt.scatter(X[y==0,0], X[y==0,1], color='red')
plt.scatter(X[y==1,0], X[y==1,1], color='blue')
plt.show()

  • 进行标准化操作
from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()
standardScaler.fit(X)
X_standard = standardScaler.transform(X)
  • 引入LinearSVC,C取值越大就越偏向hard margin svm
from sklearn.svm import LinearSVC

svc = LinearSVC(C=1e9)
svc.fit(X_standard, y)

LinearSVC(C=1000000000.0, class_weight=None, dual=True, fit_intercept=True,
intercept_scaling=1, loss=’squared_hinge’, max_iter=1000,
multi_class=’ovr’, penalty=’l2’, random_state=None, tol=0.0001,
verbose=0)

  • 使用之前使用过的绘图方法
def plot_decision_boundary(model, axis):

    x0, x1 = np.meshgrid(
        np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)).reshape(-1, 1),
        np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100)).reshape(-1, 1),
    )
    X_new = np.c_[x0.ravel(), x1.ravel()]

    y_predict = model.predict(X_new)
    zz = y_predict.reshape(x0.shape)

    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])

    plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
  • 进行具体的绘制
plot_decision_boundary(svc, axis=[-3, 3, -3, 3])
plt.scatter(X_standard[y==0,0], X_standard[y==0,1])
plt.scatter(X_standard[y==1,0], X_standard[y==1,1])
plt.show()

  • C传入0.01
svc2 = LinearSVC(C=0.01)
svc2.fit(X_standard, y)

plot_decision_boundary(svc2, axis=[-3, 3, -3, 3])
plt.scatter(X_standard[y==0,0], X_standard[y==0,1])
plt.scatter(X_standard[y==1,0], X_standard[y==1,1])
plt.show()

  • 对于第二个决策边界,有一个点被错误分类了
svc.coef_

array([[ 4.03239757, -2.49296482]])

svc.intercept_

array([ 0.95365598])

  • 改造绘图方法
def plot_svc_decision_boundary(model, axis):

    x0, x1 = np.meshgrid(
        np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)).reshape(-1, 1),
        np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100)).reshape(-1, 1),
    )
    X_new = np.c_[x0.ravel(), x1.ravel()]

    y_predict = model.predict(X_new)
    zz = y_predict.reshape(x0.shape)

    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])

    plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)

    w = model.coef_[0]
    b = model.intercept_[0]

    # w0*x0 + w1*x1 + b = 0
    # => x1 = -w0/w1 * x0 - b/w1
    plot_x = np.linspace(axis[0], axis[1], 200)
    up_y = -w[0]/w[1] * plot_x - b/w[1] + 1/w[1]
    down_y = -w[0]/w[1] * plot_x - b/w[1] - 1/w[1]

    up_index = (up_y >= axis[2]) & (up_y <= axis[3])
    down_index = (down_y >= axis[2]) & (down_y <= axis[3])
    plt.plot(plot_x[up_index], up_y[up_index], color='black')
    plt.plot(plot_x[down_index], down_y[down_index], color='black')
  • 进行绘图
plot_svc_decision_boundary(svc, axis=[-3, 3, -3, 3])
plt.scatter(X_standard[y==0,0], X_standard[y==0,1])
plt.scatter(X_standard[y==1,0], X_standard[y==1,1])
plt.show()

  • 对svc2绘图
plot_svc_decision_boundary(svc2, axis=[-3, 3, -3, 3])
plt.scatter(X_standard[y==0,0], X_standard[y==0,1])
plt.scatter(X_standard[y==1,0], X_standard[y==1,1])
plt.show()

5. SVM中使用多项式特征和核函数

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets

X, y = datasets.make_moons()

X.shape

(100, 2)

y.shape

(100,)

plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

  • 添加噪音
X, y = datasets.make_moons(noise=0.15, random_state=666)

plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

  • 使用多项式特征的SVM
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

def PolynomialSVC(degree, C=1.0):
    return Pipeline([
        ("poly", PolynomialFeatures(degree=degree)),
        ("std_scaler", StandardScaler()),
        ("linearSVC", LinearSVC(C=C))
    ])
  • 传入参数degree=3
poly_svc = PolynomialSVC(degree=3)
poly_svc.fit(X, y)
  • 绘图函数
def plot_decision_boundary(model, axis):

    x0, x1 = np.meshgrid(
        np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)).reshape(-1, 1),
        np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100)).reshape(-1, 1),
    )
    X_new = np.c_[x0.ravel(), x1.ravel()]

    y_predict = model.predict(X_new)
    zz = y_predict.reshape(x0.shape)

    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])

    plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
  • 具体绘制
plot_decision_boundary(poly_svc, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

  • 我们使用多项式让决策边界是一条曲线
  • 使用多项式核函数的SVM
from sklearn.svm import SVC

def PolynomialKernelSVC(degree, C=1.0):
    return Pipeline([
        ("std_scaler", StandardScaler()),
        ("kernelSVC", SVC(kernel="poly", degree=degree, C=C))
    ])

poly_kernel_svc = PolynomialKernelSVC(degree=3)
poly_kernel_svc.fit(X, y)

plot_decision_boundary(poly_kernel_svc, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

6. 到底什么是核函数

  • 支持向量机通过某非线性变换 φ( x) ,将输入空间映射到高维特征空间。特征空间的维数可能非常高。如果支持向量机的求解只用到内积运算,而在低维输入空间又存在某个函数 K(x, x′) ,它恰好等于在高维空间中这个内积,即K( x, x′) =<φ( x) ⋅φ( x′) > 。那么支持向量机就不用计算复杂的非线性变换,而由这个函数 K(x, x′) 直接得到非线性变换的内积,使大大简化了计算。这样的函数 K(x, x′) 称为核函数

多项式核函数

7. RBF核函数

  • Radial Basis Function Kernel

  • 将每一个样本点映射到一个无穷维的特征空间

多项式特征

  • 依靠升维使得原本线性不可分的数据线性可分

import numpy as np
import matplotlib.pyplot as plt

x = np.arange(-4, 5, 1)

x

array([-4, -3, -2, -1, 0, 1, 2, 3, 4])

y = np.array((x >= -2) & (x <= 2), dtype='int')

y

array([0, 0, 1, 1, 1, 1, 1, 0, 0])

plt.scatter(x[y==0], [0]*len(x[y==0]))
plt.scatter(x[y==1], [0]*len(x[y==1]))
plt.show()

def gaussian(x, l):
    gamma = 1.0
    return np.exp(-gamma * (x-l)**2)

l1, l2 = -1, 1

X_new = np.empty((len(x), 2))
for i, data in enumerate(x):
    X_new[i, 0] = gaussian(data, l1)
    X_new[i, 1] = gaussian(data, l2)

plt.scatter(X_new[y==0,0], X_new[y==0,1])
plt.scatter(X_new[y==1,0], X_new[y==1,1])
plt.show()

  • 显然现在是线性可分的
  • 高斯核:对于每一个数据点都是landmark
    • n的数据映射成了m m的数据

8. RBF核函数中的gamma

  • gamma越大,高斯分布越窄
  • gamma越小,高斯分布越宽
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets

X, y = datasets.make_moons(noise=0.15, random_state=666)

plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

  • 使用半月形数据并加入噪音
  • 对数据进行预处理
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

def RBFKernelSVC(gamma):
    return Pipeline([
        ("std_scaler", StandardScaler()),
        ("svc", SVC(kernel="rbf", gamma=gamma))
    ])
  • gamma传入1
svc = RBFKernelSVC(gamma=1)
svc.fit(X, y)

Pipeline(steps=[(‘std_scaler’, StandardScaler(copy=True, with_mean=True, with_std=True)), (‘svc’, SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma=1, kernel=’rbf’,
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False))])

  • 添加绘制图像函数
def plot_decision_boundary(model, axis):

    x0, x1 = np.meshgrid(
        np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)).reshape(-1, 1),
        np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100)).reshape(-1, 1),
    )
    X_new = np.c_[x0.ravel(), x1.ravel()]

    y_predict = model.predict(X_new)
    zz = y_predict.reshape(x0.shape)

    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])

    plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
  • 具体绘制决策边界
plot_decision_boundary(svc, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

  • gamma传入100,再观察决策边界
svc_gamma100 = RBFKernelSVC(gamma=100)
svc_gamma100.fit(X, y)

plot_decision_boundary(svc_gamma100, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

  • gamma越大,高斯分布越窄,每一个蓝色点周围有一个钟型图形,不过我们明显感觉过拟合了
  • gamma传入10
svc_gamma10 = RBFKernelSVC(gamma=10)
svc_gamma10.fit(X, y)

plot_decision_boundary(svc_gamma10, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

  • gamma传入0.5,比1还小
svc_gamma05 = RBFKernelSVC(gamma=0.5)
svc_gamma05.fit(X, y)

plot_decision_boundary(svc_gamma05, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

  • gamma传入0.1
svc_gamma01 = RBFKernelSVC(gamma=0.1)
svc_gamma01.fit(X, y)

plot_decision_boundary(svc_gamma01, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()

  • 现在的决策边界已经跟一个线性的决策边界差不多了,明显欠拟合了
  • gumma其实就是在调整模型的复杂度

9. SVM思想解决回归问题

  • 加载波士顿房价数据集
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets

boston = datasets.load_boston()
X = boston.data
y = boston.target
  • 进行训练测试数据集分割
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
  • LinearSVR进行回归预测
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def StandardLinearSVR(epsilon=0.1):
    return Pipeline([
        ('std_scaler', StandardScaler()),
        ('linearSVR', LinearSVR(epsilon=epsilon))
    ])

svr = StandardLinearSVR()
svr.fit(X_train, y_train)

Pipeline(steps=[(‘std_scaler’, StandardScaler(copy=True, with_mean=True, with_std=True)), (‘linearSVR’, LinearSVR(C=1.0, dual=True, epsilon=0.1, fit_intercept=True,
intercept_scaling=1.0, loss=’epsilon_insensitive’, max_iter=1000,
random_state=None, tol=0.0001, verbose=0))])

svr.score(X_test, y_test)

0.63595732513984249

  • 显然这个R^2的值不太好,这里有很多超参数可以调节,我们这里不细研究SVM解决回归问题

你可能感兴趣的:(机器学习)