import numpy as np
import src.util as util
def calc_grad(X, Y, theta):
"""Compute the gradient of the loss with respect to theta."""
m, n = X.shape
margins = Y * X.dot(theta)
probs = 1. / (1 + np.exp(margins))
grad = -(1./m) * (X.T.dot(probs * Y))
return grad
def logistic_regression(X, Y):
"""Train a logistic regression model."""
m, n = X.shape
theta = np.zeros(n)
learning_rate = 1
i = 0
while True:
i += 1
prev_theta = theta
grad = calc_grad(X, Y, theta)
theta = theta - learning_rate * grad
if i % 10000 == 0:
print('Finished %d iterations' % i)
if np.linalg.norm(prev_theta - theta) < 1e-15:
print('Converged in %d iterations' % i)
break
return
def main1():
print('==== Training model on data set A ====')
Xa, Ya = util.load_csv('./data/ds1_a.csv', add_intercept=True)
logistic_regression(Xa, Ya)
print('\n==== Training model on data set B ====')
Xb, Yb = util.load_csv('./data/ds1_b.csv', add_intercept=True)
logistic_regression(Xb, Yb)
main1()
==== Training model on data set A ====
Finished 10000 iterations
Finished 20000 iterations
Finished 30000 iterations
Finished 40000 iterations
Finished 50000 iterations
Finished 60000 iterations
Finished 70000 iterations
Finished 80000 iterations
Finished 90000 iterations
Finished 100000 iterations
Finished 110000 iterations
Finished 120000 iterations
Finished 130000 iterations
Finished 140000 iterations
Finished 150000 iterations
Finished 160000 iterations
Finished 170000 iterations
Finished 180000 iterations
Finished 190000 iterations
Finished 200000 iterations
Finished 210000 iterations
Finished 220000 iterations
Finished 230000 iterations
Finished 240000 iterations
Finished 250000 iterations
Finished 260000 iterations
Finished 270000 iterations
Converged in 278192 iterations
==== Training model on data set B ====
Finished 10000 iterations
Finished 20000 iterations
Finished 30000 iterations
Finished 40000 iterations
Finished 50000 iterations
Finished 60000 iterations
Finished 70000 iterations
Finished 80000 iterations
Finished 90000 iterations
Finished 100000 iterations
Finished 110000 iterations
Finished 120000 iterations
Finished 130000 iterations
Finished 140000 iterations
Finished 150000 iterations
Finished 160000 iterations
Finished 170000 iterations
Finished 180000 iterations
Finished 190000 iterations
Finished 200000 iterations
Finished 210000 iterations
Finished 220000 iterations
Finished 230000 iterations
Finished 240000 iterations
Finished 250000 iterations
Finished 260000 iterations
Finished 270000 iterations
Finished 280000 iterations
Finished 290000 iterations
Finished 300000 iterations
Finished 310000 iterations
Finished 320000 iterations
Finished 330000 iterations
Finished 340000 iterations
Finished 350000 iterations
Finished 360000 iterations
Finished 370000 iterations
Finished 380000 iterations
Finished 390000 iterations
Finished 400000 iterations
Finished 410000 iterations
Finished 420000 iterations
Finished 430000 iterations
Finished 440000 iterations
Finished 450000 iterations
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
in
----> 1 main1()
in main1()
37 print('\n==== Training model on data set B ====')
38 Xb, Yb = util.load_csv('./data/ds1_b.csv', add_intercept=True)
---> 39 logistic_regression(Xb, Yb)
in logistic_regression(X, Y)
20 i += 1
21 prev_theta = theta
---> 22 grad = calc_grad(X, Y, theta)
23 theta = theta - learning_rate * grad
24 if i % 10000 == 0:
in calc_grad(X, Y, theta)
4
5 margins = Y * X.dot(theta)
----> 6 probs = 1. / (1 + np.exp(margins))
7 grad = -(1./m) * (X.T.dot(probs * Y))
8
KeyboardInterrupt:
可以发现对于A数据集,结果可以收敛,对于B数据集不收敛。
首先注意到算法中和真正的Logistc 回归的算法不同,明显的不同是对于梯度下降时,梯度下降step的计算上,标准的Logistic 回归中,梯度下降每一步为:
θ j : = θ j + α ( y ( i ) − h θ ( x ( i ) ) ) x j ( i ) \theta_{j}:=\theta_{j}+\alpha\left(y^{(i)}-h_{\theta}\left(x^{(i)}\right)\right) x_{j}^{(i)} θj:=θj+α(y(i)−hθ(x(i)))xj(i)
而通过函数calc_grad
可以总结出,这里的step为:
∇ θ J ( θ ) = − 1 m ∑ i = 1 m y ( i ) x ( i ) 1 + exp ( y ( i ) θ T x ( i ) ) \nabla_{\theta} J(\theta)=-\frac{1}{m} \sum_{i=1}^{m} \frac{y^{(i)} x^{(i)}}{1+\exp \left(y^{(i)} \theta^{T} x^{(i)}\right)} ∇θJ(θ)=−m1i=1∑m1+exp(y(i)θTx(i))y(i)x(i)
那么这种梯度下降试图寻找的某个函数的最小值(原函数),这个函数主要部分一定是:
ℓ ( θ ) = − 1 m ∑ i = 1 m log 1 1 + exp ( − y ( i ) θ T x ( i ) ) \ell(\theta)=-\frac{1}{m} \sum_{i=1}^{m} \log \frac{1}{1+\exp \left(-y^{(i)} \theta^{T} x^{(i)}\right)} ℓ(θ)=−m1i=1∑mlog1+exp(−y(i)θTx(i))1
这种形式。
显然这个相比Logistic 回归要最小化的部分,sigmoid函数中,多了指数上的 y ( i ) y^{(i)} y(i),这种y与 θ x \theta x θx乘积的形式,会让人想到SVM里面的几何间隔和函数间隔的概念,那么无论是Logistic回归,还是这里已经改掉的似Logistic分类器也似SVM分类器的算法,本质上都是线性分类的问题,那么就要看不收敛是否是数据本身的问题(这个及时不考虑这些,也是第一个直觉的吧,先看看数据有没有问题)
所以,先可视化一下数据
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcdefaults()
data_load_a = pd.read_csv('./data/ds1_a.csv')
x_train_a = data_load_a.loc[:, data_load_a.columns.str.contains('x')].values
x_train_a = np.column_stack((np.ones(x_train_a.shape[0]), x_train_a))
y_train_a = data_load_a.loc[:, 'y'].values.reshape((-1, 1))
data_load_b = pd.read_csv('./data/ds1_b.csv')
x_train_b = data_load_b.loc[:, data_load_b.columns.str.contains('x')].values
x_train_b = np.column_stack((np.ones(x_train_b.shape[0]), x_train_b))
y_train_b = data_load_b.loc[:, 'y'].values.reshape((-1, 1))
fig1 = plt.figure()
ax1 = fig1.add_subplot(121)
ax1.scatter(x_train_a[np.where(y_train_a == 1)[0], 1], x_train_a[np.where(y_train_a == 1)[0], 2], marker='o', color='blue')
ax1.scatter(x_train_a[np.where(y_train_a == -1)[0], 1], x_train_a[np.where(y_train_a == -1)[0], 2], marker='x', color='red')
ax1.set_title('Set_A', **dict(fontsize=15, weight='black'))
ax2 = fig1.add_subplot(122)
ax2.scatter(x_train_b[np.where(y_train_b == 1)[0], 1], x_train_b[np.where(y_train_b == 1)[0], 2], marker='o', color='blue')
ax2.scatter(x_train_b[np.where(y_train_b == -1)[0], 1], x_train_b[np.where(y_train_b == -1)[0], 2], marker='x', color='red')
ax2.set_title('Set_B', **dict(fontsize=15, weight='black'))
plt.show()
可以发现,B相对于A有个显著的不同是,B线性可分,A线性不可分,从原理上类似SVM中的软硬间隔,B是硬间隔,所以它的函数间隔可以无限的大,也就是 θ \theta θ缺少了约束,因此出现不收敛的问题,此外从上面的 l ( θ ) l(\theta) l(θ)中可以发现,分母的e指数部分中,由于线性可分,所以都可以得到无穷大,从而有利于整体的减小,所以可以一直增大 ∣ ∣ θ ∣ ∣ ||\theta|| ∣∣θ∣∣,从而无法收敛,而A可以收敛是总可以找到让“错误”分类最小的超平面,且由于总有误分类点,所以某些情况下,可以起到限制 ∣ θ ∣ |\theta| ∣θ∣的效果,所以表现出收敛。
不能。通过上一题的分析,可以发现,学习率 α \alpha α只是增加或者减少迭代求解中每一步的大小,并没有从本质上解决问题,因为无效。
def calc_grad(X, Y, theta):
"""Compute the gradient of the loss with respect to theta."""
m, n = X.shape
margins = Y * X.dot(theta)
probs = 1. / (1 + np.exp(margins))
grad = -(1./m) * (X.T.dot(probs * Y))
return grad
def logistic_regression(X, Y):
"""Train a logistic regression model."""
m, n = X.shape
theta = np.zeros(n)
learning_rate = 10
i = 0
while True:
i += 1
prev_theta = theta
grad = calc_grad(X, Y, theta)
theta = theta - learning_rate / i ** 2 * grad
if i % 1000000 == 0:
print('Finished %d iterations' % i)
if np.linalg.norm(prev_theta - theta) < 1e-15:
print('Converged in %d iterations' % i)
break
return
def main2():
print('==== Training model on data set A with learning rate decay with time====')
Xa, Ya = util.load_csv('./data/ds1_a.csv', add_intercept=True)
logistic_regression(Xa, Ya)
print('\n==== Training model on data set B with learning rate decay with time====')
Xb, Yb = util.load_csv('./data/ds1_b.csv', add_intercept=True)
logistic_regression(Xb, Yb)
main2()
==== Training model on data set A with learning rate decay with time====
Finished 1000000 iterations
Finished 2000000 iterations
Finished 3000000 iterations
Finished 4000000 iterations
Finished 5000000 iterations
Finished 6000000 iterations
Finished 7000000 iterations
Finished 8000000 iterations
Finished 9000000 iterations
Finished 10000000 iterations
Finished 11000000 iterations
Finished 12000000 iterations
Finished 13000000 iterations
Finished 14000000 iterations
Finished 15000000 iterations
Finished 16000000 iterations
Finished 17000000 iterations
Finished 18000000 iterations
Finished 19000000 iterations
Finished 20000000 iterations
Finished 21000000 iterations
Finished 22000000 iterations
Finished 23000000 iterations
Finished 24000000 iterations
Finished 25000000 iterations
Finished 26000000 iterations
Finished 27000000 iterations
Converged in 27083822 iterations
==== Training model on data set B with learning rate decay with time====
Finished 1000000 iterations
Finished 2000000 iterations
Finished 3000000 iterations
Finished 4000000 iterations
Finished 5000000 iterations
Finished 6000000 iterations
Finished 7000000 iterations
Finished 8000000 iterations
Finished 9000000 iterations
Finished 10000000 iterations
Finished 11000000 iterations
Finished 12000000 iterations
Finished 13000000 iterations
Finished 14000000 iterations
Finished 15000000 iterations
Finished 16000000 iterations
Finished 17000000 iterations
Finished 18000000 iterations
Finished 19000000 iterations
Finished 20000000 iterations
Finished 21000000 iterations
Finished 22000000 iterations
Finished 23000000 iterations
Finished 24000000 iterations
Finished 25000000 iterations
Finished 26000000 iterations
Finished 27000000 iterations
Converged in 27850565 iterations
上面的结果表明,通过将学习率变为随着迭代次数衰减的(每次的学习率为初始学习率的 1 / t 2 1 / t^{2} 1/t2),可以使最终结果“收敛”,但是这个收敛只是达到了我们预设的收敛判断阈值,并不是真的在函数的全局收敛,因为当学习率衰减到足够小后, θ \theta θ的下一步迭代,会因为这个过小的学习率因子变得小于收敛条件,从而成为一种没有太大意义的收敛。
暂时不知道linear scaling 是在干嘛,不做回答
def calc_grad(X, Y, theta):
"""Compute the gradient of the loss with respect to theta."""
m, n = X.shape
margins = Y * X.dot(theta)
probs = 1. / (1 + np.exp(margins))
grad = -(1./m) * (X.T.dot(probs * Y)) + 0.00136 * theta
return grad
def logistic_regression(X, Y):
"""Train a logistic regression model."""
m, n = X.shape
theta = np.zeros(n)
learning_rate = 10
i = 0
while True:
i += 1
prev_theta = theta
grad = calc_grad(X, Y, theta)
theta = theta - learning_rate * grad
if i % 1000000 == 0:
print('Finished %d iterations' % i)
if np.linalg.norm(prev_theta - theta) < 1e-15:
print('Converged in %d iterations' % i)
break
return
def main3():
print('==== Training model on data set A ====')
Xa, Ya = util.load_csv('./data/ds1_a.csv', add_intercept=True)
logistic_regression(Xa, Ya)
print('\n==== Training model on data set B ====')
Xb, Yb = util.load_csv('./data/ds1_b.csv', add_intercept=True)
logistic_regression(Xb, Yb)
main3()
==== Training model on data set A ====
Converged in 832 iterations
==== Training model on data set B ====
Converged in 909 iterations
可以发现,加入正则化项(罚项),且罚项前增加一个系数,平衡原损失函数与罚项的大小(上面代码中的罚项系数 λ \lambda λ为0.00136*2=0.00272
),可以很快地收敛。
def calc_grad(X, Y, theta):
"""Compute the gradient of the loss with respect to theta."""
m, n = X.shape
margins = Y * X.dot(theta)
probs = 1. / (1 + np.exp(margins))
grad = -(1./m) * (X.T.dot(probs * Y))
return grad
def logistic_regression(X, Y):
"""Train a logistic regression model."""
m, n = X.shape
theta = np.zeros(n)
learning_rate = 1
i = 0
while True:
i += 1
prev_theta = theta
grad = calc_grad(X, Y, theta)
theta = theta - learning_rate * grad
if i % 1000000 == 0:
print('Finished %d iterations' % i)
if np.linalg.norm(prev_theta - theta) < 1e-15:
print('Converged in %d iterations' % i)
break
return
def main4():
print('==== Training model on data set A ====')
Xa, Ya = util.load_csv('./data/ds1_a.csv', add_intercept=True)
rand_mat = np.random.randn(Xa.shape[0], Xa.shape[1])
Xa = Xa + rand_mat
logistic_regression(Xa, Ya)
print('\n==== Training model on data set B ====')
Xb, Yb = util.load_csv('./data/ds1_b.csv', add_intercept=True)
rand_mat = np.random.randn(Xb.shape[0], Xb.shape[1])
Xb = Xb + rand_mat
logistic_regression(Xb, Yb)
main4()
==== Training model on data set A ====
Converged in 148 iterations
==== Training model on data set B ====
Converged in 127 iterations
从原理上,增加噪音以后,原来线性可分数据集变成了不可分的,所以就可以收敛了。
SVM本本身可以用合页损失函数的图像来理解,相比感知机,起图像沿着x轴向右平移了1,这样的其实比感知机有更加严格的损失标准,更高的置信度,而对于线性可分的数据集,SVM的合页损失函数和感知机的同样都可以达到0.
按照题目的假设,即标准的Logistic回归,那么损失函数为:
L ( θ ) = p ( y ⃗ ∣ X ; θ ) = ∏ i = 1 m p ( y ( i ) ∣ x ( i ) ; θ ) = ∏ i = 1 m ( h θ ( x ( i ) ) ) y ( i ) ( 1 − h θ ( x ( i ) ) ) 1 − y ( i ) \begin{aligned} L(\theta) &=p(\vec{y} \mid X ; \theta) \\ &=\prod_{i=1}^{m} p\left(y^{(i)} \mid x^{(i)} ; \theta\right) \\ &=\prod_{i=1}^{m}\left(h_{\theta}\left(x^{(i)}\right)\right)^{y^{(i)}}\left(1-h_{\theta}\left(x^{(i)}\right)\right)^{1-y^{(i)}} \end{aligned} L(θ)=p(y∣X;θ)=i=1∏mp(y(i)∣x(i);θ)=i=1∏m(hθ(x(i)))y(i)(1−hθ(x(i)))1−y(i)
对应的对数损失函数为:
ℓ ( θ ) = log L ( θ ) = ∑ i = 1 m y ( i ) log h ( x ( i ) ) + ( 1 − y ( i ) ) log ( 1 − h ( x ( i ) ) ) \begin{aligned} \ell(\theta) &=\log L(\theta) \\ &=\sum_{i=1}^{m} y^{(i)} \log h\left(x^{(i)}\right)+\left(1-y^{(i)}\right) \log \left(1-h\left(x^{(i)}\right)\right) \end{aligned} ℓ(θ)=logL(θ)=i=1∑my(i)logh(x(i))+(1−y(i))log(1−h(x(i)))
当完成优化的时候,理论上,对数损失函数的导数在全局最优处应该为0,即(求导过程略,可以去找一下学习Logistic回归的推导):
∂ ℓ ( θ ) ∂ θ j = ∑ i = 1 m ( y ( i ) − h ( x ( i ) ) ) x j ( i ) = 0 \frac{\partial \ell(\theta)}{\partial \theta_{j}}=\sum_{i=1}^{m}\left(y^{(i)}-h\left(x^{(i)}\right)\right) x_{j}^{(i)}=0 ∂θj∂ℓ(θ)=i=1∑m(y(i)−h(x(i)))xj(i)=0
其中,对于 j = 0 j=0 j=0,由于 x 0 ( i ) = 1 x_{0}^{(i)}=1 x0(i)=1,所以有:
∑ i = 1 m ( y ( i ) − h ( x ( i ) ) ) = 0 ∑ i = 1 m h ( x ( i ) ) = ∑ i = 1 m y ( i ) \begin{array}{l}\sum_{i=1}^{m}\left(y^{(i)}-h\left(x^{(i)}\right)\right)=0 \\ \sum_{i=1}^{m} h\left(x^{(i)}\right)=\sum_{i=1}^{m} y^{(i)}\end{array} ∑i=1m(y(i)−h(x(i)))=0∑i=1mh(x(i))=∑i=1my(i)
进而:
h ( x ( i ) ) = P ( y ( i ) = 1 ∣ x ( i ) ; θ ) , y ( i ) = I { y ( i ) = 1 } ∑ i = 1 m P ( y ( i ) = 1 ∣ x ( i ) ; θ ) = ∑ i = 1 m I { y ( i ) = 1 } \begin{array}{c}h\left(x^{(i)}\right)=P\left(y^{(i)}=1 \mid x^{(i)} ; \theta\right), y^{(i)}=\mathbb{I}\left\{y^{(i)}=1\right\} \\ \sum_{i=1}^{m} P\left(y^{(i)}=1 \mid x^{(i)} ; \theta\right)=\sum_{i=1}^{m} \mathbb{I}\left\{y^{(i)}=1\right\}\end{array} h(x(i))=P(y(i)=1∣x(i);θ),y(i)=I{y(i)=1}∑i=1mP(y(i)=1∣x(i);θ)=∑i=1mI{y(i)=1}
那么,当 ( a , b ) = ( 0 , 1 ) , I a , b = { x ( i ) , y ( i ) } i = 1 m (a, b)=(0,1), I_{a, b}=\left\{x^{(i)}, y^{(i)}\right\}_{i=1}^{m} (a,b)=(0,1),Ia,b={x(i),y(i)}i=1m,且 ∣ { i ∈ I a , b } ∣ = m \left|\left\{i \in I_{a, b}\right\}\right|=m ∣{i∈Ia,b}∣=m,则:
∑ i ∈ I a , b P ( y ( i ) = 1 ∣ x ( i ) ; θ ) ∣ { i ∈ I a , b } ∣ = ∑ i ∈ I a , b I { y ( i ) = 1 } ∣ { i ∈ I a , b } ∣ \frac{\sum_{i \in I_{a, b}} P\left(y^{(i)}=1 \mid x^{(i)} ; \theta\right)}{\left|\left\{i \in I_{a, b}\right\}\right|}=\frac{\sum_{i \in I_{a, b}} \mathbb{I}\left\{y^{(i)}=1\right\}}{\left|\left\{i \in I_{a, b}\right\}\right|} ∣{i∈Ia,b}∣∑i∈Ia,bP(y(i)=1∣x(i);θ)=∣{i∈Ia,b}∣∑i∈Ia,bI{y(i)=1}
两个都不是彼此的“必要条件”,考虑 ( a , b ) = ( 0.5 , 1 ) (a, b)=(0.5,1) (a,b)=(0.5,1),假设模型已经完全校准(perfectly calibrated),那么有: 0.5<P(y(i)=1∣x(i);θ)<1
∑ i ∈ I a , b P ( y ( i ) = 1 ∣ x ( i ) ; θ ) ∣ { i ∈ I a , b } ∣ = ∑ i ∈ I a , b I { y ( i ) = 1 } ∣ { i ∈ I a , b } ∣ \frac{\sum_{i \in I_{a, b}} P\left(y^{(i)}=1 \mid x^{(i)} ; \theta\right)}{\left|\left\{i \in I_{a, b}\right\}\right|}=\frac{\sum_{i \in I_{a, b}} \mathbb{I}\left\{y^{(i)}=1\right\}}{\left|\left\{i \in I_{a, b}\right\}\right|} ∣{i∈Ia,b}∣∑i∈Ia,bP(y(i)=1∣x(i);θ)=∣{i∈Ia,b}∣∑i∈Ia,bI{y(i)=1}
而对于上式右侧部分,这时分子分母都为正例个数,即:
∑ i ∈ I a , b { y ( i ) = 1 } ∣ { i ∈ I a , b } ∣ = 1 \frac{\sum_{i \in I_{a, b}} \mathbb{\{ y ^ { ( i ) } = 1 \}}}{\left|\left\{i \in I_{a, b}\right\}\right|}=1 ∣{i∈Ia,b}∣∑i∈Ia,b{y(i)=1}=1
而左侧的分子部分,对于 i ∈ I a , b i \in I_{a, b} i∈Ia,b,由于:
0.5 < P ( y ( i ) = 1 ∣ x ( i ) ; θ ) < 1 0.5
那么:
∑ i ∈ I a , b P ( y ( i ) = 1 ∣ x ( i ) ; θ ) ∣ { i ∈ I a , b } ∣ < 1 \frac{\sum_{i \in I_{a, b}} P\left(y^{(i)}=1 \mid x^{(i)} ; \theta\right)}{\left|\left\{i \in I_{a, b}\right\}\right|}<1 ∣{i∈Ia,b}∣∑i∈Ia,bP(y(i)=1∣x(i);θ)<1
所以对于完全校准的模型,不是模型达到最高准确率(这里考虑的情况下,为1)的必要条件。
反过来,如果模型已经达到了最高的准确率,即开始等式右侧等于1(所有假设函数输出结果大于0.5的都是正实例),那么必然不能等于等式右侧,因为等式左侧的概率部分必然不都等于1,而是位于(0.5,1)区间,那么左侧必然是小于1的,所以左右两边不可能相等,即达不到完全校准(perfectly calibrated)。
类似02a中的过程,加入正则化项(罚项)实际上就是改变了损失函数,即:
J ( θ ) = ∑ i = 1 m y ( i ) log h ( x ( i ) ) + ( 1 − y ( i ) ) log ( 1 − h ( x ( i ) ) ) + 1 2 λ ∥ θ ∥ 2 2 J(\theta)=\sum_{i=1}^{m} y^{(i)} \log h\left(x^{(i)}\right)+\left(1-y^{(i)}\right) \log \left(1-h\left(x^{(i)}\right)\right)+\frac{1}{2} \lambda\|\theta\|_{2}^{2} J(θ)=i=1∑my(i)logh(x(i))+(1−y(i))log(1−h(x(i)))+21λ∥θ∥22
那么当完成其优化以后,在最优化处梯度应该为0:
∂ J ( θ ) ∂ θ j = ∑ i = 1 m ( h ( x ( i ) ) − y ( i ) ) x j ( i ) + λ θ j = 0 \frac{\partial J(\theta)}{\partial \theta_{j}}=\sum_{i=1}^{m}\left(h\left(x^{(i)}\right)-y^{(i)}\right) x_{j}^{(i)}+\lambda \theta_{j}=0 ∂θj∂J(θ)=i=1∑m(h(x(i))−y(i))xj(i)+λθj=0
其中,对于 j = 0 j=0 j=0,由于 x 0 ( i ) = 1 x_{0}^{(i)}=1 x0(i)=1,所以有:
∑ i = 1 m ( h ( x ( i ) ) − y ( i ) ) + λ θ 0 = 0 ∑ i = 1 m h ( x ( i ) ) + λ θ 0 = ∑ i = 1 m y ( i ) \begin{array}{l}\sum_{i=1}^{m}\left(h\left(x^{(i)}\right)-y^{(i)}\right)+\lambda \theta_{0}=0 \\ \sum_{i=1}^{m} h\left(x^{(i)}\right)+\lambda \theta_{0}=\sum_{i=1}^{m} y^{(i)}\end{array} ∑i=1m(h(x(i))−y(i))+λθ0=0∑i=1mh(x(i))+λθ0=∑i=1my(i)
进而:
∑ i = 1 m P ( y ( i ) = 1 ∣ x ( i ) ; θ ) + λ θ 0 = ∑ i = 1 m I { y ( i ) = 1 } \sum_{i=1}^{m} P\left(y^{(i)}=1 \mid x^{(i)} ; \theta\right)+\lambda \theta_{0}=\sum_{i=1}^{m} \mathbb{I}\left\{y^{(i)}=1\right\} i=1∑mP(y(i)=1∣x(i);θ)+λθ0=i=1∑mI{y(i)=1}
所以当 θ 0 \theta_{0} θ0不为0时,原来的模型完全校准(perfectly calibrated)的等式已经不再成立,即加入正则化项会使得原本符合完全校准的Logistic 回归模型,变得不再完全校准。
从定义式出发:
p ( θ ∣ x , y ) = p ( x , y , θ ) p ( x , y ) = p ( y ∣ x , θ ) p ( x , θ ) p ( x , y ) = p ( y ∣ x , θ ) p ( θ ∣ x ) p ( x ) p ( x , y ) p(\theta \mid x, y)=\frac{p(x, y, \theta)}{p(x, y)}=\frac{p(y \mid x, \theta) p(x, \theta)}{p(x, y)}=\frac{p(y \mid x, \theta) p(\theta \mid x) p(x)}{p(x, y)} p(θ∣x,y)=p(x,y)p(x,y,θ)=p(x,y)p(y∣x,θ)p(x,θ)=p(x,y)p(y∣x,θ)p(θ∣x)p(x)
代入 p ( θ ) = p ( θ ∣ x ) p(\theta)=p(\theta \mid x) p(θ)=p(θ∣x):
p ( θ ∣ x , y ) = p ( y ∣ x , θ ) p ( θ ) p ( x ) p ( x , y ) = p ( y ∣ x , θ ) p ( θ ) ⋅ p ( x ) p ( x , y ) p(\theta \mid x, y)=\frac{p(y \mid x, \theta) p(\theta) p(x)}{p(x, y)}=p(y \mid x, \theta) p(\theta) \cdot \frac{p(x)}{p(x, y)} p(θ∣x,y)=p(x,y)p(y∣x,θ)p(θ)p(x)=p(y∣x,θ)p(θ)⋅p(x,y)p(x)
所以有:
θ M A P = arg max θ p ( θ ∣ x , y ) = arg max θ p ( y ∣ x , θ ) p ( θ ) ⋅ p ( x ) p ( x , y ) = arg max θ p ( y ∣ x , θ ) p ( θ ) \theta_{\mathrm{MAP}}=\arg \max _{\theta} p(\theta \mid x, y)=\arg \max _{\theta} p(y \mid x, \theta) p(\theta) \cdot \frac{p(x)}{p(x, y)}=\arg \max _{\theta} p(y \mid x, \theta) p(\theta) θMAP=argθmaxp(θ∣x,y)=argθmaxp(y∣x,θ)p(θ)⋅p(x,y)p(x)=argθmaxp(y∣x,θ)p(θ)
最后一个等于号由于优化变量是 θ \theta θ与只含x,y的项无关,所以可以舍弃。
直接计算:
θ M A P = arg max θ p ( y ∣ x , θ ) p ( θ ) = arg min θ − log p ( y ∣ x , θ ) − log p ( θ ) = arg min θ − log p ( y ∣ x , θ ) − log 1 ( 2 π ) d / 2 ∣ Σ ∣ 1 / 2 exp ( − 1 2 ( θ − μ ) T Σ − 1 ( θ − μ ) ) = arg min θ − log p ( y ∣ x , θ ) + 1 2 θ T Σ − 1 θ = arg min θ − log p ( y ∣ x , θ ) + 1 2 η 2 ∥ θ ∥ 2 2 = arg min θ − log p ( y ∣ x , θ ) + λ ∥ θ ∥ 2 2 \begin{aligned} \theta_{\mathrm{MAP}} &=\arg \max _{\theta} p(y \mid x, \theta) p(\theta) \\ &=\arg \min _{\theta}-\log p(y \mid x, \theta)-\log p(\theta) \\ &=\arg \min _{\theta}-\log p(y \mid x, \theta)-\log \frac{1}{(2 \pi)^{d / 2}|\Sigma|^{1 / 2}} \exp \left(-\frac{1}{2}(\theta-\mu)^{T} \Sigma^{-1}(\theta-\mu)\right) \\ &=\arg \min _{\theta}-\log p(y \mid x, \theta)+\frac{1}{2} \theta^{T} \Sigma^{-1} \theta \\ &=\arg \min _{\theta}-\log p(y \mid x, \theta)+\frac{1}{2 \eta^{2}}\|\theta\|_{2}^{2} \\ &=\arg \min _{\theta}-\log p(y \mid x, \theta)+\lambda\|\theta\|_{2}^{2} \end{aligned} θMAP=argθmaxp(y∣x,θ)p(θ)=argθmin−logp(y∣x,θ)−logp(θ)=argθmin−logp(y∣x,θ)−log(2π)d/2∣Σ∣1/21exp(−21(θ−μ)TΣ−1(θ−μ))=argθmin−logp(y∣x,θ)+21θTΣ−1θ=argθmin−logp(y∣x,θ)+2η21∥θ∥22=argθmin−logp(y∣x,θ)+λ∥θ∥22
可得, λ = 1 2 η 2 \lambda=\frac{1}{2 \eta^{2}} λ=2η21
首先从已知条件中得到条件概率服从的分布:
ϵ ( i ) ∼ N ( 0 , σ 2 ) y ( i ) = θ T x ( i ) + ϵ ( i ) y ( i ) ∣ x ( i ) , θ ∼ N ( θ T x ( i ) , σ 2 ) \begin{array}{c}\epsilon^{(i)} \sim \mathcal{N}\left(0, \sigma^{2}\right) \\ y^{(i)}=\theta^{T} x^{(i)}+\epsilon^{(i)} \\ y^{(i)} \mid x^{(i)}, \theta \sim \mathcal{N}\left(\theta^{T} x^{(i)}, \sigma^{2}\right)\end{array} ϵ(i)∼N(0,σ2)y(i)=θTx(i)+ϵ(i)y(i)∣x(i),θ∼N(θTx(i),σ2)
从而:
p ( y ( i ) ∣ x ( i ) , θ ) = 1 2 π σ exp { − 1 2 σ 2 ( y ( i ) − θ T x ( i ) ) 2 } p ( y ⃗ ∣ X , θ ) = ∏ i = 1 m p ( y ( i ) ∣ x ( i ) , θ ) = ∏ i = 1 m 1 2 π σ exp { − 1 2 σ 2 ( y ( i ) − θ T x ( i ) ) 2 } = 1 ( 2 π ) m / 2 σ m exp { − 1 2 σ 2 ∑ i = 1 m ( y ( i ) − θ T x ( i ) ) 2 } = 1 ( 2 π ) m / 2 σ m exp { − 1 2 σ 2 ∥ X θ − y ⃗ ∥ 2 2 } \begin{aligned} p\left(y^{(i)} \mid x^{(i)}\right.&, \theta)=\frac{1}{\sqrt{2 \pi} \sigma} \exp \left\{-\frac{1}{2 \sigma^{2}}\left(y^{(i)}-\theta^{T} x^{(i)}\right)^{2}\right\} \\ p(\vec{y} \mid X, \theta) &=\prod_{i=1}^{m} p\left(y^{(i)} \mid x^{(i)}, \theta\right) \\ &=\prod_{i=1}^{m} \frac{1}{\sqrt{2 \pi} \sigma} \exp \left\{-\frac{1}{2 \sigma^{2}}\left(y^{(i)}-\theta^{T} x^{(i)}\right)^{2}\right\} \\ &=\frac{1}{(2 \pi)^{m / 2} \sigma^{m}} \exp \left\{-\frac{1}{2 \sigma^{2}} \sum_{i=1}^{m}\left(y^{(i)}-\theta^{T} x^{(i)}\right)^{2}\right\} \\ &=\frac{1}{(2 \pi)^{m / 2} \sigma^{m}} \exp \left\{-\frac{1}{2 \sigma^{2}}\|X \theta-\vec{y}\|_{2}^{2}\right\} \end{aligned} p(y(i)∣x(i)p(y∣X,θ),θ)=2πσ1exp{−2σ21(y(i)−θTx(i))2}=i=1∏mp(y(i)∣x(i),θ)=i=1∏m2πσ1exp{−2σ21(y(i)−θTx(i))2}=(2π)m/2σm1exp{−2σ21i=1∑m(y(i)−θTx(i))2}=(2π)m/2σm1exp{−2σ21∥Xθ−y∥22}
取对数:
log p ( y ⃗ ∣ X , θ ) = − m 2 log ( 2 π ) − m log σ − 1 2 σ 2 ∥ X θ − y ⃗ ∥ 2 2 \begin{aligned} \log p(\vec{y} \mid X, \theta) &=-\frac{m}{2} \log (2 \pi)-m \log \sigma-\frac{1}{2 \sigma^{2}}\|X \theta-\vec{y}\|_{2}^{2} \end{aligned} logp(y∣X,θ)=−2mlog(2π)−mlogσ−2σ21∥Xθ−y∥22
进而
θ M A P = arg min θ − log p ( y ∣ x , θ ) + 1 2 η 2 ∥ θ ∥ 2 2 = arg min θ 1 2 σ 2 ∥ X θ − y ⃗ ∥ 2 2 + 1 2 η 2 ∥ θ ∥ 2 2 \begin{aligned} \theta_{\mathrm{MAP}} &=\arg \min _{\theta}-\log p(y \mid x, \theta)+\frac{1}{2 \eta^{2}}\|\theta\|_{2}^{2} \\ &=\arg \min _{\theta} \frac{1}{2 \sigma^{2}}\|X \theta-\vec{y}\|_{2}^{2}+\frac{1}{2 \eta^{2}}\|\theta\|_{2}^{2} \end{aligned} θMAP=argθmin−logp(y∣x,θ)+2η21∥θ∥22=argθmin2σ21∥Xθ−y∥22+2η21∥θ∥22
令:
J ( θ ) = 1 2 σ 2 ( y ⃗ − X θ ) T ( y ⃗ − X θ ) + 1 2 η 2 ∥ θ ∥ 2 2 J(\theta)= \frac{1}{2 \sigma^{2}}(\vec{y}-X \theta)^{T}(\vec{y}-X \theta)+\frac{1}{2 \eta^{2}}\|\theta\|_{2}^{2} J(θ)=2σ21(y−Xθ)T(y−Xθ)+2η21∥θ∥22
那么求最优化(最小值),最优时对应梯度为0,即:
∇ θ J ( θ ) = ∇ θ ( 1 2 σ 2 ( y ⃗ − X θ ) T ( y ⃗ − X θ ) + 1 2 η 2 ∥ θ ∥ 2 2 ) = 1 2 σ 2 ∇ θ ( θ T X T X θ − 2 y ⃗ T X θ + σ 2 η 2 θ T θ ) = 1 σ 2 ( X T X θ − X T y ⃗ + σ 2 η 2 θ ) = 0 \begin{aligned} \nabla_{\theta} J(\theta) &=\nabla_{\theta}\left(\frac{1}{2 \sigma^{2}}(\vec{y}-X \theta)^{T}(\vec{y}-X \theta)+\frac{1}{2 \eta^{2}}\|\theta\|_{2}^{2}\right) \\ &=\frac{1}{2 \sigma^{2}} \nabla_{\theta}\left(\theta^{T} X^{T} X \theta-2 \vec{y}^{T} X \theta+\frac{\sigma^{2}}{\eta^{2}} \theta^{T} \theta\right) \\ &=\frac{1}{\sigma^{2}}\left(X^{T} X \theta-X^{T} \vec{y}+\frac{\sigma^{2}}{\eta^{2}} \theta\right) \\ &=0 \end{aligned} ∇θJ(θ)=∇θ(2σ21(y−Xθ)T(y−Xθ)+2η21∥θ∥22)=2σ21∇θ(θTXTXθ−2yTXθ+η2σ2θTθ)=σ21(XTXθ−XTy+η2σ2θ)=0
上面倒数第二个等号涉及了矩阵和矢量的求导,细节可以推导或者查阅相关的数学基础。
于是:
θ M A P = ( X T X + σ 2 η 2 I ) − 1 X T y ⃗ \theta_{\mathrm{MAP}}=\left(X^{T} X+\frac{\sigma^{2}}{\eta^{2}} I\right)^{-1} X^{T} \vec{y} θMAP=(XTX+η2σ2I)−1XTy
可以沿着从03b开始的思路重新思考此题,首先去去的后验概率分布,过程与03c开始部分相同,得到:
p ( y ⃗ ∣ X , θ ) = ∏ i = 1 m p ( y ( i ) ∣ x ( i ) , θ ) = ∏ i = 1 m 1 2 π σ exp { − 1 2 σ 2 ( y ( i ) − θ T x ( i ) ) 2 } = 1 ( 2 π ) m / 2 σ m exp { − 1 2 σ 2 ∑ i = 1 m ( y ( i ) − θ T x ( i ) ) 2 } = 1 ( 2 π ) m / 2 σ m exp { − 1 2 σ 2 ∥ X θ − y ⃗ ∥ 2 2 } \begin{aligned} p(\vec{y} \mid X, \theta) &=\prod_{i=1}^{m} p\left(y^{(i)} \mid x^{(i)}, \theta\right) \\ &=\prod_{i=1}^{m} \frac{1}{\sqrt{2 \pi} \sigma} \exp \left\{-\frac{1}{2 \sigma^{2}}\left(y^{(i)}-\theta^{T} x^{(i)}\right)^{2}\right\} \\ &=\frac{1}{(2 \pi)^{m / 2} \sigma^{m}} \exp \left\{-\frac{1}{2 \sigma^{2}} \sum_{i=1}^{m}\left(y^{(i)}-\theta^{T} x^{(i)}\right)^{2}\right\} \\ &=\frac{1}{(2 \pi)^{m / 2} \sigma^{m}} \exp \left\{-\frac{1}{2 \sigma^{2}}\|X \theta-\vec{y}\|_{2}^{2}\right\} \end{aligned} p(y∣X,θ)=i=1∏mp(y(i)∣x(i),θ)=i=1∏m2πσ1exp{−2σ21(y(i)−θTx(i))2}=(2π)m/2σm1exp{−2σ21i=1∑m(y(i)−θTx(i))2}=(2π)m/2σm1exp{−2σ21∥Xθ−y∥22}
再看先验概率分布:
θ ∼ L ( 0 , b I ) p ( θ ) = 1 ( 2 b ) n exp { − 1 b ∥ θ ∥ 1 } log p ( θ ) = − n log ( 2 b ) − 1 b ∥ θ ∥ 1 \begin{array}{c}\theta \sim \mathcal{L}(0, b I) \\ p(\theta)=\frac{1}{(2 b)^{n}} \exp \left\{-\frac{1}{b}\|\theta\|_{1}\right\} \\ \log p(\theta)=-n \log (2 b)-\frac{1}{b}\|\theta\|_{1}\end{array} θ∼L(0,bI)p(θ)=(2b)n1exp{−b1∥θ∥1}logp(θ)=−nlog(2b)−b1∥θ∥1
那么:
θ M A P = arg min θ 1 2 σ 2 ∥ X θ − y ⃗ ∥ 2 2 − log p ( θ ) = arg min θ 1 2 σ 2 ∥ X θ − y ⃗ ∥ 2 2 + 1 b ∥ θ ∥ 1 \begin{aligned} \theta_{\mathrm{MAP}} &=\arg \min _{\theta} \frac{1}{2 \sigma^{2}}\|X \theta-\vec{y}\|_{2}^{2}-\log p(\theta) \\ &=\arg \min _{\theta} \frac{1}{2 \sigma^{2}}\|X \theta-\vec{y}\|_{2}^{2}+\frac{1}{b}\|\theta\|_{1} \end{aligned} θMAP=argθmin2σ21∥Xθ−y∥22−logp(θ)=argθmin2σ21∥Xθ−y∥22+b1∥θ∥1
因此:
J ( θ ) = ∥ X θ − y ⃗ ∥ 2 2 + γ ∥ θ ∥ 1 θ M A P = arg min θ J ( θ ) γ = 2 σ 2 b \begin{array}{c}J(\theta)=\|X \theta-\vec{y}\|_{2}^{2}+\gamma\|\theta\|_{1} \\ \theta_{\mathrm{MAP}}=\arg \min _{\theta} J(\theta) \\ \gamma=\frac{2 \sigma^{2}}{b}\end{array} J(θ)=∥Xθ−y∥22+γ∥θ∥1θMAP=argminθJ(θ)γ=b2σ2
上式是个绝对值函数,不可导,因此只能借助数值求解。
是合法的kernel,对称性显而易见,证明半正定性:
z T K z = ∑ i ∑ j z i K i j z j = ∑ i ∑ j z i K ( x ( i ) , x ( j ) ) z j = ∑ i ∑ j z i ( K 1 ( x ( i ) , x ( j ) ) + K 2 ( x ( i ) , x ( j ) ) ) z j = ∑ i ∑ j z i K 1 ( x ( i ) , x ( j ) ) z j + ∑ i ∑ j z i K 2 ( x ( i ) , x ( j ) ) z j = z T K 1 z + z T K 2 z ≥ 0 \begin{aligned} z^{T} K z &=\sum_{i} \sum_{j} z_{i} K_{i j} z_{j} \\ &=\sum_{i} \sum_{j} z_{i} K\left(x^{(i)}, x^{(j)}\right) z_{j} \\ &=\sum_{i} \sum_{j} z_{i}\left(K_{1}\left(x^{(i)}, x^{(j)}\right)+K_{2}\left(x^{(i)}, x^{(j)}\right)\right) z_{j} \\ &=\sum_{i} \sum_{j} z_{i} K_{1}\left(x^{(i)}, x^{(j)}\right) z_{j}+\sum_{i} \sum_{j} z_{i} K_{2}\left(x^{(i)}, x^{(j)}\right) z_{j} \\ &=z^{T} K_{1} z+z^{T} K_{2} z \\ & \geq 0 \end{aligned} zTKz=i∑j∑ziKijzj=i∑j∑ziK(x(i),x(j))zj=i∑j∑zi(K1(x(i),x(j))+K2(x(i),x(j)))zj=i∑j∑ziK1(x(i),x(j))zj+i∑j∑ziK2(x(i),x(j))zj=zTK1z+zTK2z≥0
最后的不等号是因为 K 1 K_{1} K1和 K 2 K_{2} K2都是合法的Kernel,即它们各自都半正定( ≥ \ge ≥)
不一定是合法Kernel,因为不一定是半正定矩阵,比如 K 2 = 2 K 1 K_{2}=2 K_{1} K2=2K1,则:
z T K z = z T ( K 1 − K 2 ) z = z T ( K 1 − 2 K 1 ) z = − z T K 1 z ≤ 0 z^{T} K z=z^{T}\left(K_{1}-K_{2}\right) z=z^{T}\left(K_{1}-2 K_{1}\right) z=-z^{T} K_{1} z \leq 0 zTKz=zT(K1−K2)z=zT(K1−2K1)z=−zTK1z≤0
结合04d,这里的a应该是指正实数。那么这是合法的kernel,因为这个因子显然不改变对称性,而半正定性也易证:
z T K z = ∑ i ∑ j z i K i j z j = ∑ i ∑ j z i K ( x ( i ) , x ( j ) ) z j = ∑ i ∑ j z i a K 1 ( x ( i ) , x ( j ) ) z j = a z T K 1 z ≥ 0 \begin{aligned} z^{T} K z &=\sum_{i} \sum_{j} z_{i} K_{i j} z_{j} \\ &=\sum_{i} \sum_{j} z_{i} K\left(x^{(i)}, x^{(j)}\right) z_{j} \\ &=\sum_{i} \sum_{j} z_{i} a K_{1}\left(x^{(i)}, x^{(j)}\right) z_{j} \\ &=a z^{T} K_{1} z \\ & \geq 0 \end{aligned} zTKz=i∑j∑ziKijzj=i∑j∑ziK(x(i),x(j))zj=i∑j∑ziaK1(x(i),x(j))zj=azTK1z≥0
对于a>0,同04c相反,这样使得矩阵不能够半正定,因此不是合法的Kernel。例如,a=-1,则:
z T K z = − z T K 1 z ≤ 0 { }_{z}^{T} K_{z}=-z^{T} K_{1} z \leq 0 zTKz=−zTK1z≤0
是合法的Kernel,对称性显而易见,对于半正定性,证明:
z T K z = ∑ i ∑ j z i K i j z j = ∑ i ∑ j z i K ( x ( i ) , x ( j ) ) z j = ∑ i ∑ j z i K 1 ( x ( i ) , x ( j ) ) K 2 ( x ( i ) , x ( j ) ) z j = ∑ i ∑ j z i ϕ 1 ( x ( i ) ) T ϕ 1 ( x ( j ) ) ϕ 2 ( x ( i ) ) T ϕ 2 ( x ( j ) ) z j = ∑ i ∑ j z i ∑ k ϕ 1 k ( x ( i ) ) ϕ 1 k ( x ( j ) ) ∑ l ϕ 2 l ( x ( i ) ) ϕ 2 l ( x ( j ) ) z j = ∑ k ∑ l ∑ i ∑ j z i ϕ 1 k ( x ( i ) ) ϕ 2 l ( x ( i ) ) z j ϕ 1 k ( x ( j ) ) ϕ 2 l ( x ( j ) ) = ∑ k ∑ l ( ∑ i z i ϕ 1 k ( x ( i ) ) ϕ 2 l ( x ( i ) ) ) 2 ≥ 0 \begin{aligned} z^{T} K z &=\sum_{i} \sum_{j} z_{i} K_{i j} z_{j} \\ &=\sum_{i} \sum_{j} z_{i} K\left(x^{(i)}, x^{(j)}\right) z_{j} \\ &=\sum_{i} \sum_{j} z_{i} K_{1}\left(x^{(i)}, x^{(j)}\right) K_{2}\left(x^{(i)}, x^{(j)}\right) z_{j} \\ &=\sum_{i} \sum_{j} z_{i} \phi_{1}\left(x^{(i)}\right)^{T} \phi_{1}\left(x^{(j)}\right) \phi_{2}\left(x^{(i)}\right)^{T} \phi_{2}\left(x^{(j)}\right) z_{j} \\ &=\sum_{i} \sum_{j} z_{i} \sum_{k} \phi_{1 k}\left(x^{(i)}\right) \phi_{1 k}\left(x^{(j)}\right) \sum_{l} \phi_{2 l}\left(x^{(i)}\right) \phi_{2 l}\left(x^{(j)}\right) z_{j} \\ &=\sum_{k} \sum_{l} \sum_{i} \sum_{j} z_{i} \phi_{1 k}\left(x^{(i)}\right) \phi_{2 l}\left(x^{(i)}\right) z_{j} \phi_{1 k}\left(x^{(j)}\right) \phi_{2 l}\left(x^{(j)}\right) \\ &=\sum_{k} \sum_{l}\left(\sum_{i} z_{i} \phi_{1 k}\left(x^{(i)}\right) \phi_{2 l}\left(x^{(i)}\right)\right)^{2} \\ & \geq 0 \end{aligned} zTKz=i∑j∑ziKijzj=i∑j∑ziK(x(i),x(j))zj=i∑j∑ziK1(x(i),x(j))K2(x(i),x(j))zj=i∑j∑ziϕ1(x(i))Tϕ1(x(j))ϕ2(x(i))Tϕ2(x(j))zj=i∑j∑zik∑ϕ1k(x(i))ϕ1k(x(j))l∑ϕ2l(x(i))ϕ2l(x(j))zj=k∑l∑i∑j∑ziϕ1k(x(i))ϕ2l(x(i))zjϕ1k(x(j))ϕ2l(x(j))=k∑l∑(i∑ziϕ1k(x(i))ϕ2l(x(i)))2≥0
是合法的Kernel,对称性显然,半正定性:
z T K z = ∑ i ∑ j z i K i j z j = ∑ i ∑ j z i K ( x ( i ) , x ( j ) ) z j = ∑ i ∑ j z i f ( x ( i ) ) f ( x ( j ) ) z j = ( ∑ i f ( x ( i ) ) z i ) 2 ≥ 0 \begin{aligned} z^{T} K z &=\sum_{i} \sum_{j} z_{i} K_{i j} z_{j} \\ &=\sum_{i} \sum_{j} z_{i} K\left(x^{(i)}, x^{(j)}\right) z_{j} \\ &=\sum_{i} \sum_{j} z_{i} f\left(x^{(i)}\right) f\left(x^{(j)}\right) z_{j} \\ &=\left(\sum_{i} f\left(x^{(i)}\right) z_{i}\right)^{2} \\ & \geq 0 \end{aligned} zTKz=i∑j∑ziKijzj=i∑j∑ziK(x(i),x(j))zj=i∑j∑zif(x(i))f(x(j))zj=(i∑f(x(i))zi)2≥0
是合法的Kernel,因为只要 K 3 K_{3} K3是合法的Kernel,那么里面的输入无所谓是两个n维的矢量还是d维的。
是合法的Kernel,对称性是显然的,对于正定性做如下证明:
首先,按照条件给出 p ( x ) p(x) p(x)的一般形式:
p ( x ) = ∑ k = 0 n c k x k , 其 中 c k > 0 , k = 0 , 1 , … , n p(x)=\sum_{k=0}^{n} c_{k} x^{k}, 其中 c_{k}>0, k=0,1, \ldots, n p(x)=k=0∑nckxk,其中ck>0,k=0,1,…,n
那么:
K ( x , z ) = p ( K 1 ( x , z ) ) = ∑ k = 0 n c k ( K 1 ( x , z ) ) k K(x, z)=p\left(K_{1}(x, z)\right)=\sum_{k=0}^{n} c_{k}\left(K_{1}(x, z)\right)^{k} K(x,z)=p(K1(x,z))=k=0∑nck(K1(x,z))k
而由04e可知,两个合法的Kernel的积仍然是合法的Kernel,因此 K ( x , z ) = ( K 1 ( x , z ) ) k K(x, z)=\left(K_{1}(x, z)\right)^{k} K(x,z)=(K1(x,z))k是合法的,进一步,由04c可知,合法Kernel乘正实数因子仍然是合法的,即 c k ( K 1 ( x , z ) ) k c_{k}\left(K_{1}(x, z)\right)^{k} ck(K1(x,z))k是合法的,再由04a可知,任意两个合法的Kernel的和也是合法的,所以 K ( x , z ) = ∑ k = 0 n c k ( K 1 ( x , z ) ) k K(x, z)=\sum_{k=0}^{n} c_{k}\left(K_{1}(x, z)\right)^{k} K(x,z)=∑k=0nck(K1(x,z))k是合法的。
在课程中,有提到过,对于标准的Logistic回归,以及在讲解SVM时候,为了避开繁杂而严格的推导,用的一个直觉式的,对于 θ 或 者 w \theta或者w θ或者w的表示方法,即通过迭代公式:
θ ( i + 1 ) : = θ ( i ) + α ( y ( i + 1 ) − h θ ( i ) ( x ( i + 1 ) ) ) x ( i + 1 ) \theta^{(i+1)}:=\theta^{(i)}+\alpha\left(y^{(i+1)}-h_{\theta^{(i)}}\left(x^{(i+1)}\right)\right) x^{(i+1)} θ(i+1):=θ(i)+α(y(i+1)−hθ(i)(x(i+1)))x(i+1)
可以看出,如果一开始的 θ ( 0 ) \theta^{(0)} θ(0)是一个零向量,那么它迭代以后其实就是很多x的线性组合。
这里也是同样的道理,只不过每次迭代的不再是x,而是被 ϕ \phi ϕ映射到高维空间的矢量 ϕ ( x ) \phi(x) ϕ(x),即:
θ ( i + 1 ) : = θ ( i ) + α ( y ( i + 1 ) − h θ ( i ) ( ϕ ( x ( i + 1 ) ) ) ) ϕ ( x ( i + 1 ) ) \theta^{(i+1)}:=\theta^{(i)}+\alpha\left(y^{(i+1)}-h_{\theta^{(i)}}\left(\phi\left(x^{(i+1)}\right)\right)\right) \phi\left(x^{(i+1)}\right) θ(i+1):=θ(i)+α(y(i+1)−hθ(i)(ϕ(x(i+1))))ϕ(x(i+1))
所以, θ \theta θ可以表示为 ϕ ( x ) \phi(x) ϕ(x)的线性组合:
θ ( i ) = ∑ j = 1 i β j ϕ ( x ( j ) ) 其 中 , θ ( 0 ) = 0 → \begin{aligned} \theta^{(i)} &=\sum_{j=1}^{i} \beta_{j} \phi\left(x^{(j)}\right) 其中, \theta^{(0)}=\overrightarrow{0} \end{aligned} θ(i)=j=1∑iβjϕ(x(j))其中,θ(0)=0
直接代入上式结果即可:
h θ ( i ) ( ϕ ( x ( i + 1 ) ) ) = g ( θ ( i ) T ϕ ( x ( i + 1 ) ) ) = sign ( θ ( i ) T ϕ ( x ( i + 1 ) ) ) = sign ( ∑ j = 1 i β j ϕ ( x ( j ) ) T ϕ ( x ( i + 1 ) ) ) = sign ( ∑ j = 1 i β j ⟨ ϕ ( x ( j ) ) , ϕ ( x ( i + 1 ) ) ⟩ ) = sign ( ∑ j = 1 i β j K ( x ( j ) , x ( i + 1 ) ) ) \begin{aligned} h_{\theta^{(i)}}\left(\phi\left(x^{(i+1)}\right)\right) &=g\left(\theta^{(i)^{T}} \phi\left(x^{(i+1)}\right)\right) \\ &=\operatorname{sign}\left(\theta^{(i)^{T}} \phi\left(x^{(i+1)}\right)\right) \\ &=\operatorname{sign}\left(\sum_{j=1}^{i} \beta_{j} \phi\left(x^{(j)}\right)^{T} \phi\left(x^{(i+1)}\right)\right) \\ &=\operatorname{sign}\left(\sum_{j=1}^{i} \beta_{j}\left\langle\phi\left(x^{(j)}\right), \phi\left(x^{(i+1)}\right)\right\rangle\right) \\ &=\operatorname{sign}\left(\sum_{j=1}^{i} \beta_{j} K\left(x^{(j)}, x^{(i+1)}\right)\right) \end{aligned} hθ(i)(ϕ(x(i+1)))=g(θ(i)Tϕ(x(i+1)))=sign(θ(i)Tϕ(x(i+1)))=sign(j=1∑iβjϕ(x(j))Tϕ(x(i+1)))=sign(j=1∑iβj⟨ϕ(x(j)),ϕ(x(i+1))⟩)=sign(j=1∑iβjK(x(j),x(i+1)))
直接计算可得:
θ ( i + 1 ) : = θ ( i ) + α ( y ( i + 1 ) − h θ ( i ) ( ϕ ( x ( i + 1 ) ) ) ) ϕ ( x ( i + 1 ) ) = ∑ j = 1 i β j ϕ ( x ( j ) ) + α ( y ( i + 1 ) − sign ( ∑ j = 1 i β j K ( x ( j ) , x ( i + 1 ) ) ) ) ϕ ( x ( i + 1 ) ) = ∑ j = 1 i + 1 β j ϕ ( x ( j ) ) \begin{aligned} \theta^{(i+1)}: &=\theta^{(i)}+\alpha\left(y^{(i+1)}-h_{\theta^{(i)}}\left(\phi\left(x^{(i+1)}\right)\right)\right) \phi\left(x^{(i+1)}\right) \\ &=\sum_{j=1}^{i} \beta_{j} \phi\left(x^{(j)}\right)+\alpha\left(y^{(i+1)}-\operatorname{sign}\left(\sum_{j=1}^{i} \beta_{j} K\left(x^{(j)}, x^{(i+1)}\right)\right)\right) \phi\left(x^{(i+1)}\right) \\ &=\sum_{j=1}^{i+1} \beta_{j} \phi\left(x^{(j)}\right) \end{aligned} θ(i+1):=θ(i)+α(y(i+1)−hθ(i)(ϕ(x(i+1))))ϕ(x(i+1))=j=1∑iβjϕ(x(j))+α(y(i+1)−sign(j=1∑iβjK(x(j),x(i+1))))ϕ(x(i+1))=j=1∑i+1βjϕ(x(j))
所以:
β i + 1 : = α ( y ( i + 1 ) − sign ( ∑ j = 1 i β j K ( x ( j ) , x ( i + 1 ) ) ) ) \beta_{i+1}:=\alpha\left(y^{(i+1)}-\operatorname{sign}\left(\sum_{j=1}^{i} \beta_{j} K\left(x^{(j)}, x^{(i+1)}\right)\right)\right) βi+1:=α(y(i+1)−sign(j=1∑iβjK(x(j),x(i+1))))
直接修改代码:
import math
import matplotlib.pyplot as plt
import numpy as np
import src.util
def initial_state():
"""Return the initial state for the perceptron.
This function computes and then returns the initial state of the perceptron.
Feel free to use any data type (dicts, lists, tuples, or custom classes) to
contain the state of the perceptron.
"""
# *** START CODE HERE ***
return []
# *** END CODE HERE ***
def predict(state, kernel, x_i):
"""Peform a prediction on a given instance x_i given the current state
and the kernel.
Args:
state: The state returned from initial_state()
kernel: A binary function that takes two vectors as input and returns
the result of a kernel
x_i: A vector containing the features for a single instance
Returns:
Returns the prediction (i.e 0 or 1)
"""
# *** START CODE HERE ***
return sign(sum(beta * kernel(x, x_i) for beta, x in state))
# *** END CODE HERE ***
def update_state(state, kernel, learning_rate, x_i, y_i):
"""Updates the state of the perceptron.
Args:
state: The state returned from initial_state()
kernel: A binary function that takes two vectors as input and returns the result of a kernel
learning_rate: The learning rate for the update
x_i: A vector containing the features for a single instance
y_i: A 0 or 1 indicating the label for a single instance
"""
# *** START CODE HERE ***
state.append((learning_rate * (y_i - sign(sum(beta * kernel(x, x_i) for beta, x in state))), x_i))
# *** END CODE HERE ***
def sign(a):
"""Gets the sign of a scalar input."""
if a >= 0:
return 1
else:
return 0
def dot_kernel(a, b):
"""An implementation of a dot product kernel.
Args:
a: A vector
b: A vector
"""
return np.dot(a, b)
def rbf_kernel(a, b, sigma=1):
"""An implementation of the radial basis function kernel.
Args:
a: A vector
b: A vector
sigma: The radius of the kernel
"""
distance = (a - b).dot(a - b)
scaled_distance = -distance / (2 * (sigma) ** 2)
return math.exp(scaled_distance)
def train_perceptron(kernel_name, kernel, learning_rate):
"""Train a perceptron with the given kernel.
This function trains a perceptron with a given kernel and then
uses that perceptron to make predictions.
The output predictions are saved to src/output/p05_{kernel_name}_predictions.txt.
The output plots are saved to src/output_{kernel_name}_output.pdf.
Args:
kernel_name: The name of the kernel.
kernel: The kernel function.
learning_rate: The learning rate for training.
"""
train_x, train_y = util.load_csv('./data/ds5_train.csv')
state = initial_state()
for x_i, y_i in zip(train_x, train_y):
update_state(state, kernel, learning_rate, x_i, y_i)
test_x, test_y = util.load_csv('./data/ds5_train.csv')
plt.figure(figsize=(12, 8))
util.plot_contour(lambda a: predict(state, kernel, a))
util.plot_points(test_x, test_y)
plt.savefig('./output/p05_{}_output.pdf'.format(kernel_name))
predict_y = [predict(state, kernel, test_x[i, :]) for i in range(test_y.shape[0])]
np.savetxt('./output/p05_{}_predictions'.format(kernel_name), predict_y)
def main5():
train_perceptron('dot', dot_kernel, 0.5)
train_perceptron('rbf', rbf_kernel, 0.5)
main5()
显然使用dot Kernel的效果要更差,因为原数据集不是线性可分的,dot Kernel实际上是线性的分类器,其并没有把原来的样本矢量映射到高维空间,而二维空间的线性从理论上就不能把两种正反实例分开,所以自然效果不好。另一方面,对于rbf的Kernel,其实它将原样本矢量映射到了无穷维空间,在无穷维空间中,可以找到可分离的超平面,其在二维空间的投影,就如图中所示,可以将二维中线性不可分的两类分开。
添加的代码为:
import collections
import numpy as np
import src.util
import src.svm
def get_words(message):
"""Get the normalized list of words from a message string.
This function should split a message into words, normalize them, and return
the resulting list. For splitting, you should split on spaces. For normalization,
you should convert everything to lowercase.
Args:
message: A string containing an SMS message
Returns:
The list of normalized words from the message.
"""
# *** START CODE HERE ***
word_list = []
for word in message.strip().split(' '):
if not word:
continue
word_list.append(word.lower())
return word_list
# *** END CODE HERE ***
def create_dictionary(messages):
"""Create a dictionary mapping words to integer indices.
This function should create a dictionary of word to indices using the provided
training messages. Use get_words to process each message.
Rare words are often not useful for modeling. Please only add words to the dictionary
if they occur in at least five messages.
Args:
messages: A list of strings containing SMS messages
Returns:
A python dict mapping words to integers.
"""
# *** START CODE HERE ***
# get all words' list
words = [word for message in messages for word in get_words(message)]
# get the number of each word
word_count = collections.Counter(words)
# frequent word list where elements appear 5 times in words' list at least
freq_words = [word for word, freq in word_count.items() if freq >= 5]
# return dict
return {word: index for index, word in enumerate(freq_words)}
# *** END CODE HERE ***
def transform_text(messages, word_dictionary):
"""Transform a list of text messages into a numpy array for further processing.
This function should create a numpy array that contains the number of times each word
appears in each message. Each row in the resulting array should correspond to each
message and each column should correspond to a word.
Use the provided word dictionary to map words to column indices. Ignore words that
are not present in the dictionary. Use get_words to get the words for a message.
Args:
messages: A list of strings where each string is an SMS message.
word_dictionary: A python dict mapping words to integers.
Returns:
A numpy array marking the words present in each message.
"""
# *** START CODE HERE ***
m, n = len(messages), len(word_dictionary)
trans_mat = np.zeros((m, n), dtype=int)
for i, message in enumerate(messages):
for word in get_words(message):
if word in word_dictionary:
trans_mat[i, word_dictionary[word]] += 1
return trans_mat
# *** END CODE HERE ***
关于这一部分的知识,首先可以通过讲义回顾一下朴素贝叶斯以及拉普拉斯平滑,以及提到的多变量事件模型,有助于形成算法思路。这里在已经了解上述知识后做大致思路的整理。
要做预测,为了避开最终计算时候的分母,可以考虑比较 p ( y = 1 ∣ x ) p(y=1 \mid x) p(y=1∣x)与 p ( y = 0 ∣ x ) p(y=0 \mid x) p(y=0∣x)比值的大小:
p ( y = 1 ∣ x ) p ( y = 0 ∣ x ) = p ( x ∣ y = 1 ) p ( y = 1 ) p ( x ∣ y = 0 ) p ( y = 0 ) \frac{p(y=1 \mid x)}{p(y=0 \mid x)}=\frac{p(x \mid y=1) p(y=1)}{p(x \mid y=0) p(y=0)} p(y=0∣x)p(y=1∣x)=p(x∣y=0)p(y=0)p(x∣y=1)p(y=1)
为了避免连乘符号当中每个值都很小从而造成“下溢”,对该式取对数,从而可以从结果是否大于0来判断,是否是正例(spam):
原 式 = log ( ∏ k = 1 n p ( x k ∣ y = 1 ) ) ϕ y ( ∏ k = 1 n p ( x k ∣ y = 0 ) ) ( 1 − ϕ y ) = ( ( ∑ k = 1 n l o g p ( x k ∣ y = 1 ) ) l o g ( ϕ y ) ) − ( ( ∑ k = 1 n l o g p ( x k ∣ y = 0 ) ) l o g ( 1 − ϕ y ) ) \begin{array}{l}原式=\log \frac{\left(\prod_{k=1}^{n} p\left(x_{k} \mid y=1\right)\right) \phi_{y}}{\left(\prod_{k=1}^{n} p\left(x_{k} \mid y=0\right)\right) (1-\phi_{y})} \\ =(\left(\sum_{k=1}^{n} log p\left(x_{k} \mid y=1\right)\right) log(\phi_{y})) - (\left(\sum_{k=1}^{n} log p\left(x_{k} \mid y=0\right)\right) log(1-\phi_{y})) \\ \end{array} 原式=log(∏k=1np(xk∣y=0))(1−ϕy)(∏k=1np(xk∣y=1))ϕy=((∑k=1nlogp(xk∣y=1))log(ϕy))−((∑k=1nlogp(xk∣y=0))log(1−ϕy))
其中 p ( x k ∣ y = 1 ) p\left(x_{k} \mid y=1\right) p(xk∣y=1)和 p ( x k ∣ y = 0 ) p\left(x_{k} \mid y=0\right) p(xk∣y=0)分别为:
p ( x k ∣ y = 1 ) = ϕ k ∣ y = 1 : = 1 + ∑ i = 1 m 1 { y ( i ) = 1 } x k ( i ) n + ∑ i = 1 m 1 { y ( i ) = 1 } ∑ j = 1 n x j ( i ) p\left(x_{k} \mid y=1\right) = \phi_{k \mid y=1}:=\frac{1+\sum_{i=1}^{m} 1\left\{y^{(i)}=1\right\} x_{k}^{(i)}}{n+\sum_{i=1}^{m} 1\left\{y^{(i)}=1\right\} \sum_{j=1}^{n} x_{j}^{(i)}} p(xk∣y=1)=ϕk∣y=1:=n+∑i=1m1{y(i)=1}∑j=1nxj(i)1+∑i=1m1{y(i)=1}xk(i)
p ( x k ∣ y = 0 ) = ϕ k ∣ y = 0 : = 1 + ∑ i = 1 m 1 { y ( i ) = 0 } x k ( i ) n + ∑ i = 1 m 1 { y ( i ) = 0 } ∑ j = 1 n x j ( i ) p\left(x_{k} \mid y=0\right) = \phi_{k \mid y=0}:=\frac{1+\sum_{i=1}^{m} 1\left\{y^{(i)}=0\right\} x_{k}^{(i)}}{n+\sum_{i=1}^{m} 1\left\{y^{(i)}=0\right\} \sum_{j=1}^{n} x_{j}^{(i)}} p(xk∣y=0)=ϕk∣y=0:=n+∑i=1m1{y(i)=0}∑j=1nxj(i)1+∑i=1m1{y(i)=0}xk(i)
综合上面结果有:
log p ( y = 1 ∣ x ) p ( y = 0 ∣ x ) = log p ( x ∣ y = 1 ) p ( y = 1 ) p ( x ∣ y = 0 ) p ( y = 0 ) = ∑ k = 1 n x k ( log ϕ k ∣ y = 1 − log ϕ k ∣ y = 0 ) + log ϕ y 1 − ϕ y \begin{aligned} \log \frac{p(y=1 \mid x)}{p(y=0 \mid x)} &=\log \frac{p(x \mid y=1) p(y=1)}{p(x \mid y=0) p(y=0)} \\ &=\sum_{k=1}^{n} x_{k} \left(\log \phi_{k \mid y=1}-\log \phi_{k \mid y=0}\right)+\log \frac{\phi_{y}}{1-\phi_{y}} \end{aligned} logp(y=0∣x)p(y=1∣x)=logp(x∣y=0)p(y=0)p(x∣y=1)p(y=1)=k=1∑nxk(logϕk∣y=1−logϕk∣y=0)+log1−ϕyϕy
def fit_naive_bayes_model(matrix, labels):
"""Fit a naive bayes model.
This function should fit a Naive Bayes model given a training matrix and labels.
The function should return the state of that model.
Feel free to use whatever datatype you wish for the state of the model.
Args:
matrix: A numpy array containing word counts for the training data
labels: The binary (0 or 1) labels for that training data
Returns: The trained model
"""
# *** START CODE HERE ***
m, n = matrix.shape
phi_y = np.sum(labels) / m
phi_k_y1 = (1.0 + matrix[ labels==1 ].sum(axis=0)) / (n + matrix[ labels==1 ].sum())
phi_k_y0 = (1.0 + matrix[ labels==0 ].sum(axis=0)) / (n + matrix[ labels==0 ].sum())
return phi_y, phi_k_y1, phi_k_y0
# *** END CODE HERE ***
def predict_from_naive_bayes_model(model, matrix):
"""Use a Naive Bayes model to compute predictions for a target matrix.
This function should be able to predict on the models that fit_naive_bayes_model
outputs.
Args:
model: A trained model from fit_naive_bayes_model
matrix: A numpy array containing word counts
Returns: A numpy array containg the predictions from the model
"""
# *** START CODE HERE ***
phi_y, phi_k_y1, phi_k_y0 = model
pre_result = matrix @ (np.log(phi_k_y1) - np.log(phi_k_y0)) + np.log(phi_y / (1 - phi_y))
return (np.sign(pre_result) + 1) // 2
# *** END CODE HERE ***
def get_top_five_naive_bayes_words(model, dictionary):
"""Compute the top five words that are most indicative of the spam (i.e positive) class.
Ues the metric given in 6c as a measure of how indicative a word is.
Return the words in sorted form, with the most indicative word first.
Args:
model: The Naive Bayes model returned from fit_naive_bayes_model
dictionary: A mapping of word to integer ids
Returns: The top five most indicative words in sorted order with the most indicative first
"""
# *** START CODE HERE ***
phi_y, phi_k_y1, phi_k_y0 = model
inv_dictionary = {v: k for k, v in dictionary.items()}
top_five_indicative_word_idx = np.argsort(np.log(phi_k_y1) - np.log(phi_k_y0))[-5:]
return [inv_dictionary[idx] for idx in top_five_indicative_word_idx]
# *** END CODE HERE ***
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider):
"""Compute the optimal SVM radius using the provided training and evaluation datasets.
You should only consider radius values within the radius_to_consider list.
You should use accuracy as a metric for comparing the different radius values.
Args:
train_matrix: The word counts for the training data
train_labels: The spma or not spam labels for the training data
val_matrix: The word counts for the validation data
val_labels: The spam or not spam labels for the validation data
radius_to_consider: The radius values to consider
Returns:
The best radius which maximizes SVM accuracy.
"""
# *** START CODE HERE ***
best_radius = 0
best_error = 1
for radius in radius_to_consider:
state = src.svm.svm_train(train_matrix, train_labels, radius)
pred_result = src.svm.svm_predict(state, val_matrix, radius)
error_rate = np.abs(pred_result - val_labels).sum() / len(val_labels)
print(f'radius: {radius}, error_rate: {error_rate}')
if error_rate < best_error:
best_error = error_rate
best_radius = radius
return best_radius
# *** END CODE HERE ***
实话说,他提供的SVM的代码应该是一个极为简化的,具体的没有太细扣,就认为他是对的。
检验上面的函数是否能够run
def main6():
train_messages, train_labels = util.load_spam_dataset('./data/ds6_train.tsv')
val_messages, val_labels = util.load_spam_dataset('./data/ds6_val.tsv')
test_messages, test_labels = util.load_spam_dataset('./data/ds6_test.tsv')
dictionary = create_dictionary(train_messages)
util.write_json('./output/p06_dictionary', dictionary)
train_matrix = transform_text(train_messages, dictionary)
np.savetxt('./output/p06_sample_train_matrix', train_matrix[:100,:])
val_matrix = transform_text(val_messages, dictionary)
test_matrix = transform_text(test_messages, dictionary)
naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels)
naive_bayes_predictions = predict_from_naive_bayes_model(naive_bayes_model, test_matrix)
np.savetxt('./output/p06_naive_bayes_predictions', naive_bayes_predictions)
naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels)
print('Naive Bayes had an accuracy of {} on the testing set'.format(naive_bayes_accuracy))
top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary)
print('The top 5 indicative words for Naive Bayes are: ', top_5_words)
util.write_json('./output/p06_top_indicative_words', top_5_words)
optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10])
util.write_json('./output/p06_optimal_radius', optimal_radius)
print('The optimal SVM radius was {}'.format(optimal_radius))
svm_predictions = src.svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius)
svm_accuracy = np.mean(svm_predictions == test_labels)
print('The SVM model had an accuracy of {} on the testing set'.format(svm_accuracy, optimal_radius))
main6()
Naive Bayes had an accuracy of 0.978494623655914 on the testing set
The top 5 indicative words for Naive Bayes are: ['urgent!', 'tone', 'prize', 'won', 'claim']
radius: 0.01, error_rate: 0.08258527827648116
radius: 0.1, error_rate: 0.05385996409335727
radius: 1, error_rate: 0.07001795332136446
radius: 10, error_rate: 0.12387791741472172
The optimal SVM radius was 0.1
The SVM model had an accuracy of 0.9695340501792115 on the