通过本篇博客记录一下添加噪声对Lasso和SVM的影响,采用的数据集为sklearn的make_moons。
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_moons
n = 300
n_tr = 200
X, y = make_moons(n, shuffle=True, noise=0.2, random_state=112)
y[y==0] = -1
# standardise the data
trmean = np.mean(X[:n_tr, :], axis=0)
trvar = np.var(X[:n_tr, :], axis=0)
X = (X - trmean[np.newaxis, :]) / np.sqrt(trvar)[np.newaxis, :]
# take first n_tr as training, others as test.
Xtr = X[:n_tr, :]
ytr = y[:n_tr]
Xt = X[n_tr:, :]
yt = y[n_tr:]
# inspect the dataset visually
plt.figure()
plt.scatter(Xtr[:, 0], Xtr[:, 1], c=ytr, marker='x')
plt.scatter(Xt[:, 0], Xt[:, 1], c=yt, marker='o')
plt.show()
数据集如图所示:
需要通过grid search和k fold cross validation进行最优参数寻找,数据准备有:
k = 5 # k fold cross validation
# the parameters to consider in cross-validation
# order lasso params from largest to smallest so that if ties occur CV will select the one with more sparsity
params_lasso = [1, 1e-1, 1e-2, 1e-3, 1e-4]
params_rbf = [1e-3, 1e-2, 1e-1, 1, 10, 100]
params_svm = [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000] # smaller C: more regularisation
# the noisy features to be considered; for n noisy features, add the first n columns
n_tot_noisy_feats = 50
m = 9
np.random.seed(92)
X_noise = 2*np.random.randn(X.shape[0], n_tot_noisy_feats)
Xtr_noise = X_noise[:n_tr, :]
Xt_noise = X_noise[n_tr:, :]
svm_accs = []
lasso_accs = []
lasso_n_feats = []
# do CV over the parameters
amount_of_noisy_features = np.arange(n_tot_noisy_feats+1) # include both 0 and 50
for n_noisy_features in amount_of_noisy_features:
print('number of noisey feature: {}'.format(n_noisy_features))
Xtr_noisy = np.copy(Xtr)
Xt_noisy = np.copy(Xt)
if n_noisy_features > 0:
Xtr_noisy = np.hstack((Xtr_noisy, Xtr_noise[:, :n_noisy_features]))
Xt_noisy = np.hstack((Xt_noisy, Xt_noise[:, :n_noisy_features]))
lasso_validation_accs = np.zeros((k, len(params_lasso)))
svm_validation_accs = np.zeros((k, len(params_rbf), len(params_svm)))
# make the training-validation splits on the training data
kf = KFold(n_splits=k)
fold_index = 0
for train_index, validation_index in kf.split(Xtr_noisy):
# now then get the results on all the parameters considered
# SVM with the two parameters
for g_index, gamma in enumerate(params_rbf):
for s_index, svm_param in enumerate(params_svm):
svm = SVC(C=svm_param, gamma=gamma)
svm.fit(Xtr_noisy[train_index, :], ytr[train_index])
preds = svm.predict(Xtr_noisy[validation_index])
svm_validation_accs[fold_index, g_index, s_index] = accuracy_score(ytr[validation_index], preds)
# lasso with the one parameter
for l_index, lasso_param in enumerate(params_lasso):
lasso = Lasso(alpha=lasso_param)
lasso.fit(Xtr_noisy[train_index, :], ytr[train_index])
preds = np.sign(lasso.predict(Xtr_noisy[validation_index]))
lasso_validation_accs[fold_index, l_index] = accuracy_score(ytr[validation_index], preds)
fold_index += 1
选择最优的参数并进行拟合、测试:
# average results over folds
lasso_validation_avg = np.mean(lasso_validation_accs, axis=0)
svm_validation_avg = np.mean(svm_validation_accs, axis=0)
# get the indexes of the best parameters and get the parameters at these indices
best_lasso_param = params_lasso[np.argmax(lasso_validation_avg)]
[best_svm_gamma_index, best_svm_C_index] = np.unravel_index(np.argmax(svm_validation_avg, axis=None), svm_validation_avg.shape)
best_svm_gamma = params_rbf[best_svm_gamma_index]
best_svm_C = params_svm[best_svm_C_index]
print("best lasso parameter is: {}".format(best_lasso_param))
print("best svm gamma parameter is: {}".format(best_svm_gamma))
print("best svm C parameter is: {}".format(best_svm_C))
# train with the whole training data with the chosen parameters
svm = SVC(C=best_svm_C, gamma=best_svm_gamma)
svm.fit(Xtr_noisy, ytr)
svm_preds = svm.predict(Xt_noisy)
svm_accs.append(accuracy_score(yt, svm_preds))
lasso = Lasso(alpha=best_lasso_param)
lasso.fit(Xtr_noisy, ytr)
lasso_preds = np.sign(lasso.predict(Xt_noisy))
lasso_accs.append(accuracy_score(yt, lasso_preds))
# how many features did lasso select?
lasso_n_feats.append(np.count_nonzero(lasso.coef_))
绘制结果图
plt.figure()
plt.plot(amount_of_noisy_features, svm_accs, label='svm')
plt.plot(amount_of_noisy_features, lasso_accs, label='lasso')
plt.title('cross validation')
plt.legend()
svm_accs = np.array(svm_accs)
lasso_accs = np.array(lasso_accs)
print('lasso was better than svm for the first time with {} noisy features'.format(np.where(svm_accs - lasso_accs<0)[0][0]))
plt.figure()
plt.plot(amount_of_noisy_features, lasso_n_feats)
plt.title('lasso features')
plt.show()
可以看出,当noisy feature的数目很小时,SVM的准确率高于Lasso,但是当noisy feature的数目变多后Lasso的结果反而更好。