【Machine Learning 学习笔记】feature engineering中noisy feature的影响

【Machine Learning 学习笔记】feature engineering中noisy feature的影响

通过本篇博客记录一下添加噪声对Lasso和SVM的影响,采用的数据集为sklearn的make_moons。

数据准备

import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_moons


n = 300
n_tr = 200
X, y = make_moons(n, shuffle=True, noise=0.2, random_state=112)
y[y==0] = -1

# standardise the data
trmean = np.mean(X[:n_tr, :], axis=0)
trvar = np.var(X[:n_tr, :], axis=0)
X = (X - trmean[np.newaxis, :]) / np.sqrt(trvar)[np.newaxis, :]

# take first n_tr as training, others as test.
Xtr = X[:n_tr, :]
ytr = y[:n_tr]
Xt = X[n_tr:, :]
yt = y[n_tr:]

# inspect the dataset visually
plt.figure()
plt.scatter(Xtr[:, 0], Xtr[:, 1], c=ytr, marker='x')
plt.scatter(Xt[:, 0], Xt[:, 1], c=yt, marker='o')
plt.show()

数据集如图所示:

【Machine Learning 学习笔记】feature engineering中noisy feature的影响_第1张图片

需要通过grid search和k fold cross validation进行最优参数寻找,数据准备有:

k = 5 # k fold cross validation

# the parameters to consider in cross-validation
# order lasso params from largest to smallest so that if ties occur CV will select the one with more sparsity
params_lasso = [1, 1e-1, 1e-2, 1e-3, 1e-4]
params_rbf = [1e-3, 1e-2, 1e-1, 1, 10, 100]
params_svm = [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]  # smaller C: more regularisation

# the noisy features to be considered; for n noisy features, add the first n columns
n_tot_noisy_feats = 50
m = 9
np.random.seed(92)
X_noise = 2*np.random.randn(X.shape[0], n_tot_noisy_feats)
Xtr_noise = X_noise[:n_tr, :]
Xt_noise = X_noise[n_tr:, :]

svm_accs = []
lasso_accs = []
lasso_n_feats = []


# do CV over the parameters
amount_of_noisy_features = np.arange(n_tot_noisy_feats+1) # include both 0 and 50
for n_noisy_features in amount_of_noisy_features:
    print('number of noisey feature: {}'.format(n_noisy_features))

    Xtr_noisy = np.copy(Xtr)
    Xt_noisy = np.copy(Xt)
    if n_noisy_features > 0:
        Xtr_noisy = np.hstack((Xtr_noisy, Xtr_noise[:, :n_noisy_features]))
        Xt_noisy = np.hstack((Xt_noisy, Xt_noise[:, :n_noisy_features]))

    lasso_validation_accs = np.zeros((k, len(params_lasso)))
    svm_validation_accs = np.zeros((k, len(params_rbf), len(params_svm)))

    # make the training-validation splits on the training data
    kf = KFold(n_splits=k)
    fold_index = 0

    for train_index, validation_index in kf.split(Xtr_noisy):
        # now then get the results on all the parameters considered

        # SVM with the two parameters
        for g_index, gamma in enumerate(params_rbf):
            for s_index, svm_param in enumerate(params_svm):
                svm = SVC(C=svm_param, gamma=gamma)
                svm.fit(Xtr_noisy[train_index, :], ytr[train_index])
                preds = svm.predict(Xtr_noisy[validation_index])
                svm_validation_accs[fold_index, g_index, s_index] = accuracy_score(ytr[validation_index], preds)

        # lasso with the one parameter
        for l_index, lasso_param in enumerate(params_lasso):
            lasso = Lasso(alpha=lasso_param)
            lasso.fit(Xtr_noisy[train_index, :], ytr[train_index])
            preds = np.sign(lasso.predict(Xtr_noisy[validation_index]))
            lasso_validation_accs[fold_index, l_index] = accuracy_score(ytr[validation_index], preds)

        fold_index += 1

选择最优的参数并进行拟合、测试:

    # average results over folds
    lasso_validation_avg = np.mean(lasso_validation_accs, axis=0)
    svm_validation_avg = np.mean(svm_validation_accs, axis=0)

    # get the indexes of the best parameters and get the parameters at these indices
    best_lasso_param = params_lasso[np.argmax(lasso_validation_avg)]
    [best_svm_gamma_index, best_svm_C_index] = np.unravel_index(np.argmax(svm_validation_avg, axis=None), svm_validation_avg.shape)
    best_svm_gamma = params_rbf[best_svm_gamma_index]
    best_svm_C = params_svm[best_svm_C_index]

    print("best lasso parameter is: {}".format(best_lasso_param))
    print("best svm gamma parameter is: {}".format(best_svm_gamma))
    print("best svm C parameter is: {}".format(best_svm_C))

    # train with the whole training data with the chosen parameters
    svm = SVC(C=best_svm_C, gamma=best_svm_gamma)
    svm.fit(Xtr_noisy, ytr)
    svm_preds = svm.predict(Xt_noisy)
    svm_accs.append(accuracy_score(yt, svm_preds))

    lasso = Lasso(alpha=best_lasso_param)
    lasso.fit(Xtr_noisy, ytr)
    lasso_preds = np.sign(lasso.predict(Xt_noisy))
    lasso_accs.append(accuracy_score(yt, lasso_preds))

    # how many features did lasso select?
    lasso_n_feats.append(np.count_nonzero(lasso.coef_))

绘制结果图

plt.figure()
plt.plot(amount_of_noisy_features, svm_accs, label='svm')
plt.plot(amount_of_noisy_features, lasso_accs, label='lasso')
plt.title('cross validation')
plt.legend()

svm_accs = np.array(svm_accs)
lasso_accs = np.array(lasso_accs)
print('lasso was better than svm for the first time with {} noisy features'.format(np.where(svm_accs - lasso_accs<0)[0][0]))

plt.figure()
plt.plot(amount_of_noisy_features, lasso_n_feats)
plt.title('lasso features')
plt.show()
【Machine Learning 学习笔记】feature engineering中noisy feature的影响_第2张图片
【Machine Learning 学习笔记】feature engineering中noisy feature的影响_第3张图片

可以看出,当noisy feature的数目很小时,SVM的准确率高于Lasso,但是当noisy feature的数目变多后Lasso的结果反而更好。

你可能感兴趣的:(机器学习,sklearn)