吴恩达机器学习作业5——偏差与方差

在前半部分的练习中,你将实现正则化线性回归,以预测水库中的水位变化,从而预测大坝流出的水量。在下半部分中,您将通过一些调试学习算法的诊断,并检查偏差 v.s. 方差的影响。

import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
import scipy.optimize as opt


def plotData():
    """瞧一瞧数据长啥样"""
    plt.figure(figsize=(8, 5))
    plt.scatter(X[:, 1:], y, c='r', marker='x')
    plt.xlabel('Change in water level (x)')
    plt.ylabel('Water flowing out of the dam (y)')
    plt.grid(True)


def costReg(theta, X, y, l):    # 正则化cost
    cost = ((X @ theta - y.flatten()) ** 2).sum()
    regterm = l * (theta[1:] @ theta[1:])
    return (cost + regterm) / (2 * len(X))

def gradientReg(theta, X, y, l):  # 正则化线性回归梯度
    grad = (X @ theta - y.flatten()) @ X
    regterm = l * theta
    regterm[0] = 0
    return (grad + regterm) / len(X)

def trainLinearReg(X, y, l):   # 拟合线性回归
    theta = np.zeros(X.shape[1])
    res = opt.minimize(fun=costReg,
                       x0=theta,
                       args=(X, y ,l),
                       method='TNC',
                       jac=gradientReg)
    return res.x


def plot_learning_curve(X, y, Xval, yval, l):
    """画出学习曲线,即交叉验证误差和训练误差随样本数量的变化的变化"""
    xx = range(1, len(X) + 1)  # at least has one example
    training_cost, cv_cost = [], []
    for i in xx:
        res = trainLinearReg(X[:i], y[:i], l)
        training_cost_i = costReg(res, X[:i], y[:i], 0)
        cv_cost_i = costReg(res, Xval, yval, 0)
        training_cost.append(training_cost_i)
        cv_cost.append(cv_cost_i)

    plt.figure(figsize=(8, 5))
    plt.plot(xx, training_cost, label='training cost')
    plt.plot(xx, cv_cost, label='cv cost')
    plt.legend()
    plt.xlabel('Number of training examples')
    plt.ylabel('Error')
    plt.title('Learning curve for linear regression')
    plt.grid(True)
    plt.show()

def genPolyFeatures(X, power):
    """添加多项式特征
    每次在array的最后一列插入第二列的i+2次方(第一列为偏置)
    从二次方开始开始插入(因为本身含有一列一次方)
    """
    Xpoly = X.copy()
    for i in range(2, power + 1):
        Xpoly = np.insert(Xpoly, Xpoly.shape[1], np.power(Xpoly[:,1], i), axis=1)
    return Xpoly

def get_means_std(X):
    """获取训练集的均值和误差,用来标准化所有数据。"""
    means = np.mean(X,axis=0)
    stds = np.std(X,axis=0,ddof=1)  # ddof=1 means 样本标准差
    return means, stds

def featureNormalize(myX, means, stds):
    """标准化"""
    X_norm = myX.copy()
    X_norm[:,1:] = X_norm[:,1:] - means[1:]
    X_norm[:,1:] = X_norm[:,1:] / stds[1:]
    return X_norm


def plot_fit(means, stds, l):
    """画出拟合曲线"""
    theta = trainLinearReg(X_norm, y, l)
    x = np.linspace(-75, 55, 50)
    xmat = x.reshape(-1, 1)
    xmat = np.insert(xmat, 0, 1, axis=1)
    Xmat = genPolyFeatures(xmat, power)
    Xmat_norm = featureNormalize(Xmat, means, stds)

    plotData()
    plt.plot(x, Xmat_norm @ theta, 'b--')
    plt.show()


# ---------------1.读取数据,数据可视化----------------
path = 'ex5data1.mat'
data = loadmat(path)
X, y = data['X'], data['y']  #Training set
Xval, yval = data['Xval'], data['yval']  #Training set
Xtest, ytest = data['Xtest'], data['ytest']   #Test set
X = np.insert(X,0,1,axis=1)   #X=(12, 2),y=(12, 1)
Xval = np.insert(Xval ,0,1,axis=1)  #Xval=(21, 2),yval=(21, 1)
Xtest = np.insert(Xtest,0,1,axis=1)   # Xtest=(21, 2),ytest=(21, 1)
plotData()
plt.show()
# ---------------2.正则化线性回归----------------
theta = np.ones(X.shape[1])
print(costReg(theta, X, y, 1))  # 303.9931922202643
print(gradientReg(theta, X, y, 1))
fit_theta = trainLinearReg(X, y, 0)
plotData()
plt.plot(X[:,1], X @ fit_theta)
plt.show()
# ---------------3.学习函数----------------
plot_learning_curve(X, y, Xval, yval, 0)
# ---------------4.多项式回归----------------
power = 6  # 扩展到x的6次方

train_means, train_stds = get_means_std(genPolyFeatures(X,power))
X_norm = featureNormalize(genPolyFeatures(X,power), train_means, train_stds)
Xval_norm = featureNormalize(genPolyFeatures(Xval,power), train_means, train_stds)
Xtest_norm = featureNormalize(genPolyFeatures(Xtest,power), train_means, train_stds)
plot_fit(train_means, train_stds, 0)
plot_learning_curve(X_norm, y, Xval_norm, yval, 0)

plot_fit(train_means, train_stds, 1)
plot_learning_curve(X_norm, y, Xval_norm, yval, 1)
plot_fit(train_means, train_stds, 100)
plot_learning_curve(X_norm, y, Xval_norm, yval, 100)

lambdas = [0., 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1., 3., 10.]
errors_train, errors_val = [], []
for l in lambdas:
    theta = trainLinearReg(X_norm, y, l)
    errors_train.append(costReg(theta, X_norm, y, 0))  # 记得把lambda = 0
    errors_val.append(costReg(theta, Xval_norm, yval, 0))

plt.figure(figsize=(8, 5))
plt.plot(lambdas, errors_train, label='Train')
plt.plot(lambdas, errors_val, label='Cross Validation')
plt.legend()
plt.xlabel('lambda')
plt.ylabel('Error')
plt.grid(True)
plt.show()
print(lambdas[np.argmin(errors_val)] )

theta = trainLinearReg(X_norm, y, 3)
print('test cost(l={}) = {}'.format(3, costReg(theta, Xtest_norm, ytest, 0)))


你可能感兴趣的:(机器学习)