吴恩达机器学习课后作业——偏差和方差

1.写在前面

吴恩达机器学习的课后作业及数据可以在coursera平台上进行下载,只要注册一下就可以添加课程了。所以这里就不写题目和数据了,有需要的小伙伴自行去下载就可以了。
作业及数据下载网址:吴恩达机器学习课程

2.偏差和方差

偏差和方差的作业内容比较多,主要有以下几项:

  1. 可视化训练集数据
  2. 利用一元线性回归进行拟合,可视化并绘制学习曲线
  3. 利用多项式回归进行拟合,可视化并绘制学习曲线
  4. 绘制lamda变化对于各项数据集的代价变化

下面附上代码,有详细的注释,这里就不一一解释了。

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.io as scio  # 用于导入mat文件的库
from scipy.optimize import minimize


# 用于导入数据的函数
def input_data():
    dataFile = 'machine-learning-ex5\\machine-learning-ex5\\ex5\\ex5data1.mat'
    # 导入mat文件数据
    data = scio.loadmat(dataFile)
    # 导入训练集数据
    train_X = data['X']
    train_y = data['y']
    # 导入测试集数据
    test_X = data['Xtest']
    test_y = data['ytest']
    # 导入交叉验证集数据
    val_X = data['Xval']
    val_y = data['yval']
    return train_X, train_y, test_X, test_y, val_X, val_y


# 用于数据可视化的函数
def visualize_data(X, y):
    fig, ax = plt.subplots(1, 1)
    ax.scatter(X, y)        # 绘制散点图
    ax.set_xticks([k for k in range(-50, 50, 10)])  # 设置x轴坐标
    ax.set_yticks([k for k in range(0, 45, 5)])     # 设置y轴坐标
    ax.set_xlabel('Change in water level(x)')       # 设置x轴标题
    ax.set_ylabel('Water flowing out of the dam (y)')   # 设置y轴标题
    plt.show()


# 用于计算代价的函数
def compute_costs(theta, X, y, lamda):
    theta = theta.reshape(theta.shape[0],1)     # 将theta从一维还原成二维
    m = X.shape[0]      # 获得m
    costJ1 = np.sum(np.power(X @ theta - y, 2))     # 计算代价函数第一部分
    costJ2 = np.sum(np.power(theta[1:, 0], 2)) * lamda  # 计算代价函数第二部分
    return (costJ1 + costJ2) / (2 * m)      # 计算并返回


# 用于计算梯度的函数
def compute_gradient(theta, X, y, lamda):
    theta = theta.reshape(theta.shape[0], 1)        #将theta数组从一维还原成二维
    m = X.shape[0]      # 获得m
    gradient = np.sum((X @ theta - y) * X, axis=0)      #计算梯度第一部分
    gradient = gradient.reshape(gradient.shape[0], 1)   # 将gradient从一维还原成二维
    reg = theta * lamda     # 计算梯度第二部分
    reg[0,0] = 0     # 因为theta0不参与计算,所以要单独进行修改
    return (gradient + reg) / m     # 计算并返回


# 用于拟合线性模型
def fit_linear_regression(theta, X, y, lamda):
    # 调用minimize方法求得最小值
    res = minimize(fun=compute_costs, x0=theta, args=(X, y, lamda), method='TNC', jac=compute_gradient,
                   options={
     'maxiter': 100})
    final_theta = res.x     # 获得最优theta数组
    return final_theta

# 用于绘制线性回归图像的函数
def plot_linear_regression(final_theta, train_X,train_y):
    px = np.linspace(np.min(train_X[:, 1]), np.max(train_X[:, 1]), 100)     # 产生自变量x
    px = px.reshape(px.shape[0], 1)     # 将数据从一维还原成二维
    px = np.insert(px, 0, 1, axis=1)    # 插入一列全为1的列
    py = px @ final_theta       # 计算预测的值

    # 绘制散点图和预测曲线
    fig, ax = plt.subplots(1, 1)
    ax.scatter(train_X[:, 1], train_y)
    ax.plot(px[:, 1], py)
    ax.set_xticks([k for k in range(-50, 50, 10)])
    ax.set_yticks([k for k in range(-5, 45, 5)])
    ax.set_xlabel('Change in water level(x)')
    ax.set_ylabel('Water flowing out of the dam (y)')
    plt.show()


# 用于绘制线性模型的学习曲线
def plot_linear_learning_curves(train_X, train_y, val_X, val_y, lamda):
    error_train = []    # 训练集代价数组
    error_val = []      # 交叉验证集代价数组
    for i in range(0, train_X.shape[0]):        # 逐个增加训练集的训练样本数
        theta = np.ones((train_X.shape[1], 1))      # 初始化theta数组
        # 调用线性回归进行拟合,并获得最优theta数组
        theta = fit_linear_regression(theta, train_X[0:i + 1, :], train_y[0:i + 1, :], lamda)
        # 利用最优theta计算训练集代价,注意这里的样本数为训练集中的(i+1)样本
        train_error = compute_costs(theta, train_X[0:i + 1, :], train_y[0:i + 1, :], 0)
        # 利用最优theta计算交叉验证集代价,注意这里的样本数为整个交叉验证集
        val_error = compute_costs(theta, val_X, val_y, 0)
        error_train.append(train_error)     # 将当前size的代价添加到训练集代价数组中
        error_val.append(val_error)         # 将整个交叉验证集的代价添加到交叉验证集代价数组中

    # 绘制散点图和拟合曲线
    fig, ax = plt.subplots(1, 1)
    ax.plot([i for i in range(1, train_X.shape[0] + 1)], error_train, c='blue', label='Train')
    ax.plot([i for i in range(1, train_X.shape[0] + 1)], error_val, c='green', label='Cross Validation')
    ax.set_xticks(np.arange(0, 13, 2))
    ax.set_yticks(np.arange(0, 151, 50))
    plt.legend()
    plt.show()


# 用于将特征映射会高维度的函数
def map_polynomial_features(X, p):
    for i in range(2, p + 1):   # 从2次方开始一直到p次方
        X = np.insert(X, X.shape[1], values=np.power(X[:, 1], i), axis=1)
    return X


# 用于进行特征缩放的函数(均值归一化)
def feature_normalize(data, d,dataMean,dataStd):
    for i in range(1, d + 1):
        for j in range(0, data.shape[0]):  # 遍历第i列中每一个数值
            data[j, i] = (data[j, i] - dataMean[i]) / dataStd[i]  # 利用吴恩达老师的公式进行归一化
    return data


# 用于获得数据X的均值和方差的函数
def get_means_stds(X):
    means = np.mean(X,axis=0)   # 计算均值
    stds = np.std(X,axis=0)     # 计算方差
    return means,stds


# 用于拟合多项式模型的函数
def fit_polynomical_regression(theta, train_X, train_y, lamda, d,train_mean,train_std):
    poly_features = map_polynomial_features(train_X, d)     # 将训练集映射到高纬度
    nor_poly_features = feature_normalize(poly_features, d,train_mean,train_std)     # 进行归一化
    theta = np.ones((nor_poly_features.shape[1],1))     # 初始化theta数组
    final_theta = fit_linear_regression(theta, nor_poly_features, train_y, lamda)   # 调用线性回归进行拟合
    final_theta = final_theta.reshape(final_theta.shape[0], 1)   # 将最优theta数组从一维还原成二维
    return final_theta


# 用于绘制多项式回归图像的函数
def plot_polynomical_regression(final_theta, train_X,train_y,d, train_mean, train_std):
    x = np.linspace(-70, 60, 100)  # 从-70到60之间产生100个数
    xx = x.reshape(x.shape[0], 1)  # 将数据从一维还原成二维数据
    xx = np.insert(xx, 0, 1, axis=1)  # 插入一列全为1的列
    xx = map_polynomial_features(xx, d)  # 将产生的数据映射到高纬度
    # 特别注意这里要使用训练集的均值和方差,而不是从产生的数据中的均值和方差
    xx = feature_normalize(xx, d, train_mean, train_std)  # 进行归一化
    yy = xx @ final_theta  # 计算预测的值

    # 绘制散点图和拟合曲线
    fig, ax = plt.subplots(1, 1)
    ax.scatter(train_X[:, 1], train_y, c='red')
    ax.plot(x, yy.flatten(), c='blue', linestyle='--')
    ax.set_xticks([k for k in range(-80, 81, 20)])
    ax.set_yticks([k for k in range(-60, 41, 10)])
    ax.set_xlabel('Change in water level(x)')
    ax.set_ylabel('Water flowing out of the dam (y)')
    plt.show()


# 用于绘制多项式回归的学习曲线的函数
def plot_poly_learning_curves(train_X, train_y, val_X, val_y, lamda,d,train_mean,train_std,val_mean,val_std):
    error_train = []    # 定义训练集代价数组
    error_val = []      # 定义交叉验证集代价数组
    for i in range(0, train_X.shape[0]):    # 训练集size从0开始逐渐递增
        theta = np.ones((d+1, 1))       # 初始化theta数组
        # 调用多项式线性回归函数获得最优theta值
        theta = fit_polynomical_regression(theta, train_X[0:i + 1, :], train_y[0:i + 1, :], lamda,d,train_mean,train_std)
        # 将(i+1)个训练样本映射为高纬度
        train_poly_features = map_polynomial_features(train_X, d)
        # 进行归一化,注意这里使用的是整个训练集的均值和代价
        train_nor_poly_features = feature_normalize(train_poly_features, d,train_mean,train_std)
        # 计算(i+1)个训练集样本的代价
        train_error = compute_costs(theta, train_nor_poly_features[0:i + 1, :], train_y[0:i + 1, :], 0)
        # 将整个交叉验证集映射到高纬度
        val_poly_features = map_polynomial_features(val_X, d)
        # 对整个交叉验证集进行归一化,特别注意这里使用的是训练集的均值和方差
        val_nor_poly_features = feature_normalize(val_poly_features, d,train_mean,train_std)
        # 计算整个交叉验证集的代价
        val_error = compute_costs(theta, val_nor_poly_features, val_y, 0)
        error_train.append(train_error)     # 将训练集代价添加到训练集代价数组中
        error_val.append(val_error)         # 将交叉验证集代价添加到交叉验证集代价数组中

    # 绘制散点图和拟合曲线
    fig, ax = plt.subplots(1, 1)
    ax.plot([i for i in range(1, train_X.shape[0] + 1)], error_train, c='blue', label='Train')
    ax.plot([i for i in range(1, train_X.shape[0] + 1)], error_val, c='green', label='Cross Validation')
    ax.set_xticks(np.arange(0, 13, 2))
    ax.set_yticks(np.arange(0, 101, 10))
    plt.legend()
    plt.show()


# 绘制随lamda的变化,训练集、测试集、交叉验证集的代价
def plot_lamda_curve(theta, train_X, train_y, val_X, val_y,test_X,test_y,d,train_mean,train_std):
    lamda = [0,0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]    # 设置lamda数组
    error_train = []       # 训练集代价数组
    error_val = []         # 交叉验证集代价数组
    error_test = []        # 测试集代价数组
    for k in lamda:         # 遍历每一种lamda
        theta = np.ones((d + 1, 1))     # 初始化theta数组
        # 调用多项式回归拟合函数
        theta = fit_polynomical_regression(theta, train_X, train_y, k,d,train_mean,train_std)

        train_poly_features = map_polynomial_features(train_X, d)   # 训练集进行映射高维度
        # 注意这里使用的是训练集的均值和方差
        train_nor_poly_features = feature_normalize(train_poly_features, d,train_mean,train_std)   # 训练集进行归一化
        train_error = compute_costs(theta, train_nor_poly_features, train_y, 0)     # 训练集代价

        val_poly_features = map_polynomial_features(val_X, d)       # 交叉验证集进行映射高维度
        # 注意这里使用的是训练集的均值和方差
        val_nor_poly_features = feature_normalize(val_poly_features, d,train_mean,train_std)    # 交叉验证集集进行归一化
        val_error = compute_costs(theta, val_nor_poly_features, val_y, 0)   # 交叉验证集代价

        test_poly_features = map_polynomial_features(test_X, d)     # 测试集进行映射高维度
        # 注意这里使用的是训练集的均值和方差
        test_nor_poly_features = feature_normalize(test_poly_features, d, train_mean, train_std)    # 测试集进行归一化
        test_error = compute_costs(theta, test_nor_poly_features, test_y, 0)    # 测试集代价

        error_train.append(train_error)     # 将训练集代价加入到数组中
        error_val.append(val_error)         # 将交叉验证集代价加入到数组中
        error_test.append(test_error)       # 将测试集代价加入到数组中

    # 绘制三种数据集的代价随着lamda变化的图像
    fig,ax = plt.subplots(1,1)
    ax.plot(lamda,error_train,label='Train',c='b')
    ax.plot(lamda, error_val, label='Cross Validation', c='g')
    ax.plot(lamda, error_test, label='Test', c='r')
    ax.set_xticks(np.arange(0,11,1))
    ax.set_yticks(np.arange(0,21,2))
    plt.legend()
    plt.show()


d=6     # 定义多项式的阶数
lamda = 0   # 定义lamda
train_X, train_y, test_X, test_y, val_X, val_y = input_data()   # 分别导入三种数据集的数据
train_X = np.insert(train_X, 0, 1, axis=1)  # 训练集增加全1的列
test_X = np.insert(test_X, 0, 1, axis=1)    # 测试集集增加全1的列
val_X = np.insert(val_X, 0, 1, axis=1)      # 交叉验证集集增加全1的列

train_poly_X = map_polynomial_features(train_X, d)  # 将训练集映射成高维度
test_poly_X = map_polynomial_features(test_X, d)    # 将测试集映射成高维度
val_poly_X = map_polynomial_features(val_X, d)      # 将交叉验证集映射成高维度

train_mean,train_std=get_means_stds(train_poly_X)   # 将训练集归一化
test_mean,test_std=get_means_stds(test_poly_X)      # 将测试集归一化
val_mean,val_std=get_means_stds(val_poly_X)         # 将交叉验证集归一化

theta = np.ones((2, 1))     # 用于一元线性回归的theta数组初始化
# 调用一元线性回归
final_theta = fit_linear_regression(theta, train_X, train_y, lamda)
# 绘制一元线性回归拟合曲线
plot_linear_regression(final_theta, train_X,train_y)
# 绘制一元线性回归学习曲线
plot_linear_learning_curves(train_X, train_y, val_X, val_y, lamda)
# 调用多项式回归
final_theta = fit_polynomical_regression(theta, train_X, train_y, lamda, d,train_mean,train_std)
# 绘制多项式回归拟合曲线
plot_polynomical_regression(final_theta, train_X,train_y,d, train_mean, train_std)
# 绘制多项式回归学习曲线
plot_poly_learning_curves(train_X, train_y, val_X, val_y, lamda,d,train_mean,train_std,val_mean,val_std)
# 绘制数据集随lamda的代价函数曲线
plot_lamda_curve(theta, train_X, train_y, val_X, val_y,test_X,test_y,d,train_mean,train_std)

结果展示:
一元线性回归拟合效果:
吴恩达机器学习课后作业——偏差和方差_第1张图片
一元线性回归学习曲线:
吴恩达机器学习课后作业——偏差和方差_第2张图片
多项式回归拟合效果:
吴恩达机器学习课后作业——偏差和方差_第3张图片
多项式回归学习曲线:
吴恩达机器学习课后作业——偏差和方差_第4张图片
lamda变化对于各个数据集的影响
吴恩达机器学习课后作业——偏差和方差_第5张图片

你可能感兴趣的:(吴恩达机器学习笔记,吴恩达,机器学习,偏差和方差,python)