动手学习pytorch之【感知机】——基础神经网络代码实现

Perceptrons

  • 激活函数的作用:防止隐藏层经过合并后退化为原本的单层线性模型。

  • ReLU线性修正函数是最受欢迎的激活函数(RELU(x) = max(x,0))。另外,挤压函数sigmoid也很受欢迎(sigmoid(x) = 1 / ( 1 + exp(-x) ),注意求导后是 sigmoid(x)( 1- sigmoid(x) )。

  • torch.nn.Parameter():

    self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))
    W1 = nn.Parameter(torch.randn(
        num_inputs, num_hiddens, requires_grad=True) * 0.01)
    params = [W1, b1, W2, b2]
    

    函数作用:将一个固定不可训练的tensor转换成可以训练的类型parameter,并将这个parameter绑定到目标的module里 (net.parameter()中就有这个绑定的parameter,所以在参数优化的时候可以进行优化的)

    经过类型转换这个self.v变成了模型的一部分,成为了模型中根据训练可以改动的参数了。使用这个函数的目的也是想让某些变量在学习的过程中不断的修改其值以达到最优化。

    **torch.nn.Linear()**里的weight和bias就是parameter类型,且不能够使用tensor类型替换。一般是多维的可训练tensor。

单隐藏层实现

自实现:

#  从零开始实现
import torchvision
from d2l.torch import Accumulator
from torch import nn
from torch.utils import data
from torchvision import transforms
import torch
from IPython import display
from d2l import torch as d2l


# 通过内置数据迭代器,我们可以随机打乱了所有样本,无偏见地读取小批量(大小为batch_size)
def get_dataloader_workers():  # 使用4个进程来读取数据
    return 4


# 整合所有组件
def load_data_fashion_mnist(batch_size, resize=None):
    # 通过ToTensor实例将图像数据从PIL类型变换成32位浮点数格式,并除以255使得所有像素的数值均在0到1之间
    trans = [transforms.ToTensor()]
    if resize:  # 接收可选参数来把图像调整成另一种形状
        trans.insert(0, transforms.Resize(resize))
    trans = transforms.Compose(trans)
    # 获取数据集,智障,这里路径是要加./的,不是直接/!
    mnist_train = torchvision.datasets.FashionMNIST(
        root='./mnist/', train=True, transform=trans, download=True)
    mnist_test = torchvision.datasets.FashionMNIST(
        root='./mnist/', train=False, transform=trans, download=True)
    return (data.DataLoader(mnist_train, batch_size, shuffle=True,
                            num_workers=get_dataloader_workers()),
            data.DataLoader(mnist_test, batch_size, shuffle=False,
                            num_workers=get_dataloader_workers()))


batch_size = 256
train_iter, test_iter = load_data_fashion_mnist(batch_size)

# 初始化模型参数
# 目的:实现一个具有单隐藏层的多层感知机,包含256个隐藏单元,输入28*28,输出10
# 每一层我们都要记录一个权重矩阵和一个偏执向量,故有两组参数
num_inputs, num_outputs, num_hiddens = 784, 10, 256
W1 = nn.Parameter(torch.randn(
    num_inputs, num_hiddens, requires_grad=True) * 0.01)
b1 = nn.Parameter(torch.zeros(num_hiddens, requires_grad=True))
W2 = nn.Parameter(torch.randn(
    num_hiddens, num_outputs, requires_grad=True) * 0.01)
b2 = nn.Parameter(torch.zeros(num_outputs, requires_grad=True))
params = [W1, b1, W2, b2]


# 激活函数
def relu(X):
    a = torch.zeros_like(X)  # 生成同维度的零向量
    return torch.max(X, a)


# 模型,实际上就是将神经网络的计算过程用代数形式表达出来
def net(X):
    X = X.reshape((-1, num_inputs))  # 调整模型结构
    H = relu(X@W1 + b1)  # @表示矩阵乘法
    return H @ W2 + b2


# 训练
loss = nn.CrossEntropyLoss(reduction='none')
num_epochs, lr = 10, 0.1
updater = torch.optim.SGD(params, lr=lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater)
d2l.predict_ch3(net, test_iter)

精简实现:

# 简洁实现, Sequential 用于将网络层和激活函数结合
net = nn.Sequential(
    nn.Flatten(),  # 展平
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Linear(256, 10))


def init_weights(m):
    if type(m) == nn.Linear:  # 套路
        nn.init.normal_(m.weight, std=0.01)  # 要加下划线


net.apply(init_weights)  # 递归初始化
# 训练过程和softmax完全相同——模块化的好处!
batch_size, lr, num_epochs = 256, 0.1, 10
train_iter, test_iter = load_data_fashion_mnist(batch_size)
loss = nn.CrossEntropyLoss(reduction='none')
trainer = torch.optim.SGD(net.parameters(), lr=lr)
trainer_iter, test_iter = load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)

正则化

  • 权重衰减weight decay是最广泛使用的正则化技术之一,也被叫做L2正则化(岭回归)。

    这项技术通过函数与零的距离来衡量函数的复杂度,如线性函数 ()= 中的权重向量的范数‖‖2。 要保证权重向量比较小, 最常用方法是将其范数作为惩罚项加到最小化损失的问题中。

    image-20220423122821467 image-20220423122843255 image-20220423122857930

完整实现:

import torch
from torch import nn
from d2l import torch as d2l

# 标签同时被均值为0,标准差为0.01高斯噪声破坏。
# 为了使过拟合的效果更加明显,我们将问题的维数增加到 =200 , 并使用一个只包含20个样本的小训练集
n_train, n_test, num_inputs, batch_size = 20, 100, 200, 5
true_w, true_b = torch.ones((num_inputs, 1)) * 0.01, 0.05
train_data = d2l.synthetic_data(true_w, true_b, n_train)
train_iter = d2l.load_array(train_data, batch_size)
test_data = d2l.synthetic_data(true_w, true_b, n_test)
test_iter = d2l.load_array(test_data, batch_size, is_train=False)


# 定义一个函数来随机初始化模型参数
def init_params():
    w = torch.normal(0, 1, size=(num_inputs, 1), requires_grad=True)
    b = torch.zeros(1, requires_grad=True)
    return [w, b]


# L2范数惩罚
def l2_penalty(w):
    return torch.sum(w.pow(2)) / 2


def train(lambd):
    w, b = init_params()
    net, loss = lambda X: d2l.linreg(X, w, b), d2l.squared_loss
    num_epochs, lr = 100, 0.003
    animator = d2l.Animator(xlabel='epochs', ylabel='loss', yscale='log',
                            xlim=[5, num_epochs], legend=['train', 'test'])
    for epoch in range(num_epochs):
        for X, y in train_iter:
            # 增加了L2范数惩罚项,
            # 广播机制使l2_penalty(w)成为一个长度为batch_size的向量
            l = loss(net(X), y) + lambd * l2_penalty(w)
            l.sum().backward()
            d2l.sgd([w, b], lr, batch_size)
        if (epoch + 1) % 5 == 0:
            animator.add(epoch + 1, (d2l.evaluate_loss(net, train_iter, loss),
                                     d2l.evaluate_loss(net, test_iter, loss)))
    print('w的L2范数是:', torch.norm(w).item())


train(lambd=0)  # 过拟合了,训练误差减少,但测试误差没有减少。w的L2范数是: 14.305890083312988
train(lambd=3)  # 训练误差增大,但测试误差减小。w的L2范数是: 0.3574880957603454

简洁实现:

def train_concise(wd):
    net = nn.Sequential(nn.Linear(num_inputs, 1))
    for param in net.parameters():
        param.data.normal_()
    loss = nn.MSELoss(reduction='none')
    num_epochs, lr = 100, 0.003
    # 偏置参数没有衰减
    trainer = torch.optim.SGD([
        {"params":net[0].weight,'weight_decay': wd},
        {"params":net[0].bias}], lr=lr)
    animator = d2l.Animator(xlabel='epochs', ylabel='loss', yscale='log',
                            xlim=[5, num_epochs], legend=['train', 'test'])
    for epoch in range(num_epochs):
        for X, y in train_iter:
            trainer.zero_grad()
            l = loss(net(X), y)
            l.mean().backward()
            trainer.step()
        if (epoch + 1) % 5 == 0:
            animator.add(epoch + 1,
                         (d2l.evaluate_loss(net, train_iter, loss),
                          d2l.evaluate_loss(net, test_iter, loss)))
    print('w的L2范数:', net[0].weight.norm().item())


train_concise(0)
train_concise(3)

Kaggle-house_price

  • 数据集中共有80个特征,其中第一个是ID(不携带预测信息,因此输入模型前要删除)。分别对离散型变量和连续型变量进行数据预处理后(固定流程!请记下来!)再输入模型进行预测。

  • 线性模型很难让我们在竞赛中获胜,但线性模型提供了一种健全性检查, 以查看数据中是否存在有意义的信息。 如果我们在这里不能做得比随机猜测更好,那么我们很可能存在数据处理错误。 如果一切顺利,线性模型将作为基线(baseline)模型, 让我们直观地知道最好的模型有超出简单的模型多少。

  • 房价就像股票价格一样,我们更关心相对误差(−̂ ) / 而不是绝对误差−̂ 。解决这个问题的一种方法是用价格预测的对数来衡量差异(事实上,这也是比赛中官方用来评价提交质量的误差指标)。均方根误差:

    image-20220423141311531
  • 要善用K折交叉验证集

import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l
import matplotlib as matplotlib

train_data = pd.read_csv('./Kaggle/house_price/train.csv')
test_data = pd.read_csv('./Kaggle/house_price/test.csv')
print(train_data.shape)  # (1460, 81)
print(test_data.shape)  # (1459, 80)

# 数据预处理:含有大量的NA,将test集和train集所有需要处理的数据整合在一起,最后再分开
# train_data 去掉id和sales,test_data 去掉id
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

'''
处理dtype不是字符的列:标准化( - 均值 / 标准差 )
all_features.dtypes:返回每个列数值类型
all_features.dtypes !='object':返回每个列的布尔值,类型不满足object为True否则为False
all_features.dtypes[all_features.dtypes !='object']:将类型为object的列去除
all_features.dtypes[all_features.dtypes !='object'].index:返回类型不为object列的列名
其中python中字符串类型不是string是object
'''
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index  # 列名
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))  # 标准化
all_features[numeric_features] = all_features[numeric_features].fillna(0)  # 均值都为0

# 使用独热编码处理离散值,默认只处理object类型
# “Dummy_na=True”将“na”(缺失值)视为有效的特征值,并为其创建指示符特征
all_features = pd.get_dummies(all_features, dummy_na=True)
print(all_features.shape)  # (2919,331)

# 重新分离训练集和预测集,注意变量类型,变量维度和函数用法
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(
    train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)


def get_net():
    net = nn.Sequential(nn.Linear(train_features.shape[1], 1))
    return net


# 对数的均方根误差
loss = nn.MSELoss()


def log_rmse(net, features, labels):
    # torch.clamp(input, min, max)把input限制在区间[min,max]之间(分段函数)
    clipped = torch.clamp(net(features), 1, float('inf'))
    r = torch.sqrt(loss(torch.log(clipped),  # log1 = 0
                        torch.log(labels)))
    return r.item()  # torch用x.item()得到元素值,精度高!


def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []  # 记录loss的list
    train_iter = d2l.load_array((train_features, train_labels), batch_size)  # 随机梯度下降
    # 这里使用的是Adam优化算法,对初始学习率比较不敏感
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr=learning_rate,
                                 weight_decay=weight_decay)  # 权重衰减
    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)  # 原始的loss
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))  # 修正后的loss
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls


'''
K折交叉验证:
1.首先,将全部样本划分成k个大小相等的样本子集;
2.依次遍历这k个子集,每次把当前子集作为验证集,其余所有样本作为训练集,进行模型的训练和评估;
3.最后把k次评估指标的平均值作为最终的评估指标。
k:目标折数,i:验证集取第i个,X:features数据,y:labels数据
'''
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k  # 每个集合的大小,//是整数除法
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]  # 获取验证集
        if j == i:  # 目标test集
            X_valid, y_valid = X_part, y_part
        elif X_train is None:  # 第一个train集
            X_train, y_train = X_part, y_part
        else:  # 合并train集,第二个参数dim表示要拼接的维度, 这个维度必须是存在的,0为行拼接
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid


# 这里是分别训练k个模型,查看是否过拟合!用这些数据训练同一个模型
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
           batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)  # X_train, y_train, X_valid, y_valid
        net = get_net()
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
                     xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
                     legend=['train', 'valid'], yscale='log')
        print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, '
              f'验证log rmse{float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k


k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
                          weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
      f'平均验证log rmse: {float(valid_l):f}')


# Kaggle预测,这里不关心test,因为参数已经训练好了?
def train_and_pred(train_features, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net()
    train_ls, _ = train(net, train_features, train_labels, None, None,
                        num_epochs, lr, weight_decay, batch_size)
    d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
             ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
    print(f'训练log rmse:{float(train_ls[-1]):f}')
    # 将网络应用于测试集。
    preds = net(test_features).detach().numpy()
    # 将其重新格式化以导出到Kaggle
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('submission1.csv', index=False)


train_and_pred(train_features, train_labels, test_data,
               num_epochs, lr, weight_decay, batch_size)

你可能感兴趣的:(机器学习,神经网络,深度学习,pytorch,机器学习,人工智能)