前文回顾:权重衰退、正则化
通常将丢弃法作用在隐藏全连接层的输出上,例如:
h ⃗ = σ ( W 1 x ⃗ + b ⃗ 1 ) h ⃗ ′ = d r o p o u t ( h ⃗ ) o ⃗ = W 2 h ⃗ ′ + b ⃗ 2 y ⃗ = s o f t m a x ( o ⃗ ) \begin{aligned} \vec h &= \sigma(W_1 \vec x + \vec b_1) \\ \vec h'&=\mathop{dropout}(\vec h) \\ \vec o &= W_2 \vec h' + \vec b_2 \\ \vec y &= \mathop{softmax}(\vec o) \end{aligned} hh′oy=σ(W1x+b1)=dropout(h)=W2h′+b2=softmax(o)假设我们对一个单隐藏层的网络使用丢弃法,隐藏层的部分元素可能会被变为0。
我们实现dropout_layer
函数,该函数以dropout
的概率丢弃张量输入X
中的元素。
import torch
from torch import nn
from d2l import torch as d2l
def dropout_layer(X, dropout):
assert 0 <= dropout <= 1
if dropout == 1:
return torch.zeros_like(X)
if dropout == 0:
return X
mask = (torch.randn(X.shape) > dropout).float()
return mask * X / (1.0 - dropout)
上述代码中,torch.randn(X.shape)
生成一个形状等同于X
的在0到1之间的均匀随机分布。
我们可以来测试一下dropout
函数:
# 测试dropout_layer函数
X = torch.arange(16, dtype=torch.float32).reshape((2, 8))
print(X)
print(dropout_layer(X, 0.))
print(dropout_layer(X, 0.5))
print(dropout_layer(X, 1.))
我们定义一个具有两个隐藏层的多层感知机,每个隐藏层包含256个单元。
# 定义具有两个隐藏层的多层感知机
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256
dropout1, dropout2 = 0.2, 0.5
class Net(nn.Module):
def __init__(self, num_inputs, num_outputs,
num_hiddens1, num_hiddens2, is_training = True):
super(Net, self).__init__()
self.num_inputs = num_inputs
self.training = is_training
self.lin1 = nn.Linear(num_inputs, num_hiddens1)
self.lin2 = nn.Linear(num_hiddens1, num_hiddens2)
self.lin3 = nn.Linear(num_hiddens2, num_outputs)
self.relu = nn.ReLU()
def forward(self, X):
H1 = self.relu(self.lin1(X.reshape((-1, self.num_inputs))))
if self.training == True:
H1 = dropout_layer(H1, dropout1)
H2 = self.relu(self.lin2(H1))
if self.training == True:
H2 = dropout_layer(H2, dropout2)
out = self.lin3(H2)
return out
# 训练和测试
num_epochs, lr, batch_size = 10, 0.5, 256
loss = nn.CrossEntropyLoss(reduction='none')
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
trainer = torch.optim.SGD(net.parameters(), lr=lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)
# 简洁实现
net_concise = nn.Sequential(nn.Flatten(), nn.Linear(784, 256), nn.ReLU(),
nn.Dropout(dropout1), nn.Linear(256, 256), nn.ReLU(),
nn.Dropout(dropout2), nn.Linear(256, 10))
# 简洁实现 初始化权重
def init_weights(m):
if type(m) == nn.Linear:
nn.init.normal_(m.weight, std=0)
net.apply(init_weights)
trainer = torch.optim.SGD(net_concise.parameters(), lr)
d2l.train_ch3(net_concise, train_iter, test_iter, loss, num_epochs, trainer)