代码:
# weight_decay.py
import torch
import torch.nn as nn
import numpy as np
from matplotlib import pyplot as plt
def init_params():
w = torch.randn((num_inputs, 1), requires_grad=True)
b = torch.zeros(1, requires_grad=True)
return [w, b]
def l2_penalty(w):
return (w**2).sum() / 2
def squared_loss(y_hat, y):
# 注意这里返回的是向量, 另外, pytorch里的MSELoss并没有除以 2
return ((y_hat - y.view(y_hat.size())) ** 2) / 2
def linreg(X, w, b):
return torch.mm(X, w) + b
def sgd(params, lr, batch_size):
# 为了和原书保持一致,这里除以了batch_size,但是应该是不用除的,因为一般用PyTorch计算loss时就默认已经
# 沿batch维求了平均了。
for param in params:
param.data -= lr * param.grad / batch_size
def semilogy_draw(x_vals, y_vals, x_label, y_label, x2_vals=None, y2_vals=None,
legend=None, figsize=(3.5, 2.5)):
# plt.figure(1, figsize)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.semilogy(x_vals, y_vals)
if x2_vals and y2_vals:
plt.semilogy(x2_vals, y2_vals, linestyle=':')
plt.legend(legend)
plt.savefig('weight_decay.png')
plt.show()
def fit_and_plot(lambd):
w, b = init_params()
train_ls, test_ls = [], []
for _ in range(num_epochs):
for X, y in train_iter:
# 添加了L2范数惩罚项
l = loss(net(X, w, b), y) + lambd * l2_penalty(w) # lambd为0时,权重衰减项不起作用
l = l.sum()
if w.grad is not None:
w.grad.data.zero_()
b.grad.data.zero_()
l.backward()
sgd([w, b], lr, batch_size)
train_ls.append(loss(net(train_features, w, b), train_labels).mean().item())
test_ls.append(loss(net(test_features, w, b), test_labels).mean().item())
semilogy_draw(range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
range(1, num_epochs + 1), test_ls, ['train', 'test'])
print('L2 norm of w:', w.norm().item())
def fit_and_plot_pytorch(wd):
# 对权重参数衰减。权重名称一般是以weight结尾
net = nn.Linear(num_inputs, 1)
nn.init.normal_(net.weight, mean=0, std=1)
nn.init.normal_(net.bias, mean=0, std=1)
optimizer_w = torch.optim.SGD(params=[net.weight], lr=lr, weight_decay=wd) # 对权重参数衰减
optimizer_b = torch.optim.SGD(params=[net.bias], lr=lr) # 不对偏差参数衰减
train_ls, test_ls = [], []
for _ in range(num_epochs):
for X, y in train_iter:
l = loss(net(X), y).mean()
optimizer_w.zero_grad()
optimizer_b.zero_grad()
l.backward()
# 对两个optimizer实例分别调用step函数,从而分别更新权重和偏差
optimizer_w.step()
optimizer_b.step()
train_ls.append(loss(net(train_features), train_labels).mean().item())
test_ls.append(loss(net(test_features), test_labels).mean().item())
semilogy_draw(range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
range(1, num_epochs + 1), test_ls, ['train', 'test'])
print('L2 norm of w:', net.weight.data.norm().item())
if __name__ == '__main__':
n_train, n_test, num_inputs = 20, 100, 200
true_w, true_b = torch.ones(num_inputs, 1) * 0.01, 0.05 # torch.Size([200, 1])
features = torch.randn((n_train + n_test, num_inputs)) # torch.Size([120, 200])
labels = torch.matmul(features, true_w) + true_b # torch.Size([120, 1])
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)
train_features, test_features = features[:n_train, :], features[n_train:, :] # torch.Size([20, 200]) torch.Size([100, 200])
train_labels, test_labels = labels[:n_train], labels[n_train:] # torch.Size([20, 1]) torch.Size([100, 1])
batch_size, num_epochs, lr = 2, 100, 0.003
net, loss = linreg, squared_loss
dataset = torch.utils.data.TensorDataset(train_features, train_labels)
train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
fit_and_plot(lambd=3) # lambd为0时,权重衰减项不起作用
结果:
a,当 fit_and_plot(lambd=0)参数lambd=0时,在训练集表现良好,损失逐渐下降,而在测试集loss降不下来,过拟合
b,当 fit_and_plot(lambd=3)参数lambd=3时,在训练集与测试集表现良好,一定程度上防止了过拟合。
代码:
import torch
import torch.nn as nn
import numpy as np
from matplotlib import pyplot as plt
def init_params():
w = torch.randn((num_inputs, 1), requires_grad=True)
b = torch.zeros(1, requires_grad=True)
return [w, b]
def l2_penalty(w):
return (w**2).sum() / 2
def squared_loss(y_hat, y):
# 注意这里返回的是向量, 另外, pytorch里的MSELoss并没有除以 2
return ((y_hat - y.view(y_hat.size())) ** 2) / 2
def sgd(params, lr, batch_size):
# 为了和原书保持一致,这里除以了batch_size,但是应该是不用除的,因为一般用PyTorch计算loss时就默认已经
# 沿batch维求了平均了。
for param in params:
param.data -= lr * param.grad / batch_size
def semilogy_draw(x_vals, y_vals, x_label, y_label, x2_vals=None, y2_vals=None,
legend=None, figsize=(3.5, 2.5)):
# plt.figure(1, figsize)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.semilogy(x_vals, y_vals)
if x2_vals and y2_vals:
plt.semilogy(x2_vals, y2_vals, linestyle=':')
plt.legend(legend)
plt.savefig('weight_decay_simple.png')
plt.show()
def fit_and_plot_pytorch(wd=0):
# 定义网络并初始化网络参数w b
net = nn.Linear(num_inputs, 1)
nn.init.normal_(net.weight, mean=0, std=1)
nn.init.normal_(net.bias, mean=0, std=1)
optimizer_w = torch.optim.SGD(params=[net.weight], lr=lr, weight_decay=wd) # 在SGD中对权重参数衰减
optimizer_b = torch.optim.SGD(params=[net.bias], lr=lr) # 不对偏差参数衰减
train_ls, test_ls = [], []
for _ in range(num_epochs):
for X, y in train_iter:
l = loss(net(X), y).mean()
optimizer_w.zero_grad()
optimizer_b.zero_grad()
l.backward()
# 对两个optimizer实例分别调用step函数,从而分别更新权重和偏差
optimizer_w.step()
optimizer_b.step()
train_ls.append(loss(net(train_features), train_labels).mean().item())
test_ls.append(loss(net(test_features), test_labels).mean().item())
semilogy_draw(range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
range(1, num_epochs + 1), test_ls, ['train', 'test'])
print('L2 norm of w:', net.weight.data.norm().item())
if __name__ == '__main__':
n_train, n_test, num_inputs = 20, 100, 200
true_w, true_b = torch.ones(num_inputs, 1) * 0.01, 0.05 # torch.Size([200, 1])
features = torch.randn((n_train + n_test, num_inputs)) # torch.Size([120, 200])
labels = torch.matmul(features, true_w) + true_b # torch.Size([120, 1])
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)
train_features, test_features = features[:n_train, :], features[n_train:, :] # torch.Size([20, 200]) torch.Size([100, 200])
train_labels, test_labels = labels[:n_train], labels[n_train:] # torch.Size([20, 1]) torch.Size([100, 1])
batch_size, num_epochs, lr = 2, 100, 0.003
loss = squared_loss
dataset = torch.utils.data.TensorDataset(train_features, train_labels)
train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
fit_and_plot_pytorch(wd=0) # lambd为0时,权重衰减项不起作用
结果:
a,当 fit_and_plot_pytorch参数wd=0时,在训练集表现良好,损失逐渐下降,而在测试集loss降不下来,过拟合。
b,当 fit_and_plot_pytorch参数wd=3时,在训练集与测试集表现良好,一定程度上防止了过拟合。
代码并非自己实现,参考并摘自:
https://tangshusen.me/Dive-into-DL-PyTorch/#/chapter03_DL-basics/3.12_weight-decay