squared_loss
函数中需要使用reshape
函数?data_iter
函数的行为会有什么变化?1 . 将权重初始化为零,算法依然有效。但网络层数加深后,在全连接的情况下,反向传播时,由于权重的对称性会导致出现隐藏神经元的对称性,是的多个隐藏神经元的作用就如同一个神经元,影响算法效果。
2.
import torch
import random
from d2l import torch as d2l
#生成数据集
def synthetic_data(r, b, num_examples):
I = torch.normal(0, 1, (num_examples, len(r)))
u = torch.matmul(I, r) + b
u += torch.normal(0, 0.01, u.shape) # 噪声
return I, u.reshape((-1, 1)) # 标量转换为向量
true_r = torch.tensor([20.0])
true_b = 0.01
features, labels = synthetic_data(true_r, true_b, 1000)
#读取数据集
def data_iter(batch_size, features, labels):
num_examples = len(features)
indices = list(range(num_examples))
random.shuffle(indices)
for i in range(0, num_examples, batch_size):
batch_indices = torch.tensor(indices[i: min(i + batch_size, num_examples)])
yield features[batch_indices],labels[batch_indices]
batch_size = 10
# 初始化权重
r = torch.normal(0,0.01,size = ((1,1)), requires_grad = True)
b = torch.zeros(1, requires_grad = True)
# 定义模型
def linreg(I, r, b):
return torch.matmul(I, r) + b
# 损失函数
def square_loss(u_hat, u):
return (u_hat - u.reshape(u_hat.shape)) ** 2/2
# 优化算法
def sgd(params, lr, batch_size):
with torch.no_grad():
for param in params:
param -= lr * param.grad/batch_size
param.grad.zero_()
lr = 0.03
num_epochs = 10
net = linreg
loss = square_loss
for epoch in range(num_epochs):
for I, u in data_iter(batch_size, features, labels):
l = loss(net(I, r, b), u)
l.sum().backward()
sgd([r, b], lr, batch_size)
with torch.no_grad():
train_l = loss(net(features, r, b), labels)
print(f'epoch {epoch + 1}, loss {float(train_l.mean()):f}')
print(r)
print(b)
print(f'r的估计误差: {true_r - r.reshape(true_r.shape)}')
print(f'b的估计误差: {true_b - b}')
epoch 1, loss 0.329473
epoch 2, loss 0.000541
epoch 3, loss 0.000050
epoch 4, loss 0.000050
epoch 5, loss 0.000050
epoch 6, loss 0.000050
epoch 7, loss 0.000050
epoch 8, loss 0.000050
epoch 9, loss 0.000050
epoch 10, loss 0.000050
tensor([[19.9997]], requires_grad=True)
tensor([0.0093], requires_grad=True)
r的估计误差: tensor([0.0003], grad_fn=)
b的估计误差: tensor([0.0007], grad_fn=)
3 . 尝试写了一下,但好像出错了,结果loss = nan
M λ b b = 2 π h c 2 λ 5 ⋅ 1 e h c λ k T − 1 = c 1 λ 5 ⋅ 1 e c 2 k T − 1 M_{\lambda b b}=\frac{2 \pi h c^{2}}{\lambda^{5}} \cdot \frac{1}{e^{\frac{h c}{\lambda k T}}-1}=\frac{c_{1}}{\lambda^{5}} \cdot \frac{1}{e^{\frac{c_{2}}{k T}}-1} Mλbb=λ52πhc2⋅eλkThc−11=λ5c1⋅ekTc2−11
# 3
# 普朗克公式
# x:波长
# T:温度
import torch
import random
from d2l import torch as d2l
#生成数据集
def synthetic_data(x, num_examples):
T = torch.normal(0, 1, (num_examples, len(x)))
u = c1 / ((x ** 5) * ((torch.exp(c2 / (x * T))) - 1));
u += torch.normal(0, 0.01, u.shape) # 噪声
return T, u.reshape((-1, 1)) # 标量转换为向量
c1 = 3.7414*10**8 # c1常量
c2 = 1.43879*10**4 # c2常量
true_x = torch.tensor([500.0])
features, labels = synthetic_data(true_x, 1000)
#读取数据集
def data_iter(batch_size, features, labels):
num_examples = len(features)
indices = list(range(num_examples))
random.shuffle(indices)
for i in range(0, num_examples, batch_size):
batch_indices = torch.tensor(indices[i: min(i + batch_size, num_examples)])
yield features[batch_indices],labels[batch_indices]
batch_size = 10
# 初始化权重
x = torch.normal(0,0.01,size = ((1,1)), requires_grad = True)
# 定义模型
def planck_formula(T, x):
return c1 / ((x ** 5) * ((torch.exp(c2 / (x * T))) - 1))
# 损失函数
def square_loss(u_hat, u):
return (u_hat - u.reshape(u_hat.shape)) ** 2/2
# 优化算法
def sgd(params, lr, batch_size):
with torch.no_grad():
for param in params:
param -= lr * param.grad/batch_size
param.grad.zero_()
lr = 0.001
num_epochs = 10
net = planck_formula
loss = square_loss
for epoch in range(num_epochs):
for T, u in data_iter(batch_size, features, labels):
l = loss(net(T, x), u)
l.sum().backward()
sgd([x], lr, batch_size)
with torch.no_grad():
train_l = loss(net(features, x), labels)
print(f'epoch {epoch + 1}, loss {float(train_l.mean()):f}')
print(f'r的估计误差: {true_x - x.reshape(true_x.shape)}')
learning rate = 0.03
epoch 1, loss 0.039505
epoch 2, loss 0.000141
epoch 3, loss 0.000048
learning rate = 0.05
epoch 1, loss 0.000576
epoch 2, loss 0.000052
epoch 3, loss 0.000052
learning rate = 0.5
epoch 1, loss 0.000057
epoch 2, loss 0.000053
epoch 3, loss 0.000051
indices[i: min(i + batch_size, num_examples)]
,样本个数不能被批量大小整除不会导致data_iter
变化,不设置的话可能会报错。data
模块提供了数据处理工具,nn
模块定义了大量的神经网络层和常见损失函数。_
结尾的方法将参数替换,从而初始化参数。1 . 将学习率缩小为之前的1/n
2 . 部分loss function
将损失函数更改为HuberLoss(仅1.9.0及以上版本的pytorch才有HuberLoss)
#%%
import numpy as np
import torch
from torch.utils import data
from d2l import torch as d2l
#%%
true_w = torch.tensor([2, -3.4])
true_b = 4.2
features, labels = d2l.synthetic_data(true_w, true_b, 1000)
def load_array(data_arrays, batch_size, is_train = True): #@save
'''pytorch数据迭代器'''
dataset = data.TensorDataset(*data_arrays) # 把输入的两类数据一一对应;*表示对list解开入参
return data.DataLoader(dataset, batch_size, shuffle = is_train) # 重新排序
batch_size = 10
data_iter = load_array((features, labels), batch_size) # 和手动实现中data_iter使用方法相同
#%%
# 构造迭代器并验证data_iter的效果
next(iter(data_iter)) # 获得第一个batch的数据
#%% 定义模型
from torch import nn
net = nn.Sequential(nn.Linear(2, 1)) # Linear中两个参数一个表示输入形状一个表示输出形状
# sequential相当于一个存放各层数据的list,单层时也可以只用Linear
#%% 初始化模型参数
# 使用net[0]选择神经网络中的第一层
net[0].weight.data.normal_(0, 0.01) # 正态分布
net[0].bias.data.fill_(0)
#%% 定义损失函数
loss = torch.nn.HuberLoss()
#%% 定义优化算法
trainer = torch.optim.SGD(net.parameters(), lr=0.03) # optim module中的SGD
#%% 训练
num_epochs = 3
for epoch in range(num_epochs):
for X, y in data_iter:
l = loss(net(X), y)
trainer.zero_grad()
l.backward()
trainer.step()
l = loss(net(features), labels)
print(f'epoch {epoch+1}, loss {l:f}')
#%% 查看误差
w = net[0].weight.data
print('w的估计误差:', true_w - w.reshape(true_w.shape))
b = net[0].bias.data
print('b的估计误差:', true_b - b)
3 . 线性回归的梯度
print(net[0].weight.grad)
print(net[0].bias.grad)
2 . 可以使用one-hot编码,需要的元素个数等于label数量
batch_size
(如减少到1)是否会影响读取性能?softmax
函数。这可能会导致什么问题?提示:尝试计算 exp ( 50 ) \exp(50) exp(50)的大小。cross_entropy
是根据交叉熵损失函数的定义实现的。它可能有什么问题?提示:考虑对数的定义域。数值稳定性问题,可能会导致上溢(overflow
)。如果数值过大, e x p ( o ) exp(o) exp(o)可能会大于数据类型允许的最大数字,这将使分母或分子变为inf
(无穷大),最后得到的是0、inf
或nan
(不是数字)的 y ^ j \hat y_j y^j。
若最大的 y ^ \hat y y^的值极小,接近0,可能会导致 − l o g y ^ -log\hat y −logy^的值过大,超出数据类型的范围。或出现下溢(underflow)问题,由于精度限制, y ^ \hat y y^四舍五入为0,并且使得 log ( y ^ j ) \log(\hat y_j) log(y^j)的值为-inf
。
反向传播几步后,可能会出现nan
结果。
使用LogSumExp技巧。
在继续softmax计算之前,先从所有 o k o_k ok中减去 max ( o k ) \max(o_k) max(ok)。
可以看到每个 o k o_k ok按常数进行的移动不会改变softmax的返回值:
y ^ j = exp ( o j − max ( o k ) ) exp ( max ( o k ) ) ∑ k exp ( o k − max ( o k ) ) exp ( max ( o k ) ) = exp ( o j − max ( o k ) ) ∑ k exp ( o k − max ( o k ) ) . \begin{aligned} \hat y_j & = \frac{\exp(o_j - \max(o_k))\exp(\max(o_k))}{\sum_k \exp(o_k - \max(o_k))\exp(\max(o_k))} \\ & = \frac{\exp(o_j - \max(o_k))}{\sum_k \exp(o_k - \max(o_k))}. \end{aligned} y^j=∑kexp(ok−max(ok))exp(max(ok))exp(oj−max(ok))exp(max(ok))=∑kexp(ok−max(ok))exp(oj−max(ok)).
之后,将softmax和交叉熵结合在一起,如下面的等式所示,我们避免计算 exp ( o j − max ( o k ) ) \exp(o_j - \max(o_k)) exp(oj−max(ok)),
而可以直接使用 o j − max ( o k ) o_j - \max(o_k) oj−max(ok),因为 log ( exp ( ⋅ ) ) \log(\exp(\cdot)) log(exp(⋅))被抵消了。
log ( y ^ j ) = log ( exp ( o j − max ( o k ) ) ∑ k exp ( o k − max ( o k ) ) ) = log ( exp ( o j − max ( o k ) ) ) − log ( ∑ k exp ( o k − max ( o k ) ) ) = o j − max ( o k ) − log ( ∑ k exp ( o k − max ( o k ) ) ) . \begin{aligned} \log{(\hat y_j)} & = \log\left( \frac{\exp(o_j - \max(o_k))}{\sum_k \exp(o_k - \max(o_k))}\right) \\ & = \log{(\exp(o_j - \max(o_k)))}-\log{\left( \sum_k \exp(o_k - \max(o_k)) \right)} \\ & = o_j - \max(o_k) -\log{\left( \sum_k \exp(o_k - \max(o_k)) \right)}. \end{aligned} log(y^j)=log(∑kexp(ok−max(ok))exp(oj−max(ok)))=log(exp(oj−max(ok)))−log(k∑exp(ok−max(ok)))=oj−max(ok)−log(k∑exp(ok−max(ok))).
返回概率最大的分类标签不总是最优解。一些情况下可能需要输出具体的概率值来辅助判断。
可能会造成单词之间概率差距不大,或概率过于接近0的情况。在计算过程中loss也会较高。