将 input 经过 softmax 激活函数之后,再计算其与 target 的交叉熵损失。阅读源码可以发现,该方法将 log_softmax 和 nll_loss 进行了结合
class CrossEntropyLoss(_WeightedLoss):
def __init__(self, weight=None, size_average=None, ignore_index=-100,
reduce=None, reduction='mean'):
super(CrossEntropyLoss, self).__init__(weight, size_average, reduce, reduction)
self.ignore_index = ignore_index
@weak_script_method
def forward(self, input, target):
return F.cross_entropy(input, target, weight=self.weight,
ignore_index=self.ignore_index, reduction=self.reduction)
def cross_entropy(input, target, weight=None, size_average=None, ignore_index=-100,
reduce=None, reduction='mean'):
if size_average is not None or reduce is not None:
reduction = _Reduction.legacy_get_string(size_average, reduce)
return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
为什么要采用 softmax 激活函数?
因为交叉熵描述了两个概率分布的差异,而神经网络输出的是向量,并不是概率分布的形式,所以需要 softmax 激活函数将一个向量进行 “归一化” 成概率分布的形式,再使用交叉熵损失函数计算 loss
代码理解:
import torch
import torch.nn.functional as F
import torch.nn as nn
torch.manual_seed(2019)
output = torch.randn(2, 3)
target = torch.ones(2, dtype=torch.long).random_(3)
print('output:\n{}'.format(output))
print('target:\n{}\n'.format(target))
log_softmax = F.log_softmax(output, dim=1)
print('log_softmax:\n{}\n'.format(log_softmax))
nll_loss = F.nll_loss(log_softmax, target)
print('nll_loss:{}'.format(nll_loss))
cross_entropy_loss = F.cross_entropy(output, target)
print('cross_entropy_loss:{}'.format(cross_entropy_loss))
criterion = nn.CrossEntropyLoss()
loss = criterion(output, target)
print('loss:{}'.format(loss))
"""
output:
tensor([[-0.1187, 0.2110, 0.7463],
[-0.6136, -0.1186, 1.5565]])
target:
tensor([2, 0])
log_softmax:
tensor([[-1.5614, -1.2317, -0.6964],
[-2.4335, -1.9386, -0.2635]])
nll_loss:1.564985990524292
cross_entropy_loss:1.564985990524292
loss:1.564985990524292
"""
log_softmax函数:先将 iutput 进行一次 softmax 运算,然后将结果作 log 对数运算
代码理解
import torch
import torch.nn.functional as F
import numpy as np
torch.manual_seed(2019)
output = torch.randn(2, 3)
print(output, '\n')
torch_softmax = F.softmax(output, dim=1)
print(torch_softmax)
torch_log_softmax = torch.log(torch_softmax)
print(torch_log_softmax)
torch_log_softmax = F.log_softmax(output, dim=1)
print(torch_log_softmax)
torch_log_softmax = output.log_softmax(dim=1)
print(torch_log_softmax, '\n')
e_j = np.exp(output.numpy())
e_k = e_j.sum(axis=1).reshape(-1, 1)
np_softmax = e_j / e_k
print(np_softmax)
np_log_softmax = np.log(np_softmax)
print(np_log_softmax)
"""
tensor([[-0.1187, 0.2110, 0.7463],
[-0.6136, -0.1186, 1.5565]])
tensor([[0.2098, 0.2918, 0.4984],
[0.0877, 0.1439, 0.7684]])
tensor([[-1.5614, -1.2317, -0.6964],
[-2.4335, -1.9386, -0.2635]])
tensor([[-1.5614, -1.2317, -0.6964],
[-2.4335, -1.9386, -0.2635]])
tensor([[-1.5614, -1.2317, -0.6964],
[-2.4335, -1.9386, -0.2635]])
[[0.20983616 0.29179907 0.49836478]
[0.08772492 0.14391233 0.76836276]]
[[-1.5614283 -1.2316898 -0.696423 ]
[-2.4335492 -1.938551 -0.2634933]]
"""