本文介绍以下几点:
导入包
# import needed package
%matplotlib inline
from IPython import display
import matplotlib.pyplot as plt
import torch
import torchvision
import torchvision.transforms as transforms
import time
import sys
sys.path.append("/home/kesci/input")
import d2lzh1981 as d2l
print(torch.__version__)
print(torchvision.__version__)
对于分类问题,真实标签通常是离散值,我们通过模型和权重输出的值称之为置信度,置信度和特征值有关,且同一条数据对于不同类别的预测输出可能差距会很大,导致我们难以直观判断这些值的意义,且真实标签与其不确定范围的输出(置信度)之间的误差确难以衡量。
softmax运算符(softmax operator)解决了以上两个问题。它通过下式将输出值变换成值为正且和为1的概率分布:
y ^ j = exp ( o j ) ∑ i = 1 3 exp ( o i ) \hat{y}_j = \frac{ \exp(o_j)}{\sum_{i=1}^3 \exp(o_i)} y^j=∑i=13exp(oi)exp(oj)
代码如下
def softmax(X):
"""
softmax 公式
X 特征值
"""
X_exp = X.exp()
partition = X_exp.sum(dim=1, keepdim=True)
return X_exp / partition # 这里应用了广播机制
print(softmax(torch.tensor([[10.0, 10.1, 10.2]])))
输出的值的和是为1
tensor([[0.3006, 0.3322, 0.3672]])
想要预测分类结果正确,我们其实并不需要预测概率完全等于标签概率。例如,在图像分类的例子里,如果 y ( i ) = 3 y^{(i)}=3 y(i)=3,那么我们只需要 y ^ 3 ( i ) \hat{y}^{(i)}_3 y^3(i)比其他两个预测值 y ^ 1 ( i ) \hat{y}^{(i)}_1 y^1(i)和 y ^ 2 ( i ) \hat{y}^{(i)}_2 y^2(i)大就行了。即使 y ^ 3 ( i ) \hat{y}^{(i)}_3 y^3(i)值为0.6,不管其他两个预测值为多少,类别预测均正确。而平方损失则过于严格,例如 y ^ 1 ( i ) = y ^ 2 ( i ) = 0.2 \hat y^{(i)}_1=\hat y^{(i)}_2=0.2 y^1(i)=y^2(i)=0.2比 y ^ 1 ( i ) = 0 , y ^ 2 ( i ) = 0.4 \hat y^{(i)}_1=0, \hat y^{(i)}_2=0.4 y^1(i)=0,y^2(i)=0.4的损失要小很多,虽然两者都有同样正确的分类预测结果。
改善上述问题的一个方法是使用更适合衡量两个概率分布差异的测量函数。其中,交叉熵(cross entropy)是一个常用的衡量方法。
H ( y ( i ) , y ^ ( i ) ) = − ∑ j = 1 q y j ( i ) log y ^ j ( i ) , H\left(\boldsymbol y^{(i)}, \boldsymbol {\hat y}^{(i)}\right ) = -\sum_{j=1}^q y_j^{(i)} \log \hat y_j^{(i)}, H(y(i),y^(i))=−j=1∑qyj(i)logy^j(i),
ℓ ( Θ ) = 1 n ∑ i = 1 n H ( y ( i ) , y ^ ( i ) ) , \ell(\boldsymbol{\Theta}) = \frac{1}{n} \sum_{i=1}^n H\left(\boldsymbol y^{(i)}, \boldsymbol {\hat y}^{(i)}\right ), ℓ(Θ)=n1i=1∑nH(y(i),y^(i)),
ℓ ( Θ ) = − ( 1 / n ) ∑ i = 1 n log y ^ y ( i ) ( i ) \ell(\boldsymbol{\Theta}) = -(1/n) \sum_{i=1}^n \log \hat y_{y^{(i)}}^{(i)} ℓ(Θ)=−(1/n)i=1∑nlogy^y(i)(i)
y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y = torch.LongTensor([0, 2])
def cross_entropy(y_hat, y):
"""
交叉熵损失函数
"""
return - torch.log(y_hat.gather(1, y.view(-1, 1)))
tensor([[2.3026], [0.6931]])
def net(X):
return softmax(torch.mm(X.view((-1, num_inputs)), W) + b) # 线性模型加上softmax处理
计算准确率
def accuracy(y_hat, y):
"""
y_hat.argmax(dim=1) 按行取最大值的索引
"""
return (y_hat.argmax(dim=1) == y).float().mean().item()
使用当前参数计算y值并输出准确率,常用于测试集
# 本函数已保存在d2lzh_pytorch包中方便以后使用。该函数将被逐步改进:它的完整实现将在“图像增广”一节中描述
def evaluate_accuracy(data_iter, net):
acc_sum, n = 0.0, 0
for X, y in data_iter:
acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
n += y.shape[0]
return acc_sum / n
训练前先做一下数据
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, root='/home/kesci/input/FashionMNIST2065')
num_epochs, lr = 5, 0.1
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
params=None, lr=None, optimizer=None):
"""
net 网络结构函数
train_iter 训练数据,循环每轮取一批次数据
test_iter 测试数据,使用方法同
loss 损失值
num_epochs 训练轮数
batch_size 每批次数据条数
params 参数
lr 学习率
optimizer 求解方法(参数优化方法)
"""
for epoch in range(num_epochs):
# train_l_sum 训练损失的和
# train_acc_sum 训练准确率的和
# n 训练样本数量的和
train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
for X, y in train_iter: # 循环数据
y_hat = net(X) # 计算y值
l = loss(y_hat, y).sum() # 计算损失
# 梯度清零
if optimizer is not None:
optimizer.zero_grad()
elif params is not None and params[0].grad is not None:
for param in params:
param.grad.data.zero_()
l.backward() # 计算梯度
# 梯度下降优化参数
if optimizer is None:
d2l.sgd(params, lr, batch_size)
else:
optimizer.step()
print(type(l.item()))
print(l.item())
train_l_sum += l.item() # 损失值
train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item() # 准确率
n += y.shape[0]
test_acc = evaluate_accuracy(test_iter, net) # 输出测试集准确率
print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
% (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W, b], lr)
epoch 1, loss 0.7854, train acc 0.750, test acc 0.788
epoch 2, loss 0.5693, train acc 0.815, test acc 0.811
epoch 3, loss 0.5259, train acc 0.825, test acc 0.815
epoch 4, loss 0.5015, train acc 0.833, test acc 0.823
epoch 5, loss 0.4859, train acc 0.836, test acc 0.824
对比模型准确性
X, y = iter(test_iter).next() # 取出一批数据
# 输出真实标签
true_labels = d2l.get_fashion_mnist_labels(y.numpy())
# 输出预测标签
pred_labels = d2l.get_fashion_mnist_labels(net(X).argmax(dim=1).numpy())
titles = [true + '\n' + pred for true, pred in zip(true_labels, pred_labels)]
d2l.show_fashion_mnist(X[0:9], titles[0:9])
导包和加载数据集
# 加载各种包或者模块
import torch
from torch import nn
from torch.nn import init
import numpy as np
import sys
sys.path.append("/home/kesci/input")
import d2lzh1981 as d2l
print(torch.__version__)
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, root='/home/kesci/input/FashionMNIST2065')
# 定义网络模型
num_inputs = 784
num_outputs = 10
# 定义线性模型
class LinearNet(nn.Module):
def __init__(self, num_inputs, num_outputs):
super(LinearNet, self).__init__()
self.linear = nn.Linear(num_inputs, num_outputs)
def forward(self, x): # x 的形状: (batch, 1, 28, 28)
y = self.linear(x.view(x.shape[0], -1))
return y
# net = LinearNet(num_inputs, num_outputs)
# 将多条数据合并为一个矩阵
class FlattenLayer(nn.Module):
def __init__(self):
super(FlattenLayer, self).__init__()
def forward(self, x): # x 的形状: (batch, *, *, ...)
return x.view(x.shape[0], -1)
from collections import OrderedDict
net = nn.Sequential(
# FlattenLayer(),
# LinearNet(num_inputs, num_outputs)
OrderedDict([
('flatten', FlattenLayer()),
('linear', nn.Linear(num_inputs, num_outputs))]) # 或者写成我们自己定义的 LinearNet(num_inputs, num_outputs) 也可以
)
# 初始化模型参数
init.normal_(net.linear.weight, mean=0, std=0.01)
init.constant_(net.linear.bias, val=0)
# 定义损失函数
loss = nn.CrossEntropyLoss()
# 定义优化函数
optimizer = torch.optim.SGD(net.parameters(), lr=0.1)
# 训练
num_epochs = 5
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer)
epoch 1, loss 0.0031, train acc 0.752, test acc 0.784
epoch 2, loss 0.0022, train acc 0.812, test acc 0.807
epoch 3, loss 0.0021, train acc 0.825, test acc 0.816
epoch 4, loss 0.0020, train acc 0.833, test acc 0.822
epoch 5, loss 0.0019, train acc 0.837, test acc 0.826
通过上述网络结构,我们搭建了使用sotfmax处理的线性回归模型,并对图像集进行训练和预测。