文件目录结构
├── data
│ └── cifar-10-batches-py
│ ├── batches.meta
│ ├── data_batch_1
│ ├── data_batch_2
│ ├── data_batch_3
│ ├── data_batch_4
│ ├── data_batch_5
│ ├── readme.html
│ └── test_batch
├── data.py
├── model.py
├── train.py
└── validate.py
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
# torchvision.transforms.Compose()类的主要作用是串联多个图片变换的操作
# 即其内是对图片的一系列操作
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
def getData():
trainset = torchvision.datasets.CIFAR10(root='data/',
train=True,
download=False,
transform=transform)
trainloader = torch.utils.data.DataLoader(trainset,
batch_size=4,
shuffle=True,
num_workers=4)
# 使用cpu,所以batch_size设的不大
# num_workers 通俗可理解为开几个cpu线程处理数据
testset = torchvision.datasets.CIFAR10(root='data/',
train=False,
download=False,
transform=transform)
testloader = torch.utils.data.DataLoader(testset,
batch_size=4,
shuffle=False,
num_workers=2)
return trainloader, testloader
# 下面是数据展示
trainloader, testloader = getData()
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
'ship', 'truck')
def imshow(img):
img = img / 2 + 0.5
# unnormalize(反归一化),因为前面数据进行load时使用的
# transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
# 是先减0.5再除以0.5,现在反过来
npimg = img.numpy()
plt.imshow(np.transpose(npimg, (1, 2, 0)))
# plt.imshow在现实的时候输入的是(imagesize,imagesize,channels),
# 此处imshow(img)中,参数img的格式为(channels,imagesize,imagesize),
# 所以要转换一下
plt.show()
# 获取随机数据
dataiter = iter(testloader)
images, labels = dataiter.next()
# images 返回维度为[4, 3, 32, 32]的tensor,labels返回维度为[4]的tensor
# print(images.shape)
# print(labels.shape)
#print(sum(1 for _ in dataiter))
# 显示图像标签
print(' '.join('%5s' % classes[labels[j]] for j in range(4)))
# 展示图像
imshow(torchvision.utils.make_grid(images))
关于transforms.Compose()
类用法详解,参见 transforms.Compose()类详解
为什么要进行图片归一化(-1, 1),因为数据如果分布在(0,1)之间,可能实际的bias,就是神经网络的输入b会比较大,而模型初始化时b=0的,这样会导致神经网络收敛比较慢,经过Normalize后,可以加快模型的收敛速度。对RGB图片而言,数据范围是[0-255]的,需要先经过ToTensor除以255归一化到[0,1]之后,再通过Normalize计算过后,将数据归一化到[-1,1]。transforms.Normalize()
用法参见 transforms.Normalize()。
torchvision.utils.make_grid()可将若干幅图像拼成一幅图像。
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True))
in_channel
: 输入数据的通道数,例RGB图片通道数为3;out_channel
: 输出数据的通道数,根据模型进行调整;kennel_size
: 卷积核大小,可以是int,或tuple;kennel_size=2,意味着卷积大小2, kennel_size=(2,3),意味着卷积在第一维度大小为2,在第二维度大小为3在这里插入代码片
;stride
:步长,默认为1,与kennel_size类似,stride=2,意味在所有维度步长为2, stride=(2,3),意味着在第一维度步长为2,意味着在第二维度步长为3;padding
:默认零填充nn.MaxPool2d(kernel_size, stride=None, padding=0, dilation=1, return_indices=False, ceil_mode=False)
kernel_size(int or tuple)
: max pooling的窗口大小,
stride(int or tuple, optional)
: max pooling的窗口移动的步长。默认值是kernel_size
padding(int or tuple, optional):
输入的每一条边补充0的层数
dilation(int or tuple, optional)
: 一个控制窗口中元素步幅的参数
return_indices
: 如果等于True,会返回输出最大值的序号,对于上采样操作会有帮助
ceil_mode
: 如果等于True,计算输出信号大小的时候,会使用向上取整,代替默认的向下取整的操作
nn.Linear(in_features, out_features, bias=True)
in_features
指的是输入的二维张量的大小,即输入的 [batch_size, size] 中的size。
out_features
指的是输出的二维张量的大小,即输出的二维张量的形状为 [batch_size,output_size],当然,它也代表了该全连接层的神经元个数。
import torch.optim as optim
import torch.nn as nn
import torch
from data import getData
from model import Net, net
## 开启GPU训练阀门
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
trainloader, testloader = getData()
for epoch in range(2): # 多批次循环
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# 获取输入
inputs, labels = data
# inputs, labels = inputs.to(device), labels.to(device)
# 若 GPU 训练要用
# 梯度置0
optimizer.zero_grad()
# 正向传播,反向传播,优化
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# 打印状态信息
running_loss += loss.item()
if i % 2000 == 1999: # 每2000批次打印一次
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
print('Finished Training')
# torch.save(net, 'cifarModel')
交叉熵损失 nn.CrossEntropyLoss()
optimizer.zero_grad()
将梯度置0,即在每一次反向传播之前都需将梯度置0,这样才能正确的更新参数(不置0的话,梯度会叠加上次计算过的梯度,导致计算不准)。optimizer.step()
进行梯度更新。
import torch
import torchvision
from data import getData, imshow
net = torch.load("cifarModel")
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
'ship', 'truck')
_, testloader = getData()
dataiter = iter(testloader)
images, labels = dataiter.next()
# 显示图片
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))
outputs = net(images)
_, predicted = torch.max(outputs, 1)
print('Predicted: ', ' '.join('%5s' % classes[predicted[j]] for j in range(4)))
correct = 0
total = 0
with torch.no_grad():
for data in testloader:
images, labels = data
outputs = net(images)
# images, labels = images.to(device), labels.to(device)
# outputs = net(images.cuda()) # GPU 上训练的则用此句
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('acc on the 10000 images: %d %%' % (100 * correct / total))
这里验证模型,是加载的训练好的保存的模型。torch.max() 是对softmax归一化后的数据返回每行最大值(one-hot)及其索引,来确认分类结果。