首先我们先梳理一下思路,要想自己训练一个网络首先要做一下几步
第一,准备数据集,对数据集做一些预处理
第二,划分加载的样本数据,以及加载的数量等
第三,定义网络结构,即在PyTorch中的话就是实现一个类的声明(torch.nn.Module)
第四,定义损失函数
第五,写一个加载数据集的函数用来在训练初期加载数据集
第六,定义训练网络实现网络的迭代训练
代码中的注释非常的详细,很适合初学者上手去自己训练模型,参考了https://cloud.tencent.com/developer/article/1143453这个文章,对里边的错误进行了一定的改动,希望对你们有所帮助。
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import time
from matplotlib import pyplot as plt
# 随机种子使每次产生的随机数一样
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
# Compose的意思是将多个transform组合在一起用,ToTensor 将像素转化为0-1的数字,Normalize将值正则化变为 -1 -- 1
#
tf = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
# 下载数据集,训练集:需要训练;测试集:不需要训练
train_set = torchvision.datasets.CIFAR10(root='./cifar_data', train=True, download=True, transform=tf)
test_set = torchvision.datasets.CIFAR10(root='./cifar_data', train=False, download=True, transform=tf)
# 指定十个类别的标签,有的数据集很大的回加载相应的标签文件(groundtruth)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'truck', 'ship')
# ===========================================================================================
from torch.utils.data.sampler import SubsetRandomSampler
# Training
n_training_sample = 20000
train_sample = SubsetRandomSampler(np.arange(n_training_sample, dtype=np.int64))
# Validation
n_validation_sample = 5000
validation_sample = SubsetRandomSampler(np.arange(n_training_sample, n_training_sample + n_validation_sample,
dtype=np.int64))
# Testing
n_test_sample = 5000
test_sample = SubsetRandomSampler(np.arange(n_test_sample, dtype=np.int64))
# ================================================================================================
from torch.autograd.variable import Variable
import torch.nn.functional as F
class SimpleCNN(torch.nn.Module):
def __init__(self):
super(SimpleCNN, self).__init__()
# conv卷积的结果是 32*32*18,可以理解为32*32大小的图片共有18张(18通道)
self.conv1 = torch.nn.Conv2d(3, 18, kernel_size=3, padding=1, stride=1)
# maxpool之后 结果是 16*16 *18 ,可以理解为 16*16大小的图片共有18张(18通道)
self.pool = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
# 全连接层fc1,输入层18*16*16 = 4608个神经元,输出层有64个神经元
self.fc1 = torch.nn.Linear(18 * 16 * 16, 64)
# 全连接层fc2,输入层有 64个神经元,输出层有10个神经元,对应的是十个标签
self.fc2 = torch.nn.Linear(64, 10)
# 前向传播
def forward(self, x):
# 卷积操作,然后relu激活函数增加非线性拟合能力
x = F.relu(self.conv1(x))
# 经过池化层
x = self.pool(x)
# view将池化后的张量拉伸,-1的意思其实就是未知数的意思,根据其他位置(这里就是18*16*16)来推断这个-1是几
x = x.view(-1, 18 * 16 * 16)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
def createlossandoptimizer(net, learning_rate=0.001):
loss = torch.nn.CrossEntropyLoss() # 交叉熵损失函数
optimizer = optim.Adam(net.parameters(), lr=learning_rate) # Adam 优化算法是随机梯度下降算法的扩展式
print(optimizer)
return loss, optimizer
def get_train_loader(batch_size):
# train_loader 一次性加载了sample中全部的样本数据,每次以batch_size为一组循环
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, sampler=train_sample, num_workers=2)
# cnt = 0
# # 下边是为了显示数据集中的图片和标签数据
# for image, labels in train_loader:
#
# if cnt >= 3:
# break
# for i in range(batch_size):
# print("标签数据:", labels[i])
# print(image)
# img = image[i] # 举例:image[0]是一个3*32*32的数组,也就是取三个里边(batch_size取得三)的第一张图片
# img = img.numpy() # FloatTensor转为ndarray
# # print("the shape of the img:",img.shape) # 3*32*32
# img = np.transpose(img, (1, 2, 0)) # 把channel那一维放到最后 转变之后变为 32*32 *3,做转变是因为imshow只接受
# # 标量数据(M,N)RGB数据(M,N,3),(M,N,4):具有RGBA值(float或uint8)的图像,即包括透明度。
# # print("the shape of the img after converted:", img.shape)
# plt.imshow(img) # 前边做好转化就可以渲染输出了
# plt.show()
# cnt += 1
return train_loader
test_loader = torch.utils.data.DataLoader(test_set, batch_size=4, sampler=test_sample, num_workers=2)
val_loader = torch.utils.data.DataLoader(train_set, batch_size=64, sampler=validation_sample, num_workers=2)
# ==========================================================================================
def trainNet(net, batchsize, n_epochs, learning_rate):
print("================HYPERPARAMETERS===================") # 超参数定义
print("batch-size=", batchsize)
print("n_epochs=", n_epochs)
print("learning_rate=", learning_rate)
print("=============================================")
# 加载数据集上边的方法解释了获取训练数据
print("batchsize:", batchsize)
train_loader = get_train_loader(batchsize)
n_batches = len(train_loader) # n_batches * batchsize = 20000(样本数目)
print("n_batches", n_batches)
loss, optimizer = createlossandoptimizer(net, learning_rate)
training_start_time = time.time() # 开始时间,为了后边统计一个训练花费时间
print("训练即将开始")
# 循环训练 n_epochs是5,也就是重复扫 五遍样本数据,CIFAR10数据集将50000条训练数据分为了五个batch,所以这个地方不要有疑惑
for epoch in range(n_epochs):
running_loss = 0.0
print_every = n_batches // 10
print("print_every:", print_every) # 数值
start_time = time.time()
total_train_loss = 0
# 关于下边enumerate的用法
# >>>seq = ['one', 'two', 'three']
# >>> for i, element in enumerate(seq,1): # 数字代表开始下标
# ... print i, element
# 1 two
# 2 three
for i, data in enumerate(train_loader, 0):
# i就是0 1 2 3 4.....n_batches ; data中包含了batch_size张图片的信息和对应的标签,在这里是3张,0-9中的三个数字(每个数字代表一类动物)
inputs, labels = data
# print(inputs, labels)
inputs, labels = Variable(inputs), Variable(labels)
optimizer.zero_grad() # 将所有的梯度置零,原因是防止每次backward的时候梯度会累加
outputs = net(inputs) #
# print(labels)
# outputs: (N,C) C 是类别的数量
#
# labels: (N) N是batch_size的大小,0 <= labels[i] <= C-1
loss_size = loss(outputs, labels)
loss_size.backward() # 根据反向传播更新所有的参数
optimizer.step()
print(loss_size)
running_loss += loss_size.data[0]
print("running_loss:", running_loss)
total_train_loss += loss_size.data[0]
print("total_train_loss:", total_train_loss)
# 在一个epoch里。每十组batchsize大小的数据输出一次结果,即以batch_size大小的数据为一组,到第10组,20组,30组...的时候输出
if (i + 1) % 10 == 0:
print("epoch{}, {:d} \t traing_loss:{:.2f} took:{:.2f}s".format(epoch + 1, int(100 * (i + 1) / n_batches),
running_loss / 10, time.time()-
start_time))
running_loss = 0.0
start_time = time.time()
# 验证集一般是在几个Epoch之后执行一次,在这里我们一个epochvv表现出v 执行一次。
total_val_loss = 0
# 这个for循环一共要执行5000/64(val的batchsize)=len(val_loader)
for inputs, labels in val_loader:
# Wrap tensors in Variables
inputs, labels = Variable(inputs), Variable(labels)
# Forward pass
val_outputs = net(inputs)
val_loss_size = loss(val_outputs, labels)
# print("-------------------------------------------------------", val_loss_size)
total_val_loss += val_loss_size.data[0]
# print("val_loader的长度:", len(val_loader))
print("Validation loss = {:.2f}".format(total_val_loss / len(val_loader))) # 求验证集的平均损失是多少
print("Training finished, took {:.2f}s".format(time.time() - training_start_time)) # 所有的Epoch结束,也就是训练结束,计算花费的时间
# 定义主函数的入口,执行整个训练过程
if __name__ == "__main__":
CNN = SimpleCNN()
trainNet(CNN, batchsize=32, n_epochs=5, learning_rate=0.001)