《深度学习之pytorch实战计算机视觉》
Pytorch官方教程
Pytorch官方文档
本文是采用pytorch进行迁移学习的实战演练,实战目的是为了进一步学习和熟悉pytorch编程。
本文涉及以下内容
两种常见的迁移学习的做法是
1、finetuning:微调。采用在其他数据集上训练好的权重作为初始权重训练模型。
2、冻结前面部分层,只训练后面的一些层。
迁移学习的好处:节省训练的时间,解决小样本问题。
个人感觉迁移学习在自然图像上应用得比较多,尤其是采用ImageNet数据集上训练的权重作为初始权重是一种很普遍的做法。但是在医学影像上,我很少看到有用迁移学习的。
采用的数据集是ImageNet的一个子集——蚂蚁和蜜蜂。每一类的训练数据和验证数据各为120,75。数据很少,从头训练肯定是不够的,但本文采用的是迁移学习。下载地址。
我们来看数据读取和预览的代码。
#--coding:utf-8--
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor()
]),
'val': transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor()
]),
}
#获得数据生成器,以字典的形式保存。
data_dir = 'data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
data_transforms[x])
for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
shuffle=True, num_workers=4)
for x in ['train', 'val']}
#选择1batch的训练数据进行可视化
def imshow(inp, title=None):
"""Imshow for Tensor."""
inp = inp.numpy().transpose((1, 2, 0))
inp = np.clip(inp, 0, 1)
plt.imshow(inp)
if title is not None:
plt.title(title)
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes
# Get a batch of training data
inputs, classes = next(iter(dataloaders['train']))
# Make a grid from batch
out = torchvision.utils.make_grid(inputs)
imshow(out, title=[class_names[x] for x in classes])
plt.show()
图片预览结果:
这部分如果有问题可以看前一篇博文 【Pytorch实战5】数据读取和处理(以脸部关键点检测的数据为例)
查看数据集大小和标签:
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
print(dataset_sizes)
index_classes = image_datasets['train'].class_to_idx
print(index_classes)
本节将搭建三个卷积神经网络模型用于分类:Res18,VGG16,ResNet50。并比较这三个模型的准确率和泛化能力。
本节还涉及以下内容:
先看完整的训练代码。
# --coding:utf-8--
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
from tensorboardX import SummaryWriter
# 获得数据生成器,以字典的形式保存。
data_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
data_dir = 'data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
data_transforms[x])
for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
shuffle=True, num_workers=4)
for x in ['train', 'val']}
# 数据集的大小
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
# 类的名称
class_names = image_datasets['train'].classes
# 有GPU就用GPU训练
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 模型训练和参数优化
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
writer.add_scalar('loss_%s'%phase, epoch_loss, epoch)
writer.add_scalar('acc_%s'%phase, epoch_acc, epoch)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
model_ft = models.resnet18(pretrained=True)
writer = SummaryWriter()
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 2)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
num_epochs=25)
writer.close()
torch.save(model_ft.state_dict(), 'models/res18.pt')
运行该代码时可能报错:
UnboundLocalError: local variable 'photoshop' referenced before assignment
把Pillow降到5.4.1即可解决问题。官方说7月1号会发布新版Pillow,会解决这个问题。所以7月1号之后更新Pilllow到最新版也可。
下面是代码的详解:
可参考:https://github.com/lanpa/tensorboardX
tensorboardX的安装:
pip install tensorboardX(前提是已经装好了tensorflow 和 tensorboard)
可视化相关代码:
#导入读写器
from tensorboardX import SummaryWriter
#121行,实例化
writer = SummaryWriter()
#95,96行。分别写入loss和accuracy
writer.add_scalar('loss_%s'%phase, epoch_loss, epoch)
writer.add_scalar('acc_%s'%phase, epoch_acc, epoch)
#136行
writer.close()
可视化效果(省略验证集):
运行 tensorboard --logdir runs,打开弹出的网址
和keras.applicaitons类似,该包封装好了很多常见的网络模型,如下图。
我们只需要直接调用修改部分参数就好,不需要再重新搭建网络模型。具体的模型调用方法,相关参数设置,模型性能比较可见
https://pytorch.org/docs/stable/torchvision/models.html
本文会调用ResNet的两个模型和VGG16。上面的代码调用的是ResNet18,将该模型打印出来输出如下:
ResNet(
(conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace)
(maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
(layer1): Sequential(
(0): BasicBlock(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(1): BasicBlock(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(layer2): Sequential(
(0): BasicBlock(
(conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(downsample): Sequential(
(0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): BasicBlock(
(conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(layer3): Sequential(
(0): BasicBlock(
(conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(downsample): Sequential(
(0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): BasicBlock(
(conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(layer4): Sequential(
(0): BasicBlock(
(conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(downsample): Sequential(
(0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): BasicBlock(
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(avgpool): AdaptiveAvgPool2d(output_size=(1, 1))
(fc): Linear(in_features=512, out_features=1000, bias=True)
)
可以看到模型的最后一层名字叫'fc',上面的代码采用finetuning的方式做迁移学习。前面所有层采用欲训练权重,最后一层重写了,采用默认的初始权重。见代码第120,122,123行。
这个是设置模型的状态(mode),验证状态时BN,dropout是固定的,比如BN采用的训练时学到的参数。官方解释是这样的:
看网上的讨论似乎对是否使用model.eval()存在争议。具体请百度“model.eval"
GPU训练主要依赖于to()函数,将模型和输入变量送入device("cuda")中(也有其它的写法,如tensor.cuda代替to(device)。具体的相关代码如下:
#第44行,定义device,在哪定义都行,位置无所谓。
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#第71,72行,将输入张量送入GPU中
inputs = inputs.to(device)
labels = labels.to(device)
#第125行,将整个网络送入GPU中
model_ft = model_ft.to(device)
总结:要想用GPU计算,把涉及计算的所有张量或变量送入GPU即可。
变量(Variable)是pytorch专门为自动求导机制设计的,如今它已经与tensor合并了。可见下图:
这里先复制官方文档看看这个函数的含义。
下面说下我个人的理解。这个函数通常作为上下文管理器使用。它用来规定它作用域内的张量是否要计算梯度,也就是张量的属性requires_grad是否为真。从而避免在inference的时候浪费大量的内存。一个问题是,为什么张量属性requires_grad为真时会浪费大量内存。我们知道,这时候仅仅是做前向传播,而梯度计算是反向传播才做的事情,而且我们也是在phase=='train'的条件下执行的反向传播的代码,也就是说phase=='val'时只涉及前向传播。
当我看到下面这段话时,我才想明白内存节省在哪——不用再去构建用于计算梯度的图了。
When computing the forwards pass, autograd simultaneously performs the requested computations and builds up a graph representing the function that computes the gradient (the .grad_fn
attribute of each torch.Tensor
is an entry point into this graph). When the forwards pass is completed, we evaluate this graph in the backwards pass to compute the gradients.
很简单,就一行。其中state_dict是python字典,保存着模型中所可学习参数。路径的后缀一般为.pt或.pth
torch.save(model.state_dict(), PATH)
学习率计划 lr_scheduler
官方已经封装好了学习率调整的包lr_scheduler,提供了很多函数可供调用。详见:
https://pytorch.org/docs/stable/optim.html#torch.optim.lr_scheduler
本文采用的函数说明如下:调用的相关代码在6(导入模块),61(step+1),133行(实例化)
训练代码讲解完毕。我们来看测试代码。主要是加载模型。
# 测试
model = models.resnet18()
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 2)
model = model.to(device)
model.load_state_dict(torch.load('models/res18.pt'))
model.eval()
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders['val']:
inputs = inputs.to(device)
labels = labels.to(device)
# forward
# track history if only in train
with torch.set_grad_enabled(False):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
running_corrects += torch.sum(preds == labels.data)
epoch_acc = running_corrects.double() / dataset_sizes['val']
print(' Acc: {:.4f}'.format(epoch_acc))
准确率为95.42%
我尝试把model.eval注释掉。准确率为85.67%
下面来看用冻结层来做迁移学习。并测试单张数据可视化测试结果。
#冻结一些层
model_conv = torchvision.models.resnet18(pretrained=True)
for param in model_conv.parameters():
param.requires_grad = False
# Parameters of newly constructed modules have requires_grad=True by default
num_ftrs = model_conv.fc.in_features
model_conv.fc = nn.Linear(num_ftrs, 2)
model_conv = model_conv.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that only parameters of final layer are being optimized as
# opposed to before.
optimizer_conv = optim.SGD(model_conv.fc.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)
model_conv = train_model(model_conv, criterion, optimizer_conv,
exp_lr_scheduler, num_epochs=25)
torch.save(model_conv.state_dict(), 'models/res18_0.pt')
测试代码:
# --coding:utf-8--
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
from tensorboardX import SummaryWriter
# 获得数据生成器,以字典的形式保存。
data_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
data_dir = 'data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
data_transforms[x])
for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=1,
shuffle=True, num_workers=4)
for x in ['train', 'val']}
# 数据集的大小
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
# 类的名称
class_names = image_datasets['train'].classes
# 有GPU就用GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#单张测试可视化代码
model = models.resnet18()
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 2)
model = model.to(device)
model.load_state_dict(torch.load('models/res18_0.pt'))
model.eval()
def imshow(inp, title=None):
"""Imshow for Tensor."""
inp = inp.numpy().transpose((1, 2, 0))
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
inp = std * inp + mean
inp = np.clip(inp, 0, 1)
plt.imshow(inp)
if title is not None:
plt.title(title)
with torch.no_grad():
for i, (inputs, labels) in enumerate(dataloaders['val']):
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
imshow(inputs.cpu().data[0],'predicted: {}'.format(class_names[preds[0]]))
plt.show()
# --coding:utf-8--
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
from tensorboardX import SummaryWriter
# 获得数据生成器,以字典的形式保存。
data_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
data_dir = 'data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
data_transforms[x])
for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
shuffle=True, num_workers=4)
for x in ['train', 'val']}
# 数据集的大小
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
# 类的名称
class_names = image_datasets['train'].classes
# 有GPU就用GPU训练
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 模型训练和参数优化
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
writer.add_scalar('loss_%s'%phase, epoch_loss, epoch)
writer.add_scalar('acc_%s'%phase, epoch_acc, epoch)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
model_ft = models.vgg16(pretrained=True)
writer = SummaryWriter()
model_ft.classifier = torch.nn.Sequential(torch.nn.Linear(25088,4096),
torch.nn.ReLU(),
torch.nn.Dropout(p=0.5),
torch.nn.Linear(4096,4096),
torch.nn.ReLU(),
torch.nn.Dropout(p=0.5),
torch.nn.Linear(4096,2))
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
num_epochs=25)
writer.close()
torch.save(model_ft.state_dict(), 'models/vgg16.pt')
验证集准确率94%