Github库地址:pytorch-ssd/train_ssd.py
创建网络
create_net = lambda num: create_mobilenetv2_ssd_lite(num, width_mult = args.mb2_width_mult) #这个写法有点意思哈。相当于create_net是一个能接受参数的函数!基于create_mobilenetv2_ssd_lite的函数,这个写法第一次见还纠结了一下,看完觉得很棒
config = mobilenetv1_ssd_config
net = create_net(num_classes)
设定transform
train_transform = TrainAugmentation(config.image_size, config.image_mean, config.image_std) #测试图像做的变换
target_transform = MatchPrior(config.priors, config.center_variance,
config.size_variance, 0.5) #对anchor做的变换(分配到具体网格中)
test_transform = TestTransform(config.image_size, config.image_mean, config.image_std) #对测试图像做的变换
获得数据集
dataset = VOCDataset(dataset_path, transform=train_transform,
target_transform=target_transform)
datasets.append(dataset)
train_dataset = ConcatDataset(datasets) # 因为可能有多个数据集,作者写了一个拼接
train_loader = DataLoader(train_dataset, args.batch_size,
num_workers=args.num_workers,
shuffle=True)
设置freeze
作者的逻辑很棒。它将mobilenet+SSD各个部分拆开来,使得后续的处理也方便了很多。比如这里想要freeze,直接设置net.base_net即可
freeze_net_layers(net.base_net)
def freeze_net_layers(net):
for param in net.parameters():
param.requires_grad = False
网络初始化
if args.resume:
net.load(args.resume)
elif args.base_net:
net.init_from_base_net(args.base_net)
elif args.pretrained_ssd:
net.init_from_pretrained_ssd(args.pretrained_ssd)
可选择是继续上一次的模型,还是仅训练base_net还是加载预训练好的模型
损失和优化函数
criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3,
center_variance=0.1, size_variance=0.2, device=DEVICE)
optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum,
weight_decay=args.weight_decay)
MultiboxLoss是写好的多用检测的损失函数。它包括使用cross entroy计算出的分类loss和用smooth L1计算出来的location loss
-
优化函数:这是一个使用momentum的min_batch gradient descent。也就是请注意,虽然它名义上说是SGD,但其实就是个MBGD
这个优化函数还有一个细节,即如何调整学习率
不需要调整学习率的通用流程如下:
criterion = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=1e-4) for epoch in range(1000): for step, (inputs, targets) in enumerate(loader): # 前向传播 out = model(inputs) loss = criterion(out, target) # 反向传播 optimizer.zero_grad() loss.backward() # 调整参数 optimizer.step()
如果想要调整学习率,通过会定义一种学习率调整的策略scheduler,其通用流程如下:
scheduler = XXXLR(optimizer,...) #设定一种学习率调整的策略
for epoch in range(100):
scheduler.step()
train(...) # optimizer.zero_grad() optimizer.step()仍然需要
validate(...)
这块的代码具体如下。
if args.scheduler == 'multi-step':
milestones = [int(v.strip()) for v in args.milestones.split(",")]
scheduler = MultiStepLR(optimizer, milestones=milestones,gamma=0.1, last_epoch=last_epoch)
elif args.scheduler == 'cosine':
scheduler = CosineAnnealingLR(optimizer, args.t_max, last_epoch=last_epoch)
对学习率改变的策略做一个总结, 参考。
-
torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch=-1)
将每个参数组的学习率设置为给定函数的初始值,当last_epoch=-1时,设置初始的lr作为lr;
optimizer:封装好的优化器
lr_lambda(function or list):一个计算每个epoch的学习率的函数或者一个list;
last_epoch:最后一个epoch的索引
-
torch.optim.lr_scheduler.StepLR(optimizer, step_size, gamma=0.1, last_epoch=-1)
当epoch每过step_size时,学习率都变为初始学习率的gamma倍
-
orch.optim.lr_scheduler.MultiStepLR(optimizer, milestones, gamma=0.1, last_epoch=-1)
当训练epoch达到milestones值时,初始学习率乘以gamma得到新的学习率;
milestones为一个数组,如 [50,70]. gamma为倍数。如果learning rate开始为0.01 ,则当epoch为50时变为0.001,epoch 为70 时变为0.0001。
-
torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma, last_epoch=-1)
每个epoch学习率都变为初始学习率的gamma倍
-
torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max, eta_min=0, last_epoch=-1)
利用cos曲线降低学习率,该方法来源SGDR
-
CLASS torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=False, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08)
当参考的评价指标停止改进时,降低学习率,factor为每次下降的比例,训练过程中,当指标连续patience次数还没有改进时,降低学习率
训练网络做迭代
for epoch in range(last_epoch + 1, args.num_epochs):
scheduler.step()
# 训练
train(train_loader, net, criterion, optimizer,
device=DEVICE, debug_steps=args.debug_steps, epoch=epoch)
if epoch % args.validation_epochs == 0 or epoch == args.num_epochs - 1:
# 验证
val_loss, val_regression_loss, val_classification_loss = test(val_loader, net, criterion, DEVICE)
model_path = os.path.join(args.checkpoint_folder, f"{args.net}-Epoch-{epoch}-Loss-{val_loss}.pth")
net.save(model_path)
logging.info(f"Saved model {model_path}")
训练过程
def train(loader, net, criterion, optimizer, device, debug_steps=100, epoch=-1):
net.train(True) # 还可以写成net.train()
running_loss = 0.0
running_regression_loss = 0.0
running_classification_loss = 0.0
for i, data in enumerate(loader):
images, boxes, labels = data
images = images.to(device)
boxes = boxes.to(device)
labels = labels.to(device)
optimizer.zero_grad() # 虽然用了scheduler,但还是要将optimizer初始化
confidence, locations = net(images)
regression_loss, classification_loss = criterion(confidence, locations, labels, boxes) # TODO CHANGE BOXES
loss = regression_loss + classification_loss
loss.backward()
optimizer.step()
# 这一块儿没看懂,好像是为了调试用的,需要测试
running_loss += loss.item()
running_regression_loss += regression_loss.item()
running_classification_loss += classification_loss.item()
if i and i % debug_steps == 0:
avg_loss = running_loss / debug_steps
avg_reg_loss = running_regression_loss / debug_steps
avg_clf_loss = running_classification_loss / debug_steps
running_loss = 0.0
running_regression_loss = 0.0
running_classification_loss = 0.0
测试过程
def test(loader, net, criterion, device):
net.eval()
running_loss = 0.0
running_regression_loss = 0.0
running_classification_loss = 0.0
num = 0
for _, data in enumerate(loader):
images, boxes, labels = data
images = images.to(device)
boxes = boxes.to(device)
labels = labels.to(device)
num += 1
with torch.no_grad():
confidence, locations = net(images)
regression_loss, classification_loss = criterion(confidence, locations, labels, boxes)
loss = regression_loss + classification_loss
running_loss += loss.item()
running_regression_loss += regression_loss.item()
running_classification_loss += classification_loss.item()
return running_loss / num, running_regression_loss / num, running_classification_loss / num