# Optimizer
nbs = 64 # nominal batch size模拟的batch_size
accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizi, g64
hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay
pg0, pg1, pg2 = [], [], [] # optimizer parameter groups # 将模型分成三组优化
pg0_k, pg1_k, pg2_k = [], [], [] # 三组对应的关键字
para = model.named_parameters()
for k, v in model.named_parameters():
v.requires_grad = True
if '.bias' in k:
pg2.append(v) # biases
pg2_k.append(k)
elif '.weight' in k and '.bn' not in k:
pg1.append(v) # apply weight decay
pg1.append(k)
else:
pg0.append(v) # all else
pg0_k.append(k)
# 选用优化器,设置pg0的优化方式
if opt.adam:
optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
else:
optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
# 设置weight、bn的优化方式
optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay
# 设置biases的优化方式
optimizer.add_param_group({'params': pg2}) # add pg2 (biases)
logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
del pg0, pg1, pg2
其中 hyp[‘lr0’]为初始学习率,对每一组采用不同的超参数设置,如模型参数pg0的动量采用yaml文件的momentum,其他超参数默认, 模型参数pg1的权重衰减采用yaml文件的设置,其他超参数默认.
# 设置学习率衰减,这里为余弦退火方式进行衰减
# 就是根据以下公式lf、epoch、和超参数hyp['lrf']进行衰减
# Scheduler https://arxiv.org/pdf/1812.01187.pdf
# https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp['lrf']) + hyp['lrf'] # cosine
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
# plot_lr_scheduler(optimizer, scheduler, epochs)
# Optimize
if ni % accumulate == 0:
scaler.step(optimizer) # optimizer.step
scaler.update()
optimizer.zero_grad()
if ema:
ema.update(model)
其中优化器 optimizer包含了每组模型参数的超参数值,模型参数值以及对应的参数梯度值 . 其中default表示超参数的默认设置, param_groups的每组参数的超参数以及对应的参数值 , state表示每组参数的参数值对应的梯度值.
# Scheduler
lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard
scheduler.step()