def set_parameter_requires_grad(model, feature_extracting):
if feature_extracting:
for param in model.parameters():
param.requires_grad = False
feature_extract = True
set_parameter_requires_grad(model, feature_extract)
# cosine lr_lambda
lambda_cosine = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * (1 - lrf) + lrf # cosine
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_cosine)
# warm up lr_lambda
# warm up的epoch一般设置小于等于5
warm_up_with_cosine_lr = lambda epoch: epoch / args.warm_up_epochs if epoch <= warm_up_epochs else 0.5 * (math.cos((epoch - args.warm_up_epochs) / (epochs - warm_up_epochs) * math.pi) + 1)
# 全局warm up
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=warm_up_with_cosine_lr)
[ pytorch ] 基本使用丨8. 优化器optimizer的使用
# 不同模型层不同学习率
# 预训练层-小学习率
# 微调层-warm up学习率
ignored_params = list(map(id, model.output_heads.parameters()))
base_params = filter(lambda p: id(p) not in ignored_params, model.parameters())
optimizer = optim.Adam([
{'params': base_params},
{'params': model.output_heads.parameters(), 'lr': 0.001}], 5e-5,weight_decay=1e-4)
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda_cosine,warm_up_with_cosine_lr])
for step, (input_ids, attention_mask, token_type_ids, y) in enumerate(tk):
input_ids, attention_mask, token_type_ids, y = input_ids.to(device), attention_mask.to(
device), token_type_ids.to(device), y.to(device).long()
with autocast():
for i in range(batch_n_iter):
output = model(input_ids, attention_mask, token_type_ids).logits
loss = criterion(output, y) / CFG['accum_iter']
SCALER.scale(loss).backward()
SCALER.step(optimizer)
SCALER.update()
optimizer.zero_grad()
class multilabel_dropout():
# Multisample Dropout 论文: https://arxiv.org/abs/1905.09788
def __init__(self, HIGH_DROPOUT, HIDDEN_SIZE):
self.high_dropout = torch.nn.Dropout(config.HIGH_DROPOUT)
self.classifier = torch.nn.Linear(config.HIDDEN_SIZE * 2, 2)
def forward(self, out):
return torch.mean(torch.stack([
self.classifier(self.high_dropout(p))
for p in np.linspace(0.1,0.5, 5)
], dim=0), dim=0)
[thorough-pytorch/6.4 半精度训练.md at main · datawhalechina/thorough-pytorch (github.com)](https://github.com/datawhalechina/thorough-pytorch/blob/main/第六章 PyTorch进阶训练技巧/6.4 半精度训练.md#64-半精度训练)
在PyTorch中使用autocast
配置半精度训练,同时需要在下面三处加以设置:
from torch.cuda.amp import autocast
在模型定义中,使用python的装饰器方法,用autocast
装饰模型中的forward
函数。关于装饰器的使用,可以参考这里:
@autocast()
def forward(self, x):
...
return x
在训练过程中,只需在将数据输入模型及其之后的部分放入with autocast():
即可:
for x in train_loader:
x = x.cuda()
with autocast():
output = model(x)
...
注意:
半精度训练主要适用于数据本身的size比较大(比如说3D图像、视频等)。当数据本身的size并不大时(比如手写数字MNIST数据集的图片尺寸只有28*28),使用半精度训练则可能不会带来显著的提升。