实现权重衰减和学习率预热

 1.实现L2正则化接口,对模型中的所有参数进行L2正则处理防止过拟合,包括权重w和偏置b

optimizer_param = list(model.named_parameters())  # named_parameters()获取模型中的参数和参数名字
"""实现L2正则化接口,对模型中的所有参数进行L2正则处理防止过拟合,包括权重w和偏置b"""
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']  # no_decay中存放不进行权重衰减的参数
# any()函数用于判断给定的可迭代参数iterable是否全部为False,则返回False,如果有一个为True,则返回True
# 判断optimizer_param中所有的参数。如果不在no_decay中,则进行权重衰减;如果在no_decay中,则不进行权重衰减
optimizer_grouped_parameters = [
    {'params': [param for name, param in optimizer_param if not any((name in no_decay_name) for no_decay_name in no_decay)], 'weight_decay': 0.01},
    {'params': [param for name, param in optimizer_param if any((name in no_decay_name) for no_decay_name in no_decay)], 'weight_decay': 0.0}
]

# 使用带有权重衰减功能的Adam优化器Adamw
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
# 实现学习率预热,optimizer为优化器类,num_warmup_steps为训练多少步进行预热,num_training_steps为总共训练的次数
scheduler=get_linear_schedule_with_warmup(optimizer,num_warmup_steps=50,num_training_steps=len(train_data) * epochs)


"""梯度置零,反向传播,参数更新"""
model.zero_grad()
loss.backward()
optimizer.step()
scheduler.step()  # 更新学习率预热参数

2.冻结Bert模型底层3层参数

def get_optimizer_grouped_parameters(model,weight_decay,no_grad = None):
    no_decay = ['bais','LayerNorm.weight']
    if no_grad is not None:
        logging.info('冻结参数')
        for name,parameter in model.named_parameters():
            parameter.requires_grad = False if any(nd in name for nd in no_grad) else True
            if not parameter.requires_grad:
                logging.info('冻结:%s',name)
    else:
        for name, parameter in model.named_parameters():
            if not parameter.requires_grad:
                assert False, "parameters to update with requires_grad=False"
    output_params = [
        {'params':[parameter for name,parameter in model.named_parameters() if not any(nd in name for nd in no_decay) and parameter.required_grad],
         'weight_decay':weight_decay},
        {'params': [parameter for name,parameter in model.named_parameters() if any(nd in name for nd in no_decay) and parameter.required_grad],
         'weight_decay':0.0}
    ]
    return output_params

# 实现学习率预热和带有权重衰减功能的Adam优化器AdamW(冻结底部3层参数)
no_grad = ['embeddings'] + ['layer.' + str(i) + '.' for i in range(12) if i < freeze_bottom_layer]            # freeze_bottom_layer为冻结底层的层数
output_params = get_optimizer_grouped_parameters(model=model,weight_decay = weight_decay,no_grad=no_grad)    # weight_decay为权重衰减参数
optimizer = AdamW(params=output_params,lr= args.learning_rate ,eps = 1e-8,weight_decay= 0.01)
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,num_warmup_steps=int(len(feature)*warmup_ratio),num_training_steps=int(len(feature)))   # get_linear_schedule_with_warmup学习率预热
model.to(args.device)

你可能感兴趣的:(pytorch,深度学习,机器学习,数据挖掘)