这个系列将记录下本人平时在深度学习方面觉得实用的一些trick,可能会包括性能提升和工程优化等方面。
该系列的代码会更新到Github
炼丹系列1: 分层学习率&梯度累积
炼丹系列2: Stochastic Weight Averaging (SWA) & Exponential Moving Average(EMA)
阅读优秀的模型开源代码,我们总能学到很多东西,例如这个很常见,但又容易被忽略的东西:变量初始化。
我们应该养成自己控制变量初始化的习惯,而非随机初始化,这样模型训练会更加稳定,最终一般也可以带来更好的性能。
import tensorflow as tf
# 初始化变量
w = tf.get_variable('w', initializer=tf.truncated_normal(shape=[128, 256],
mean=0.0,
stddev=1.0))
"""
截断正态分布:tf.truncated_normal
普通的正态分布:tf.random_normal
均匀分布:tf.random_uniform
"""
# 全连接中的变量初始化
output = tf.layers.dense(w, 128, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
"""
tf.random_normal_initializer()
tf.glorot_normal_initializer()
tf.initializers.he_normal()
tf.initializers.lecun_normal()
还有,以上对应的均匀分布。如 random_uniform_initializer
"""
初始化形式分为两种:
在tensorflow中,全连接网络层通过结合输入或者输出的维度的初始化方式提供了较为丰富的选择,并且每种初始化形式都有正态分布和均匀分布的版本。
以下为正态分布:
这些初始化有一个通用的表达方式,包含scale、mode、distribution三个关键参数:
接下来,distribution参数代表选择正态分布或者均匀分布:
那么,我们再把它们对应起来:
(均分分布同理)
在pytorch中没有现成的方法可以调用,大家可以按照下面这个模式去设计。
import math
import torch
import torch.nn as nn
def _no_grad_trunc_normal_(tensor, mean, std, a, b):
# Cut & paste from PyTorch official master until it's in a few official releases - RW
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
def norm_cdf(x):
# Computes standard normal cumulative distribution function
return (1. + math.erf(x / math.sqrt(2.))) / 2.
if (mean < a - 2 * std) or (mean > b + 2 * std):
print("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
"The distribution of values may be incorrect.")
with torch.no_grad():
# Values are generated by using a truncated uniform distribution and
# then using the inverse CDF for the normal distribution.
# Get upper and lower cdf values
l = norm_cdf((a - mean) / std)
u = norm_cdf((b - mean) / std)
# Uniformly fill tensor with values from [l, u], then translate to
# [2l-1, 2u-1].
tensor.uniform_(2 * l - 1, 2 * u - 1)
# Use inverse cdf transform for normal distribution to get truncated
# standard normal
tensor.erfinv_()
# Transform to proper mean, std
tensor.mul_(std * math.sqrt(2.))
tensor.add_(mean)
# Clamp to ensure it's in the proper range
tensor.clamp_(min=a, max=b)
return tensor
def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
# type: (Tensor, float, float, float, float) -> Tensor
return _no_grad_trunc_normal_(tensor, mean, std, a, b)
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
weight = nn.Parameter(torch.zeros(128, 256))
linear = nn.Linear(256, 128)
norm = nn.LayerNorm(256)
# Parameter参数初始化
trunc_normal_(weight, std=.02)
# 全连接层、Layer Normalization层的参数初始化
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def forward(self, x):
pass
类似但不限于以上这些情况,分层学习率是一个不错的应对方案。
tensorflow实现
import tensorflow as tf
lr_dict = {'bert': 1e-5,
'default': 1e-3}
def get_train_op(loss: tf.Tensor, global_step: tf.Tensor):
optimizer_dict = {}
for key in lr_dict:
# 这里可以选择其他的优化器
optimizer_dict[key] = tf.train.AdamOptimizer(learning_rate=lr_dict[key])
# 这里计算梯度与学习率无关, 选择任一optimizer即可
gradients = optimizer_dict['default'].compute_gradients(loss)
vars_dict = {k: [] for k in lr_dict}
for grad, var in gradients:
layer = 'default' # 默认归属层
for key in lr_dict:
if key in var.name:
layer = key
break
vars_dict[layer].append((grad, var))
train_op_list = []
for key, var in vars_dict.items():
# 在这里根据不同的学习率进行反向传播,更新参数
# global_step参数None,代表global_step不变
train_op_list.append(optimizer_dict[key].apply_gradients(vars_dict[key], global_step=None))
# global_step在这里+1
new_global_step = global_step + 1
train_op_list.append(global_step.assign(new_global_step))
train_op = tf.group(*train_op_list)
return train_op
pytorch实现
这里使用了带weight decay的Adam优化器,并且除了分层学习率,还有weight decay、epsilon的分层配置。
import torch.nn as nn
from transformers import AdamW
lr_dict = {
'bert': {'lr': 1e-5, 'weight_decay': 0.02, 'eps': 1e-6},
'default': {'lr': 1e-3, 'weight_decay': 0.01, 'eps': 1e-6},
}
def create_optimizer(model: nn.Module):
# Set learning_rates for each layers
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters_decay = []
optimizer_grouped_parameters_no_decay = []
group_id = {}
for i, key in enumerate(lr_dict):
optimizer_grouped_parameters_decay.append({'params': [],
'weight_decay': lr_dict[key]['weight_decay'],
'lr': lr_dict[key]['lr'],
'eps': lr_dict[key]['eps']})
optimizer_grouped_parameters_no_decay.append({'params': [],
'weight_decay': 0.0,
'lr': lr_dict[key]['lr'],
'eps': lr_dict[key]['eps']})
group_id[key] = i
for n, p in model.named_parameters():
index = group_id['default']
for key in lr_dict:
if key in n:
index = group_id[key]
break
if any(nd in n for nd in no_decay):
optimizer_grouped_parameters_no_decay[index]['params'].append(p)
else:
optimizer_grouped_parameters_decay[index]['params'].append(p)
optimizer = AdamW(
optimizer_grouped_parameters_decay + optimizer_grouped_parameters_no_decay,
lr=lr_dict['default']['lr'],
eps=lr_dict['default']['eps'],
)
return optimizer
有时我们想要使用大模型来取得更好的性能,如bert large的性能比bert base好,但由于机器资源有限,显存不够放下大模型。
这时,一般可以通过减少batch size来强行使用大模型,但太小的batch size会使得模型很难收敛。
梯度累积就可以应对这种难题:
但需要注意的是,如Batch Normalization这种会受batch size影响的操作,则可能会带来一些性能损失。
tensorflow实现
在tensorflow比较麻烦,需要自己实现这个功能。主要注意几点:
import tensorflow as tf
"""
steps_accumulate为梯度累积的步数,即累积`steps_accumulate`再进行一次反向传播更新参数
实现`steps_accumulate * bs`的大批次训练
"""
def create_train_op(loss: tf.Tensor,
global_step: tf.Tensor,
steps_accumulate: int):
opt = tf.train.AdamOptimizer(0.01)
tvs = tf.trainable_variables()
# 创建梯度变量副本,用于累积梯度
accum_vars = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in tvs]
# 清空梯度变量副本
zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars]
# 计算当前批次梯度
gvs = opt.compute_gradients(loss / steps_accumulate, tvs)
# 将当前批次的梯度累加到`accum_vars`
accum_ops = [accum_vars[i].assign_add(gv[0]) for i, gv in enumerate(gvs)]
# 使用累积的梯度,进行反向传播更新参数
train_op = opt.apply_gradients([(accum_vars[i], gv[1]) for i, gv in enumerate(gvs)],
global_step=global_step)
return train_op, accum_ops, zero_ops
def train(loss: tf.Tensor, steps_accumulate: int):
global_step = tf.train.get_or_create_global_step()
train_op, accum_ops, zero_ops = create_train_op(loss, global_step, steps_accumulate)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(10000):
# 这里是模拟使用tf.data.Dataset定义输入流
# 如果是使用placeholder的方式,则需喂入feed_dict数据
sess.run(accum_ops)
if (i + 1) % steps_accumulate == 0:
sess.run(train_op)
sess.run(zero_ops)
PyTorch实现
在pytorch中就很容易,只要我们执行backward后不执行optimizer.step(),pytorch就会自动帮我们累积梯度
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
# steps_accumulate为梯度累积的步数,即累积`steps_accumulate`再进行一次反向传播更新参数
# 实现`steps_accumulate * bs`的大批次训练
def train(model: nn.Module,
dataloader: DataLoader,
optimizer: torch.optim.Optimizer,
steps_accumulate: int):
model.zero_grad()
model.train()
for i, data in enumerate(dataloader):
loss = model(data) / steps_accumulate
loss.backward()
if (i + 1) % steps_accumulate == 0:
optimizer.step()
optimizer.zero_grad()