现象:
梯度爆炸是深度学习中十分常见的现象,有时会导致寻优过程不收敛,或者算出来的结果干脆直接溢出,例如在训练过程中出现大面积的nan或者-nan,使前向传播失效,迭代无法继续正常进行。
原因:
1.权重初始化不合理:
由于初始化权值过大,前面层会比后面层变化的更快,就会导致权值越来越大,梯度爆炸的现象就发生了。网络层之间的梯度绝对值大于 1.0,重复相乘导致的指数级增长会产生梯度爆炸
2.训练任务难及样本质量低:
导致训练前期,loss过大致使梯度爆炸。
3.网络结构设计不合理或者训练参数设置不合理:
网络过深导致梯度累积严重,batch设置过大导致学习难度增加。
针对以上原因主要有以下三种解决思路:
1.修改模型初始化策略以及模型训练参数
参考:《yolov3模型初始化》https://blog.csdn.net/qq_33270279/article/details/103029130
2.加载预训练模型
因为梯度爆炸多发生在训练前期,模型权重混乱的状态。所以在预训练模型上进行微调训练会解决梯度爆炸问题。
3.设置梯度限制策略
主要详细介绍第三种思路-设置梯度限制策略:
def clip_grad_norm_(parameters, max_norm, norm_type=2):
r"""Clips gradient norm of an iterable of parameters.
The norm is computed over all gradients together, as if they were
concatenated into a single vector. Gradients are modified in-place.
Arguments:
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
single Tensor that will have gradients normalized
max_norm (float or int): max norm of the gradients
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
infinity norm. #范数类型,默认为L2
Returns:
Total norm of the parameters (viewed as a single vector).
"""
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
parameters = list(filter(lambda p: p.grad is not None, parameters))
max_norm = float(max_norm)
norm_type = float(norm_type)
if norm_type == inf:
total_norm = max(p.grad.data.abs().max() for p in parameters)#取绝对值的最大值
else:
total_norm = 0
for p in parameters:
param_norm = p.grad.data.norm(norm_type)
total_norm += param_norm.item() ** norm_type
total_norm = total_norm ** (1. / norm_type)#取(grad**norm_type)**(1/norm_type)
clip_coef = max_norm / (total_norm + 1e-6)
if clip_coef < 1:
for p in parameters:
p.grad.data.mul_(clip_coef)#按比例进行缩放
return total_norm
def clip_grad_value_(parameters, clip_value):
r"""Clips gradient of an iterable of parameters at specified value.
Gradients are modified in-place.
Arguments:
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
single Tensor that will have gradients normalized
clip_value (float or int): maximum allowed value of the gradients
The gradients are clipped in the range [-clip_value, clip_value]
"""
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
clip_value = float(clip_value)
for p in filter(lambda p: p.grad is not None, parameters):
p.grad.data.clamp_(min=-clip_value, max=clip_value)
...
loss, outputs = model(imgs, targets) # 正向传播
loss.backward() # 反向传播
nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step() # 更新迭代
optimizer.zero_grad() # 将module中的所有模型参数的梯度初始化为0
...
以gpu训练讲解:src/blas_kernals.cu
//可以用来作为梯度限制的函数,但又不限于限制梯度,比如限制loss等。如果下边的这些函数不满足需求,可以自己先写一个。
__global__ void axpy_kernel(int N, float ALPHA, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < N) Y[OFFY+i*INCY] += ALPHA*X[OFFX+i*INCX];
}
__global__ void pow_kernel(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < N) Y[i*INCY] = pow(X[i*INCX], ALPHA);
}
__global__ void const_kernel(int N, float ALPHA, float *X, int INCX)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < N) X[i*INCX] = ALPHA;
}
__global__ void constrain_kernel(int N, float ALPHA, float *X, int INCX)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < N) X[i*INCX] = fminf(ALPHA, fmaxf(-ALPHA, X[i*INCX]));
}
__global__ void supp_kernel(int N, float ALPHA, float *X, int INCX)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < N) {
if((X[i*INCX] * X[i*INCX]) < (ALPHA * ALPHA)) X[i*INCX] = 0;
}
}
__global__ void add_kernel(int N, float ALPHA, float *X, int INCX)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < N) X[i*INCX] += ALPHA;
}
__global__ void scal_kernel(int N, float ALPHA, float *X, int INCX)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < N) X[i*INCX] *= ALPHA;
}
__global__ void fill_kernel(int N, float ALPHA, float *X, int INCX)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < N) X[i*INCX] = ALPHA;
}
__global__ void copy_kernel(int N, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < N) Y[i*INCY + OFFY] = X[i*INCX + OFFX];
}
__global__ void mul_kernel(int N, float *X, int INCX, float *Y, int INCY)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < N) Y[i*INCY] *= X[i*INCX];
}
以卷积层为例进行讲解,src/convolutional_kernels.cu
void backward_convolutional_layer_gpu(convolutional_layer l, network net)
{
if(l.smooth){
smooth_layer(l, 5, l.smooth);
}
constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);//缩放因子为1;并限制在-1,1的范围内
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
...
}