本系列为darknet源码解析,本次解析src/batchnorm_layer.h 与 src/batchnorm_layer.c两个。batchnorm主要完成批归一化操作。
论文名字:Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift
论文地址:https://arxiv.org/pdf/1502.03167.pdf
BatchNorm,BN的基本动机与原理是什么?在CNN中如何使用?
神经网络的训练过程的本质是学习数据分布,如果训练数据与测试数据的分布不同将大大降低网络的泛化能力,所以我们需要在训练开始前对所有输入数据进行归一化操作。然后随着网络的训练,每个隐藏层的参数变化使得后一层的输入发生变化,从而每一个batchsize的训练数据的分布也随之变化,使得网络在每次迭代中都需要去拟合不同的数据分布,增大训练的复杂度以及过拟合的风险。BN是在网络的每一层输入之前增加归一化处理(均值为0,标准差为1)将所有批数据强制在统一的数据分布下。
BN层实现:
BN为什么需要进行缩放和平移操作?
BN可以认为是在每一层的输入和上一层的输出之前之间加入一个计算层,对数据的分布进行额外的约束,从而增强模型的泛化能力。但是BN同时也降低了模型的拟合能力,BN之后的输入分布被强制为均值为0标准差为1。以Sigmoid激活函数为例,BN之后的输入分布整体处于函数的非饱和区域,只包含线性变换,破坏了之前学习到的特征分布。为了恢复原始数据分布,所以引入了缩放和平移参数。仅用这两个参数就可以恢复最优的输入数据分布,与之前的网络层解耦,从而更加有利于优化的过程,提高模型的泛化能力。
batchnorm_layer.h 的解析如下:
#ifndef BATCHNORM_LAYER_H
#define BATCHNORM_LAYER_H
#include "image.h"
#include "layer.h"
#include "network.h"
// 构造BN层函数
layer make_batchnorm_layer(int batch, int w, int h, int c);
// BN层前向传播函数
void forward_batchnorm_layer(layer l, network net);
// BN层反向传播函数
void backward_batchnorm_layer(layer l, network net);
#ifdef GPU
void forward_batchnorm_layer_gpu(layer l, network net);
void backward_batchnorm_layer_gpu(layer l, network net);
void pull_batchnorm_layer(layer l);
void push_batchnorm_layer(layer l);
#endif
#endif
求导:
【公式1-5】
【公式1-6】
batchnorm_layer.c 的解析如下:
#include "convolutional_layer.h"
#include "batchnorm_layer.h"
#include "blas.h"
#include
// 构造归一化层
/**
* 构造归一化层
* @param batch 一个batch包含图片的张数
* @param w 输入图片的高度
* @param h 输入图片的宽度
* @param c 输入图片的通道数
* @return
*/
layer make_batchnorm_layer(int batch, int w, int h, int c)
{
fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c);
layer l = {0};
l.type = BATCHNORM;
l.batch = batch; // 一个batch中图片的张数
l.h = l.out_h = h; // 输入图片的高度
l.w = l.out_w = w; // 输入图片的宽度
l.c = l.out_c = c; // 输入图片的通道数
// calloc 传入两个参数,分别为元素的数目和每个元素的大小
// calloc 会将所有分配的内存空间中的每一位都初始化为零
l.output = calloc(h * w * c * batch, sizeof(float)); // BN层的所有输出(包含整个batch的)
l.delta = calloc(h * w * c * batch, sizeof(float)); // BN层的误差损失项(包含整个batch的)
l.inputs = w*h*c; // BN层一张输入图片中所有元素的个数
l.outputs = l.inputs; // BN层对应一张输入图片的输出元素个数, BN层不会改变输入输出的个数,通道数也不发生变化
//
l.scales = calloc(c, sizeof(float)); // BN层的gamma参数项
l.scale_updates = calloc(c, sizeof(float)); // gamma更新值
l.biases = calloc(c, sizeof(float)); // BN层的beta参数项
l.bias_updates = calloc(c, sizeof(float)); // beta更新值
int i;
for(i = 0; i < c; ++i){ //gamma初始化为1
l.scales[i] = 1;
}
l.mean = calloc(c, sizeof(float)); // 用于保存每个通道元素的平均值
l.variance = calloc(c, sizeof(float)); // 用于保存每个通道的方差
l.rolling_mean = calloc(c, sizeof(float)); // 保存每个通道均值的滚动平均
l.rolling_variance = calloc(c, sizeof(float)); // 保存每个通道的方差的滚动平均
// BN层的前向, 反向传播函数
l.forward = forward_batchnorm_layer;
l.backward = backward_batchnorm_layer;
#ifdef GPU
l.forward_gpu = forward_batchnorm_layer_gpu;
l.backward_gpu = backward_batchnorm_layer_gpu;
l.output_gpu = cuda_make_array(l.output, h * w * c * batch);
l.delta_gpu = cuda_make_array(l.delta, h * w * c * batch);
l.biases_gpu = cuda_make_array(l.biases, c);
l.bias_updates_gpu = cuda_make_array(l.bias_updates, c);
l.scales_gpu = cuda_make_array(l.scales, c);
l.scale_updates_gpu = cuda_make_array(l.scale_updates, c);
l.mean_gpu = cuda_make_array(l.mean, c);
l.variance_gpu = cuda_make_array(l.variance, c);
l.rolling_mean_gpu = cuda_make_array(l.mean, c);
l.rolling_variance_gpu = cuda_make_array(l.variance, c);
l.mean_delta_gpu = cuda_make_array(l.mean, c);
l.variance_delta_gpu = cuda_make_array(l.variance, c);
l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs);
l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs);
#ifdef CUDNN
cudnnCreateTensorDescriptor(&l.normTensorDesc);
cudnnCreateTensorDescriptor(&l.dstTensorDesc);
cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w);
cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1);
#endif
#endif
return l;
}
// 求gamma的梯度,对应公式 BN 2-6
//backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);
void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
{
int i,b,f;
for(f = 0; f < n; ++f){
float sum = 0;
for(b = 0; b < batch; ++b){
for(i = 0; i < size; ++i){
int index = i + size*(f + n*b);
sum += delta[index] * x_norm[index];
}
}
scale_updates[f] += sum;
}
}
// 求y对均值的导数,对应公式 BN 2-2
void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
{
int i,j,k;
for(i = 0; i < filters; ++i){
mean_delta[i] = 0;
for (j = 0; j < batch; ++j) {
for (k = 0; k < spatial; ++k) {
int index = j*filters*spatial + i*spatial + k;
mean_delta[i] += delta[index];
}
}
mean_delta[i] *= (-1./sqrt(variance[i] + .00001f));
}
}
// 求y对方差的导数,对应公式 BN 2-1
void variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta)
{
int i,j,k;
for(i = 0; i < filters; ++i){
variance_delta[i] = 0;
for(j = 0; j < batch; ++j){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + i*spatial + k;
variance_delta[i] += delta[index]*(x[index] - mean[i]);
}
}
variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.));
}
}
// 归一化,对应公式 BN 2-3
void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
{
int f, j, k;
for(j = 0; j < batch; ++j){
for(f = 0; f < filters; ++f){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + f*spatial + k;
delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
}
}
}
}
void resize_batchnorm_layer(layer *layer, int w, int h)
{
fprintf(stderr, "Not implemented\n");
}
//mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
/**
* 计算每个通道中所有元素的均值
* @param x 3
* @param batch
* @param filters
* @param spatial
* @param mean
*/
void mean_cpu(float *x, int batch, int filters, int spatial, float *mean)
{
float scale = 1./(batch * spatial);
int i,j,k;
for(i = 0; i < filters; ++i){
mean[i] = 0;
for(j = 0; j < batch; ++j){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + i*spatial + k;
mean[i] += x[index];
}
}
mean[i] *= scale;
}
}
void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance)
{
float scale = 1./(batch * spatial - 1);
int i,j,k;
for(i = 0; i < filters; ++i){
variance[i] = 0;
for(j = 0; j < batch; ++j){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + i*spatial + k;
variance[i] += pow((x[index] - mean[i]), 2);
}
}
variance[i] *= scale;
}
}
//scal_cpu(l.out_c, .99, l.rolling_mean, 1);
void scal_cpu(int N, float ALPHA, float *X, int INCX)
{
int i;
for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA;
}
//axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);
void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
{
int i;
for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX];
}
// 归一化前向传播函数
/**
* BN层前向出传播函数
* @param l 当前BN层
* @param net 整个网络
*/
void forward_batchnorm_layer(layer l, network net)
{
// 对于batchnorm层,直接输出等于输入,BN计算是在l.output进行计算
if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
if(net.train){ // 训练状态
// 求当前batch的均值,对应公式 mini-batch mean
mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
// 求当前batch的方差,对应公式 mini-batch variance
variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);
// 求均值的滚动平均,【预测时,均值就是这个值】
// l.rolling_mean *= 0.99 这里的0.99实际上的计算方法是【1 - 1./batch_size】,这里假设batch_size = 100
scal_cpu(l.out_c, .99, l.rolling_mean, 1);
// l.rolling_mean = 0.01 * l.mean + l.rolling_mean 这里的0.01实际上的计算方法是【1./batch_size】,这里假设batch_size = 100
axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);
// 求方差的滚动平均,【预测时,方差就用这个值】
// l.rolling_variance *= 0.99
scal_cpu(l.out_c, .99, l.rolling_variance, 1);
// l.rolling_variance = 0.01 * l.variance + l.rolling_variance
axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1);
// 归一化, 对应公式 Normalize
normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);
// l.x_norm = l.output 将归一化结果保存在l.x_norm中国,用于反向传播时候的梯度计算
copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
} else { // 测试状态, 直接使用rolling_mean 和 rolling_variance 进行归一化即可
normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
}
// 下面这两步,对应缩放和迁移,这里l.scales为gamma,l.biases对应beta
scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w);
}
// 求当前batch的每个元素进行平移操作, 加上beta
void add_bias(float *output, float *biases, int batch, int n, int size)
{
int i,j,b;
for(b = 0; b < batch; ++b){
for(i = 0; i < n; ++i){
for(j = 0; j < size; ++j){
output[(b*n + i)*size + j] += biases[i];
}
}
}
}
// 求当前batch的每个元素进行缩放操作,
// scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
void scale_bias(float *output, float *scales, int batch, int n, int size)
{
int i,j,b;
for(b = 0; b < batch; ++b){
for(i = 0; i < n; ++i){
for(j = 0; j < size; ++j){
output[(b*n + i)*size + j] *= scales[i];
}
}
}
}
// BN反向传播
/**
* BN层反向传播函数
* @param l 当前BN层
* @param net 整个网络
*/
void backward_batchnorm_layer(layer l, network net)
{
if(!net.train) { //测试状态
l.mean = l.rolling_mean;
l.variance = l.rolling_variance;
}
// 求偏差beta的梯度, 【公式1-2】
backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h);
// 求gamma的梯度, d(l.delta)/d(l.gamma) = xi,即l.delta * l.x_norm【归一化的结果】 【公式1-1】
backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);
// 正向传播中scale and shift部分已经反向计算完毕,计算公共到gamma部分公共求导 【公式1-3】
scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
// 求y对均值的导数, 【公式1-4】
mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta);
// 求y对方差的导数, 【公式1-5】 这里按上面化简后的公式,若激活函数为ReLU应该直接等于0
variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta);
// 求y对xi的导数,对应公式 【公式1-6】,即求上一层的误差项
normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
// 对应BN层,直接输出等于输入,l.delta 拷贝给 net.delta
if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, net.delta, 1);
}
// 求偏差beta的梯度, 对应公式 BN 2-5
//backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h);
void backward_bias(float *bias_updates, float *delta, int batch, int n , int size) {
int i, b;
for (b = 0; b < batch; b ++) {
for (i = 0; i < n; i ++) // 每个通道
bias_updates[i] += sum_array(delta+size*(i+b*n), size);
}
}
#ifdef GPU
void pull_batchnorm_layer(layer l)
{
cuda_pull_array(l.scales_gpu, l.scales, l.c);
cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.c);
cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.c);
}
void push_batchnorm_layer(layer l)
{
cuda_push_array(l.scales_gpu, l.scales, l.c);
cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.c);
cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.c);
}
void forward_batchnorm_layer_gpu(layer l, network net)
{
if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
if (net.train) {
#ifdef CUDNN
float one = 1;
float zero = 0;
cudnnBatchNormalizationForwardTraining(cudnn_handle(),
CUDNN_BATCHNORM_SPATIAL,
&one,
&zero,
l.dstTensorDesc,
l.x_gpu,
l.dstTensorDesc,
l.output_gpu,
l.normTensorDesc,
l.scales_gpu,
l.biases_gpu,
.01,
l.rolling_mean_gpu,
l.rolling_variance_gpu,
.00001,
l.mean_gpu,
l.variance_gpu);
#else
fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);
scal_gpu(l.out_c, .99, l.rolling_mean_gpu, 1);
axpy_gpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
scal_gpu(l.out_c, .99, l.rolling_variance_gpu, 1);
axpy_gpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
#endif
} else {
normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
}
}
void backward_batchnorm_layer_gpu(layer l, network net)
{
if(!net.train){
l.mean_gpu = l.rolling_mean_gpu;
l.variance_gpu = l.rolling_variance_gpu;
}
#ifdef CUDNN
float one = 1;
float zero = 0;
cudnnBatchNormalizationBackward(cudnn_handle(),
CUDNN_BATCHNORM_SPATIAL,
&one,
&zero,
&one,
&one,
l.dstTensorDesc,
l.x_gpu,
l.dstTensorDesc,
l.delta_gpu,
l.dstTensorDesc,
l.x_norm_gpu,
l.normTensorDesc,
l.scales_gpu,
l.scale_updates_gpu,
l.bias_updates_gpu,
.00001,
l.mean_gpu,
l.variance_gpu);
copy_gpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
#else
backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);
scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
#endif
if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
}
#endif