Caffe 中的 LRNLayer 支持两种模式:
ReLU 具有不需要输入归一化以防止其饱和的理想特性。但 AlexNet 论文中发现 LRN 有助于提高泛化性。LRN CrossChannel 模式公式如下:
b x , y i = a x , y i ( k + α n ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) ( a x , y i ) 2 ) β b_{x,y}^{i} = \frac{a_{x,y}^{i}}{(k+\frac{\alpha}{n} \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}(a_{x,y}^{i})^{2})^{\beta}} bx,yi=(k+nα∑j=max(0,i−n/2)min(N−1,i+n/2)(ax,yi)2)βax,yi
其中, a x , y i a_{x,y}^{i} ax,yi 表示在第 i i i 个卷积核输出的 ( x , y ) (x, y) (x,y) 位置施加 ReLU 非线性计算得到的神经元活动。在相同空间位置上的 n n n 个"相邻"通道上求和, N N N 为该层通道的总数。常数 k k k、 n n n、 α \alpha α和 β \beta β。是超参数。 b x , y i b_{x,y}^{i} bx,yi 为响应归一化的激活。
Caffe 的 LRNLayer 在 CPU 和 GPU 中均采用滑窗方式实现。不同特征图位置间并行,在通道方向上循环处理。层中前向缓存公式中的 k + α n ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) ( a x , y i ) 2 k+\frac{\alpha}{n} \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}(a_{x,y}^{i})^{2} k+nα∑j=max(0,i−n/2)min(N−1,i+n/2)(ax,yi)2 供给后向求导时使用。
scale_
为前向和后向共享的变量。初始化为什么不使用拷贝而使用赋值?
padded_square
在前后填充size_ - 1
个通道。
alpha_over_size
为 α n \frac{\alpha}{n} nα。
template <typename Dtype>
void LRNLayer<Dtype>::CrossChannelForward_cpu(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
const Dtype* bottom_data = bottom[0]->cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
Dtype* scale_data = scale_.mutable_cpu_data();
// start with the constant value
for (int i = 0; i < scale_.count(); ++i) {
scale_data[i] = k_;
}
Blob<Dtype> padded_square(1, channels_ + size_ - 1, height_, width_);
Dtype* padded_square_data = padded_square.mutable_cpu_data();
caffe_set(padded_square.count(), Dtype(0), padded_square_data);
Dtype alpha_over_size = alpha_ / size_;
每个 image 是独立的。
caffe_sqr 计算平方。
padded_square_data
跳过前面的pre_pad_
个通道,计算 ( a x , y i ) 2 (a_{x,y}^{i})^{2} (ax,yi)2。
// go through the images
for (int n = 0; n < num_; ++n) {
// compute the padded square
caffe_sqr(channels_ * height_ * width_,
bottom_data + bottom[0]->offset(n),
padded_square_data + padded_square.offset(0, pre_pad_));
caffe_axpy 功能为 Y = α X + Y Y=\alpha X+Y Y=αX+Y
输出地址不变,累加size_
个通道的 α n ( a x , y i ) 2 \frac{\alpha}{n}(a_{x,y}^{i})^{2} nα(ax,yi)2,计算出第一个通道的 N N N。
// Create the first channel scale
for (int c = 0; c < size_; ++c) {
caffe_axpy<Dtype>(height_ * width_, alpha_over_size,
padded_square_data + padded_square.offset(0, c),
scale_data + scale_.offset(n, 0));
}
后续通道基于第一个通道的 N N N 计算,加上后面的通道,减去前面通道的值。
scale_data
为 N = k + ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) α n ( a x , y i ) 2 N = k+ \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}\frac{\alpha}{n}(a_{x,y}^{i})^{2} N=k+∑j=max(0,i−n/2)min(N−1,i+n/2)nα(ax,yi)2。
for (int c = 1; c < channels_; ++c) {
// copy previous scale
caffe_copy<Dtype>(height_ * width_,
scale_data + scale_.offset(n, c - 1),
scale_data + scale_.offset(n, c));
// add head
caffe_axpy<Dtype>(height_ * width_, alpha_over_size,
padded_square_data + padded_square.offset(0, c + size_ - 1),
scale_data + scale_.offset(n, c));
// subtract tail
caffe_axpy<Dtype>(height_ * width_, -alpha_over_size,
padded_square_data + padded_square.offset(0, c - 1),
scale_data + scale_.offset(n, c));
}
}
计算 y = x N β y = \frac{x} {N^{\beta}} y=Nβx
// In the end, compute output
caffe_powx<Dtype>(scale_.count(), scale_data, -beta_, top_data);
caffe_mul<Dtype>(scale_.count(), top_data, bottom_data, top_data);
}
LRNFillScale 函数计算 N = k + ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) α n ( a x , y i ) 2 N = k+ \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}\frac{\alpha}{n}(a_{x,y}^{i})^{2} N=k+∑j=max(0,i−n/2)min(N−1,i+n/2)nα(ax,yi)2。每个线程处理一个元素。
CAFFE_GET_BLOCKS 根据线程数计算 block 数。
template <typename Dtype>
void LRNLayer<Dtype>::CrossChannelForward_gpu(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
// First, compute scale
const Dtype* bottom_data = bottom[0]->gpu_data();
Dtype* top_data = top[0]->mutable_gpu_data();
Dtype* scale_data = scale_.mutable_gpu_data();
// We will launch one kernel for each pixel location, and have the kernel
// go through all the channels.
int n_threads = num_ * height_ * width_;
// NOLINT_NEXT_LINE(whitespace/operators)
LRNFillScale<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
n_threads, bottom_data, num_, channels_, height_, width_, size_,
alpha_ / size_, k_, scale_data);
CUDA_POST_KERNEL_CHECK;
LRNComputeOutput 函数计算 y = x N β y = \frac{x} {N^{\beta}} y=Nβx,每个线程处理一个输出。
n_threads = bottom[0]->count();
// NOLINT_NEXT_LINE(whitespace/operators)
LRNComputeOutput<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
n_threads, bottom_data, scale_data, -beta_, top_data);
CUDA_POST_KERNEL_CHECK;
}
N = k + ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) α n ( a x , y i ) 2 N = k+ \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}\frac{\alpha}{n}(a_{x,y}^{i})^{2} N=k+j=max(0,i−n/2)∑min(N−1,i+n/2)nα(ax,yi)2
根据一维index
推算出空间位置。
CUDA_KERNEL_LOOP(index, nthreads) {
// find out the local offset
const int w = index % width;
const int h = (index / width) % height;
const int n = index / width / height;
const int offset = (n * channels * height + h) * width + w;
const int step = height * width;
const Dtype* const in_off = in + offset;
Dtype* const scale_off = scale + offset;
仍然是滑窗方式。
首先计算post_pad
个元素的平方和。
int head = 0;
const int pre_pad = (size - 1) / 2;
const int post_pad = size - pre_pad - 1;
Dtype accum_scale = 0;
// fill the scale at [n, :, h, w]
// accumulate values
while (head < post_pad && head < channels) {
accum_scale += in_off[head * step] * in_off[head * step];
++head;
}
维持一个宽度为size
的滑动窗口,计算窗口内元素的平方和。
得到head - post_pad
个通道的 scale 值。
// both add and subtract
while (head < channels) {
accum_scale += in_off[head * step] * in_off[head * step];
if (head - size >= 0) {
accum_scale -= in_off[(head - size) * step]
* in_off[(head - size) * step];
}
scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
++head;
}
计算末尾post_pad
个通道的 scale 值。
// subtract only
while (head < channels + post_pad) {
if (head - size >= 0) {
accum_scale -= in_off[(head - size) * step]
* in_off[(head - size) * step];
}
scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
++head;
}
}
y = x N β y = \frac{x} {N^{\beta}} y=Nβx
CUDA_KERNEL_LOOP(index, nthreads) {
out[index] = in[index] * pow(scale[index], negative_beta);
}
d y i d x i = N β − x i β ⋅ N β − 1 ⋅ 2 α x i N 2 β = 1 − 2 α β N − 1 ⋅ x i x i N β d y i d x j = − x i β ⋅ N β − 1 ⋅ 2 α x j N 2 β = − 2 α β ⋅ N − 1 ⋅ x i x j N β \begin{aligned} \frac{dy_i}{dx_i} &= \frac{N^\beta - x_i\beta\cdot N^{\beta-1}\cdot 2\alpha x_i}{N^{2\beta}}\\ &= \frac{1 - 2\alpha\beta N^{-1}\cdot x_i x_i}{N^{\beta}}\\ \frac{dy_i}{dx_j} &= \frac{ - x_i\beta \cdot N^{\beta-1}\cdot 2\alpha x_j}{N^{2\beta}}\\ &= \frac{ - 2\alpha\beta \cdot N^{-1}\cdot x_i x_j}{N^{\beta}} \end{aligned} dxidyidxjdyi=N2βNβ−xiβ⋅Nβ−1⋅2αxi=Nβ1−2αβN−1⋅xixi=N2β−xiβ⋅Nβ−1⋅2αxj=Nβ−2αβ⋅N−1⋅xixj
scale_
为前向过程中求出的 N = k + ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) α n ( a x , y i ) 2 N = k+ \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}\frac{\alpha}{n}(a_{x,y}^{i})^{2} N=k+∑j=max(0,i−n/2)min(N−1,i+n/2)nα(ax,yi)2
padded_ratio
和accum_ratio
为临时空间。padded_ratio
为填充通道后的单个特征图大小,accum_ratio
为特征图的通道求和。
cache_ratio_value
为 2 α β n \frac{2\alpha\beta}{n} n2αβ。
template <typename Dtype>
void LRNLayer<Dtype>::CrossChannelBackward_cpu(
const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
const vector<Blob<Dtype>*>& bottom) {
const Dtype* top_diff = top[0]->cpu_diff();
const Dtype* top_data = top[0]->cpu_data();
const Dtype* bottom_data = bottom[0]->cpu_data();
const Dtype* scale_data = scale_.cpu_data();
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
Blob<Dtype> padded_ratio(1, channels_ + size_ - 1, height_, width_);
Blob<Dtype> accum_ratio(1, 1, height_, width_);
Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data();
Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data();
// We hack a little bit by using the diff() to store an additional result
Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff();
caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data);
Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_;
bottom_diff
为 N − β ∂ E ∂ y N^{-\beta} \frac{\partial E}{\partial y} N−β∂y∂E。
caffe_powx<Dtype>(scale_.count(), scale_data, -beta_, bottom_diff);
caffe_mul<Dtype>(scale_.count(), top_diff, bottom_diff, bottom_diff);
inverse_pre_pad
即反向计算中前填充通道数。
每个批量单独计算。
padded_ratio
为 N − 1 y ∂ E ∂ y N^{-1} y\frac{\partial E}{\partial y} N−1y∂y∂E
// go through individual data
int inverse_pre_pad = size_ - (size_ + 1) / 2;
for (int n = 0; n < num_; ++n) {
int block_offset = scale_.offset(n);
// first, compute diff_i * y_i / s_i
caffe_mul<Dtype>(channels_ * height_ * width_,
top_diff + block_offset, top_data + block_offset,
padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
caffe_div<Dtype>(channels_ * height_ * width_,
padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad),
scale_data + block_offset,
padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
累加padded_ratio
中的size_-1
个局部通道得到前缀通道结果accum_ratio
。
// Now, compute the accumulated ratios and the bottom diff
caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data);
for (int c = 0; c < size_ - 1; ++c) {
caffe_axpy<Dtype>(height_ * width_, 1.,
padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
}
padded_ratio
前后有填充,所以c + size_ - 1
对应的是c
通道的最后一个局部输入通道。
accum_ratio
为 ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} ∑j=max(0,i−n/2)min(N−1,i+n/2)N−1y∂y∂E。
对于每个通道,accum_ratio
前减后加。
accum_ratio_times_bottom
为 x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} x∑j=max(0,i−n/2)min(N−1,i+n/2)N−1y∂y∂E。
由于 y i = N − β ⋅ x j y_i = N^{-{\beta}}\cdot x_j yi=N−β⋅xj
因此,
∂ E ∂ x = ∂ E ∂ y ∂ y ∂ x = ∂ E ∂ y ( 1 − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 ⋅ x j N β ) = ∂ E ∂ y ( N − β − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ) = ∂ E ∂ y N − β − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y \begin{aligned} \frac{\partial E}{\partial x} &= \frac{\partial E}{\partial y} \frac{\partial y}{\partial x} \\ &=\frac{\partial E}{\partial y} \left(\frac{1-\frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} \cdot x_j}{N^{\beta}}\right) \\ &=\frac{\partial E}{\partial y} (N^{-{\beta}} - \frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y) \\ &=\frac{\partial E}{\partial y} N^{-{\beta}} - \frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} \end{aligned} ∂x∂E=∂y∂E∂x∂y=∂y∂E Nβ1−n2αβx∑j=max(0,i−n/2)min(N−1,i+n/2)N−1⋅xj =∂y∂E(N−β−n2αβxj=max(0,i−n/2)∑min(N−1,i+n/2)N−1y)=∂y∂EN−β−n2αβxj=max(0,i−n/2)∑min(N−1,i+n/2)N−1y∂y∂E
for (int c = 0; c < channels_; ++c) {
caffe_axpy<Dtype>(height_ * width_, 1.,
padded_ratio_data + padded_ratio.offset(0, c + size_ - 1),
accum_ratio_data);
// compute bottom diff
caffe_mul<Dtype>(height_ * width_,
bottom_data + top[0]->offset(n, c),
accum_ratio_data, accum_ratio_times_bottom);
caffe_axpy<Dtype>(height_ * width_, -cache_ratio_value,
accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c));
caffe_axpy<Dtype>(height_ * width_, -1.,
padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
}
}
}
将 NHW 维度合并,并行处理特征图位置上的元素。
网络后面的层由于特征图小而通道宽,导致并行程度低,效率下降。
template <typename Dtype>
void LRNLayer<Dtype>::CrossChannelBackward_gpu(
const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
const vector<Blob<Dtype>*>& bottom) {
int n_threads = num_ * height_ * width_;
// NOLINT_NEXT_LINE(whitespace/operators)
LRNComputeDiff<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
bottom[0]->mutable_gpu_diff());
}
CUDA_KERNEL_LOOP 循环执行 grid。
根据一维index
推算出位置偏移。
template <typename Dtype>
__global__ void LRNComputeDiff(const int nthreads,
const Dtype* const bottom_data, const Dtype* const top_data,
const Dtype* const scale, const Dtype* const top_diff,
const int num, const int channels, const int height,
const int width, const int size, const Dtype negative_beta,
const Dtype cache_ratio, Dtype* const bottom_diff) {
CUDA_KERNEL_LOOP(index, nthreads) {
// find out the local offset
const int w = index % width;
const int h = (index / width) % height;
const int n = index / width / height;
const int offset = (n * channels * height + h) * width + w;
const int step = height * width;
const Dtype* const bottom_off = bottom_data + offset;
const Dtype* const top_off = top_data + offset;
const Dtype* const scale_off = scale + offset;
const Dtype* const top_diff_off = top_diff + offset;
Dtype* const bottom_diff_off = bottom_diff + offset;
head
为通道上中心值的索引。
pre_pad
为包含中心值,比post_pad
长。
累加属于第一个输出的post_pad
个 N − 1 y ∂ E ∂ y N^{-1} y\frac{\partial E}{\partial y} N−1y∂y∂E
int head = 0;
const int pre_pad = size - (size + 1) / 2;
const int post_pad = size - pre_pad - 1;
Dtype accum_ratio = 0;
// accumulate values
while (head < post_pad && head < channels) {
accum_ratio += top_diff_off[head * step] * top_off[head * step] /
scale_off[head * step];
++head;
}
循环处理channels
次,得到channels-post_pad
个通道的 $ \frac{\partial E}{\partial x}$。
accum_ratio
为 ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} ∑j=max(0,i−n/2)min(N−1,i+n/2)N−1y∂y∂E。
没有缓冲区存储每个位置上的 N − 1 y ∂ E ∂ y N^{-1} y\frac{\partial E}{\partial y} N−1y∂y∂E,因此加减过程中需要计算两次。
cache_ratio
为 2 α β n \frac{2\alpha\beta}{n} n2αβ。
∂ E ∂ x = ∂ E ∂ y ∂ y ∂ x = ∂ E ∂ y ( 1 − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 ⋅ x j N β ) = ∂ E ∂ y ( N − β − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ) = ∂ E ∂ y N − β − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y \begin{aligned} \frac{\partial E}{\partial x} &= \frac{\partial E}{\partial y} \frac{\partial y}{\partial x} \\ &=\frac{\partial E}{\partial y} \left(\frac{1-\frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} \cdot x_j}{N^{\beta}}\right) \\ &=\frac{\partial E}{\partial y} (N^{-{\beta}} - \frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y) \\ &=\frac{\partial E}{\partial y} N^{-{\beta}} - \frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} \end{aligned} ∂x∂E=∂y∂E∂x∂y=∂y∂E Nβ1−n2αβx∑j=max(0,i−n/2)min(N−1,i+n/2)N−1⋅xj =∂y∂E(N−β−n2αβxj=max(0,i−n/2)∑min(N−1,i+n/2)N−1y)=∂y∂EN−β−n2αβxj=max(0,i−n/2)∑min(N−1,i+n/2)N−1y∂y∂E
y j = N − β ⋅ x j y_j = N^{-{\beta}}\cdot x_j yj=N−β⋅xj 。
// both add and subtract
while (head < channels) {
accum_ratio += top_diff_off[head * step] * top_off[head * step] /
scale_off[head * step];
if (head - size >= 0) {
accum_ratio -= top_diff_off[(head - size) * step] *
top_off[(head - size) * step] / scale_off[(head - size) * step];
}
bottom_diff_off[(head - post_pad) * step] =
top_diff_off[(head - post_pad) * step]
* pow(scale_off[(head - post_pad) * step], negative_beta)
- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
++head;
}
计算最后的post_pad
个通道。
// subtract only
while (head < channels + post_pad) {
if (head - size >= 0) {
accum_ratio -= top_diff_off[(head - size) * step] *
top_off[(head - size) * step] / scale_off[(head - size) * step];
}
bottom_diff_off[(head - post_pad) * step] =
top_diff_off[(head - post_pad) * step]
* pow(scale_off[(head - post_pad) * step], negative_beta)
- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
++head;
}
}
}