Caffe 中的 LRNLayer

Caffe 中的 LRNLayer 支持两种模式:

  • CrossChannel:为人们所熟知的局部响应归一化,在 AlexNet 中提出,并在一些早期网络中使用;
  • WithinChannel: Caffe 中独有的实现,未见网络中应用。本文略过。

ReLU 具有不需要输入归一化以防止其饱和的理想特性。但 AlexNet 论文中发现 LRN 有助于提高泛化性。LRN CrossChannel 模式公式如下:
b x , y i = a x , y i ( k + α n ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) ( a x , y i ) 2 ) β b_{x,y}^{i} = \frac{a_{x,y}^{i}}{(k+\frac{\alpha}{n} \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}(a_{x,y}^{i})^{2})^{\beta}} bx,yi=(k+nαj=max(0,in/2)min(N1,i+n/2)(ax,yi)2)βax,yi

其中, a x , y i a_{x,y}^{i} ax,yi 表示在第 i i i 个卷积核输出的 ( x , y ) (x, y) (x,y) 位置施加 ReLU 非线性计算得到的神经元活动。在相同空间位置上的 n n n 个"相邻"通道上求和, N N N 为该层通道的总数。常数 k k k n n n α \alpha α β \beta β。是超参数。 b x , y i b_{x,y}^{i} bx,yi 为响应归一化的激活。

Caffe 的 LRNLayer 在 CPU 和 GPU 中均采用滑窗方式实现。不同特征图位置间并行,在通道方向上循环处理。层中前向缓存公式中的 k + α n ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) ( a x , y i ) 2 k+\frac{\alpha}{n} \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}(a_{x,y}^{i})^{2} k+nαj=max(0,in/2)min(N1,i+n/2)(ax,yi)2 供给后向求导时使用。

LRNLayer::CrossChannelForward_cpu

scale_为前向和后向共享的变量。初始化为什么不使用拷贝而使用赋值?
padded_square在前后填充size_ - 1个通道。
alpha_over_size α n \frac{\alpha}{n} nα

template <typename Dtype>
void LRNLayer<Dtype>::CrossChannelForward_cpu(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  Dtype* top_data = top[0]->mutable_cpu_data();
  Dtype* scale_data = scale_.mutable_cpu_data();
  // start with the constant value
  for (int i = 0; i < scale_.count(); ++i) {
    scale_data[i] = k_;
  }
  Blob<Dtype> padded_square(1, channels_ + size_ - 1, height_, width_);
  Dtype* padded_square_data = padded_square.mutable_cpu_data();
  caffe_set(padded_square.count(), Dtype(0), padded_square_data);
  Dtype alpha_over_size = alpha_ / size_;

每个 image 是独立的。
caffe_sqr 计算平方。
padded_square_data跳过前面的pre_pad_个通道,计算 ( a x , y i ) 2 (a_{x,y}^{i})^{2} (ax,yi)2

  // go through the images
  for (int n = 0; n < num_; ++n) {
    // compute the padded square
    caffe_sqr(channels_ * height_ * width_,
        bottom_data + bottom[0]->offset(n),
        padded_square_data + padded_square.offset(0, pre_pad_));

caffe_axpy 功能为 Y = α X + Y Y=\alpha X+Y Y=αX+Y
输出地址不变,累加size_个通道的 α n ( a x , y i ) 2 \frac{\alpha}{n}(a_{x,y}^{i})^{2} nα(ax,yi)2,计算出第一个通道的 N N N

    // Create the first channel scale
    for (int c = 0; c < size_; ++c) {
      caffe_axpy<Dtype>(height_ * width_, alpha_over_size,
          padded_square_data + padded_square.offset(0, c),
          scale_data + scale_.offset(n, 0));
    }

后续通道基于第一个通道的 N N N 计算,加上后面的通道,减去前面通道的值。
scale_data N = k + ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) α n ( a x , y i ) 2 N = k+ \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}\frac{\alpha}{n}(a_{x,y}^{i})^{2} N=k+j=max(0,in/2)min(N1,i+n/2)nα(ax,yi)2

    for (int c = 1; c < channels_; ++c) {
      // copy previous scale
      caffe_copy<Dtype>(height_ * width_,
          scale_data + scale_.offset(n, c - 1),
          scale_data + scale_.offset(n, c));
      // add head
      caffe_axpy<Dtype>(height_ * width_, alpha_over_size,
          padded_square_data + padded_square.offset(0, c + size_ - 1),
          scale_data + scale_.offset(n, c));
      // subtract tail
      caffe_axpy<Dtype>(height_ * width_, -alpha_over_size,
          padded_square_data + padded_square.offset(0, c - 1),
          scale_data + scale_.offset(n, c));
    }
  }

计算 y = x N β y = \frac{x} {N^{\beta}} y=Nβx

  // In the end, compute output
  caffe_powx<Dtype>(scale_.count(), scale_data, -beta_, top_data);
  caffe_mul<Dtype>(scale_.count(), top_data, bottom_data, top_data);
}

LRNLayer::CrossChannelForward_gpu

LRNLayer::CrossChannelForward_gpu
LRNFillScale
LRNComputeOutput

LRNFillScale 函数计算 N = k + ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) α n ( a x , y i ) 2 N = k+ \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}\frac{\alpha}{n}(a_{x,y}^{i})^{2} N=k+j=max(0,in/2)min(N1,i+n/2)nα(ax,yi)2。每个线程处理一个元素。
CAFFE_GET_BLOCKS 根据线程数计算 block 数。

template <typename Dtype>
void LRNLayer<Dtype>::CrossChannelForward_gpu(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  // First, compute scale
  const Dtype* bottom_data = bottom[0]->gpu_data();
  Dtype* top_data = top[0]->mutable_gpu_data();
  Dtype* scale_data = scale_.mutable_gpu_data();
  // We will launch one kernel for each pixel location, and have the kernel
  // go through all the channels.
  int n_threads = num_ * height_ * width_;
  // NOLINT_NEXT_LINE(whitespace/operators)
  LRNFillScale<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
      n_threads, bottom_data, num_, channels_, height_, width_, size_,
      alpha_ / size_, k_, scale_data);
  CUDA_POST_KERNEL_CHECK;

LRNComputeOutput 函数计算 y = x N β y = \frac{x} {N^{\beta}} y=Nβx,每个线程处理一个输出。

  n_threads = bottom[0]->count();
  // NOLINT_NEXT_LINE(whitespace/operators)
  LRNComputeOutput<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
      n_threads, bottom_data, scale_data, -beta_, top_data);
  CUDA_POST_KERNEL_CHECK;
}  

LRNFillScale

N = k + ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) α n ( a x , y i ) 2 N = k+ \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}\frac{\alpha}{n}(a_{x,y}^{i})^{2} N=k+j=max(0,in/2)min(N1,i+n/2)nα(ax,yi)2
根据一维index推算出空间位置。

  CUDA_KERNEL_LOOP(index, nthreads) {
    // find out the local offset
    const int w = index % width;
    const int h = (index / width) % height;
    const int n = index / width / height;
    const int offset = (n * channels * height + h) * width + w;
    const int step = height * width;
    const Dtype* const in_off = in + offset;
    Dtype* const scale_off = scale + offset;

仍然是滑窗方式。
首先计算post_pad个元素的平方和。

    int head = 0;
    const int pre_pad = (size - 1) / 2;
    const int post_pad = size - pre_pad - 1;
    Dtype accum_scale = 0;
    // fill the scale at [n, :, h, w]
    // accumulate values
    while (head < post_pad && head < channels) {
      accum_scale += in_off[head * step] * in_off[head * step];
      ++head;
    }

维持一个宽度为size的滑动窗口,计算窗口内元素的平方和。
得到head - post_pad个通道的 scale 值。

    // both add and subtract
    while (head < channels) {
      accum_scale += in_off[head * step] * in_off[head * step];
      if (head - size >= 0) {
        accum_scale -= in_off[(head - size) * step]
                       * in_off[(head - size) * step];
      }
      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
      ++head;
    }

计算末尾post_pad个通道的 scale 值。

    // subtract only
    while (head < channels + post_pad) {
      if (head - size >= 0) {
        accum_scale -= in_off[(head - size) * step]
                       * in_off[(head - size) * step];
      }
      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
      ++head;
    }
  }

LRNComputeOutput

y = x N β y = \frac{x} {N^{\beta}} y=Nβx

  CUDA_KERNEL_LOOP(index, nthreads) {
    out[index] = in[index] * pow(scale[index], negative_beta);
  }

LRNLayer::CrossChannelBackward_cpu

d y i d x i = N β − x i β ⋅ N β − 1 ⋅ 2 α x i N 2 β = 1 − 2 α β N − 1 ⋅ x i x i N β d y i d x j = − x i β ⋅ N β − 1 ⋅ 2 α x j N 2 β = − 2 α β ⋅ N − 1 ⋅ x i x j N β \begin{aligned} \frac{dy_i}{dx_i} &= \frac{N^\beta - x_i\beta\cdot N^{\beta-1}\cdot 2\alpha x_i}{N^{2\beta}}\\ &= \frac{1 - 2\alpha\beta N^{-1}\cdot x_i x_i}{N^{\beta}}\\ \frac{dy_i}{dx_j} &= \frac{ - x_i\beta \cdot N^{\beta-1}\cdot 2\alpha x_j}{N^{2\beta}}\\ &= \frac{ - 2\alpha\beta \cdot N^{-1}\cdot x_i x_j}{N^{\beta}} \end{aligned} dxidyidxjdyi=N2βNβxiβNβ12αxi=Nβ12αβN1xixi=N2βxiβNβ12αxj=Nβ2αβN1xixj
scale_为前向过程中求出的 N = k + ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) α n ( a x , y i ) 2 N = k+ \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}\frac{\alpha}{n}(a_{x,y}^{i})^{2} N=k+j=max(0,in/2)min(N1,i+n/2)nα(ax,yi)2
padded_ratioaccum_ratio为临时空间。padded_ratio为填充通道后的单个特征图大小,accum_ratio为特征图的通道求和。
cache_ratio_value 2 α β n \frac{2\alpha\beta}{n} n2αβ

template <typename Dtype>
void LRNLayer<Dtype>::CrossChannelBackward_cpu(
    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
    const vector<Blob<Dtype>*>& bottom) {
  const Dtype* top_diff = top[0]->cpu_diff();
  const Dtype* top_data = top[0]->cpu_data();
  const Dtype* bottom_data = bottom[0]->cpu_data();
  const Dtype* scale_data = scale_.cpu_data();
  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
  Blob<Dtype> padded_ratio(1, channels_ + size_ - 1, height_, width_);
  Blob<Dtype> accum_ratio(1, 1, height_, width_);
  Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data();
  Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data();
  // We hack a little bit by using the diff() to store an additional result
  Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff();
  caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data);
  Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_;

bottom_diff N − β ∂ E ∂ y N^{-\beta} \frac{\partial E}{\partial y} NβyE

  caffe_powx<Dtype>(scale_.count(), scale_data, -beta_, bottom_diff);
  caffe_mul<Dtype>(scale_.count(), top_diff, bottom_diff, bottom_diff);

inverse_pre_pad即反向计算中前填充通道数。
每个批量单独计算。
padded_ratio N − 1 y ∂ E ∂ y N^{-1} y\frac{\partial E}{\partial y} N1yyE

  // go through individual data
  int inverse_pre_pad = size_ - (size_ + 1) / 2;
  for (int n = 0; n < num_; ++n) {
    int block_offset = scale_.offset(n);
    // first, compute diff_i * y_i / s_i
    caffe_mul<Dtype>(channels_ * height_ * width_,
        top_diff + block_offset, top_data + block_offset,
        padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
    caffe_div<Dtype>(channels_ * height_ * width_,
        padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad),
        scale_data + block_offset,
        padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));

累加padded_ratio中的size_-1个局部通道得到前缀通道结果accum_ratio

    // Now, compute the accumulated ratios and the bottom diff
    caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data);
    for (int c = 0; c < size_ - 1; ++c) {
      caffe_axpy<Dtype>(height_ * width_, 1.,
          padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
    }

padded_ratio前后有填充,所以c + size_ - 1对应的是c通道的最后一个局部输入通道。
accum_ratio ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} j=max(0,in/2)min(N1,i+n/2)N1yyE
对于每个通道,accum_ratio前减后加。
accum_ratio_times_bottom x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} xj=max(0,in/2)min(N1,i+n/2)N1yyE
由于 y i = N − β ⋅ x j y_i = N^{-{\beta}}\cdot x_j yi=Nβxj
因此,
∂ E ∂ x = ∂ E ∂ y ∂ y ∂ x = ∂ E ∂ y ( 1 − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 ⋅ x j N β ) = ∂ E ∂ y ( N − β − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ) = ∂ E ∂ y N − β − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y \begin{aligned} \frac{\partial E}{\partial x} &= \frac{\partial E}{\partial y} \frac{\partial y}{\partial x} \\ &=\frac{\partial E}{\partial y} \left(\frac{1-\frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} \cdot x_j}{N^{\beta}}\right) \\ &=\frac{\partial E}{\partial y} (N^{-{\beta}} - \frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y) \\ &=\frac{\partial E}{\partial y} N^{-{\beta}} - \frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} \end{aligned} xE=yExy=yE Nβ1n2αβxj=max(0,in/2)min(N1,i+n/2)N1xj =yE(Nβn2αβxj=max(0,in/2)min(N1,i+n/2)N1y)=yENβn2αβxj=max(0,in/2)min(N1,i+n/2)N1yyE

    for (int c = 0; c < channels_; ++c) {
      caffe_axpy<Dtype>(height_ * width_, 1.,
          padded_ratio_data + padded_ratio.offset(0, c + size_ - 1),
          accum_ratio_data);
      // compute bottom diff
      caffe_mul<Dtype>(height_ * width_,
          bottom_data + top[0]->offset(n, c),
          accum_ratio_data, accum_ratio_times_bottom);
      caffe_axpy<Dtype>(height_ * width_, -cache_ratio_value,
          accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c));
      caffe_axpy<Dtype>(height_ * width_, -1.,
          padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
    }
  }
}

LRNLayer::CrossChannelBackward_gpu

LRNLayer::CrossChannelBackward_gpu
LRNComputeDiff

将 NHW 维度合并,并行处理特征图位置上的元素。
网络后面的层由于特征图小而通道宽,导致并行程度低,效率下降。

template <typename Dtype>
void LRNLayer<Dtype>::CrossChannelBackward_gpu(
    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
    const vector<Blob<Dtype>*>& bottom) {
  int n_threads = num_ * height_ * width_;
  // NOLINT_NEXT_LINE(whitespace/operators)
  LRNComputeDiff<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
      n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
      scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
      bottom[0]->mutable_gpu_diff());
}

LRNComputeDiff

CUDA_KERNEL_LOOP 循环执行 grid。
根据一维index推算出位置偏移。

template <typename Dtype>
__global__ void LRNComputeDiff(const int nthreads,
    const Dtype* const bottom_data, const Dtype* const top_data,
    const Dtype* const scale, const Dtype* const top_diff,
    const int num, const int channels, const int height,
    const int width, const int size, const Dtype negative_beta,
    const Dtype cache_ratio, Dtype* const bottom_diff) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    // find out the local offset
    const int w = index % width;
    const int h = (index / width) % height;
    const int n = index / width / height;
    const int offset = (n * channels * height + h) * width + w;
    const int step = height * width;
    const Dtype* const bottom_off = bottom_data + offset;
    const Dtype* const top_off = top_data + offset;
    const Dtype* const scale_off = scale + offset;
    const Dtype* const top_diff_off = top_diff + offset;
    Dtype* const bottom_diff_off = bottom_diff + offset;

head为通道上中心值的索引。
pre_pad为包含中心值,比post_pad长。
累加属于第一个输出的post_pad N − 1 y ∂ E ∂ y N^{-1} y\frac{\partial E}{\partial y} N1yyE

    int head = 0;
    const int pre_pad = size - (size + 1) / 2;
    const int post_pad = size - pre_pad - 1;
    Dtype accum_ratio = 0;
    // accumulate values
    while (head < post_pad && head < channels) {
      accum_ratio += top_diff_off[head * step] * top_off[head * step] /
          scale_off[head * step];
      ++head;
    }

循环处理channels次,得到channels-post_pad个通道的 $ \frac{\partial E}{\partial x}$。
accum_ratio ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} j=max(0,in/2)min(N1,i+n/2)N1yyE
没有缓冲区存储每个位置上的 N − 1 y ∂ E ∂ y N^{-1} y\frac{\partial E}{\partial y} N1yyE,因此加减过程中需要计算两次。
cache_ratio 2 α β n \frac{2\alpha\beta}{n} n2αβ
∂ E ∂ x = ∂ E ∂ y ∂ y ∂ x = ∂ E ∂ y ( 1 − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 ⋅ x j N β ) = ∂ E ∂ y ( N − β − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ) = ∂ E ∂ y N − β − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y \begin{aligned} \frac{\partial E}{\partial x} &= \frac{\partial E}{\partial y} \frac{\partial y}{\partial x} \\ &=\frac{\partial E}{\partial y} \left(\frac{1-\frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} \cdot x_j}{N^{\beta}}\right) \\ &=\frac{\partial E}{\partial y} (N^{-{\beta}} - \frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y) \\ &=\frac{\partial E}{\partial y} N^{-{\beta}} - \frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} \end{aligned} xE=yExy=yE Nβ1n2αβxj=max(0,in/2)min(N1,i+n/2)N1xj =yE(Nβn2αβxj=max(0,in/2)min(N1,i+n/2)N1y)=yENβn2αβxj=max(0,in/2)min(N1,i+n/2)N1yyE

y j = N − β ⋅ x j y_j = N^{-{\beta}}\cdot x_j yj=Nβxj

    // both add and subtract
    while (head < channels) {
      accum_ratio += top_diff_off[head * step] * top_off[head * step] /
          scale_off[head * step];
      if (head - size >= 0) {
        accum_ratio -= top_diff_off[(head - size) * step] *
            top_off[(head - size) * step] / scale_off[(head - size) * step];
      }
      bottom_diff_off[(head - post_pad) * step] =
          top_diff_off[(head - post_pad) * step]
            * pow(scale_off[(head - post_pad) * step], negative_beta)
          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
      ++head;
    }

计算最后的post_pad个通道。

    // subtract only
    while (head < channels + post_pad) {
      if (head - size >= 0) {
        accum_ratio -= top_diff_off[(head - size) * step] *
            top_off[(head - size) * step] / scale_off[(head - size) * step];
      }
      bottom_diff_off[(head - post_pad) * step] =
          top_diff_off[(head - post_pad) * step]
            * pow(scale_off[(head - post_pad) * step], negative_beta)
          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
      ++head;
    }
  }
}

参考资料:

  • Caffe源码(一):math_functions 分析
  • Caffe中的底层数学计算函数
  • 滑动窗口技术(Sliding windows technique)

你可能感兴趣的:(Caffe,DeepLearning,caffe,深度学习,人工智能)