Caffe源代码之池化层

Caffe源代码之池化层

池化层(PoolingLayer)在深度学习的结构中占很重要的一部分,其功能是缩小输入特征图的尺度,增大输出特征图的感受野,在有的网络中,通过全局池化层,将二维特征图变成一维特征向量。池化层有两个特点:(1)输出特征图尺度普遍小于输入特征图尺度;(2)池化层并没有可学习的参量,因此,再反向传播的过程中,只需要求输入特征图的梯度(bottom_diff)。下面,我们来看看在Caffe中,怎么实现PoolingLayer的呢?

caffe.proto

首先,我们还是来看看,配置文件,了解PoolingLayer需要哪些用户定义的参数

message PoolingParameter {
//池化方法,平均池化,最大池化,默认为,最大池化
  enum PoolMethod {
    MAX = 0;
    AVE = 1;
    STOCHASTIC = 2;
  }
  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
  // Pad, kernel size, and stride are all given as a single value for equal
  // dimensions in height and width or as Y, X pairs.
  //需要在输入特征图padding的数量,默认为0,
  //既可以分别指定(pad_h、pad_w),又可以一起指定(pad)
  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
  optional uint32 pad_h = 9 [default = 0]; // The padding height
  optional uint32 pad_w = 10 [default = 0]; // The padding width
  //池化核大小,同padding一样两种指定方式
  optional uint32 kernel_size = 2; // The kernel size (square)
  optional uint32 kernel_h = 5; // The kernel height
  optional uint32 kernel_w = 6; // The kernel width
  //stride,同padding指定方式一样
  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
  optional uint32 stride_h = 7; // The stride height
  optional uint32 stride_w = 8; // The stride width
  //计算引擎
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
  optional Engine engine = 11 [default = DEFAULT];
  // If global_pooling then it will pool over the size of the bottom by doing
  // kernel_h = bottom->height and kernel_w = bottom->width
  //全局池化层,如果设置为全局池化层
  //则kernel_h = bottom_h kernel_w = bottom_w 
  optional bool global_pooling = 12 [default = false];
}

PoolingLayer头文件

#ifndef CAFFE_POOLING_LAYER_HPP_
#define CAFFE_POOLING_LAYER_HPP_

#include 

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

namespace caffe {

/**
 * @brief Pools the input image by taking the max, average, etc. within regions.
 *
 * TODO(dox): thorough documentation for Forward, Backward, and proto params.
 */
template <typename Dtype>
class PoolingLayer : public Layer {
 public:
 //显式构造函数
  explicit PoolingLayer(const LayerParameter& param)
      : Layer(param) {}
  virtual void LayerSetUp(const vector*>& bottom,
      const vector*>& top);
  virtual void Reshape(const vector*>& bottom,
      const vector*>& top);

  virtual inline const char* type() const { return "Pooling"; }
  //只能有一个输入Blob
  virtual inline int ExactNumBottomBlobs() const { return 1; }
  virtual inline int MinTopBlobs() const { return 1; }
  // MAX POOL layers can output an extra top blob for the mask;
  // others can only output the pooled inputs.
  virtual inline int MaxTopBlobs() const {
    return (this->layer_param_.pooling_param().pool() ==
            PoolingParameter_PoolMethod_MAX) ? 2 : 1;
  }

 protected:
 //前向传播
  virtual void Forward_cpu(const vector*>& bottom,
      const vector*>& top);
  virtual void Forward_gpu(const vector*>& bottom,
      const vector*>& top);
//反向传播
  virtual void Backward_cpu(const vector*>& top,
      const vector<bool>& propagate_down, const vector*>& bottom);
  virtual void Backward_gpu(const vector*>& top,
      const vector<bool>& propagate_down, const vector*>& bottom);
//以下参数都和caffe.proto文件中的对应
  int kernel_h_, kernel_w_;
  int stride_h_, stride_w_;
  int pad_h_, pad_w_;
  int channels_;
  //输入特征图的尺度
  int height_, width_;
  //Pooling之后特征图的尺度
  int pooled_height_, pooled_width_;
  bool global_pooling_;
  Blob rand_idx_;
  //max_idx_要单独解释
  //在Pooling的过程中,必然有信息的丢失(空间信息)
  //有些算法(SegNet)中,希望保存,Pooling之后(仅针对MaxPooling)的数值,在原特征图的位置,或者索引,有啥用?请看SegNet这篇论文。
  //因此,max_idx_就是保存MaxPooling之前,Max值得位置
  Blob<int> max_idx_;
};

}  // namespace caffe

#endif  // CAFFE_POOLING_LAYER_HPP_

PoolingLayer源文件

#include 
#include 
#include 

#include "caffe/layers/pooling_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

using std::min;
using std::max;

template <typename Dtype>
void PoolingLayer::LayerSetUp(const vector*>& bottom,
      const vector*>& top) {
  PoolingParameter pool_param = this->layer_param_.pooling_param();
  //首先,如果是全局池化层的话,强制赋值;
  //如果,不是全局池化层的话,就按用户的来
  //以下,padding和stride,一样的套路
  //LayerSetUp函数就是来设置kernel、padding、stride的
  if (pool_param.global_pooling()) {
    CHECK(!(pool_param.has_kernel_size() ||
      pool_param.has_kernel_h() || pool_param.has_kernel_w()))
      << "With Global_pooling: true Filter size cannot specified";
  } else {
    CHECK(!pool_param.has_kernel_size() !=
      !(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
    CHECK(pool_param.has_kernel_size() ||
      (pool_param.has_kernel_h() && pool_param.has_kernel_w()))
      << "For non-square filters both kernel_h and kernel_w are required.";
  }
  CHECK((!pool_param.has_pad() && pool_param.has_pad_h()
      && pool_param.has_pad_w())
      || (!pool_param.has_pad_h() && !pool_param.has_pad_w()))
      << "pad is pad OR pad_h and pad_w are required.";
  CHECK((!pool_param.has_stride() && pool_param.has_stride_h()
      && pool_param.has_stride_w())
      || (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
      << "Stride is stride OR stride_h and stride_w are required.";
  global_pooling_ = pool_param.global_pooling();
  if (global_pooling_) {
    kernel_h_ = bottom[0]->height();
    kernel_w_ = bottom[0]->width();
  } else {
    if (pool_param.has_kernel_size()) {
      kernel_h_ = kernel_w_ = pool_param.kernel_size();
    } else {
      kernel_h_ = pool_param.kernel_h();
      kernel_w_ = pool_param.kernel_w();
    }
  }
  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
  if (!pool_param.has_pad_h()) {
    pad_h_ = pad_w_ = pool_param.pad();
  } else {
    pad_h_ = pool_param.pad_h();
    pad_w_ = pool_param.pad_w();
  }
  if (!pool_param.has_stride_h()) {
    stride_h_ = stride_w_ = pool_param.stride();
  } else {
    stride_h_ = pool_param.stride_h();
    stride_w_ = pool_param.stride_w();
  }
  if (global_pooling_) {
    CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
      << "With Global_pooling: true; only pad = 0 and stride = 1";
  }
  if (pad_h_ != 0 || pad_w_ != 0) {
    CHECK(this->layer_param_.pooling_param().pool()
        == PoolingParameter_PoolMethod_AVE
        || this->layer_param_.pooling_param().pool()
        == PoolingParameter_PoolMethod_MAX)
        << "Padding implemented only for average and max pooling.";
    CHECK_LT(pad_h_, kernel_h_);
    CHECK_LT(pad_w_, kernel_w_);
  }
}

template <typename Dtype>
void PoolingLayer::Reshape(const vector*>& bottom,
      const vector*>& top) {
  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
      << "corresponding to (num, channels, height, width)";
  channels_ = bottom[0]->channels();//输入(输出)特征图的通道数
  //输入特征图的尺度
  height_ = bottom[0]->height();
  width_ = bottom[0]->width();
  if (global_pooling_) {
    kernel_h_ = bottom[0]->height();
    kernel_w_ = bottom[0]->width();
  }
  //输出特征图的尺度,这里有个公式,非常重要,在之后卷积层也会涉及到
  //输出特征图的尺度 = (输入 + 2 * pad - kernel) / stride + 1
  pooled_height_ = static_cast<int>(ceil(static_cast<float>(
      height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
  pooled_width_ = static_cast<int>(ceil(static_cast<float>(
      width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
  if (pad_h_ || pad_w_) {
    // If we have padding, ensure that the last pooling starts strictly
    // inside the image (instead of at the padding); otherwise clip the last.
    //然后 做有个确保,每次滑动窗口的起始位置都在输入特征图的里面
    //只有到每行、每列的最后位置时,才可能会超出图像
    if ((pooled_height_ - 1) * stride_h_ >= height_ + pad_h_) {
      --pooled_height_;
    }
    if ((pooled_width_ - 1) * stride_w_ >= width_ + pad_w_) {
      --pooled_width_;
    }
    CHECK_LT((pooled_height_ - 1) * stride_h_, height_ + pad_h_);
    CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_);
  }
  //分配内存(top)
  top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_,
      pooled_width_);
  if (top.size() > 1) {
    top[1]->ReshapeLike(*top[0]);
  }
  // If max pooling, we will initialize the vector index part.
  if (this->layer_param_.pooling_param().pool() ==
      PoolingParameter_PoolMethod_MAX && top.size() == 1) {
      //最大索引位置
    max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
        pooled_width_);
  }
  // If stochastic pooling, we will initialize the random index part.
  if (this->layer_param_.pooling_param().pool() ==
      PoolingParameter_PoolMethod_STOCHASTIC) {
    rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
      pooled_width_);
  }
}

// TODO(Yangqing): Is there a faster way to do pooling in the channel-first
// case?
template <typename Dtype>
void PoolingLayer::Forward_cpu(const vector*>& bottom,
      const vector*>& top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  Dtype* top_data = top[0]->mutable_cpu_data();
  const int top_count = top[0]->count();
  // We'll output the mask to top[1] if it's of size >1.
  //use_top_mask仅在最大池化层处有用,当use_top_mask = true时,则
  //池化层中的最大值索引,被当做top[1]向后面层传输,SegNet有使用
  const bool use_top_mask = top.size() > 1;
  int* mask = NULL;  // suppress warnings about uninitalized variables
  Dtype* top_mask = NULL;
  // Different pooling methods. We explicitly do the switch outside the for
  // loop to save time, although this results in more code.
  switch (this->layer_param_.pooling_param().pool()) {
  case PoolingParameter_PoolMethod_MAX:
    // Initialize
    if (use_top_mask) {
      top_mask = top[1]->mutable_cpu_data();
      caffe_set(top_count, Dtype(-1), top_mask);
    } else {
      mask = max_idx_.mutable_cpu_data();
      caffe_set(top_count, -1, mask);
    }
    caffe_set(top_count, Dtype(-FLT_MAX), top_data);
    // The main loop
    //前向传播的核心代码

    for (int n = 0; n < bottom[0]->num(); ++n) {//大循环,batch_size
      for (int c = 0; c < channels_; ++c) {//特征图通道数
        for (int ph = 0; ph < pooled_height_; ++ph) {//输出尺度的两层循环
          for (int pw = 0; pw < pooled_width_; ++pw) {
          //输出特征图位置,对应于输入特征图的一块区域
            int hstart = ph * stride_h_ - pad_h_;
            int wstart = pw * stride_w_ - pad_w_;
            //保证都在输入特征图里面
            int hend = min(hstart + kernel_h_, height_);
            int wend = min(wstart + kernel_w_, width_);
            hstart = max(hstart, 0);
            wstart = max(wstart, 0);
            //pool_index表示输出特征图的内存位置
            //在Caffe或者C++中,数据块是内存的线性存储的,没有分什么维度
            //通过解析可以解析成不同维度的
            //举个例子有一个连续的12个int类型的数组
            //通过不同的解析可以是(2,6)、(3,4)、(2,2,3)不同形状的
            //但是所占内存空间都是一样的
            const int pool_index = ph * pooled_width_ + pw;
            //开始遍历(ph,pw)对于与输入特征图的区域
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
              //同理输入特征图的索引
                const int index = h * width_ + w;
                //找最大位置,然后保存最大位置的索引
                if (bottom_data[index] > top_data[pool_index]) {
                  top_data[pool_index] = bottom_data[index];
                  if (use_top_mask) {
                    top_mask[pool_index] = static_cast(index);
                  } else {
                    mask[pool_index] = index;
                  }
                }
              }
            }
          }
        }
        //一个特征图完事了,然后就计算下一个特征图,也就是下一个通道
        // compute offset
        bottom_data += bottom[0]->offset(0, 1);//输入内存偏移
        top_data += top[0]->offset(0, 1);//输出内存偏移
        //索引偏移
        if (use_top_mask) {
          top_mask += top[0]->offset(0, 1);
        } else {
          mask += top[0]->offset(0, 1);
        }
      }
    }
    break;
    //平均池化代码上和最大池化前向传播差不多
  case PoolingParameter_PoolMethod_AVE:
    for (int i = 0; i < top_count; ++i) {
      top_data[i] = 0;
    }
    // The main loop
    for (int n = 0; n < bottom[0]->num(); ++n) {
      for (int c = 0; c < channels_; ++c) {
        for (int ph = 0; ph < pooled_height_; ++ph) {
          for (int pw = 0; pw < pooled_width_; ++pw) {
            int hstart = ph * stride_h_ - pad_h_;
            int wstart = pw * stride_w_ - pad_w_;
            int hend = min(hstart + kernel_h_, height_ + pad_h_);
            int wend = min(wstart + kernel_w_, width_ + pad_w_);
            int pool_size = (hend - hstart) * (wend - wstart);
            hstart = max(hstart, 0);
            wstart = max(wstart, 0);
            hend = min(hend, height_);
            wend = min(wend, width_);
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
                top_data[ph * pooled_width_ + pw] +=
                    bottom_data[h * width_ + w];
              }
            }
            top_data[ph * pooled_width_ + pw] /= pool_size;
          }
        }
        // compute offset
        bottom_data += bottom[0]->offset(0, 1);
        top_data += top[0]->offset(0, 1);
      }
    }
    break;
  case PoolingParameter_PoolMethod_STOCHASTIC:
    NOT_IMPLEMENTED;
    break;
  default:
    LOG(FATAL) << "Unknown pooling method.";
  }
}
//反向传播
template <typename Dtype>
void PoolingLayer::Backward_cpu(const vector*>& top,
      const vector<bool>& propagate_down, const vector*>& bottom) {
  if (!propagate_down[0]) {
    return;
  }
  //获取相应的指针
  const Dtype* top_diff = top[0]->cpu_diff();
  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
  // Different pooling methods. We explicitly do the switch outside the for
  // loop to save time, although this results in more codes.
  //首先填充0
  caffe_set(bottom[0]->count(), Dtype(0), bottom_diff);
  // We'll output the mask to top[1] if it's of size >1.
  const bool use_top_mask = top.size() > 1;
  const int* mask = NULL;  // suppress warnings about uninitialized variables
  //最大池化和平均池化有所不同
  const Dtype* top_mask = NULL;
  switch (this->layer_param_.pooling_param().pool()) {
  //在最大池化的反向传播中,输入层最大值索引位置的梯度为相应的top_diff的值
  //其余位置的梯度均为0
  //因此,可以看出最大池化的反向传播时稀疏的!
  case PoolingParameter_PoolMethod_MAX:
    // The main loop
    if (use_top_mask) {
      top_mask = top[1]->cpu_data();
    } else {
      mask = max_idx_.cpu_data();
    }
    for (int n = 0; n < top[0]->num(); ++n) {
      for (int c = 0; c < channels_; ++c) {
        for (int ph = 0; ph < pooled_height_; ++ph) {
          for (int pw = 0; pw < pooled_width_; ++pw) {
            const int index = ph * pooled_width_ + pw;
            //获取最大值索引
            const int bottom_index =
                use_top_mask ? top_mask[index] : mask[index];
            //最大索引的位置
            bottom_diff[bottom_index] += top_diff[index];
          }
        }
        bottom_diff += bottom[0]->offset(0, 1);
        top_diff += top[0]->offset(0, 1);
        if (use_top_mask) {
          top_mask += top[0]->offset(0, 1);
        } else {
          mask += top[0]->offset(0, 1);
        }
      }
    }
    break;
    //平均池化,就是将top_diff的值平均分配到输入特征图对应的区域
    //也就是(ph, pw) -> {(hstart:hend), (wstart:wend)}
  case PoolingParameter_PoolMethod_AVE:
    // The main loop
    for (int n = 0; n < top[0]->num(); ++n) {
      for (int c = 0; c < channels_; ++c) {
        for (int ph = 0; ph < pooled_height_; ++ph) {
          for (int pw = 0; pw < pooled_width_; ++pw) {
            int hstart = ph * stride_h_ - pad_h_;
            int wstart = pw * stride_w_ - pad_w_;
            int hend = min(hstart + kernel_h_, height_ + pad_h_);
            int wend = min(wstart + kernel_w_, width_ + pad_w_);
            int pool_size = (hend - hstart) * (wend - wstart);
            hstart = max(hstart, 0);
            wstart = max(wstart, 0);
            hend = min(hend, height_);
            wend = min(wend, width_);
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
                bottom_diff[h * width_ + w] +=
                  top_diff[ph * pooled_width_ + pw] / pool_size;
              }
            }
          }
        }
        // offset
        bottom_diff += bottom[0]->offset(0, 1);
        top_diff += top[0]->offset(0, 1);
      }
    }
    break;
  case PoolingParameter_PoolMethod_STOCHASTIC:
    NOT_IMPLEMENTED;
    break;
  default:
    LOG(FATAL) << "Unknown pooling method.";
  }
}


#ifdef CPU_ONLY
STUB_GPU(PoolingLayer);
#endif

INSTANTIATE_CLASS(PoolingLayer);

}  // namespace caffe

你可能感兴趣的:(深度学习与计算机视觉,caffe源代码)