int Threshold::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q threshold ? 1.f : 0.f;
return 0;
int TanH::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q
将为1的维度压缩掉,例如:a = [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]],使用squeeze压缩后,b=squeeze(a)=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
int Squeeze::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int dims = bottom_blob.dims;
top_blob = bottom_blob;
// 对channels维度进行压缩
if (squeeze_c && dims == 3 && channels == 1)
// 对height维度进行压缩
if (squeeze_h && h == 1)
top_blob = bottom_blob.reshape(w, opt.blob_allocator);
top_blob = bottom_blob.reshape(w, h, opt.blob_allocator);
// 对height维度进行压缩
else if (squeeze_h && dims >= 2 && h == 1)
// 对width维度进行压缩
if (squeeze_w && w == 1)
top_blob = bottom_blob.reshape(channels, opt.blob_allocator);
top_blob = bottom_blob.reshape(w, channels, opt.blob_allocator);
// 对width维度进行压缩
else if (squeeze_w && dims >= 1 && w == 1)
// 对height维度进行压缩
if (squeeze_h && h == 1)
top_blob = bottom_blob.reshape(channels, opt.blob_allocator);
top_blob = bottom_blob.reshape(h, channels, opt.blob_allocator);
if (top_blob.empty())
return -100;
return 0;
int Tile::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
// 沿着channel方向进行平铺
if (dim == 0)
top_blob.create(w, h, channels * tiles, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;
const float* ptr = bottom_blob;
int size = bottom_blob.cstep * channels;
#pragma omp parallel for num_threads(opt.num_threads)
for (int p=0; p
图1 单通道输入SPP示意图(摘自参考资料2)
图2 channel为256的输入,对应spp原理(摘自参考资料2)
int SPP::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
size_t elemsize = bottom_blob.elemsize;
// 1 + 4 + 16 + 64 + ... + (2*pyramid_height)^2
int pyramid_num_bins = ((1 << (pyramid_height * 2)) - 1) / 3;
top_blob.create(pyramid_num_bins, 1, 2, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;
float* pyramid_ptr = top_blob;
// all spatial pyramids
for (int p = 0; p < pyramid_height; p++)
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
// 每一层总的bins为 1 << 2p
// 针对水平和垂直方向kernel大小和stride时的bins为 1 << p
int num_bins = 1 << p;
// pooing时,height方向的kernel大小和stride大小
int kernel_h = ceil(h / (float)num_bins);
int stride_h = kernel_h;
int remainder_h = stride_h * num_bins - h;
int pad_h = (remainder_h + 1) / 2;
// pooing时,width方向的kernel大小和stride大小
int kernel_w = ceil(w / (float)num_bins);
int stride_w = kernel_w;
int remainder_w = stride_w * num_bins - w;
int pad_w = (remainder_w + 1) / 2;
// max value in NxN window
// avg value in NxN window
// 输出的width和height
int outw = num_bins;
int outh = num_bins;
Mat bottom_blob_bordered = bottom_blob;
if (pad_h > 0 || pad_w > 0)
copy_make_border(bottom_blob, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
if (bottom_blob_bordered.empty())
return -100;
w = bottom_blob_bordered.w;
h = bottom_blob_bordered.h;
// pooling时,数组长度
const int maxk = kernel_h * kernel_w;
// kernel offsets
// 计算每个kernel对应偏移量
std::vector _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
int p1 = 0;
int p2 = 0;
int gap = w - kernel_w;
for (int i = 0; i < kernel_h; i++)
for (int j = 0; j < kernel_w; j++)
space_ofs[p1] = p2;
p2 += gap;
if (pooling_type == PoolMethod_MAX)
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q
// 将输入blob拷贝n份,分别复制给输出blob
int Split::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& /*opt*/) const
const Mat& bottom_blob = bottom_blobs[0];
for (size_t i=0; i
// softmax
int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
// value = exp( value - global max value )
// sum all value
// value = value / sum
int dims = bottom_top_blob.dims;
size_t elemsize = bottom_top_blob.elemsize;
// 输入是一维
if (dims == 1) // axis == 0
int w = bottom_top_blob.w;
float* ptr = bottom_top_blob;
float max = -FLT_MAX;
// 求最大值
for (int i=0; i
int Slice::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const
const Mat& bottom_blob = bottom_blobs[0];
int dims = bottom_blob.dims;
size_t elemsize = bottom_blob.elemsize;
const int* slices_ptr = slices;
// 输入维度为1
if (dims == 1) // axis == 0
int w = bottom_blob.w;
int q = 0;
for (size_t i=0; i
// sigmoid函数
int Sigmoid::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q
精简的CNN网络都会采用Group Convolution技术,进而降低运算量,但是Group之间的特征图不会发生共享,导致每一个filter只能对部分特征可见,降低了模型的表征能力。为了降低group convolution对于模型表征能力的影响,会在Group convolution的特征图后面加入shuffle处理,使得接下来的group convolution filters可以在每个group输出的部分channel上进行计算,参考资料[4]。
图3 Channel Shuffle示意图(摘自参考资料[5])
// 将channel进行分组
int ShuffleChannel::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
int w = bottom_blob.w;
int h = bottom_blob.h;
int c = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
// 每个group对应channel数目
int chs_per_group = c / group;
if (c != chs_per_group * group)
// reject invalid group
return -100;
top_blob.create(w, h, c, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;
const size_t feature_sz = w * h * elemsize;
// 遍历group
for (int i = 0; i != group; i++)
// 遍历每个group对应channel
for (int j = 0; j != chs_per_group; j++)
// src上第i个group的第j个channel
int src_q = chs_per_group * i + j;
// dst上第j个group第i个channel
int dst_q = group * j + i;
memcpy(top_blob.channel(dst_q), bottom_blob.channel(src_q), feature_sz);
return 0;
// SELU激活函数
int SELU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
float alphaxlambda = alpha * lambda;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q
int Scale::forward_inplace(std::vector& bottom_top_blobs, const Option& opt) const
Mat& bottom_top_blob = bottom_top_blobs[0];
const Mat& scale_blob = bottom_top_blobs[1];
int dims = bottom_top_blob.dims;
if (dims == 1)
int w = bottom_top_blob.w;
float* ptr = bottom_top_blob;
// 如果有bias
if (bias_term)
#pragma omp parallel for num_threads(opt.num_threads)
for (int i=0; i
roipooling源自于faster rcnn网络,rcnn网络检测目标物体的思路为:先使用推断出有限的最可能存在物体的位置,再进行分类和回归,得到最终精确的目标位置,采用这个流程的目标检测方法也称为二阶段检测方法,这里不是解读faster rcnn算法,就不做深究了,但是理解roipooling,还是需要理解一下faster rcnn的思路,最原始的rcnn采用的方法是:直接在原始图像上使用region proposal技术生成2K个可能的目标位置,然后将对应图像块抠出来resize到一个固定大小放到神经网络中进行训练,生成的2K个区域每一个都需要放到神经网络中进行一次前向传播,训练耗时不说,还十分耗费资源;于是,作者就对其进行了改进,由于cnn网络每个神经元与输入其实存在着空间位置的对应关系,我们只需要将原始图像放到cnn网络中进行一次前向传播,就可以基于roi与特征图之间对应关系找到每个roi在feature map上的对应位置关系,然后采用roi pooling技术将不等尺寸的输入执行最大池化操作,以获得固定尺寸的特征映射,但是任然存在一个问题,前面候选区域生成还是采用的图像处理方法,无法直接放到gpu上进行训练,于是就有了faster rcnn,faster rcnn内嵌了一个rpn网络,取代了前面的region proposal功能,实现了一个端到端的训练过程,大大提升了模型训练效率(有点扯远了),具体可以参考资料[6]:
(1)找roi从输入到feature map的对应关系:
如图所示,输入尺寸为800x800,feature map尺寸为25x25
那么对应feat_stride = 800 / 25 = 32,也就是说,将输入上roi除以32,就可以得到对应于feature map上roi位置,如:
roi大小为665x665,对应于feature map上roi大小为:665/32 x 665/32 = 20.78 x 20.78,做一个向下取整操作有:20 x 20,从20.78到20的过程称为第一次量化。
(2)feature map 到roi pooling 输出的区域划分:
由于我们需要得到pool_w x pool_h的输出,所以需要将feature map上的roi划分为pool_w x pool_h个区域,那么每个区域对应的width和height为:
int ROIPooling::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const
const Mat& bottom_blob = bottom_blobs[0];
int w = bottom_blob.w;
int h = bottom_blob.h;
size_t elemsize = bottom_blob.elemsize;
int channels = bottom_blob.c;
// 读取roi参数
const Mat& roi_blob = bottom_blobs[1];
// 为输出blob分配空间
Mat& top_blob = top_blobs[0];
top_blob.create(pooled_width, pooled_height, channels, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;
// For each ROI R = [x y w h]: max pool over R
// 读取roi:[x, y, width, height]
const float* roi_ptr = roi_blob;
// 计算在feature map上的roi
int roi_x1 = round(roi_ptr[0] * spatial_scale);
int roi_y1 = round(roi_ptr[1] * spatial_scale);
int roi_x2 = round(roi_ptr[2] * spatial_scale);
int roi_y2 = round(roi_ptr[3] * spatial_scale);
// 计算feature map上roi的宽度和高度
int roi_w = std::max(roi_x2 - roi_x1 + 1, 1);
int roi_h = std::max(roi_y2 - roi_y1 + 1, 1);
// 需要将feature map上的roi分解成pooled_width x pooled_height个region
// 计算每个region对应长度
float bin_size_w = (float)roi_w / (float)pooled_width;
float bin_size_h = (float)roi_h / (float)pooled_height;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q
(1)找roi从输入到feature map的对应关系:
如图上面图所示,输入尺寸为800x800,feature map尺寸为25x25
那么对应feat_stride = 800 / 25 = 32,也就是说,将输入上roi除以32,就可以得到对应于feature map上roi位置,如:
roi大小为665x665,对应于feature map上roi大小为:665/32 x 665/32 = 20.78 x 20.78。
(2)feature map 到roi pooling 输出的区域划分:
由于我们需要得到pool_w x pool_h的输出,所以需要将feature map上的roi划分为pool_w x pool_h个区域,那么每个区域对应的width和height为:
int ROIAlign::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const
const Mat& bottom_blob = bottom_blobs[0];
int w = bottom_blob.w;
int h = bottom_blob.h;
size_t elemsize = bottom_blob.elemsize;
int channels = bottom_blob.c;
const Mat& roi_blob = bottom_blobs[1];
Mat& top_blob = top_blobs[0];
top_blob.create(pooled_width, pooled_height, channels, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;
// For each ROI R = [x y w h]: avg pool over R
const float* roi_ptr = roi_blob;
// 计算roi在feature map上的位置
float roi_x1 = roi_ptr[0] * spatial_scale;
float roi_y1 = roi_ptr[1] * spatial_scale;
float roi_x2 = roi_ptr[2] * spatial_scale;
float roi_y2 = roi_ptr[3] * spatial_scale;
// 计算roi的size
float roi_w = std::max(roi_x2 - roi_x1, 1.f);
float roi_h = std::max(roi_y2 - roi_y1, 1.f);
// 计算每个小区域的size
float bin_size_w = roi_w / (float)pooled_width;
float bin_size_h = roi_h / (float)pooled_height;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q
int Reshape::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
size_t elemsize = bottom_blob.elemsize;
int total = bottom_blob.w * bottom_blob.h * bottom_blob.c;
// 输入为一维
if (ndim == 1)
int _w = w;
if (_w == 0)
_w = bottom_blob.w;
// 计算reshape后mat的形状
if (_w == -1)
_w = total;
// 转置
if (permute == 1)
top_blob.create(_w, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;
// c-h-w to h-w-c
float* ptr = top_blob;
for (int i=0; i
