faster rcnn 和 rfcn 的最大不同点在于rfcn采用了PsROI Pooling 保留了局部区域的位置敏感性。
假设我们通过 RPN 层网络获取了 M 个 rois, 每个 rois 用 1*5 的向量表示,**第0 个数表示rois 所属于的图像id,**对roi 进行pooling 时要到特征图对应的batch 中。
例如 rois = [[0, 1,4,6,8],[0,2,3,7,9],[1,3,5,7,9]],有3个roi,其中两个属于第0张图,1个属于第1张图。在对roi 进行pooling 时要到对应batch的特征图中。
针对C个类别(包括背景类别),
保持featu resize 不变通过卷积将特征图通道数量卷成 K x K x C, *获取new 特征图为 N * (K*K C) * H * W。
输入 :
(1)、N * (K*K C) * H * W 的敏感分值图feature map, pooling 前进行通道融合,为(N * KK *C) * H * W
(2)、 N 个图总的rois, shape 为 M × 5, 5个维度第0个值对应第i个图,后4个值对应图中roi的位置。
(3)、 pooling size 为 K * K , 对任意一个roi, pooling 后 shape 为 C * (K * K), C 为通道数,size 为 K×K
(4)、spatial_scale 特征图相对原图的缩放尺寸。
过程:
(1)、
取一个 roi,rpn 出来的roi对应的是原始图像尺寸。
batch_id = roi[0],
top_left_x = roi[1], top_left_y = roi[2], bottom_right_x = roi[3], bottom_right_y = roi[4]
(2)、
ROI 对应到特征图,top_left_x, top_left_y, bottom_right_x, bottom_right_y 均乘 spatial_scale 取整 ,有精度损失这样roi 对应到特征图上 为 - roi_start_x, roi_start_y, roi_end_x, roi_end_y。
(3)、
计算 roi 在特征图上的 roi_w, roi_h, bin_w, bin_h。
roi_w = roi_end_x - roi_start_x
roi_h = roi_end_y - roi_start_y
对这么一个在特征图上宽高的roi划分为 K * K(pooling size),则bin 大小为:
bin_w = roi_w / k
bin_h = roi_h / k
(4)、
针对poling位置(ph, pw)计算每一个bin的起止位置。
h_start = ph * bin_h + roi_start_y
w_start = pw * bin_w + roi_start_x
h_end = (ph + 1)* bin_h + roi_start_y
w_end = (pw +1)* bin_w + roi_start_x
(5)、
id 图上的 roi, pooling 目标位置(ci, ph, pw ),选取通道feature map。
单张图上,通道位置为:
offset_cls =(ci × K * K) + ph * K + gw
不同图像id的roi 去到不同图像的feature map。
offset_batch = (roi_batch_ind * C * (K * K) )
最后,通道offset为:
offset_batch + offset_cls
(6)
在offset 通道上对poling位置(ci, ph, pw ), 进行均值pooling,得到ci 类别,(pw, ph)位置的pooling 结果:
for (int h = hstart; h < hend; ++h){
for (int w = wstart; w < wend; ++w){
int bottom_index = h*width + w;
out_sum += offset_bottom_data[bottom_index];
}
}
T bin_area = (hend - hstart) * (wend - wstart);
top_data[index] = is_empty ? 0. : out_sum / bin_area;
mapping_channel[index] = c;
1、cuda 核函数
kernel_forward_t = '''
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
typedef float T;
extern "C"
__global__ void PSROIPoolForward(
const int nthreads,
const T* bottom_data,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const T* bottom_rois,
const int output_dim,
const int group_size,
T* top_data,
int* mapping_channel)
{
// index of current thread
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index >= nthreads)
{
return;
}
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
T roi_start_w = static_cast<T>(
roundf(offset_bottom_rois[1])) * spatial_scale;
T roi_start_h = static_cast<T>(
roundf(offset_bottom_rois[2])) * spatial_scale;
T roi_end_w = static_cast<T>(
roundf(offset_bottom_rois[3]) + 1.) * spatial_scale;
T roi_end_h = static_cast<T>(
roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
// Force too small ROIs to be 1x1
T roi_width = max(roi_end_w - roi_start_w, static_cast<T>(0.1)); // avoid 0
T roi_height = max(roi_end_h - roi_start_h, static_cast<T>(0.1));
// Compute w and h at bottom
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
// Add roi offsets and clip to input boundaries
int hstart = floor(
static_cast<T>(ph) * bin_size_h + roi_start_h);
int wstart = floor(
static_cast<T>(pw)* bin_size_w + roi_start_w);
int hend = ceil(
static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
int wend = ceil(
static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
hstart = min(max(hstart, 0), height);
hend = min(max(hend, 0), height);
wstart = min(max(wstart, 0),width);
wend = min(max(wend, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
/******************** Add sample step base on group_size ******************/
/* the group of psROI pooling
e.g. group_size=7, pooled_with=21, then module get 3x3 bottom data from each channel */
// the horizontal index of the group of current pooling block
int gw = floor(static_cast<T>(pw)* group_size / pooled_width);
// the vertical index of the group of current pooling block
int gh = floor(static_cast<T>(ph)* group_size / pooled_height);
// clip gw and gh to [0, group_size - 1]
gw = min(max(gw, 0), group_size - 1);
gh = min(max(gh, 0), group_size - 1);
/******************** end ******************/
int c = (ctop * group_size + gh) * group_size + gw;
const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
T out_sum = 0;
for (int h = hstart; h < hend; ++h){
for (int w = wstart; w < wend; ++w){
int bottom_index = h*width + w;
out_sum += offset_bottom_data[bottom_index];
}
}
T bin_area = (hend - hstart) * (wend - wstart);
top_data[index] = is_empty ? 0. : out_sum / bin_area;
mapping_channel[index] = c;
}
'''
kernel_forward = '''
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
typedef float DType;
extern "C"
__global__ void PSROIPoolForwardKernel(
const int count,
const DType* bottom_data,
const DType spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const DType* bottom_rois,
const int output_dim,
const int group_size,
DType* top_data)
{
// get index of thread
CUDA_1D_KERNEL_LOOP(index, count){
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
const DType* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0]; // 该roi在batch中对应第几幅图, batch_idx
DType roi_start_w = static_cast<DType>(round(offset_bottom_rois[1])) * spatial_scale;
DType roi_start_h = static_cast<DType>(round(offset_bottom_rois[2])) * spatial_scale;
DType roi_end_w = static_cast<DType>(round(offset_bottom_rois[3]) + 1.) * spatial_scale;
DType roi_end_h = static_cast<DType>(round(offset_bottom_rois[4]) + 1.) * spatial_scale;
DType roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0
DType roi_height = max(roi_end_h - roi_start_h, 0.1);
DType bin_size_h = roi_height / static_cast<DType>(pooled_height);
DType bin_size_w = roi_width / static_cast<DType>(pooled_width);
int hstart = floor(static_cast<DType>(ph) * bin_size_h + roi_start_h);
int wstart = floor(static_cast<DType>(pw)* bin_size_w + roi_start_w);
int hend = ceil(static_cast<DType>(ph + 1) * bin_size_h + roi_start_h);
int wend = ceil(static_cast<DType>(pw + 1) * bin_size_w + roi_start_w);
hstart = min(max(hstart, 0), height);
hend = min(max(hend, 0), height);
wstart = min(max(wstart, 0), width);
wend = min(max(wend, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
/* the group of psROI pooling
e.g. group_size=7, pooled_with=21, then module get 3x3 bottom data from each channel */
// the horizontal index of the group of current pooling block
int gw = floor(static_cast<DType>(pw)* group_size / pooled_width);
// the vertical index of the group of current pooling block
int gh = floor(static_cast<DType>(ph)* group_size / pooled_height);
// clip gw and gh to [0, group_size - 1]
gw = min(max(gw, 0), group_size - 1);
gh = min(max(gh, 0), group_size - 1);
// sample bottom data with Position-sensitive methods
int c = (ctop*group_size + gh)*group_size + gw;
const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
DType out_sum = 0;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
int bottom_index = h*width + w;
out_sum += offset_bottom_data[bottom_index];
}
}
DType bin_area = (hend - hstart)*(wend - wstart);
top_data[index] = is_empty? (DType)0. : out_sum / bin_area; // avg pool
}
}
'''
kernel_backward_t = '''
inline __device__
float gpu_atomic_add(const float val, float* address) {
return atomicAdd(address, val);
}
typedef float T;
extern "C"
__global__ void PSROIPoolBackward(
const int nthreads,
const T* top_diff,
const int* mapping_channel,
const int num_rois,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int output_dim,
T* bottom_diff,
const T* bottom_rois)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index >= nthreads)
{
return;
}
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
T roi_start_w = static_cast<T>(roundf(offset_bottom_rois[1])) * spatial_scale;
T roi_start_h = static_cast<T>(roundf(offset_bottom_rois[2])) * spatial_scale;
T roi_end_w = static_cast<T>(roundf(offset_bottom_rois[3]) + 1.) * spatial_scale;
T roi_end_h = static_cast<T>(roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
// Force too small ROIs to be 1x1
T roi_width = max(roi_end_w - roi_start_w, static_cast<T>(0.1)); //avoid 0
T roi_height = max(roi_end_h - roi_start_h, static_cast<T>(0.1));
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
int hstart = floor(static_cast<T>(ph)* bin_size_h + roi_start_h);
int wstart = floor(static_cast<T>(pw)* bin_size_w + roi_start_w);
int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
hstart = min(max(hstart, 0), height);
hend = min(max(hend, 0), height);
wstart = min(max(wstart, 0), width);
wend = min(max(wend, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
int c = mapping_channel[index];
T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
T bin_area = (hend - hstart) * (wend - wstart);
T diff_val = is_empty ? 0. : top_diff[index] / bin_area;
for (int h = hstart; h < hend; ++h)
{
for (int w = wstart; w < wend; ++w)
{
int bottom_index = h * width + w;
gpu_atomic_add(diff_val, offset_bottom_diff + bottom_index);
}
}
}
'''
kernel_backward = '''
typedef float DType;
extern "C"
__global__ void PSROIPoolBackwardAccKernel(
const int count,
const DType* top_diff,
const int num_rois,
const DType spatial_scale,
const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const int group_size,
const int output_dim,
DType* bottom_diff,
const DType* bottom_rois)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index >= count)
{
return;
}
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
const DType* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
DType roi_start_w = static_cast<DType>(round(offset_bottom_rois[1])) * spatial_scale;
DType roi_start_h = static_cast<DType>(round(offset_bottom_rois[2])) * spatial_scale;
DType roi_end_w = static_cast<DType>(round(offset_bottom_rois[3]) + 1.) * spatial_scale;
DType roi_end_h = static_cast<DType>(round(offset_bottom_rois[4]) + 1.) * spatial_scale;
DType roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0
DType roi_height = max(roi_end_h - roi_start_h, 0.1);
DType bin_size_h = roi_height / static_cast<DType>(pooled_height);
DType bin_size_w = roi_width / static_cast<DType>(pooled_width);
int hstart = floor(static_cast<DType>(ph) * bin_size_h + roi_start_h);
int wstart = floor(static_cast<DType>(pw)* bin_size_w + roi_start_w);
int hend = ceil(static_cast<DType>(ph + 1) * bin_size_h + roi_start_h);
int wend = ceil(static_cast<DType>(pw + 1) * bin_size_w + roi_start_w);
hstart = min(max(hstart, 0), height);
hend = min(max(hend, 0), height);
wstart = min(max(wstart, 0), width);
wend = min(max(wend, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
/* the group of psROI pooling
e.g. group_size=7, pooled_with=21, then module get 3x3 bottom data from each channel */
int gw = floor(static_cast<DType>(pw)* group_size / pooled_width);
int gh = floor(static_cast<DType>(ph)* group_size / pooled_height);
gw = min(max(gw, 0), group_size - 1);
gh = min(max(gh, 0), group_size - 1);
int c = (ctop*group_size + gh)*group_size + gw;
DType* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
DType bin_area = (hend - hstart)*(wend - wstart);
DType diff_val = is_empty ? (DType)0. : top_diff[index] / bin_area;
// gradient backward
for (int h = hstart; h < hend; ++h)
{
for (int w = wstart; w < wend; ++w)
{
int bottom_index = h*width + w;
atomicAdd(offset_bottom_diff + bottom_index, diff_val);
}
}
}
'''
2 、Python 调用及用例
import cupy, torch
import cupy as cp
import torch as t
#from torch._six import container_abcs
import collections.abc as container_abcs
from itertools import repeat
from string import Template
from torch.autograd import Function
from collections import namedtuple
from psroi_cuda import kernel_forward_t, kernel_backward_t
Stream = namedtuple('Stream', ['ptr'])
CUDA_NUM_THREADS = 1024 # threads of each block
def t_ntuple(n):
def parse(x):
if isinstance(x, container_abcs.Iterable):
return x
return tuple(repeat(x, n))
return parse
t_pair = t_ntuple(2)
@cp.memoize(True)
def load_kernel(kernel_name, code, **kwargs):
cp.cuda.runtime.free(0)
# replace code with input params
code = Template(code).substitute(**kwargs)
kernel_code = cupy.cuda.compile_with_cache(code)
return kernel_code.get_function(kernel_name)
def GET_BLOCKS(N, K=CUDA_NUM_THREADS):
return (N + K - 1) // K
psROI_backward_fn = load_kernel('PSROIPoolBackward', kernel_backward_t)
class psRoI_Info:
def __init__(self):
self.forward_fn = load_kernel('PSROIPoolForward', kernel_forward_t)
self.outh, self.outw, self.spatial_scale = None, None, None
self.group_size = None
def set_para(self, pool_size, spatial_scale, group_size=None):
self.outh, self.outw, self.spatial_scale = pool_size[0], pool_size[1], spatial_scale
if group_size is None:
if pool_size[0] != pool_size[1]:
raise ValueError("pool_h_size must be equal with pool_w_size when the group_size is None")
self.group_size = pool_size[0]
else:
self.group_size = group_size
class psRoI(Function):
@staticmethod
def forward(ctx, x, rois, Info: psRoI_Info):
"""
:param ctx: context variable(similar to 'self')
:param x: input feature map
:param rois: rois generated by rpn,
note:this 'rois' is indices_and_rois combined indexes and rois
==> [batch_ind, x_min, y_min, x_max, y_max]
:return:
"""
# Ensure memory contiguous
x = x.contiguous()
rois = rois.contiguous()
in_size = B, C, H, W = x.size() # e.g.(b, 21 * 7 * 7, h, w)
N = rois.size(0) # the numbers of roi
if C % (Info.group_size * Info.group_size) != 0:
raise ValueError("The group_size must be an integral multiple of input_channel!")
out_dim = C // (Info.group_size * Info.group_size)
output = t.zeros(N, out_dim, Info.outh, Info.outw).cuda() # Used to save output
count = output.numel() # the number of sub regions for psROI
mapping_channel = torch.zeros(count, dtype=cp.int).cuda() # hich channel is the bottom data in
# Packing parameters
args = [count,
x.data_ptr(),
cp.float32(Info.spatial_scale), # must convert float param to cp.float32
C, H, W,
Info.outh,
Info.outw,
rois.data_ptr(),
out_dim,
Info.group_size,
output.data_ptr(),
mapping_channel.data_ptr(),
]
# create cuda stream so that Kernel calculation and data transmission can be executed asynchronously
stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
# using one-dimensional index for block and thread
Info.forward_fn(args=args,
block=(CUDA_NUM_THREADS, 1, 1),
grid=(GET_BLOCKS(count), 1, 1),
stream=stream)
# save info for backward
saveBackwardInfo_int = [count, N, out_dim, Info.outh, Info.outw]
saveBackwardInfo_int = torch.tensor(saveBackwardInfo_int)
ctx.save_for_backward(saveBackwardInfo_int, torch.tensor(in_size),
torch.tensor(Info.spatial_scale), rois, mapping_channel)
return output
@staticmethod
def backward(ctx, grad_output):
"""
the backward of psRoI_pooling
:param ctx: context variable
:param grad_output: gradient input(backward) of psRoI module
:return:
"""
# Here we must handle None grad_output tensor. In this case we
# can skip unnecessary computations and just return None.
if grad_output is None:
return None, None, None
grad_output = grad_output.contiguous()
int_info, in_size, spatial_scale, rois, mapping_channel = ctx.saved_tensors
count, N, out_dim, outh, outw = int_info.tolist()
in_size = tuple(in_size.tolist())
B, C, H, W = in_size # e.g.(b, 21 * 7 * 7, h, w)
grad_input = t.zeros(in_size).cuda() # developing cuda memory to save gradient for output
# create cuda stream
stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
args = [count,
grad_output.data_ptr(),
mapping_channel.data_ptr(),
N,
cp.float32(spatial_scale),
C, H, W,
outh, outw,
out_dim,
grad_input.data_ptr(),
rois.data_ptr(),
]
psROI_backward_fn(args=args,
block=(CUDA_NUM_THREADS, 1, 1),
grid=(GET_BLOCKS(grad_output.numel()), 1, 1),
stream=stream)
return grad_input, None, None # The 'None' indicates that backpropagation to RPN and info is ignored
class PSRoIPooling2D(t.nn.Module):
def __init__(self, pool_size, spatial_scale, group_size=None):
super(PSRoIPooling2D, self).__init__()
pool_size = t_pair(pool_size)
# i.e. pool_size, spatial_scale = (7, 7), 1./16
self.RoI_Info = psRoI_Info()
self.RoI_Info.set_para(pool_size, spatial_scale, group_size=group_size)
self.psROI_md = psRoI()
def forward(self, x, rois):
"""
PS_ROI pooling forward
:param x: input feature map
:param rois: rois generated by rpn,
note:this 'rois' is indices_and_rois combined indexes and rois
==> [batch_ind, x_min, y_min, x_max, y_max]
:return: output
"""
return self.psROI_md.apply(x, rois, self.RoI_Info)
def acitvate_PsROI_for_eval(model: PSRoIPooling2D):
"""
backward once first to speed up eval(cause of an hidden conflict with SkImage lib?)
:return:
"""
# fake data
class_num = 21
group_size = 7
B, C, H, W, PH, PW = 2, class_num*group_size*group_size, 28, 28, 21, 21
bottom_data = t.randn((B, C, H, W)).cuda()
# rois
rois = [torch.tensor([[0, 0, 112, 112,], [7, 75, 503, 442]], dtype=torch.float),
torch.tensor([[0, 0, 224, 224]], dtype=torch.float)]
indices = torch.tensor([0, 0, 1])
rois2 = torch.cat(rois, dim=0)
indices = torch.reshape(indices, (-1, 1)).float()
#rois2_with_indices = torch.hstack((indices, rois2))
print(indices.shape)
print(rois2.shape)
rois2_with_indices = torch.cat([indices, rois2],dim = 1)
print(rois2_with_indices.shape)
bottom_rois = rois2_with_indices.cuda()
x = bottom_data.detach().requires_grad_()
rois = bottom_rois.detach()
print("x.shape ", x.shape)
print("rois.shape", rois.shape)
output = model(x, rois)
print("output.shape", output.shape)
output.sum().backward()
if __name__ == "__main__":
model = PSRoIPooling2D((7,7), 1./16)
acitvate_PsROI_for_eval(model)