注:些许心得,或比较乱。
首先来看RPN的损失函数:总损失=分类损失+回归损失。
由于RPN的作用时生成候选区域,因此为2分类(是/否目标)问题。回归则是为了对目标区域更准确地定位。参考网络配置文件:
回归损失采用方程:
训练回归过程时,也需要标签信息。ti 和 ti* 分别为网络的预测值和回归的目标。
在训练RPN时需要准备好目标t*。它是通过ground-truth box(目标真实box)和anchor box(按一定规则生成的anchor box)计算得出的,代表的是ground-truth box与anchor box之间的转化关系。用这个来训练rpn,那么rpn最终学会输出一个良好的转化关系t。而这个t,是predicted box与anchor box之间的转化关系。通过这个t和anchor box,可以计算出预测框box的真实坐标。
附录:
function [anchors, im_scales] = proposal_locate_anchors(conf, im_size, target_scale, feature_map_size)
% [anchors, im_scales] = proposal_locate_anchors(conf, im_size, target_scale, feature_map_size)
% --------------------------------------------------------
% Faster R-CNN
% Copyright (c) 2015, Shaoqing Ren
% Licensed under The MIT License [see LICENSE for details]
% --------------------------------------------------------
% generate anchors for each scale
% only for fcn
if ~exist('feature_map_size', 'var')
feature_map_size = [];
end
func = @proposal_locate_anchors_single_scale;
if exist('target_scale', 'var')
[anchors, im_scales] = func(im_size, conf, target_scale, feature_map_size);
else
[anchors, im_scales] = arrayfun(@(x) func(im_size, conf, x, feature_map_size), ...
conf.scales, 'UniformOutput', false);
end
end
function [anchors, im_scale] = proposal_locate_anchors_single_scale(im_size, conf, target_scale, feature_map_size)
if isempty(feature_map_size)
im_scale = prep_im_for_blob_size(im_size, target_scale, conf.max_size);
img_size = round(im_size * im_scale);
output_size = cell2mat([conf.output_height_map.values({img_size(1)}), conf.output_width_map.values({img_size(2)})]);
else
im_scale = prep_im_for_blob_size(im_size, target_scale, conf.max_size);
output_size = feature_map_size;
end
shift_x = [0:(output_size(2)-1)] * conf.feat_stride;
shift_y = [0:(output_size(1)-1)] * conf.feat_stride;
[shift_x, shift_y] = meshgrid(shift_x, shift_y);
% concat anchors as [channel, height, width], where channel is the fastest dimension.
anchors = reshape(bsxfun(@plus, permute(conf.anchors, [1, 3, 2]), ...
permute([shift_x(:), shift_y(:), shift_x(:), shift_y(:)], [3, 1, 2])), [], 4);
% equals to
% anchors = arrayfun(@(x, y) single(bsxfun(@plus, conf.anchors, [x, y, x, y])), shift_x, shift_y, 'UniformOutput', false);
% anchors = reshape(anchors, [], 1);
% anchors = cat(1, anchors{:});
end
function [regression_label] = fast_rcnn_bbox_transform(ex_boxes, gt_boxes)
% [regression_label] = fast_rcnn_bbox_transform(ex_boxes, gt_boxes)
% --------------------------------------------------------
% Fast R-CNN
% Reimplementation based on Python Fast R-CNN (https://github.com/rbgirshick/fast-rcnn)
% Copyright (c) 2015, Shaoqing Ren
% Licensed under The MIT License [see LICENSE for details]
% --------------------------------------------------------
ex_widths = ex_boxes(:, 3) - ex_boxes(:, 1) + 1;
ex_heights = ex_boxes(:, 4) - ex_boxes(:, 2) + 1;
ex_ctr_x = ex_boxes(:, 1) + 0.5 * (ex_widths - 1);
ex_ctr_y = ex_boxes(:, 2) + 0.5 * (ex_heights - 1);
gt_widths = gt_boxes(:, 3) - gt_boxes(:, 1) + 1;
gt_heights = gt_boxes(:, 4) - gt_boxes(:, 2) + 1;
gt_ctr_x = gt_boxes(:, 1) + 0.5 * (gt_widths - 1);
gt_ctr_y = gt_boxes(:, 2) + 0.5 * (gt_heights - 1);
targets_dx = (gt_ctr_x - ex_ctr_x) ./ (ex_widths+eps);
targets_dy = (gt_ctr_y - ex_ctr_y) ./ (ex_heights+eps);
targets_dw = log(gt_widths ./ ex_widths);
targets_dh = log(gt_heights ./ ex_heights);
regression_label = [targets_dx, targets_dy, targets_dw, targets_dh];
end
function [pred_boxes] = fast_rcnn_bbox_transform_inv(boxes, box_deltas)
% [pred_boxes] = fast_rcnn_bbox_transform_inv(boxes, box_deltas)
% --------------------------------------------------------
% Fast R-CNN
% Reimplementation based on Python Fast R-CNN (https://github.com/rbgirshick/fast-rcnn)
% Copyright (c) 2015, Shaoqing Ren
% Licensed under The MIT License [see LICENSE for details]
% --------------------------------------------------------
src_w = double(boxes(:, 3) - boxes(:, 1) + 1);
src_h = double(boxes(:, 4) - boxes(:, 2) + 1);
src_ctr_x = double(boxes(:, 1) + 0.5*(src_w-1));
src_ctr_y = double(boxes(:, 2) + 0.5*(src_h-1));
dst_ctr_x = double(box_deltas(:, 1:4:end));
dst_ctr_y = double(box_deltas(:, 2:4:end));
dst_scl_x = double(box_deltas(:, 3:4:end));
dst_scl_y = double(box_deltas(:, 4:4:end));
pred_ctr_x = bsxfun(@plus, bsxfun(@times, dst_ctr_x, src_w), src_ctr_x);
pred_ctr_y = bsxfun(@plus, bsxfun(@times, dst_ctr_y, src_h), src_ctr_y);
pred_w = bsxfun(@times, exp(dst_scl_x), src_w);
pred_h = bsxfun(@times, exp(dst_scl_y), src_h);
pred_boxes = zeros(size(box_deltas), 'single');
pred_boxes(:, 1:4:end) = pred_ctr_x - 0.5*(pred_w-1);
pred_boxes(:, 2:4:end) = pred_ctr_y - 0.5*(pred_h-1);
pred_boxes(:, 3:4:end) = pred_ctr_x + 0.5*(pred_w-1);
pred_boxes(:, 4:4:end) = pred_ctr_y + 0.5*(pred_h-1);
end