本文将带大家稍微详细地了解Faster RCNN的整体构造以及对应的每个块的构造细节。
感谢大佬的Faster RCNN复现代码。https://github.com/chenyuntc/simple-faster-rcnn-pytorch
本文也将基于该代码框架进行讲解,主路径为simple-faster-rcnn-pytorch
。
个人觉得先简单看下训练代码
./train.py
,便可以顺藤摸瓜,更加深入地去了解例如:
- 数据的加载
- 模型的加载
- 训练过程的建立
- 模型的评估
- 模型的权重保存
def train(**kwargs):
# ——————————————————————————————————————————————————————————————————————————————————
# 一些参数的设置,具体可以查阅simple-faster-rcnn-pytorch-master\utils\config.py
# ——————————————————————————————————————————————————————————————————————————————————
opt._parse(kwargs)
# ——————————————————————————————————————————————————————————————————————————————————
# 读取数据(train_set和test_set)
# ——————————————————————————————————————————————————————————————————————————————————
dataset = Dataset(opt)
print('load data')
dataloader = data_.DataLoader(dataset, \
batch_size=1, \
shuffle=True, \
# pin_memory=True,
num_workers=opt.num_workers)
testset = TestDataset(opt)
test_dataloader = data_.DataLoader(testset,
batch_size=1,
num_workers=opt.test_num_workers,
shuffle=False, \
pin_memory=True
)
# ——————————————————————————————————————————————————————————————————————————————————
# 加载模型 (注意其中的trainer = FasterRCNNTrainer(faster_rcnn).cuda(), 负责网络训练的反向梯度传播过程)
# ——————————————————————————————————————————————————————————————————————————————————
faster_rcnn = FasterRCNNVGG16()
print('model construct completed')
trainer = FasterRCNNTrainer(faster_rcnn).cuda()
if opt.load_path:
trainer.load(opt.load_path)
print('load pretrained model from %s' % opt.load_path)
trainer.vis.text(dataset.db.label_names, win='labels')
# ——————————————————————————————————————————————————————————————————————————————————
# 为了记录最好的map结果
# ——————————————————————————————————————————————————————————————————————————————————
best_map = 0
# ——————————————————————————————————————————————————————————————————————————————————
# 初始化学习率lr
# ——————————————————————————————————————————————————————————————————————————————————
lr_ = opt.lr
# ——————————————————————————————————————————————————————————————————————————————————
# 进行opt.epoch轮次的学习
# ——————————————————————————————————————————————————————————————————————————————————
for epoch in range(opt.epoch):
trainer.reset_meters()
for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
# ——————————————————————————————————————————————————————————————————————————————————
# 得到每一批次图片的信息,进行学习,loss的backword()在trainer.train_step()中进行
# ——————————————————————————————————————————————————————————————————————————————————
scale = at.scalar(scale)
img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
trainer.train_step(img, bbox, label, scale)
# ——————————————————————————————————————————————————————————————————————————————————
# 在每个opt.plot_every轮次进行一次记录
# ——————————————————————————————————————————————————————————————————————————————————
if (ii + 1) % opt.plot_every == 0:
if os.path.exists(opt.debug_file):
ipdb.set_trace()
# plot loss
trainer.vis.plot_many(trainer.get_meter_data())
# plot groud truth bboxes
ori_img_ = inverse_normalize(at.tonumpy(img[0]))
gt_img = visdom_bbox(ori_img_,
at.tonumpy(bbox_[0]),
at.tonumpy(label_[0]))
trainer.vis.img('gt_img', gt_img)
# plot predicti bboxes
_bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True)
pred_img = visdom_bbox(ori_img_,
at.tonumpy(_bboxes[0]),
at.tonumpy(_labels[0]).reshape(-1),
at.tonumpy(_scores[0]))
trainer.vis.img('pred_img', pred_img)
# rpn confusion matrix(meter)
trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm')
# roi confusion matrix
trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float())
# ——————————————————————————————————————————————————————————————————————————————————
# 对本次训练结果进行评估,并输出信息
# ——————————————————————————————————————————————————————————————————————————————————
eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)
trainer.vis.plot('test_map', eval_result['map'])
lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] #
log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_),
str(eval_result['map']),
str(trainer.get_meter_data()))
trainer.vis.log(log_info)
# ——————————————————————————————————————————————————————————————————————————————————
# 对模型map效果最好的权重进行保存,或者读取使用
# ——————————————————————————————————————————————————————————————————————————————————
if eval_result['map'] > best_map:
best_map = eval_result['map']
best_path = trainer.save(best_map=best_map)
if epoch == 9:
trainer.load(best_path)
trainer.faster_rcnn.scale_lr(opt.lr_decay)
lr_ = lr_ * opt.lr_decay
if epoch == 13:
break
上面代码中通过下面的代码来建立了FasterRCNNVGG16模型
./model/faster_rcnn_vgg16.py
,faster_rcnn = FasterRCNNVGG16()
而该模型继承自FasterRCNN模型
./model/faster_rcnn.py
。
整体上来讲,FasterRCNN相当于一个整体的骨骼框架,FasterRCNNVGG16则将很多具现化的东西给填充进去,形成了一个有血有肉的模型。
下面介绍FasterRCNN模型
./model/faster_rcnn.py
from __future__ import absolute_import
from __future__ import division
import torch as t
import numpy as np
from utils import array_tool as at
from model.utils.bbox_tools import loc2bbox
from torchvision.ops import nms
# from model.utils.nms import non_maximum_suppression
from torch import nn
from data.dataset import preprocess
from torch.nn import functional as F
from utils.config import opt
def nograd(f):
def new_f(*args,**kwargs):
with t.no_grad():
return f(*args,**kwargs)
return new_f
# ——————————————————————————————————————————————————————————————————————————————————
# FasterRCNN主要包括了三大块:
# 1.Extractor(torch.nn.Module),类似于Backbone用来进行初步的特征提取,输入BCHW图片,最后输出feature map;
# 2.RPN(torch.nn.Module),生成一组目标周围的RoIs;
# 3.Head(torch.nn.Module),对RoI中的对象进行分类,并改进定位。
# ——————————————————————————————————————————————————————————————————————————————————
class FasterRCNN(nn.Module):
# ——————————————————————————————————————————————————————————————————————————————————
# 初始化函数,初始化一些相关的结构与参数
# ——————————————————————————————————————————————————————————————————————————————————
def __init__(self, extractor, rpn, head,
loc_normalize_mean = (0., 0., 0., 0.),
loc_normalize_std = (0.1, 0.1, 0.2, 0.2)
):
super(FasterRCNN, self).__init__()
# 特征提取器
self.extractor = extractor
# RPN网络
self.rpn = rpn
# Head网络
self.head = head
# 设定位置预测的均值和标准差
self.loc_normalize_mean = loc_normalize_mean
self.loc_normalize_std = loc_normalize_std
# 设定nms_thresh, score_thresh
self.use_preset('evaluate')
# ——————————————————————————————————————————————————————————————————————————————————
# 总类别的数量(包括背景类)
# ——————————————————————————————————————————————————————————————————————————————————
@property
def n_class(self):
return self.head.n_class
# ——————————————————————————————————————————————————————————————————————————————————
# 前向传播(其中每一块都十分重要!!!)
# ——————————————————————————————————————————————————————————————————————————————————
def forward(self, x, scale=1.):
# ——————————————————————————————————————————————————————————————————————————————————
# 先对图片进行resize,再通过特征提取器提取得到特征图
# ——————————————————————————————————————————————————————————————————————————————————
img_size = x.shape[2:]
h = self.extractor(x)
# ——————————————————————————————————————————————————————————————————————————————————
# 使用RPN对得到的特征图,生成一组目标周围的RoIs
# ——————————————————————————————————————————————————————————————————————————————————
rpn_locs, rpn_scores, rois, roi_indices, anchor = \
self.rpn(h, img_size, scale)
# ——————————————————————————————————————————————————————————————————————————————————
# 使用Head对RPN输出的rois, roi_indices,基于特征图h,进行类别分类与进一步的定位
# ——————————————————————————————————————————————————————————————————————————————————
roi_cls_locs, roi_scores = self.head(
h, rois, roi_indices)
# ——————————————————————————————————————————————————————————————————————————————————
# R:批量图片产生的ROI的总数; N:类别总数
# roi_cls_locs:候选ROI的位置偏移和宽高缩放。 shape = (R, (N + 1) * 4)
# roi_scores: 候选ROI的类预测分数。 shape = (R, N + 1),1为背景类
# rois: RoIs proposed by RPN. shape = (R, 4) [x, y, w, h]
# roi_indices : 批量图片产生的ROI的索引 shape = (R, ),i表示第i个图片产生的ROI
# ——————————————————————————————————————————————————————————————————————————————————
return roi_cls_locs, roi_scores, rois, roi_indices
# ——————————————————————————————————————————————————————————————————————————————————
# 用来设定nms_thresh与score_thresh
# ——————————————————————————————————————————————————————————————————————————————————
def use_preset(self, preset):
if preset == 'visualize':
self.nms_thresh = 0.3
self.score_thresh = 0.7
elif preset == 'evaluate':
self.nms_thresh = 0.3
self.score_thresh = 0.05
else:
raise ValueError('preset must be visualize or evaluate')
# ——————————————————————————————————————————————————————————————————————————————————
# 对网格head的预测结果进行NMS后处理,得到最终的网络检测结果
# ——————————————————————————————————————————————————————————————————————————————————
def _suppress(self, raw_cls_bbox, raw_prob):
bbox = list()
label = list()
score = list()
# skip cls_id = 0 because it is the background class
for l in range(1, self.n_class):
cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :]
prob_l = raw_prob[:, l]
mask = prob_l > self.score_thresh
cls_bbox_l = cls_bbox_l[mask]
prob_l = prob_l[mask]
keep = nms(cls_bbox_l, prob_l,self.nms_thresh)
# import ipdb;ipdb.set_trace()
# keep = cp.asnumpy(keep)
bbox.append(cls_bbox_l[keep].cpu().numpy())
# The labels are in [0, self.n_class - 2].
label.append((l - 1) * np.ones((len(keep),)))
score.append(prob_l[keep].cpu().numpy())
bbox = np.concatenate(bbox, axis=0).astype(np.float32)
label = np.concatenate(label, axis=0).astype(np.int32)
score = np.concatenate(score, axis=0).astype(np.float32)
return bbox, label, score
# ——————————————————————————————————————————————————————————————————————————————————
# 进行目标检测前向推理,得到最终的bboxes, labels, scores
# ——————————————————————————————————————————————————————————————————————————————————
@nograd
def predict(self, imgs,sizes=None,visualize=False):
self.eval()
if visualize:
self.use_preset('visualize')
prepared_imgs = list()
sizes = list()
# ——————————————————————————————————————————————————————————————————————————————————
# 进行数据增强等处理
# ——————————————————————————————————————————————————————————————————————————————————
for img in imgs:
size = img.shape[1:]
img = preprocess(at.tonumpy(img))
prepared_imgs.append(img)
sizes.append(size)
else:
prepared_imgs = imgs
# ——————————————————————————————————————————————————————————————————————————————————
# 对每张图片进行正向推理与结果记录
# ——————————————————————————————————————————————————————————————————————————————————
bboxes = list()
labels = list()
scores = list()
for img, size in zip(prepared_imgs, sizes):
img = at.totensor(img[None]).float()
scale = img.shape[3] / size[1]
# ——————————————————————————————————————————————————————————————————————————————————
# 进行正向推理foward,得到Head的预测roi_cls_loc, roi_scores, rois,这里的self即为self.forword
# ——————————————————————————————————————————————————————————————————————————————————
roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
# We are assuming that batch size is 1.
roi_score = roi_scores.data
roi_cls_loc = roi_cls_loc.data
roi = at.totensor(rois) / scale
# ——————————————————————————————————————————————————————————————————————————————————
# 将预测得到的bbox转化成原图尺寸上的框的坐标与长宽
# ——————————————————————————————————————————————————————————————————————————————————
mean = t.Tensor(self.loc_normalize_mean).cuda(). \
repeat(self.n_class)[None]
std = t.Tensor(self.loc_normalize_std).cuda(). \
repeat(self.n_class)[None]
roi_cls_loc = (roi_cls_loc * std + mean)
roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
at.tonumpy(roi_cls_loc).reshape((-1, 4)))
cls_bbox = at.totensor(cls_bbox)
cls_bbox = cls_bbox.view(-1, self.n_class * 4)
# ——————————————————————————————————————————————————————————————————————————————————
# 将bbox超出图像区域的进行裁剪
# ——————————————————————————————————————————————————————————————————————————————————
cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])
prob = (F.softmax(at.totensor(roi_score), dim=1))
# ——————————————————————————————————————————————————————————————————————————————————
# 进行后处理,做最后的输出bboxes, labels, scores
# ——————————————————————————————————————————————————————————————————————————————————
bbox, label, score = self._suppress(cls_bbox, prob)
bboxes.append(bbox)
labels.append(label)
scores.append(score)
self.use_preset('evaluate')
self.train()
return bboxes, labels, scores
# ——————————————————————————————————————————————————————————————————————————————————
# 返回模型的优化器Optimizer
# ——————————————————————————————————————————————————————————————————————————————————
def get_optimizer(self):
lr = opt.lr
params = []
for key, value in dict(self.named_parameters()).items():
if value.requires_grad:
if 'bias' in key:
params += [{'params': [value], 'lr': lr * 2, 'weight_decay': 0}]
else:
params += [{'params': [value], 'lr': lr, 'weight_decay': opt.weight_decay}]
if opt.use_adam:
self.optimizer = t.optim.Adam(params)
else:
self.optimizer = t.optim.SGD(params, momentum=0.9)
return self.optimizer
# ——————————————————————————————————————————————————————————————————————————————————
# 调整学习率
# ——————————————————————————————————————————————————————————————————————————————————
def scale_lr(self, decay=0.1):
for param_group in self.optimizer.param_groups:
param_group['lr'] *= decay
return self.optimizer
下面介绍FasterRCNNVGG16模型
./model/faster_rcnn_vgg16.py
from __future__ import absolute_import
import torch as t
from torch import nn
from torchvision.models import vgg16
from torchvision.ops import RoIPool
from model.region_proposal_network import RegionProposalNetwork
from model.faster_rcnn import FasterRCNN
from utils import array_tool as at
from utils.config import opt
# ——————————————————————————————————————————————————————————————————————————————————
# 建立VGG16 model
# ——————————————————————————————————————————————————————————————————————————————————
def decom_vgg16():
# the 30th layer of features is relu of conv5_3
if opt.caffe_pretrain:
model = vgg16(pretrained=False)
if not opt.load_path:
model.load_state_dict(t.load(opt.caffe_pretrain_path))
else:
model = vgg16(not opt.load_path)
features = list(model.features)[:30]
# classifier是特征提取网络最后的分类层
classifier = model.classifier
classifier = list(classifier)
del classifier[6]
if not opt.use_drop:
del classifier[5]
del classifier[2]
classifier = nn.Sequential(*classifier)
# freeze top4 conv
for layer in features[:10]:
for p in layer.parameters():
p.requires_grad = False
return nn.Sequential(*features), classifier
# ——————————————————————————————————————————————————————————————————————————————————
# FasterRCNNVGG16
# ——————————————————————————————————————————————————————————————————————————————————
class FasterRCNNVGG16(FasterRCNN):
feat_stride = 16 # vgg16下采样了16x
def __init__(self,
n_fg_class=20, # 类别数量
ratios=[0.5, 1, 2], # anchor ratios
anchor_scales=[8, 16, 32] # anchor scales
):
# Feature extractor, 也即backbone
extractor, classifier = decom_vgg16()
# Region Proposal Network(RPN)
rpn = RegionProposalNetwork(
512, 512,
ratios=ratios,
anchor_scales=anchor_scales,
feat_stride=self.feat_stride,
)
# Head
head = VGG16RoIHead(
n_class=n_fg_class + 1,
roi_size=7,
spatial_scale=(1. / self.feat_stride),
classifier=classifier
)
super(FasterRCNNVGG16, self).__init__(
extractor,
rpn,
head,
)
可以看到FasterRCNNVGG16在实例化的过程中调用了:
- RegionProposalNetwork
./model/faster_rcnn_vgg16.py
;- VGG16RoIHead
./model/region_proposal_network.py
。
这里介绍RegionProposalNetwork
./model/faster_rcnn_vgg16.py
import numpy as np
from torch.nn import functional as F
import torch as t
from torch import nn
from model.utils.bbox_tools import generate_anchor_base
from model.utils.creator_tool import ProposalCreator
class RegionProposalNetwork(nn.Module):
# ——————————————————————————————————————————————————————————————————————————————————
# RPN初始化参数。包括:
# in_channels, mid_channels: 输入层与中间层的通道数;
# ratios, anchor_scales: anchor的宽高比和尺寸种类;
# feat_stride: backbone下采样的倍数;
# proposal_creator_params: model.utils.creator_tools.ProposalCreator
# ——————————————————————————————————————————————————————————————————————————————————
def __init__(
self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2],
anchor_scales=[8, 16, 32], feat_stride=16,
proposal_creator_params=dict(),
):
super(RegionProposalNetwork, self).__init__()
# ——————————————————————————————————————————————————————————————————————————————————
# 生成一组先验的base_anchors [x_min, y_min, x_max, y_max]
# ——————————————————————————————————————————————————————————————————————————————————
self.anchor_base = generate_anchor_base(anchor_scales=anchor_scales, ratios=ratios)
# feature stride
self.feat_stride = feat_stride
# 产生ROI作为头部的输入
self.proposal_layer = ProposalCreator(self, **proposal_creator_params)
# ——————————————————————————————————————————————————————————————————————————————————
# 初始化RPN的输出头的结构。
# ——————————————————————————————————————————————————————————————————————————————————
# 得到base_anchor的数量,并基于此建立结构
n_anchor = self.anchor_base.shape[0]
self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1) # mid layer
self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0) # pos/neg cls head
self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0) # location head
# 网络层权重初始化
normal_init(self.conv1, 0, 0.01)
normal_init(self.score, 0, 0.01)
normal_init(self.loc, 0, 0.01)
# ——————————————————————————————————————————————————————————————————————————————————
# RPN前项推理。输入包括:
## x: input feature map, shape = (N, C, H, W);
## img_size: 网络输入的原始图像scaling后的size (height, weight);
## scale: 网络输入图像相对于原始图像的缩放尺度。
# 返回包括:
## rpn_locs: 相对于anchor box的偏移[ty, tx, th, tw], shape=(N, H*W*A, 4), A表示anchor的数量
## rpn_scores: 预测为foreground前景类的分数, shape=(N, H*W*A, 2), 2个类别(正/负例)
## rois: 所有输入图片候选的bbox组成的array, shape=(R', 4), R'表示每一张图的候选框数量的总和
## roi_indices: rois的索引,该候选框属于第几张图, shape=(R',)
## anchor: 预先根据图片设定的prior anchor_bboxes, shape=(H*W*A, 4)
# ——————————————————————————————————————————————————————————————————————————————————
def forward(self, x, img_size, scale=1.):
# >特征图<的宽高
n, _, hh, ww = x.shape
# (1) 为特征图产生anchor boxes
anchor = _enumerate_shifted_anchor(np.array(self.anchor_base), self.feat_stride, hh, ww)
# (2) 记录anchor的数量, 为RPN的位置预测与类别分数预测的维度做准备
n_anchor = anchor.shape[0] // (hh * ww)
# 1. 网络的预测框boxes坐标偏移预测, [t_y, t_x, t_h, t_w]
h = F.relu(self.conv1(x))
rpn_locs = self.loc(h)
rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4)
# 2. 网络的预测框boxes前后景(正/负类)类别分数预测
rpn_scores = self.score(h)
rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous()
rpn_softmax_scores = F.softmax(rpn_scores.view(n, hh, ww, n_anchor, 2), dim=4)
rpn_fg_scores = rpn_softmax_scores[:, :, :, :, 1].contiguous()
rpn_fg_scores = rpn_fg_scores.view(n, -1)
rpn_scores = rpn_scores.view(n, -1, 2)
# ——————————————————————————————————————————————————————————————————————————————————
# 记录下每张图片的ROIs, 以及每一个roi属于哪张图片的ID列表roi_indices
# ——————————————————————————————————————————————————————————————————————————————————
rois = list()
roi_indices = list()
# 遍历批量中的每一张图片
for i in range(n):
# (3) 为第i张图片生成的ROIs, 这些ROIs是经过NMS等处理的 (具体看./model/utils/creator_tool.py的class ProposalCreator)
roi = self.proposal_layer(
rpn_locs[i].cpu().data.numpy(), # RPN位置预测
rpn_fg_scores[i].cpu().data.numpy(), # RPN前景类分数预测
anchor, img_size, # anchor boxs, 调整后的图片尺寸
scale=scale # 缩放因子
)
# (4) 将第i张图片的ROIs添加到rois的列表中, 并将batch_index标记为i
batch_index = i * np.ones((len(roi),), dtype=np.int32)
rois.append(roi)
roi_indices.append(batch_index)
rois = np.concatenate(rois, axis=0)
roi_indices = np.concatenate(roi_indices, axis=0)
return rpn_locs, rpn_scores, rois, roi_indices, anchor
# ——————————————————————————————————————————————————————————————————————————————————
# 列举所有的anchor boxes(不同尺寸, 不同宽高比)
# _enumerate_shifted_anchor(anchor_base, feat_stride=16, 特征图height, 特征图width):
# return anchors, shape=(K*A, 4), K
# ——————————————————————————————————————————————————————————————————————————————————
def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width):
import numpy as xp
# (1) 将原本的图片划分成网格
shift_y = xp.arange(0, height * feat_stride, feat_stride)
shift_x = xp.arange(0, width * feat_stride, feat_stride)
shift_x, shift_y = xp.meshgrid(shift_x, shift_y)
shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
shift_y.ravel(), shift_x.ravel()), axis=1)
# (2) 将anchor_base boxes分配给上面划分的网格,得到在该stride特征图下的anchor boxes
A = anchor_base.shape[0]
K = shift.shape[0]
anchor = anchor_base.reshape((1, A, 4)) + \
shift.reshape((1, K, 4)).transpose((1, 0, 2))
anchor = anchor.reshape((K * A, 4)).astype(np.float32)
return anchor
# ——————————————————————————————————————————————————————————————————————————————————
# 与_enumerate_shifted_anchor的功能一样,只不过用torch包中的功能来取代numpy包的一些功能 (不进行注释了)
# ——————————————————————————————————————————————————————————————————————————————————
def _enumerate_shifted_anchor_torch(anchor_base, feat_stride, height, width):
import torch as t
shift_y = t.arange(0, height * feat_stride, feat_stride)
shift_x = t.arange(0, width * feat_stride, feat_stride)
shift_x, shift_y = xp.meshgrid(shift_x, shift_y)
shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
shift_y.ravel(), shift_x.ravel()), axis=1)
A = anchor_base.shape[0]
K = shift.shape[0]
anchor = anchor_base.reshape((1, A, 4)) + \
shift.reshape((1, K, 4)).transpose((1, 0, 2))
anchor = anchor.reshape((K * A, 4)).astype(np.float32)
return anchor
# ——————————————————————————————————————————————————————————————————————————————————
# 网络层权重初始化
# ——————————————————————————————————————————————————————————————————————————————————
def normal_init(m, mean, stddev, truncated=False):
# x is a parameter
if truncated:
m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation
else:
m.weight.data.normal_(mean, stddev)
m.bias.data.zero_()
这里介绍VGG16RoIHead
./model/region_proposal_network.py
# ——————————————————————————————————————————————————————————————————————————————————
# Faster RCNN最终的预测头部,得到roi_cls_locs各bbox位置, roi_scores各类别分数)
# ——————————————————————————————————————————————————————————————————————————————————
class VGG16RoIHead(nn.Module):
def __init__(self, n_class, roi_size, spatial_scale,
classifier):
# n_class includes the background
super(VGG16RoIHead, self).__init__()
# ——————————————————————————————————————————————————————————————————————————————————
# 构建位置与分类预测头
# ——————————————————————————————————————————————————————————————————————————————————
self.classifier = classifier
# 位置预测头
self.cls_loc = nn.Linear(4096, n_class * 4)
# 分类预测头
self.score = nn.Linear(4096, n_class)
# 网络层权重初始化
normal_init(self.cls_loc, 0, 0.001)
normal_init(self.score, 0, 0.01)
# 类别数量
self.n_class = n_class
# ——————————————————————————————————————————————————————————————————————————————————
# 构建ROI Pooling层
# ——————————————————————————————————————————————————————————————————————————————————
# roi_pooling后统一的size
self.roi_size = roi_size
self.spatial_scale = spatial_scale
# 构建ROI Pooling层
self.roi = RoIPool( (self.roi_size, self.roi_size),self.spatial_scale)
# ——————————————————————————————————————————————————————————————————————————————————
# 正向传播
# x: input image经过vgg产生的特征图;
# rois: ROI框的坐标;
# roi_indices: ROI的id,指向哪张图片。
# ——————————————————————————————————————————————————————————————————————————————————
def forward(self, x, rois, roi_indices):
# in case roi_indices is ndarray
roi_indices = at.totensor(roi_indices).float()
rois = at.totensor(rois).float()
indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1)
# NOTE: important: yx->xy
xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
indices_and_rois = xy_indices_and_rois.contiguous()
# ——————————————————————————————————————————————————————————————————————————————————
# 得到ROI pooling后的ROI的特征图
# ——————————————————————————————————————————————————————————————————————————————————
pool = self.roi(x, indices_and_rois)
pool = pool.view(pool.size(0), -1)
# ——————————————————————————————————————————————————————————————————————————————————
# 得到定位头分支与类别头分支的输出
# ——————————————————————————————————————————————————————————————————————————————————
fc7 = self.classifier(pool)
roi_cls_locs = self.cls_loc(fc7)
roi_scores = self.score(fc7)
return roi_cls_locs, roi_scores
# ——————————————————————————————————————————————————————————————————————————————————
# 网络层权重初始化
# ——————————————————————————————————————————————————————————————————————————————————
def normal_init(m, mean, stddev, truncated=False):
"""
weight initalizer: truncated normal and random normal.
"""
# x is a parameter
if truncated:
m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation
else:
m.weight.data.normal_(mean, stddev)
m.bias.data.zero_()
再回过头去,看看损失函数是如何backward()
的。
./trainer.py
中的FasterRCNNTrainer类是对训练进行包装,对损失函数进行反向传递过程,损失函数包括:
- RPN loss
- rpn_loc_loss
- rpn_cls_loss
- Head loss
- roi_loc_loss
- roi_cls_loss
- total_loss: 所有损失的和
这里简单介绍一下FasterRCNNTrainer类是怎么在训练过程中使用的,也就是对其中一些主要的类内置方法进行介绍,主要的类内置方法包括有:
def __init__(...)
:def forward(...)
:输入IMAGE与Ground Truth的Box的location与label,前项推理Faster RCNN模型,并且计算各部分损失Loss;def train_step(...)
:使用optimizer对loss进行backward()
;def save(..)
:保存模型的训练信息,包括权重、optimizer、模型config等;def load(...)
:加载权重文件、训练参数config、优化器optimizer;
下面的代码随便添一下,没有太多注释。
from __future__ import absolute_import
import os
from collections import namedtuple
import time
from torch.nn import functional as F
from model.utils.creator_tool import AnchorTargetCreator, ProposalTargetCreator
from torch import nn
import torch as t
from utils import array_tool as at
from utils.vis_tool import Visualizer
from utils.config import opt
from torchnet.meter import ConfusionMeter, AverageValueMeter
LossTuple = namedtuple('LossTuple',
['rpn_loc_loss',
'rpn_cls_loss',
'roi_loc_loss',
'roi_cls_loss',
'total_loss'
])
# ——————————————————————————————————————————————————————————————————————————————————
# 对训练进行包装, 对损失函数进行反向传递过程
# The losses include:
#
# * :obj:`rpn_loc_loss`: The localization loss for \
# Region Proposal Network (RPN).
# * :obj:`rpn_cls_loss`: The classification loss for RPN.
# * :obj:`roi_loc_loss`: The localization loss for the head module.
# * :obj:`roi_cls_loss`: The classification loss for the head module.
# * :obj:`total_loss`: The sum of 4 loss above.
# ——————————————————————————————————————————————————————————————————————————————————
class FasterRCNNTrainer(nn.Module):
def __init__(self, faster_rcnn):
super(FasterRCNNTrainer, self).__init__()
self.faster_rcnn = faster_rcnn
self.rpn_sigma = opt.rpn_sigma
self.roi_sigma = opt.roi_sigma
# target creator create gt_bbox gt_label etc as training targets.
self.anchor_target_creator = AnchorTargetCreator()
self.proposal_target_creator = ProposalTargetCreator()
self.loc_normalize_mean = faster_rcnn.loc_normalize_mean
self.loc_normalize_std = faster_rcnn.loc_normalize_std
self.optimizer = self.faster_rcnn.get_optimizer()
# visdom wrapper
self.vis = Visualizer(env=opt.env)
# indicators for training status
self.rpn_cm = ConfusionMeter(2)
self.roi_cm = ConfusionMeter(21)
self.meters = {k: AverageValueMeter() for k in LossTuple._fields} # average loss
def forward(self, imgs, bboxes, labels, scale):
"""Forward Faster R-CNN and calculate losses.
Here are notations used.
* :math:`N` is the batch size.
* :math:`R` is the number of bounding boxes per image.
Currently, only :math:`N=1` is supported.
Args:
imgs (~torch.autograd.Variable): A variable with a batch of images.
bboxes (~torch.autograd.Variable): A batch of bounding boxes.
Its shape is :math:`(N, R, 4)`.
labels (~torch.autograd..Variable): A batch of labels.
Its shape is :math:`(N, R)`. The background is excluded from
the definition, which means that the range of the value
is :math:`[0, L - 1]`. :math:`L` is the number of foreground
classes.
scale (float): Amount of scaling applied to
the raw image during preprocessing.
Returns:
namedtuple of 5 losses
"""
n = bboxes.shape[0]
if n != 1:
raise ValueError('Currently only batch size 1 is supported.')
_, _, H, W = imgs.shape
img_size = (H, W)
features = self.faster_rcnn.extractor(imgs)
rpn_locs, rpn_scores, rois, roi_indices, anchor = \
self.faster_rcnn.rpn(features, img_size, scale)
# Since batch size is one, convert variables to singular form
bbox = bboxes[0]
label = labels[0]
rpn_score = rpn_scores[0]
rpn_loc = rpn_locs[0]
roi = rois
# Sample RoIs and forward
# it's fine to break the computation graph of rois,
# consider them as constant input
sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
roi,
at.tonumpy(bbox),
at.tonumpy(label),
self.loc_normalize_mean,
self.loc_normalize_std)
# NOTE it's all zero because now it only support for batch=1 now
sample_roi_index = t.zeros(len(sample_roi))
roi_cls_loc, roi_score = self.faster_rcnn.head(
features,
sample_roi,
sample_roi_index)
# ------------------ RPN losses -------------------#
gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
at.tonumpy(bbox),
anchor,
img_size)
gt_rpn_label = at.totensor(gt_rpn_label).long()
gt_rpn_loc = at.totensor(gt_rpn_loc)
rpn_loc_loss = _fast_rcnn_loc_loss(
rpn_loc,
gt_rpn_loc,
gt_rpn_label.data,
self.rpn_sigma)
# NOTE: default value of ignore_index is -100 ...
rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)
_gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
_rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long())
# ------------------ ROI losses (fast rcnn loss) -------------------#
n_sample = roi_cls_loc.shape[0]
roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
at.totensor(gt_roi_label).long()]
gt_roi_label = at.totensor(gt_roi_label).long()
gt_roi_loc = at.totensor(gt_roi_loc)
roi_loc_loss = _fast_rcnn_loc_loss(
roi_loc.contiguous(),
gt_roi_loc,
gt_roi_label.data,
self.roi_sigma)
roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())
self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long())
losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
losses = losses + [sum(losses)]
return LossTuple(*losses)
def train_step(self, imgs, bboxes, labels, scale):
self.optimizer.zero_grad()
losses = self.forward(imgs, bboxes, labels, scale)
losses.total_loss.backward()
self.optimizer.step()
self.update_meters(losses)
return losses
def save(self, save_optimizer=False, save_path=None, **kwargs):
"""serialize models include optimizer and other info
return path where the model-file is stored.
Args:
save_optimizer (bool): whether save optimizer.state_dict().
save_path (string): where to save model, if it's None, save_path
is generate using time str and info from kwargs.
Returns:
save_path(str): the path to save models.
"""
save_dict = dict()
save_dict['model'] = self.faster_rcnn.state_dict()
save_dict['config'] = opt._state_dict()
save_dict['other_info'] = kwargs
save_dict['vis_info'] = self.vis.state_dict()
if save_optimizer:
save_dict['optimizer'] = self.optimizer.state_dict()
if save_path is None:
timestr = time.strftime('%m%d%H%M')
save_path = 'checkpoints/fasterrcnn_%s' % timestr
for k_, v_ in kwargs.items():
save_path += '_%s' % v_
save_dir = os.path.dirname(save_path)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
t.save(save_dict, save_path)
self.vis.save([self.vis.env])
return save_path
def load(self, path, load_optimizer=True, parse_opt=False, ):
state_dict = t.load(path)
if 'model' in state_dict:
self.faster_rcnn.load_state_dict(state_dict['model'])
else: # legacy way, for backward compatibility
self.faster_rcnn.load_state_dict(state_dict)
return self
if parse_opt:
opt._parse(state_dict['config'])
if 'optimizer' in state_dict and load_optimizer:
self.optimizer.load_state_dict(state_dict['optimizer'])
return self
def update_meters(self, losses):
loss_d = {k: at.scalar(v) for k, v in losses._asdict().items()}
for key, meter in self.meters.items():
meter.add(loss_d[key])
def reset_meters(self):
for key, meter in self.meters.items():
meter.reset()
self.roi_cm.reset()
self.rpn_cm.reset()
def get_meter_data(self):
return {k: v.value()[0] for k, v in self.meters.items()}
def _smooth_l1_loss(x, t, in_weight, sigma):
sigma2 = sigma ** 2
diff = in_weight * (x - t)
abs_diff = diff.abs()
flag = (abs_diff.data < (1. / sigma2)).float()
y = (flag * (sigma2 / 2.) * (diff ** 2) +
(1 - flag) * (abs_diff - 0.5 / sigma2))
return y.sum()
def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
in_weight = t.zeros(gt_loc.shape).cuda()
# Localization loss is calculated only for positive rois.
# NOTE: unlike origin implementation,
# we don't need inside_weight and outside_weight, they can calculate by gt_label
in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1
loc_loss = _smooth_l1_loss(pred_loc, gt_loc, in_weight.detach(), sigma)
# Normalize by total number of negtive and positive rois.
loc_loss /= ((gt_label >= 0).sum().float()) # ignore gt_label==-1 for rpn_loss
return loc_loss
【github】simple-faster-rcnn-pytorch
【目标检测】Faster RCNN代码实现——(2)