一起来看看Retinanet的Pytorch实现吧,顺便训练一下自己的数据。
Retinanet是在何凯明大神提出Focal loss同时提出的一种新的目标检测方案,来验证Focal Loss的有效性。
One-Stage目标检测方法常常使用先验框提高预测性能,一张图像可能生成成千上万的候选框,但是其中只有很少一部分是包含目标的的,有目标的就是正样本,没有目标的就是负样本。这种情况造成了One-Stage目标检测方法的正负样本不平衡,也使得One-Stage目标检测方法的检测效果比不上Two-Stage目标检测方法。
Focal Loss是一种新的用于平衡One-Stage目标检测方法正负样本的Loss方案。
Retinane的结构非常简单,但是其存在非常多的先验框,以输入600x600x3的图片为例,就存在着67995个先验框,这些先验框里面大多包含的是背景,存在非常多的负样本。以Focal Loss训练的Retinanet可以有效的平衡正负样本,实现有效的训练。
https://github.com/bubbliiiing/retinanet-pytorch
喜欢的可以点个star噢。
Retinanet采用的主干网络是Resnet网络,关于Resnet的介绍大家可以看我的另外一篇博客https://blog.csdn.net/weixin_44791964/article/details/102790260。
本例子假设输入的图片大小为600x600x3。
ResNet50有两个基本的块,分别名为Conv Block和Identity Block,其中Conv Block输入和输出的维度是不一样的,所以不能连续串联,它的作用是改变网络的维度;Identity Block输入维度和输出维度相同,可以串联,用于加深网络的。
Conv Block的结构如下:
Identity Block的结构如下:
这两个都是残差网络结构。
当输入的图片为600x600x3的时候,shape变化与总的网络结构如下:
我们取出长宽压缩了三次、四次、五次的结果来进行网络金字塔结构的构造。
实现代码:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import math
import torch.utils.model_zoo as model_zoo
import pdb
model_urls = {
'resnet18': 'https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth',
'resnet34': 'https://s3.amazonaws.com/pytorch/models/resnet34-333f7ec4.pth',
'resnet50': 'https://s3.amazonaws.com/pytorch/models/resnet50-19c8e357.pth',
'resnet101': 'https://s3.amazonaws.com/pytorch/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://s3.amazonaws.com/pytorch/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=dilation, groups=groups, bias=False, dilation=dilation)
def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
base_width=64, dilation=1, norm_layer=None):
super(BasicBlock, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
if groups != 1 or base_width != 64:
raise ValueError('BasicBlock only supports groups=1 and base_width=64')
if dilation > 1:
raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
# Both self.conv1 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
base_width=64, dilation=1, norm_layer=None):
super(Bottleneck, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
width = int(planes * (base_width / 64.)) * groups
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
self.conv2 = conv3x3(width, width, stride, groups, dilation)
self.bn2 = norm_layer(width)
self.conv3 = conv1x1(width, planes * self.expansion)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True) # change
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AvgPool2d(7)
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def resnet18(pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18'], model_dir='model_data'), strict=False)
return model
def resnet34(pretrained=False, **kwargs):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet34'], model_dir='model_data'), strict=False)
return model
def resnet50(pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50'], model_dir='model_data'), strict=False)
return model
def resnet101(pretrained=False, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101'], model_dir='model_data'), strict=False)
return model
def resnet152(pretrained=False, **kwargs):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet152'], model_dir='model_data'), strict=False)
return model
由抽象的结构图可知,获得到的特征还需要经过图像金字塔的处理,这样的结构可以融合多尺度的特征,实现更有效的预测。
图像金字塔的具体结构如下:
通过图像金字塔我们可以获得五个有效的特征层,分别是P3、P4、P5、P6、P7,
为了和普通特征层区分,我们称之为有效特征层,将这五个有效的特征层传输过class+box subnets就可以获得预测结果了。
class subnet采用4次256通道的卷积和1次num_priors x num_classes的卷积,num_priors指的是该特征层所拥有的先验框数量,num_classes指的是网络一共对多少类的目标进行检测。
box subnet采用4次256通道的卷积和1次num_priors x 4的卷积,num_priors指的是该特征层所拥有的先验框数量,4指的是先验框的调整情况。
需要注意的是,每个特征层所用的class subnet是同一个class subnet;每个特征层所用的box subnet是同一个box subnet。
其中:
num_priors x 4的卷积 用于预测 该特征层上 每一个网格点上 每一个先验框的变化情况。(为什么说是变化情况呢,这是因为ssd的预测结果需要结合先验框获得预测框,预测结果就是先验框的变化情况。)
num_priors x num_classes的卷积 用于预测 该特征层上 每一个网格点上 每一个预测框对应的种类。
实现代码为:
import torch.nn as nn
import torch.nn.functional as F
import torch
import math
from nets.resnet import resnet18,resnet34,resnet50,resnet101,resnet152
from utils.anchors import Anchors
class PyramidFeatures(nn.Module):
def __init__(self, C3_size, C4_size, C5_size, feature_size=256):
super(PyramidFeatures, self).__init__()
self.P5_1 = nn.Conv2d(C5_size, feature_size, kernel_size=1, stride=1, padding=0)
self.P5_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)
self.P4_1 = nn.Conv2d(C4_size, feature_size, kernel_size=1, stride=1, padding=0)
self.P4_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)
self.P3_1 = nn.Conv2d(C3_size, feature_size, kernel_size=1, stride=1, padding=0)
self.P3_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)
self.P6 = nn.Conv2d(C5_size, feature_size, kernel_size=3, stride=2, padding=1)
self.P7_1 = nn.ReLU()
self.P7_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=2, padding=1)
def forward(self, inputs):
C3, C4, C5 = inputs
_, _, h4, w4 = C4.size()
_, _, h3, w3 = C3.size()
P5_x = self.P5_1(C5)
P5_upsampled_x = F.interpolate(P5_x, size=(h4, w4))
P5_x = self.P5_2(P5_x)
P4_x = self.P4_1(C4)
P4_x = P5_upsampled_x + P4_x
P4_upsampled_x = F.interpolate(P4_x, size=(h3, w3))
P4_x = self.P4_2(P4_x)
P3_x = self.P3_1(C3)
P3_x = P3_x + P4_upsampled_x
P3_x = self.P3_2(P3_x)
P6_x = self.P6(C5)
P7_x = self.P7_1(P6_x)
P7_x = self.P7_2(P7_x)
return [P3_x, P4_x, P5_x, P6_x, P7_x]
class RegressionModel(nn.Module):
def __init__(self, num_features_in, num_anchors=9, feature_size=256):
super(RegressionModel, self).__init__()
self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
self.act1 = nn.ReLU()
self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act2 = nn.ReLU()
self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act3 = nn.ReLU()
self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act4 = nn.ReLU()
self.output = nn.Conv2d(feature_size, num_anchors * 4, kernel_size=3, padding=1)
def forward(self, x):
out = self.conv1(x)
out = self.act1(out)
out = self.conv2(out)
out = self.act2(out)
out = self.conv3(out)
out = self.act3(out)
out = self.conv4(out)
out = self.act4(out)
out = self.output(out)
# out is B x C x W x H, with C = 4*num_anchors
out = out.permute(0, 2, 3, 1)
return out.contiguous().view(out.shape[0], -1, 4)
class ClassificationModel(nn.Module):
def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256):
super(ClassificationModel, self).__init__()
self.num_classes = num_classes
self.num_anchors = num_anchors
self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
self.act1 = nn.ReLU()
self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act2 = nn.ReLU()
self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act3 = nn.ReLU()
self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act4 = nn.ReLU()
self.output = nn.Conv2d(feature_size, num_anchors * num_classes, kernel_size=3, padding=1)
self.output_act = nn.Sigmoid()
def forward(self, x):
out = self.conv1(x)
out = self.act1(out)
out = self.conv2(out)
out = self.act2(out)
out = self.conv3(out)
out = self.act3(out)
out = self.conv4(out)
out = self.act4(out)
out = self.output(out)
out = self.output_act(out)
# out is B x C x W x H, with C = n_classes + n_anchors
out1 = out.permute(0, 2, 3, 1)
batch_size, width, height, channels = out1.shape
out2 = out1.view(batch_size, width, height, self.num_anchors, self.num_classes)
return out2.contiguous().view(x.shape[0], -1, self.num_classes)
class Resnet(nn.Module):
def __init__(self, phi, load_weights=False):
super(Resnet, self).__init__()
self.edition = [resnet18,resnet34,resnet50,resnet101,resnet152]
model = self.edition[phi](load_weights)
del model.avgpool
del model.fc
self.model = model
def forward(self, x):
x = self.model.conv1(x)
x = self.model.bn1(x)
x = self.model.relu(x)
x = self.model.maxpool(x)
x = self.model.layer1(x)
feat1 = self.model.layer2(x)
feat2 = self.model.layer3(feat1)
feat3 = self.model.layer4(feat2)
return [feat1,feat2,feat3]
class Retinanet(nn.Module):
def __init__(self, num_classes, phi, pretrain_weights=False):
super(Retinanet, self).__init__()
self.pretrain_weights = pretrain_weights
self.backbone_net = Resnet(phi,pretrain_weights)
fpn_sizes = {
0: [128, 256, 512],
1: [128, 256, 512],
2: [512, 1024, 2048],
3: [512, 1024, 2048],
4: [512, 1024, 2048],
}[phi]
self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2])
self.regressionModel = RegressionModel(256)
self.classificationModel = ClassificationModel(256, num_classes=num_classes)
self.anchors = Anchors()
self._init_weights()
def _init_weights(self):
if not self.pretrain_weights:
print("_init_weights")
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
print("_init_classificationModel")
prior = 0.01
self.classificationModel.output.weight.data.fill_(0)
self.classificationModel.output.bias.data.fill_(-math.log((1.0 - prior) / prior))
print("_init_regressionModel")
self.regressionModel.output.weight.data.fill_(0)
self.regressionModel.output.bias.data.fill_(0)
def forward(self, inputs):
p3, p4, p5 = self.backbone_net(inputs)
features = self.fpn([p3, p4, p5])
regression = torch.cat([self.regressionModel(feature) for feature in features], dim=1)
classification = torch.cat([self.classificationModel(feature) for feature in features], dim=1)
anchors = self.anchors(features)
return features, regression, classification, anchors
我们通过对每一个特征层的处理,可以获得三个内容,分别是:
num_priors x 4的卷积 用于预测 该特征层上 每一个网格点上 每一个先验框的变化情况。**
num_priors x num_classes的卷积 用于预测 该特征层上 每一个网格点上 每一个预测框对应的种类。
每一个有效特征层对应的先验框对应着该特征层上 每一个网格点上 预先设定好的9个框。
我们利用 num_priors x 4的卷积 与 每一个有效特征层对应的先验框 获得框的真实位置。
每一个有效特征层对应的先验框就是,如图所示的作用:
每一个有效特征层将整个图片分成与其长宽对应的网格,如P3的特征层就是将整个图像分成75x75个网格;然后从每个网格中心建立9个先验框,一共75x75x9个,50625个先验框。
先验框虽然可以代表一定的框的位置信息与框的大小信息,但是其是有限的,无法表示任意情况,因此还需要调整,Retinanet利用4次256通道的卷积+num_priors x 4的卷积的结果对先验框进行调整。
num_priors x 4中的num_priors表示了这个网格点所包含的先验框数量,其中的4表示了框的左上角xy轴,右下角xy的调整情况。
Retinanet解码过程就是将对应的先验框的左上角和右下角进行位置的调整,调整完的结果就是预测框的位置了。
当然得到最终的预测结构后还要进行得分排序与非极大抑制筛选这一部分基本上是所有目标检测通用的部分。
1、取出每一类得分大于confidence_threshold的框和得分。
2、利用框的位置和得分进行非极大抑制。
实现代码如下:
def decodebox(regression, anchors, img):
dtype = regression.dtype
anchors = anchors.to(dtype)
y_centers_a = (anchors[..., 0] + anchors[..., 2]) / 2
x_centers_a = (anchors[..., 1] + anchors[..., 3]) / 2
ha = anchors[..., 2] - anchors[..., 0]
wa = anchors[..., 3] - anchors[..., 1]
w = regression[..., 3].exp() * wa
h = regression[..., 2].exp() * ha
y_centers = regression[..., 0] * ha + y_centers_a
x_centers = regression[..., 1] * wa + x_centers_a
ymin = y_centers - h / 2.
xmin = x_centers - w / 2.
ymax = y_centers + h / 2.
xmax = x_centers + w / 2.
boxes = torch.stack([xmin, ymin, xmax, ymax], dim=2)
_, _, height, width = np.shape(img)
boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0)
boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0)
boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width - 1)
boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height - 1)
# fig = plt.figure()
# ax = fig.add_subplot(121)
# grid_x = x_centers_a[0,-4*4*9:]
# grid_y = y_centers_a[0,-4*4*9:]
# plt.ylim(-600,1200)
# plt.xlim(-600,1200)
# plt.gca().invert_yaxis()
# plt.scatter(grid_x.cpu(),grid_y.cpu())
# anchor_left = anchors[0,-4*4*9:,1]
# anchor_top = anchors[0,-4*4*9:,0]
# anchor_w = wa[0,-4*4*9:]
# anchor_h = ha[0,-4*4*9:]
# for i in range(9,18):
# rect1 = plt.Rectangle([anchor_left[i],anchor_top[i]],anchor_w[i],anchor_h[i],color="r",fill=False)
# ax.add_patch(rect1)
# ax = fig.add_subplot(122)
# grid_x = x_centers_a[0,-4*4*9:]
# grid_y = y_centers_a[0,-4*4*9:]
# plt.scatter(grid_x.cpu(),grid_y.cpu())
# plt.ylim(-600,1200)
# plt.xlim(-600,1200)
# plt.gca().invert_yaxis()
# y_centers = y_centers[0,-4*4*9:]
# x_centers = x_centers[0,-4*4*9:]
# pre_left = xmin[0,-4*4*9:]
# pre_top = ymin[0,-4*4*9:]
# pre_w = xmax[0,-4*4*9:]-xmin[0,-4*4*9:]
# pre_h = ymax[0,-4*4*9:]-ymin[0,-4*4*9:]
# for i in range(9,18):
# plt.scatter(x_centers[i].cpu(),y_centers[i].cpu(),c='r')
# rect1 = plt.Rectangle([pre_left[i],pre_top[i]],pre_w[i],pre_h[i],color="r",fill=False)
# ax.add_patch(rect1)
# plt.show()
return boxes
通过第三步,我们可以获得预测框在原图上的位置,而且这些预测框都是经过筛选的。这些筛选后的框可以直接绘制在图片上,就可以获得结果了。
从预测部分我们知道,每个特征层的预测结果,num_priors x 4的卷积 用于预测 该特征层上 每一个网格点上 每一个先验框的变化情况。
也就是说,我们直接利用retinanet网络预测到的结果,并不是预测框在图片上的真实位置,需要解码才能得到真实位置。
而在训练的时候,我们需要计算loss函数,这个loss函数是相对于Retinanet网络的预测结果的。我们需要把图片输入到当前的Retinanet网络中,得到预测结果;同时还需要把真实框的信息,进行编码,这个编码是把真实框的位置信息格式转化为Retinanet预测结果的格式信息。
也就是,我们需要找到 每一张用于训练的图片的每一个真实框对应的先验框,并求出如果想要得到这样一个真实框,我们的预测结果应该是怎么样的。
从预测结果获得真实框的过程被称作解码,而从真实框获得预测结果的过程就是编码的过程。
因此我们只需要将解码过程逆过来就是编码过程了。
在进行编码的时候,我们需要找到每一个真实框对应的先眼眶,我们把和真实框重合程度在0.5以上的作为正样本,在0.4以下的作为负样本,在0.4和0.5之间的作为忽略样本。
实现代码如下:
def get_target(anchor, bbox_annotation, classification, cuda):
IoU = calc_iou(anchor[:, :], bbox_annotation[:, :4])
IoU_max, IoU_argmax = torch.max(IoU, dim=1)
# compute the loss for classification
targets = torch.ones_like(classification) * -1
if cuda:
targets = targets.cuda()
targets[torch.lt(IoU_max, 0.4), :] = 0
positive_indices = torch.ge(IoU_max, 0.5)
num_positive_anchors = positive_indices.sum()
assigned_annotations = bbox_annotation[IoU_argmax, :]
targets[positive_indices, :] = 0
targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1
return targets, num_positive_anchors, positive_indices, assigned_annotations
def encode_bbox(assigned_annotations, positive_indices, anchor_widths, anchor_heights, anchor_ctr_x, anchor_ctr_y):
assigned_annotations = assigned_annotations[positive_indices, :]
anchor_widths_pi = anchor_widths[positive_indices]
anchor_heights_pi = anchor_heights[positive_indices]
anchor_ctr_x_pi = anchor_ctr_x[positive_indices]
anchor_ctr_y_pi = anchor_ctr_y[positive_indices]
gt_widths = assigned_annotations[:, 2] - assigned_annotations[:, 0]
gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1]
gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths
gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights
# efficientdet style
gt_widths = torch.clamp(gt_widths, min=1)
gt_heights = torch.clamp(gt_heights, min=1)
targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi
targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi
targets_dw = torch.log(gt_widths / anchor_widths_pi)
targets_dh = torch.log(gt_heights / anchor_heights_pi)
targets = torch.stack((targets_dy, targets_dx, targets_dh, targets_dw))
targets = targets.t()
return targets
利用上述代码我们可以获得,真实框编码后的结果。
loss的计算分为两个部分:
1、Smooth Loss:获取所有正标签的框的预测结果的回归loss。
2、Focal Loss:获取所有未被忽略的种类的预测结果的交叉熵loss。
由于在Retinanet的训练过程中,正负样本极其不平衡,即 存在对应真实框的先验框可能只有若干个,但是不存在对应真实框的负样本却有上万个,这就会导致负样本的loss值极大,因此引入了Focal Loss进行正负样本的平衡,关于Focal Loss的介绍可以看这个博客。
https://blog.csdn.net/weixin_44791964/article/details/102853782
实现代码如下:
class FocalLoss(nn.Module):
def __init__(self):
super(FocalLoss, self).__init__()
def forward(self, classifications, regressions, anchors, annotations, alpha = 0.25, gamma = 2.0, cuda = True):
# 设置
dtype = regressions.dtype
batch_size = classifications.shape[0]
classification_losses = []
regression_losses = []
# 获得先验框,将先验框转换成中心宽高的形势
anchor = anchors[0, :, :].to(dtype)
# 转换成中心,宽高的形式
anchor_widths = anchor[:, 3] - anchor[:, 1]
anchor_heights = anchor[:, 2] - anchor[:, 0]
anchor_ctr_x = anchor[:, 1] + 0.5 * anchor_widths
anchor_ctr_y = anchor[:, 0] + 0.5 * anchor_heights
for j in range(batch_size):
# 取出真实框
bbox_annotation = annotations[j]
# 获得每张图片的分类结果和回归预测结果
classification = classifications[j, :, :]
regression = regressions[j, :, :]
# 平滑标签
classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4)
if len(bbox_annotation) == 0:
alpha_factor = torch.ones_like(classification) * alpha
if cuda:
alpha_factor = alpha_factor.cuda()
alpha_factor = 1. - alpha_factor
focal_weight = classification
focal_weight = alpha_factor * torch.pow(focal_weight, gamma)
bce = -(torch.log(1.0 - classification))
cls_loss = focal_weight * bce
if cuda:
regression_losses.append(torch.tensor(0).to(dtype).cuda())
else:
regression_losses.append(torch.tensor(0).to(dtype))
classification_losses.append(cls_loss.sum())
continue
# 获得目标预测结果
targets, num_positive_anchors, positive_indices, assigned_annotations = get_target(anchor, bbox_annotation, classification, cuda)
alpha_factor = torch.ones_like(targets) * alpha
if cuda:
alpha_factor = alpha_factor.cuda()
alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor)
focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification)
focal_weight = alpha_factor * torch.pow(focal_weight, gamma)
bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification))
cls_loss = focal_weight * bce
zeros = torch.zeros_like(cls_loss)
if cuda:
zeros = zeros.cuda()
cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, zeros)
classification_losses.append(cls_loss.sum() / torch.clamp(num_positive_anchors.to(dtype), min=1.0))
# smoooth_l1
if positive_indices.sum() > 0:
targets = encode_bbox(assigned_annotations, positive_indices, anchor_widths, anchor_heights, anchor_ctr_x, anchor_ctr_y)
regression_diff = torch.abs(targets - regression[positive_indices, :])
regression_loss = torch.where(
torch.le(regression_diff, 1.0 / 9.0),
0.5 * 9.0 * torch.pow(regression_diff, 2),
regression_diff - 0.5 / 9.0
)
regression_losses.append(regression_loss.mean())
else:
if cuda:
regression_losses.append(torch.tensor(0).to(dtype).cuda())
else:
regression_losses.append(torch.tensor(0).to(dtype))
c_loss = torch.stack(classification_losses).mean()
r_loss = torch.stack(regression_losses).mean()
loss = c_loss + r_loss
return loss, c_loss, r_loss
Retinanet整体的文件夹构架如下:
本文使用VOC格式进行训练。
训练前将标签文件放在VOCdevkit文件夹下的VOC2007文件夹下的Annotation中。
训练前将图片文件放在VOCdevkit文件夹下的VOC2007文件夹下的JPEGImages中。
在训练前利用voc2retinanet.py文件生成对应的txt。
再运行根目录下的voc_annotation.py,运行前需要将classes改成你自己的classes。
classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
就会生成对应的2007_train.txt,每一行对应其图片位置及其真实框的位置。
在训练前需要修改model_data里面的voc_classes.txt文件,需要将classes改成你自己的classes。
运行train.py即可开始训练。