目录
1、序言
2、yolov2的贡献点
3、yolov2框的回归方式
4、总结
Yolov1的框回归较为困难,依然存在很多缺点,在此基础上提出了Yolov2,以下对yolov2进行简单介绍。
(1)yolov1实际上可以认为是一个anchor free模型,但由于其框的回归质量差,yolov2又提出了使用anchor box作为先验知识,在此基础下进行回归目标框。
(2)yolov2在卷积层后面加了BN层,去掉了fc层。
(3)增加了reorg层,用来将底层特征加到高层特征,增加上下文的语义信息,提升对小目标的检出。(例如:16倍下采样的feature在不同纬度信息交互后,下采样到32倍,通道维度增加4倍,然后和原来的32倍下采样的featrue进行concate输出)
(4)anchor box的尺寸是聚类得到的。
(5)采用了多尺度训练。
(6)分类和检测进行了联合训练。先用448的尺寸imagenet训练分类模型,在用来作为预训练模型训练检测网络。
网络最后的输出的feature map大小为w,h。则将feature map划分为wxh的1x1网格。预测的坐标有五个值Tx,Ty,Tw,Th和T0,Tx和Ty是网格左上角坐标相对于中心点的偏移量,用sigmoid将其限制在0到1范围,在加上左上角坐标就是中心点的坐标,再乘下采样倍率就是绝对坐标下的中心点坐标。yolov1也是预测的偏移量,是相对于网格大小的偏移量。yolov1因为没有anchor box,所以直接预测目标w和h,而yolov2预测一个系数,乘以anchor box的w和h得到目标的w和h。
代码写了一半就懒得写了,有兴趣的可以对齐target和predict后加loss训练就行了。主体的网络模型如下,我在vovnet的基础下进行修改的。v3的论文相对v2就一句话,加了fpn做多级检测。
#encoding:utf-8
#author: [email protected]
import os
import torch
import torch.nn as nn
from collections import OrderedDict
import sys,os
sys.path.append(os.path.dirname(__file__) + os.sep + '../')
from configs.config import Configs as cfg
__all__ = [ 'vovnet27_slim']
def conv3x3(in_channels, out_channels, module_name, postfix,
stride=1, groups=1, kernel_size=3, padding=1):
"""3x3 convolution with padding"""
return [
('{}_{}/conv'.format(module_name, postfix),
nn.Conv2d(in_channels, out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=False)),
('{}_{}/norm'.format(module_name, postfix),
nn.BatchNorm2d(out_channels)),
('{}_{}/relu'.format(module_name, postfix),
nn.ReLU(inplace=True)),
]
def conv1x1(in_channels, out_channels, module_name, postfix,
stride=1, groups=1, kernel_size=1, padding=0):
"""1x1 convolution"""
return [
('{}_{}/conv'.format(module_name, postfix),
nn.Conv2d(in_channels, out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=False)),
('{}_{}/norm'.format(module_name, postfix),
nn.BatchNorm2d(out_channels)),
('{}_{}/relu'.format(module_name, postfix),
nn.ReLU(inplace=True)),
]
class _OSA_module(nn.Module):
def __init__(self,
in_ch,
stage_ch,
concat_ch,
layer_per_block,
module_name,
identity=False):
super(_OSA_module, self).__init__()
self.identity = identity
self.layers = nn.ModuleList()
in_channel = in_ch
for i in range(layer_per_block):
self.layers.append(nn.Sequential(
OrderedDict(conv3x3(in_channel, stage_ch, module_name, i))))
in_channel = stage_ch
# feature aggregation
in_channel = in_ch + layer_per_block * stage_ch
self.concat = nn.Sequential(
OrderedDict(conv1x1(in_channel, concat_ch, module_name, 'concat')))
def forward(self, x):
identity_feat = x
output = []
output.append(x)
for layer in self.layers:
x = layer(x)
output.append(x)
x = torch.cat(output, dim=1)
xt = self.concat(x)
if self.identity:
xt = xt + identity_feat
return xt
class _OSA_stage(nn.Sequential):
def __init__(self,
in_ch,
stage_ch,
concat_ch,
block_per_stage,
layer_per_block,
stage_num):
super(_OSA_stage, self).__init__()
if not stage_num == 2:
self.add_module('Pooling',
nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True))
module_name = f'OSA{stage_num}_1'
self.add_module(module_name,
_OSA_module(in_ch,
stage_ch,
concat_ch,
layer_per_block,
module_name))
for i in range(block_per_stage-1):
module_name = f'OSA{stage_num}_{i+2}'
self.add_module(module_name,
_OSA_module(concat_ch,
stage_ch,
concat_ch,
layer_per_block,
module_name,
identity=True))
class VoVNet(nn.Module):
def __init__(self,
cfg,
config_stage_ch,
config_concat_ch,
block_per_stage,
layer_per_block,
num_classes=1000):
super(VoVNet, self).__init__()
# Stem module
self.cfg = cfg
stem = conv3x3(3, 64, 'stem', '1', 2)
stem += conv3x3(64, 64, 'stem', '2', 1)
stem += conv3x3(64, 128, 'stem', '3', 2)
self.add_module('stem', nn.Sequential(OrderedDict(stem)))
stem_out_ch = [128]
in_ch_list = stem_out_ch + config_concat_ch[:-1]
self.stage_names = []
for i in range(4): #num_stages
name = 'stage%d' % (i+2)
self.stage_names.append(name)
self.add_module(name,
_OSA_stage(in_ch_list[i],
config_stage_ch[i],
config_concat_ch[i],
block_per_stage[i],
layer_per_block,
i+2))
self.MaxPool = nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.constant_(m.bias, 0)
if self.cfg.Pretrain_model:
self.load_param(model_path=self.cfg.Pretrain_model)
def forward(self, x):
x = self.stem(x)
result=[]
for name in self.stage_names:
x = getattr(self, name)(x)
result.append(x)
return result[1:]
def load_param(self, model_path):
param_dict = torch.load(model_path, map_location='cpu')['state_dict']
model_dict = self.state_dict()
for i in model_dict:
print(i)
if 'head' in i:
continue
mi = i.replace('stages.', 'stage.stage_')
map_i = 'module.' + mi
if 'fc' in i or 'classifier' in i:
continue
if len(self.state_dict()[i].size()) != 0:
self.state_dict()[i].copy_(param_dict[map_i])
class Vovnet27_slim(VoVNet):
def __init__(self, cfg):
super(Vovnet27_slim, self).__init__(cfg=cfg, config_stage_ch=[64, 80, 96, 112],
config_concat_ch=[128, 256, 384, 512],
block_per_stage=[1,1,1,1], layer_per_block=5,
num_classes=1000)
class reorg_layer(nn.Module):
def __init__(self, stride):
super(reorg_layer, self).__init__()
self.stride = stride
def forward(self, x):
batch_size, channels, height, width = x.size()
_height, _width = height // self.stride, width // self.stride
x = x.view(batch_size, channels, _height, self.stride, _width, self.stride).transpose(3, 4).contiguous()
x = x.view(batch_size, channels, _height * _width, self.stride * self.stride).transpose(2, 3).contiguous()
x = x.view(batch_size, channels, self.stride * self.stride, _height, _width).transpose(1, 2).contiguous()
x = x.view(batch_size, -1, _height, _width)
return x
class Vovnet27_slim_reorg(VoVNet):
def __init__(self, cfg):
super(Vovnet27_slim_reorg, self).__init__(cfg=cfg, config_stage_ch=[64, 80, 96, 112],
config_concat_ch=[128, 256, 384, 512],
block_per_stage=[1,1,1,1], layer_per_block=5,
num_classes=1000)
self.cfg = cfg
self.anchor_number = len(self.cfg.Inputs["anchor_size"])
self.route_layer = nn.Sequential(nn.Conv2d(384, 128, kernel_size=1, stride=1, padding=0),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True))
self.reorg = reorg_layer(stride=2)
self.convsets = nn.Sequential(nn.Conv2d(1024, 512, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(512),
nn.ReLU(inplace=True))
self.pred = nn.Conv2d(512, self.anchor_number*(1 + 4 + self.cfg.Inputs["num_class"]), 1)
def forward(self, x):
x8, x16, x32 = super().forward(x)
x16 = self.route_layer(x16)
reorg_x32 = self.reorg(x16)
merge_x32 = torch.cat([x32, reorg_x32], dim=1)
merge_x32 = self.convsets(merge_x32)
pred = self.pred(merge_x32)
return pred
if __name__ == "__main__":
import pdb
model = Vovnet27_slim_reorg(cfg).cuda()
test_input = torch.randn((2,3,416,416)).cuda()
out = model(test_input)
pdb.set_trace()
# print(out.shape)