论文传送门:YOLOv3: An Incremental Improvement
1.使用Darknet53作为backbone;
2.多尺度特征预测(类似FPN结构);
3.其他tricks。
backbone为Darknet53的特征提取部分,其中Convolutional表示Conv+BN+LeakyReLU,Residual表示进行残差连接;
输入图像经过backbone提取出三层特征层,由浅层到深层分别标记为feature0、feature1和feature2;
首先将feature2经过Convolutional Layers,其输出一边经过Convolutional+Conv输出out2,另一边经过Convolutional和Upsampling,再与feature1相接;
Concat的特征一边经过Convolutional+Conv输出out1,另一边经过Convolutional和Upsampling,再与feature0相接;
最后Concat的特征经过Convolutional+Conv输出out0;
其中,Convolutional Layers的结构为5层Convolutional堆叠,其卷积核大小为 [ 1 , 3 , 1 , 3 , 1 ] [1,3,1,3,1] [1,3,1,3,1];
除了Concat后的第一个Convolutional和图片输入的第一个Convolutional,其余Convolutional中,卷积核尺寸为3,则通道数翻倍;卷积核尺寸为1,则通道数减半。
网络输出为三层,在COCO目标检测任务中,当输入图像尺寸为(3,416,416)时,输出的结果为:
Out 0 (255,52,52),用于预测小尺寸目标;
Out 1 (255,26,26),用于预测中尺寸目标;
Out 1 (255,13,13),用于预测大尺寸目标。
与Yolov2相似,其中,52x52,26x26,13x13均代表预设的anchor位置; 255 = ( 4 + 1 + 80 ) ∗ 3 255=(4+1+80)*3 255=(4+1+80)∗3,4代表目标回归参数,1代表目标置信度,80代表80个类别的条件概率,最后一个3代表anchor的尺寸数,即每个位置存在3种尺寸的anchor(针对每一层Out而言)。
(代码仅实现模型结构部分)
import torch
import torch.nn as nn
def convolutional(in_channels, out_channels, kernel_size, strid): # Conv+BN+LeakyReLU
padding = 1 if kernel_size == 3 else 0
return nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size, strid, padding, bias=False),
nn.BatchNorm2d(out_channels),
nn.LeakyReLU(0.1)
)
class Residual(nn.Module): # Residual结构
def __init__(self, in_channels, hidden_channels):
super(Residual, self).__init__()
self.residual_block = nn.Sequential(
convolutional(in_channels, hidden_channels, 1, 1),
convolutional(hidden_channels, in_channels, 3, 1)
)
def forward(self, x):
return x + self.residual_block(x) # x+F(x)
class Darknet53(nn.Module): # Darknet53的特征提取部分
def __init__(self):
super(Darknet53, self).__init__()
self.feature0 = nn.Sequential(
convolutional(3, 32, 3, 1),
convolutional(32, 64, 3, 2),
Residual(64, 32),
convolutional(64, 128, 3, 2),
*[Residual(128, 64) for i in range(2)],
convolutional(128, 256, 3, 2),
*[Residual(256, 128) for i in range(8)],
)
self.feature1 = nn.Sequential(
convolutional(256, 512, 3, 2),
*[Residual(512, 256) for i in range(8)],
)
self.feature2 = nn.Sequential(
convolutional(512, 1024, 3, 2),
*[Residual(1024, 512) for i in range(4)],
)
def forward(self, x):
feature0 = self.feature0(x) # 浅层特征
feature1 = self.feature1(feature0) # 中层特征
feature2 = self.feature2(feature1) # 深层特征
return feature0, feature1, feature2
class Convlayers(nn.Module): # 5个Convolutional的堆叠
def __init__(self, in_channels, hidden_channels):
super(Convlayers, self).__init__()
self.convlayers = nn.Sequential(
convolutional(in_channels, hidden_channels, 1, 1),
convolutional(hidden_channels, hidden_channels * 2, 3, 1),
convolutional(hidden_channels * 2, hidden_channels, 1, 1),
convolutional(hidden_channels, hidden_channels * 2, 3, 1),
convolutional(hidden_channels * 2, hidden_channels, 1, 1),
)
def forward(self, x):
return self.convlayers(x)
class Yolov3(nn.Module): # yolov3模型
def __init__(self):
super(Yolov3, self).__init__()
self.backbone = Darknet53()
self.convlayers2 = Convlayers(1024, 512)
self.convlayers1 = Convlayers(512 + 256, 256)
self.convlayers0 = Convlayers(256 + 128, 128)
self.final_conv2 = nn.Sequential(
convolutional(512, 1024, 3, 1),
nn.Conv2d(1024, 255, 1, 1, 0),
)
self.final_conv1 = nn.Sequential(
convolutional(256, 512, 3, 1),
nn.Conv2d(512, 255, 1, 1, 0),
)
self.final_conv0 = nn.Sequential(
convolutional(128, 256, 3, 1),
nn.Conv2d(256, 255, 1, 1, 0),
)
self.upsample2 = nn.Sequential(
convolutional(512, 256, 1, 1),
nn.Upsample(scale_factor=2)
)
self.upsample1 = nn.Sequential(
convolutional(256, 128, 1, 1),
nn.Upsample(scale_factor=2)
)
def forward(self, x):
# (B,256,52,52),(B,512,26,26),(B,1024,13,13)
feature0, feature1, feature2 = self.backbone(x) # 输入图像经过backbone提取到3层特征
f2 = self.convlayers2(feature2) # 深层特征经过Conolutional layers得到f2,(B,1024,13,13)-->(B,512,13,13)
out2 = self.final_conv2(f2) # f2经过Convolutional+Conv获得out2,(B,512,13,13)-->(B,255,13,13)
f1 = self.convlayers1( # f2经过Convolutional+Upsampling与中层特征拼接,再经过Conolutional layers得到f1
torch.cat([self.upsample2(f2), feature1], dim=1)) # (B,256,26,26)cat(B,512,26,26)-->(B,256,26,26)
out1 = self.final_conv1(f1) # f1经过Convolutional+Conv获得out1,(B,256,26,26)-->(B,255,26,26)
f0 = self.convlayers0( # f1经过Convolutional+Upsampling与浅层特征拼接,再经过Conolutional layers得到f0
torch.cat([self.upsample1(f1), feature0], dim=1)) # (B,128,52,52)cat(B,256,52,52)-->(B,128,52,52)
out0 = self.final_conv0(f0) # f0经过Convolutional+Conv获得out0,(B,128,52,52)-->(B,255,52,52)
return out0, out1, out2