在这里分享一下我对于YOLOv5中的主干特征提取网络以及整体网络的代码解析,希望对大家的工作有所帮助。
代码来源:https://github.com/bubbliiiing/yolov5-pytorch
1.主干特征提取网络
import torch
import torch.nn as nn
#----------------------------------#
# SiLU激活函数的定义
#----------------------------------#
class SiLU(nn.Module):
@staticmethod
def forward(x):
return x * torch.sigmoid(x)
#----------------------------------#
# 求解卷积操作中pad应取的值
#----------------------------------#
def autopad(k, p=None):
if p is None:
p = k // 2 if isinstance(k, int) else [x // 2 for x in k]
return p
#----------------------------------#
# 对于backbone中focus结构的定义
#----------------------------------#
class Focus(nn.Module):
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
super(Focus, self).__init__()
self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
def forward(self, x):
return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1))
#----------------------------------#
# 在backbone中所使用的卷积操作的定义
# 此Conv中所使用的激活函数为SiLU
#----------------------------------#
class Conv(nn.Module):
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
super(Conv, self).__init__()
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
self.bn = nn.BatchNorm2d(c2, eps=0.001, momentum=0.03)
self.act = SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())
def forward(self, x):
return self.act(self.bn(self.conv(x))) #卷积->BatchNorm2d->激活函数->output
def fuseforward(self,x):
return self.act(self.conv(x)) #这个forward函数不存在BN操作
#----------------------------------#
# 此为CSPLayer中的残差结构的定义
#----------------------------------#
class Bottleneck(nn.Module):
# Standard Bottleneck
def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion(扩张率)
super(Bottleneck, self).__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c_, c2, 3, 1, g=g)
self.add = shortcut and c1 == c2 #当shortcut=True并且输入与输出的通道相等时会进行add操作
def forward(self, x):
return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
#-----------------------------------------------#
# C3即为v5的backbone中的CSPLayer
# 其中所定义的shortcut用于Bottleneck中
# 当shortcut=True时,则会有一次add操作以及concat操作
# 当shortcut=False时,在Bottleneck将不会进行add操作
# 而是只有concat操作以及两次卷积操作
#-----------------------------------------------#
class C3(nn.Module):
# CSP Bottleneck with 3 convolutions
# n的值代表内部残差结构所循环的次数,即为base_depth
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
super(C3, self).__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c1, c_, 1, 1)
self.cv3 = Conv(2 * c_, c2, 1)
self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
def forward(self, x):
# -----------------------------------------------------------#
# 对于input,将其分为两部分,分别进行卷积操作,一部分会去到残差结构
# 进行处理,之后两部分会在第一维度进行堆叠,之后再过一次卷积便得到output
# -----------------------------------------------------------#
return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
#-------------------------------#
# 此为在第五层中所使用的SPP结构的定义
#-------------------------------#
class SPP(nn.Module):
# Spatial pyramid pooling layer used in YOLOv3-SPP
def __init__(self, c1, c2, k=(5, 9, 13)):
super(SPP, self).__init__()
c_ = c1 // 2 # hidden channels
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
def forward(self, x):
x = self.cv1(x)
return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
#-------------------------------------------#
# 此为对于backbone的定义
# 在各个残差块中BottleNeck循环的次数均为1
# 在backbone中的CSPLayer中的shortcut的值为True
#-------------------------------------------#
class CSPDarknet(nn.Module):
def __init__(self, base_channels, base_depth):
super().__init__()
#-----------------------------------------------#
# 输入图片是640, 640, 3
# 初始的基本通道是64
#-----------------------------------------------#
#---------------------------------------------------#
# 利用focus网络结构进行特征提取
# 在经过focus结构之后,特征图像的分辨率以及通道数均发生变化
# 640, 640, 3 -> 320, 320, 12 -> 320, 320, 64
#---------------------------------------------------#
self.stem = Focus(3, base_channels, k=3)
#-----------------------------------------------#
# dark2即为第一层
# 完成卷积之后,320, 320, 64 -> 160, 160, 128
# 完成CSPlayer之后,160, 160, 128 -> 160, 160, 128
#-----------------------------------------------#
self.dark2 = nn.Sequential(
Conv(base_channels, base_channels * 2, 3, 2),
C3(base_channels * 2, base_channels * 2, base_depth),
)
#-----------------------------------------------#
# dark3即为第二层
# 完成卷积之后,160, 160, 128 -> 80, 80, 256
# 完成CSPlayer之后,80, 80, 256 -> 80, 80, 256
#-----------------------------------------------#
self.dark3 = nn.Sequential(
Conv(base_channels * 2, base_channels * 4, 3, 2),
C3(base_channels * 4, base_channels * 4, base_depth * 3),
)
#-----------------------------------------------#
# dark34即为第三层
# 完成卷积之后,80, 80, 256 -> 40, 40, 512
# 完成CSPlayer之后,40, 40, 512 -> 40, 40, 512
#-----------------------------------------------#
self.dark4 = nn.Sequential(
Conv(base_channels * 4, base_channels * 8, 3, 2),
C3(base_channels * 8, base_channels * 8, base_depth * 3),
)
#-----------------------------------------------#
# dark5即为第四层
# 完成卷积之后,40, 40, 512 -> 20, 20, 1024
# 完成SPP之后,20, 20, 1024 -> 20, 20, 1024
# 完成CSPlayer之后,20, 20, 1024 -> 20, 20, 1024
#-----------------------------------------------#
self.dark5 = nn.Sequential(
Conv(base_channels * 8, base_channels * 16, 3, 2),
SPP(base_channels * 16, base_channels * 16),
C3(base_channels * 16, base_channels * 16, base_depth, shortcut=False),
)
def forward(self, x):
x = self.stem(x)
x = self.dark2(x)
#-----------------------------------------------#
# dark3的输出为80, 80, 256,是一个有效特征层
#-----------------------------------------------#
x = self.dark3(x)
feat1 = x
#-----------------------------------------------#
# dark4的输出为40, 40, 512,是一个有效特征层
#-----------------------------------------------#
x = self.dark4(x)
feat2 = x
#-----------------------------------------------#
# dark5的输出为20, 20, 1024,是一个有效特征层
#-----------------------------------------------#
x = self.dark5(x)
feat3 = x
#-----------------------------------------------#
# feat3为最下层的输出,shape = 1024,20,20
# feat2为中间层的输出,shape = 512,40,40
# feat1为最上层的输出,shape = 256,80,80
#-----------------------------------------------#
return feat1, feat2, feat3
2.YOLOv5整体网络
import torch import torch.nn as nn #---------------------------------------------------# # 此处导入的依次为backbone,CSPLayer结构以及基础的Conv操作 #---------------------------------------------------# from nets.CSPdarknet import CSPDarknet, C3, Conv #---------------------------------------------------# # yolo_body #---------------------------------------------------# class YoloBody(nn.Module): def __init__(self, anchors_mask, num_classes, phi): super(YoloBody, self).__init__() #-----------------------------------# # 定义的各个不同大小的模型的宽度和深度 #-----------------------------------# depth_dict = {'s' : 0.33, 'm' : 0.67, 'l' : 1.00, 'x' : 1.33,} width_dict = {'s' : 0.50, 'm' : 0.75, 'l' : 1.00, 'x' : 1.25,} dep_mul, wid_mul = depth_dict[phi], width_dict[phi] #-----------------------------------------# # 我们令模型的大小为l,进而输入的channels为64 # base_depth = 3,此值即为CSPlayer的层数 # base_channels = 64 # input_size是(640,640,3) # 输入图像的resolution越大,则最终所得的AP值越大 #-----------------------------------------# base_channels = int(wid_mul * 64) base_depth = max(round(dep_mul * 3), 1) #---------------------------------------------------# # 生成CSPdarknet53的主干模型 # 获得三个有效特征层,他们的shape分别是: # 80,80,256 # 40,40,512 # 20,20,1024 #---------------------------------------------------# self.backbone = CSPDarknet(base_channels, base_depth) self.upsample = nn.Upsample(scale_factor=2, mode="nearest") # scale_factor指定输出为输入的多少倍数 #------------------------------------------------------------# # conv_for_feat3,对于最深层的输出feat3,在进入neck之前先进行一次卷积 # conv3_for_upsample1,当feat3与feat2堆叠过之后,进行CSPLayer结构 # 并且输入与输出的特征图的大小以及通道数均不发生改变 #------------------------------------------------------------# self.conv_for_feat3 = Conv(base_channels * 16, base_channels * 8, 1, 1) #-----------------------------------# # 此处的CSPLayer的shortcut为False # 进而此处无add操作 #-----------------------------------# self.conv3_for_upsample1 = C3(base_channels * 16, base_channels * 8, base_depth, shortcut=False) #------------------------------------------------------------# # conv_for_feat2,对于中间层的输出feat2,在进入neck之前先进行一次卷积 # conv3_for_upsample2,当feat2与feat1堆叠过之后,进入CSPLayer结构 # 并且输入与输出的特征图的大小以及通道数均不发生改变 #------------------------------------------------------------# self.conv_for_feat2 = Conv(base_channels * 8, base_channels * 4, 1, 1) self.conv3_for_upsample2 = C3(base_channels * 8, base_channels * 4, base_depth, shortcut=False) #-----------------------------------# # feat1进行下采样,即为卷积操作 # feat1与feat2进行concat # 之后过CSPLayer #-----------------------------------# self.down_sample1 = Conv(base_channels * 4, base_channels * 4, 3, 2) self.conv3_for_downsample1 = C3(base_channels * 8, base_channels * 8, base_depth, shortcut=False) #-----------------------------------# # feat2进行下采样,即为卷积操作 # feat2与feat3进行concat # 之后过CSPLayer #-----------------------------------# self.down_sample2 = Conv(base_channels * 8, base_channels * 8, 3, 2) self.conv3_for_downsample2 = C3(base_channels * 16, base_channels * 16, base_depth, shortcut=False) #-----------------------------------# # 此为对于三个head的定义 # 具体做法为通过一个卷积操作来实现 #-----------------------------------# self.yolo_head_P3 = nn.Conv2d(base_channels * 4, len(anchors_mask[2]) * (5 + num_classes), 1) self.yolo_head_P4 = nn.Conv2d(base_channels * 8, len(anchors_mask[1]) * (5 + num_classes), 1) self.yolo_head_P5 = nn.Conv2d(base_channels * 16,len(anchors_mask[0]) * (5 + num_classes), 1) def forward(self, x): # backbone-Modified CSPDarknet53 feat1,feat2,feat3 = self.backbone(x) #---------------------------------------# # feat3首先卷积,再上采样,与feat2堆叠 # 之后过CSPLayer,最后得到P4(middle) #---------------------------------------# P5 = self.conv_for_feat3(feat3) P5_upsample = self.upsample(P5) P4 = torch.cat([P5_upsample, feat2], 1) P4 = self.conv3_for_upsample1(P4) #---------------------------------------# # feat2首先卷积,再上采样,与feat1堆叠 # 之后过CSPLayer,最后得到P3(top),并输出 #---------------------------------------# P4 = self.conv_for_feat2(P4) P4_upsample = self.upsample(P4) P3 = torch.cat([P4_upsample, feat1], 1) P3 = self.conv3_for_upsample2(P3) #---------------------------------------# # P3首先下采样再与P4堆叠 # 之后过CSPLayer,最后得到P4(middle),并输出 #---------------------------------------# P3_downsample = self.down_sample1(P3) P4 = torch.cat([P3_downsample, P4], 1) P4 = self.conv3_for_downsample1(P4) #---------------------------------------# # P4首先下采样再与P5堆叠 # 之后过CSPLayer,最后得到P5(bottom),并输出 #---------------------------------------# P4_downsample = self.down_sample2(P4) P5 = torch.cat([P4_downsample, P5], 1) P5 = self.conv3_for_downsample2(P5) #---------------------------------------------------# # 第三个特征层 # y3=(batch_size,255,80,80) #---------------------------------------------------# out2 = self.yolo_head_P3(P3) #---------------------------------------------------# # 第二个特征层 # y2=(batch_size,255,40,40) #---------------------------------------------------# out1 = self.yolo_head_P4(P4) #---------------------------------------------------# # 第一个特征层 # y1=(batch_size,255,20,20) #---------------------------------------------------# out0 = self.yolo_head_P5(P5) return out0, out1, out2