YOLOv5网络代码解析

在这里分享一下我对于YOLOv5中的主干特征提取网络以及整体网络的代码解析,希望对大家的工作有所帮助。

代码来源:https://github.com/bubbliiiing/yolov5-pytorch

1.主干特征提取网络

import torch
import torch.nn as nn

#----------------------------------#
# SiLU激活函数的定义
#----------------------------------#
class SiLU(nn.Module):
    @staticmethod
    def forward(x):
        return x * torch.sigmoid(x)
#----------------------------------#
# 求解卷积操作中pad应取的值
#----------------------------------#
def autopad(k, p=None):
    if p is None:
        p = k // 2 if isinstance(k, int) else [x // 2 for x in k] 
    return p
#----------------------------------#
# 对于backbone中focus结构的定义
#----------------------------------#
class Focus(nn.Module):
    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
        super(Focus, self).__init__()
        self.conv = Conv(c1 * 4, c2, k, s, p, g, act)

    def forward(self, x):
        return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1))
#----------------------------------#
# 在backbone中所使用的卷积操作的定义
# 此Conv中所使用的激活函数为SiLU
#----------------------------------#
class Conv(nn.Module):
    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
        super(Conv, self).__init__()

        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
        self.bn = nn.BatchNorm2d(c2, eps=0.001, momentum=0.03)
        self.act = SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))   #卷积->BatchNorm2d->激活函数->output

    def fuseforward(self,x):
        return self.act(self.conv(x))    #这个forward函数不存在BN操作
#----------------------------------#
# 此为CSPLayer中的残差结构的定义
#----------------------------------#
class Bottleneck(nn.Module):
    # Standard Bottleneck
    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, shortcut, groups, expansion(扩张率)
        super(Bottleneck, self).__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c_, c2, 3, 1, g=g)
        self.add = shortcut and c1 == c2  #当shortcut=True并且输入与输出的通道相等时会进行add操作

    def forward(self, x):
        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
#-----------------------------------------------#
# C3即为v5的backbone中的CSPLayer
# 其中所定义的shortcut用于Bottleneck中
# 当shortcut=True时,则会有一次add操作以及concat操作
# 当shortcut=False时,在Bottleneck将不会进行add操作
# 而是只有concat操作以及两次卷积操作
#-----------------------------------------------#
class C3(nn.Module):
    # CSP Bottleneck with 3 convolutions
    # n的值代表内部残差结构所循环的次数,即为base_depth
    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
        super(C3, self).__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c1, c_, 1, 1)
        self.cv3 = Conv(2 * c_, c2, 1)
        self.m   = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])

    def forward(self, x):
        # -----------------------------------------------------------#
        # 对于input,将其分为两部分,分别进行卷积操作,一部分会去到残差结构
        # 进行处理,之后两部分会在第一维度进行堆叠,之后再过一次卷积便得到output
        # -----------------------------------------------------------#
        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
#-------------------------------#
# 此为在第五层中所使用的SPP结构的定义
#-------------------------------#
class SPP(nn.Module):
    # Spatial pyramid pooling layer used in YOLOv3-SPP
    def __init__(self, c1, c2, k=(5, 9, 13)):
        super(SPP, self).__init__()
        c_ = c1 // 2  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])

    def forward(self, x):
        x = self.cv1(x)
        return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
#-------------------------------------------#
# 此为对于backbone的定义
# 在各个残差块中BottleNeck循环的次数均为1
# 在backbone中的CSPLayer中的shortcut的值为True
#-------------------------------------------#
class CSPDarknet(nn.Module):
    def __init__(self, base_channels, base_depth):
        super().__init__()
        #-----------------------------------------------#
        #   输入图片是640, 640, 3
        #   初始的基本通道是64
        #-----------------------------------------------#

        #---------------------------------------------------#
        #   利用focus网络结构进行特征提取
        #   在经过focus结构之后,特征图像的分辨率以及通道数均发生变化
        #   640, 640, 3 -> 320, 320, 12 -> 320, 320, 64
        #---------------------------------------------------#
        self.stem = Focus(3, base_channels, k=3)
        #-----------------------------------------------#
        #   dark2即为第一层
        #   完成卷积之后,320, 320, 64 -> 160, 160, 128
        #   完成CSPlayer之后,160, 160, 128 -> 160, 160, 128
        #-----------------------------------------------#
        self.dark2 = nn.Sequential(
            Conv(base_channels, base_channels * 2, 3, 2),
            C3(base_channels * 2, base_channels * 2, base_depth),
        )
        #-----------------------------------------------#
        #   dark3即为第二层
        #   完成卷积之后,160, 160, 128 -> 80, 80, 256
        #   完成CSPlayer之后,80, 80, 256 -> 80, 80, 256
        #-----------------------------------------------#
        self.dark3 = nn.Sequential(
            Conv(base_channels * 2, base_channels * 4, 3, 2),
            C3(base_channels * 4, base_channels * 4, base_depth * 3),
        )
        #-----------------------------------------------#
        #   dark34即为第三层
        #   完成卷积之后,80, 80, 256 -> 40, 40, 512
        #   完成CSPlayer之后,40, 40, 512 -> 40, 40, 512
        #-----------------------------------------------#
        self.dark4 = nn.Sequential(
            Conv(base_channels * 4, base_channels * 8, 3, 2),
            C3(base_channels * 8, base_channels * 8, base_depth * 3),
        )
        #-----------------------------------------------#
        #   dark5即为第四层
        #   完成卷积之后,40, 40, 512 -> 20, 20, 1024
        #   完成SPP之后,20, 20, 1024 -> 20, 20, 1024
        #   完成CSPlayer之后,20, 20, 1024 -> 20, 20, 1024
        #-----------------------------------------------#
        self.dark5 = nn.Sequential(
            Conv(base_channels * 8, base_channels * 16, 3, 2),
            SPP(base_channels * 16, base_channels * 16),
            C3(base_channels * 16, base_channels * 16, base_depth, shortcut=False),
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.dark2(x)
        #-----------------------------------------------#
        #   dark3的输出为80, 80, 256,是一个有效特征层
        #-----------------------------------------------#
        x = self.dark3(x)
        feat1 = x
        #-----------------------------------------------#
        #   dark4的输出为40, 40, 512,是一个有效特征层
        #-----------------------------------------------#
        x = self.dark4(x)
        feat2 = x
        #-----------------------------------------------#
        #   dark5的输出为20, 20, 1024,是一个有效特征层
        #-----------------------------------------------#
        x = self.dark5(x)
        feat3 = x
        #-----------------------------------------------#
        #   feat3为最下层的输出,shape = 1024,20,20
        #   feat2为中间层的输出,shape = 512,40,40
        #   feat1为最上层的输出,shape = 256,80,80
        #-----------------------------------------------#
        return feat1, feat2, feat3

 2.YOLOv5整体网络

import torch
import torch.nn as nn
#---------------------------------------------------#
#  此处导入的依次为backbone,CSPLayer结构以及基础的Conv操作
#---------------------------------------------------#
from nets.CSPdarknet import CSPDarknet, C3, Conv

#---------------------------------------------------#
#   yolo_body
#---------------------------------------------------#
class YoloBody(nn.Module):
    def __init__(self, anchors_mask, num_classes, phi):
        super(YoloBody, self).__init__()
        #-----------------------------------#
        # 定义的各个不同大小的模型的宽度和深度
        #-----------------------------------#
        depth_dict          = {'s' : 0.33, 'm' : 0.67, 'l' : 1.00, 'x' : 1.33,}
        width_dict          = {'s' : 0.50, 'm' : 0.75, 'l' : 1.00, 'x' : 1.25,}
        dep_mul, wid_mul    = depth_dict[phi], width_dict[phi]
        #-----------------------------------------#
        # 我们令模型的大小为l,进而输入的channels为64
        # base_depth    = 3,此值即为CSPlayer的层数
        # base_channels = 64
        # input_size是(640,640,3)
        # 输入图像的resolution越大,则最终所得的AP值越大
        #-----------------------------------------#
        base_channels       = int(wid_mul * 64)
        base_depth          = max(round(dep_mul * 3), 1)
        #---------------------------------------------------#
        #   生成CSPdarknet53的主干模型
        #   获得三个有效特征层,他们的shape分别是:
        #   80,80,256
        #   40,40,512
        #   20,20,1024
        #---------------------------------------------------#
        self.backbone   = CSPDarknet(base_channels, base_depth)

        self.upsample   = nn.Upsample(scale_factor=2, mode="nearest")  # scale_factor指定输出为输入的多少倍数
        #------------------------------------------------------------#
        # conv_for_feat3,对于最深层的输出feat3,在进入neck之前先进行一次卷积
        # conv3_for_upsample1,当feat3与feat2堆叠过之后,进行CSPLayer结构
        # 并且输入与输出的特征图的大小以及通道数均不发生改变
        #------------------------------------------------------------#
        self.conv_for_feat3         = Conv(base_channels * 16, base_channels * 8, 1, 1)
        #-----------------------------------#
        # 此处的CSPLayer的shortcut为False
        # 进而此处无add操作
        #-----------------------------------#
        self.conv3_for_upsample1    = C3(base_channels * 16, base_channels * 8, base_depth, shortcut=False)
        #------------------------------------------------------------#
        # conv_for_feat2,对于中间层的输出feat2,在进入neck之前先进行一次卷积
        # conv3_for_upsample2,当feat2与feat1堆叠过之后,进入CSPLayer结构
        # 并且输入与输出的特征图的大小以及通道数均不发生改变
        #------------------------------------------------------------#
        self.conv_for_feat2         = Conv(base_channels * 8, base_channels * 4, 1, 1)
        self.conv3_for_upsample2    = C3(base_channels * 8, base_channels * 4, base_depth, shortcut=False)
        #-----------------------------------#
        # feat1进行下采样,即为卷积操作
        # feat1与feat2进行concat
        # 之后过CSPLayer
        #-----------------------------------#
        self.down_sample1           = Conv(base_channels * 4, base_channels * 4, 3, 2)
        self.conv3_for_downsample1  = C3(base_channels * 8, base_channels * 8, base_depth, shortcut=False)
        #-----------------------------------#
        # feat2进行下采样,即为卷积操作
        # feat2与feat3进行concat
        # 之后过CSPLayer
        #-----------------------------------#
        self.down_sample2           = Conv(base_channels * 8, base_channels * 8, 3, 2)
        self.conv3_for_downsample2  = C3(base_channels * 16, base_channels * 16, base_depth, shortcut=False)
        #-----------------------------------#
        # 此为对于三个head的定义
        # 具体做法为通过一个卷积操作来实现
        #-----------------------------------#
        self.yolo_head_P3 = nn.Conv2d(base_channels * 4, len(anchors_mask[2]) * (5 + num_classes), 1)
        self.yolo_head_P4 = nn.Conv2d(base_channels * 8, len(anchors_mask[1]) * (5 + num_classes), 1)
        self.yolo_head_P5 = nn.Conv2d(base_channels * 16,len(anchors_mask[0]) * (5 + num_classes), 1)

    def forward(self, x):
        # backbone-Modified CSPDarknet53
        feat1,feat2,feat3 = self.backbone(x)
        #---------------------------------------#
        # feat3首先卷积,再上采样,与feat2堆叠
        # 之后过CSPLayer,最后得到P4(middle)
        #---------------------------------------#
        P5          = self.conv_for_feat3(feat3)
        P5_upsample = self.upsample(P5)
        P4          = torch.cat([P5_upsample, feat2], 1)
        P4          = self.conv3_for_upsample1(P4)
        #---------------------------------------#
        # feat2首先卷积,再上采样,与feat1堆叠
        # 之后过CSPLayer,最后得到P3(top),并输出
        #---------------------------------------#
        P4          = self.conv_for_feat2(P4)
        P4_upsample = self.upsample(P4)
        P3          = torch.cat([P4_upsample, feat1], 1)
        P3          = self.conv3_for_upsample2(P3)
        #---------------------------------------#
        # P3首先下采样再与P4堆叠
        # 之后过CSPLayer,最后得到P4(middle),并输出
        #---------------------------------------#
        P3_downsample = self.down_sample1(P3)
        P4            = torch.cat([P3_downsample, P4], 1)
        P4            = self.conv3_for_downsample1(P4)
        #---------------------------------------#
        # P4首先下采样再与P5堆叠
        # 之后过CSPLayer,最后得到P5(bottom),并输出
        #---------------------------------------#
        P4_downsample = self.down_sample2(P4)
        P5            = torch.cat([P4_downsample, P5], 1)
        P5            = self.conv3_for_downsample2(P5)

        #---------------------------------------------------#
        #   第三个特征层
        #   y3=(batch_size,255,80,80)
        #---------------------------------------------------#
        out2 = self.yolo_head_P3(P3)
        #---------------------------------------------------#
        #   第二个特征层
        #   y2=(batch_size,255,40,40)
        #---------------------------------------------------#
        out1 = self.yolo_head_P4(P4)
        #---------------------------------------------------#
        #   第一个特征层
        #   y1=(batch_size,255,20,20)
        #---------------------------------------------------#
        out0 = self.yolo_head_P5(P5)

        return out0, out1, out2

你可能感兴趣的:(深度学习,计算机视觉)