yolov5模型部分源码解读

model部分

对于model部分,我们首先看model的配置文件

模型配置

# YOLOv5  by Ultralytics, GPL-3.0 license

# Parameters
nc: 80  # number of classes
depth_multiple: 0.33  # 更改模型的深度,这里主要影响的是网络里面的number参数,最少为一,其他的则是会变为原来的三分之一
width_multiple: 0.50  # 更改网络的宽度, 这里主要影响的是网络里面的输入通道数,变为原来的二分之一
anchors:
	# 这个先验框的配置
  - [10,13, 16,30, 33,23]  # P3/8  大尺度
  - [30,61, 62,45, 59,119]  # P4/16 中尺度
  - [116,90, 156,198, 373,326]  # P5/32 小尺度

# YOLOv5 v6.0 backbone
backbone:
  # [from, number, module, args] 这里需要熟记
  [[-1, 1, Conv, [64, 6, 2, 2]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, C3, [128]],
   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
   [-1, 6, C3, [256]],
   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
   [-1, 9, C3, [512]],
   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
   [-1, 3, C3, [1024]],
   [-1, 1, SPPF, [1024, 5]],  # 9
  ]

# YOLOv5 v6.0 head
head:
  [[-1, 1, Conv, [512, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, C3, [512, False]],  # 13

   [-1, 1, Conv, [256, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)

   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 14], 1, Concat, [1]],  # cat head P4
   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)

   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 10], 1, Concat, [1]],  # cat head P5
   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)

   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
  ]

模型解析

为了突出重点,我把一些日志输出相关部分,给删去了

def parse_model(d, ch):  # model_dict, input_channels(3)
    """
        解析模型的方法,根据配置文件信息,搭建模型
    Args:
        d: 配置文件的信息
        ch: 图片输入的channel

    Returns: 返回解析后的模型、保存了FPN层的一些信息,每一层都有哪些层与之相连

    """
    # 先验框,类别数,深度,宽度
    anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
    na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors  # 每个检测头anchor的数量
    
    no = na * (nc + 5)  # 最后每个grid输出的个数

    layers, save, c2 = [], [], ch[-1]  # layers用来保存每层结构的列表 save是保存网络输出结果,在后面forward的时候有用
    
    # 列表相加是直接把前面的列表元素和后面的列表元素进行合并
    for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):  # from, number, module, args
        m = eval(m) if isinstance(m, str) else m  # eval 把参数当作代码块进行处理
        for j, a in enumerate(args):
            try:
            
                args[j] = eval(a) if isinstance(a, str) else a
            except NameError:
                pass

        n = n_ = max(round(n * gd), 1) if n > 1 else n  # 这里来改变网络的深度 round是采用四舍五入的方式进行取整
       
    # 针对不同的模型,需要不同的参数,如果想要为网络添加一些block,只需要修改下面的代码和yaml的配置即可 
        if m in (Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, MixConv2d, Focus, CrossConv,
                 BottleneckCSP, C3, C3TR, C3SPP, C3Ghost):
            """
            	对于在这里面的模块来说,yaml配置的第一个参数为输出通道,输入通道为上一层输出通道
            """
            
            c1, c2 = ch[f], args[0]  # ch表示该层之前所有层的输出,-1代表上一层的输出
            if c2 != no:  #  查看该层是不是输出层,如果是输出层,那么模型则不进行缩放
                c2 = make_divisible(c2 * gw, 8)  # 将网络的宽度给修改了,由于yolov5s是0.5 所以中间的网络层均缩小了

            args = [c1, c2, *args[1:]] # *args[1:] 列表的解构赋值 相当于[c1, c2]+args[1:] 所以现在args[输入通道,输出通道,其他参数信息]
            
            if m in [BottleneckCSP, C3, C3TR, C3Ghost]:
                args.insert(2, n)  # 这些网络层有n个堆叠
                n = 1
        
        """
        	对于在这里面的模块来说,yaml配置的参数根据你定义的模块需要来改变,输出通道根据需要去改变
        """ 
        elif m is nn.BatchNorm2d:
            args = [ch[f]]
        # 当模型采用Concat或者Detect时,他是将有一个FPN结构
        elif m is Concat:
            c2 = sum(ch[x] for x in f)
        elif m is Detect:
            # Detect 就是把每个检测层都给存起来
            args.append([ch[x] for x in f])
            if isinstance(args[1], int):  # number of anchors, 这里一般都用不到,我们一般就是会直接给了候选框,并不只是给了候选框的个数
                args[1] = [list(range(args[1] * 2))] * len(f)
        elif m is Contract:
            c2 = ch[f] * args[0] ** 2
        elif m is Expand:
            c2 = ch[f] // args[0] ** 2
        else:
            c2 = ch[f]

        m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args)  # module
        t = str(m)[8:-2].replace('__main__.', '')  # module type
        np = sum(x.numel() for x in m_.parameters())  # numel计算数组元素的个数, 这一行表示模型的参数个数
        m_.i, m_.f, m_.type, m_.np = i, f, t, np  # attach index, 'from' index, type, number params
        save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # 保存该层需要前置层的索引
        """
        
        x % i for x in ([f] if isinstance(f, int) else f) if x != -1  =>
        if isinstance(f, int):
            for x in [f]:
                if x != -1:
                    # save.extend(x % 100)
                    save.append(x % 100)

		else:
            for x in f:
                if x != -1:
                    # save.extend(x % 100)
                    save.append(x % 100)
        上面是列表生成式的写法,extend参数必须是可迭代对象,所以可以把extend改成append
        """
        layers.append(m_)
        if i == 0:  # 从第一层的输入开始存,刚开始传入的ch为输入图片的通道数,不需要
            ch = []
        ch.append(c2) # 把这一次输入放入结果列表当中
    return nn.Sequential(*layers), sorted(save)  # 返回的模型,每一层需要前置层的索引

Detect类

class Detect(nn.Module):
    stride = None  # strides computed during build
    onnx_dynamic = False  # ONNX export parameter
    export = False  # export mode

    def __init__(self, nc=80, anchors=(), ch=(), inplace=True):  # detection layer
        super().__init__()
        self.nc = nc  # number of classes
        self.no = nc + 5  # number of outputs per anchor
        self.nl = len(anchors)  # number of detection layers
        self.na = len(anchors[0]) // 2  # number of anchors  # 每个探测器的候选框个数
        self.grid = [torch.zeros(1)] * self.nl  # init grid
        self.anchor_grid = [torch.zeros(1)] * self.nl  # init anchor grid
        self.register_buffer('anchors', torch.tensor(anchors).float().view(self.nl, -1, 2))  # shape(nl,na,2)
        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv
        self.inplace = inplace  # use in-place ops (e.g. slice assignment)

    def forward(self, x):
        z = []  # inference output
        # self.nl 探测头的个数
        for i in range(self.nl):
            x[i] = self.m[i](x[i])  # conv
            bs, _, ny, nx = x[i].shape  
            # 对于20*20的探测头 x(bs,255,20,20) to x(bs,3,20,20,85)
            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

            if not self.training:  # inference
                """
                 -----------------预测的时候开启,在训练的时候不开启,这部分需要结合损失函数去看-------------------------
                """
                if self.onnx_dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:
                    self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)

                # 把最后的输出结果进行sigmoid处理,使其映射在0-1之间
                y = x[i].sigmoid()
                if self.inplace:
                    # y[...,0:2]  就是只观察最后一列 而最后一列 前面五个分别表示 x,y,w,h,c
                    y[..., 0:2] = (y[..., 0:2] * 2 + self.grid[i]) * self.stride[i]  # xy
                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
                else:  # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
                    xy, wh, conf = y.split((2, 2, self.nc + 1), 4)  # y.tensor_split((2, 4, 5), 4)  # torch 1.8.0
                    xy = (xy * 2 + self.grid[i]) * self.stride[i]  # xy
                    wh = (wh * 2) ** 2 * self.anchor_grid[i]  # wh
                    y = torch.cat((xy, wh, conf), 4)
                z.append(y.view(bs, -1, self.no))  # y->[bs,-1,85]
        # 训练的时候,返回原数据,否则返回处理后的结果 该处理和loss处理是息息相关的
        # x=[3*80*80*85,3*40*40*85,3*20*20*85]
        # z在训练的时候没有数据,只有测试的时候才会有数据
        return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)

    def _make_grid(self, nx=20, ny=20, i=0):
        d = self.anchors[i].device
        t = self.anchors[i].dtype
        shape = 1, self.na, ny, nx, 2  # grid shape
        y, x = torch.arange(ny, device=d, dtype=t), torch.arange(nx, device=d, dtype=t)
        if check_version(torch.__version__, '1.10.0'):  # torch>=1.10.0 meshgrid workaround for torch>=0.7 compatibility
            yv, xv = torch.meshgrid(y, x, indexing='ij')
        else:
            yv, xv = torch.meshgrid(y, x)
        grid = torch.stack((xv, yv), 2).expand(shape) - 0.5  # add grid offset, i.e. y = 2.0 * x - 0.5
        anchor_grid = (self.anchors[i] * self.stride[i]).view((1, self.na, 1, 1, 2)).expand(shape)
        return grid, anchor_grid

model类

了解完了模型配置和解析,那么接下来就可以看看yolo网络了,对其进行了部分删减

class Model(nn.Module):
    # YOLOv5 model
    def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None):  # model, input channels, number of classes
        super().__init__()
        if isinstance(cfg, dict):  # 如果配置文件是字典,则直接使用
            self.yaml = cfg  # model dict
        else:  # 配置文件是yaml类型需要进行解析
            import yaml  # for torch hub
            self.yaml_file = Path(cfg).name
            with open(cfg, encoding='ascii', errors='ignore') as f:
                self.yaml = yaml.safe_load(f)  # model dict

        # 搭建模型
        # self.yaml.get("ch", ch) 如果存在ch则使用配置文件的值,否则使用默认值
        ch = self.yaml['ch'] = self.yaml.get('ch', ch)  # input channels
        # 如果传入的时候把nc或者anchors给赋值了,那么会覆盖掉配置文件里面的值
        if nc and nc != self.yaml['nc']:
            LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
            self.yaml['nc'] = nc  # override yaml value
        if anchors:
            LOGGER.info(f'Overriding model.yaml anchors with anchors={anchors}')
            self.yaml['anchors'] = round(anchors)  # override yaml value

        # 解析模型, self.model是解析后的模型 self.save是每一层与之相连的层
        self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch])  # model, savelist
        self.names = [str(i) for i in range(self.yaml['nc'])]  # default names
        self.inplace = self.yaml.get('inplace', True)

        # Build strides, anchors
        m = self.model[-1]  # Detect()    Detect层是三个输出层连接在一起ModuleList列表,具体代码后面再讲
        if isinstance(m, Detect):
        	"""
        		整个这部分的作用就是把Detect层给初始化并且,缩放比例c
        	"""
            s = 256  # 2x min stride
            m.inplace = self.inplace
            # yolov5s 三个特征器缩放的倍数[8, 16, 32]
            m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))])
            # 检查anchor的顺序 和缩放的倍数是否一致,如果不一致尝试变成一致
            check_anchor_order(m)  # must be in pixel-space (not grid-space)
            # 当前缩放倍率下单 anchor大小 已16倍为例  候选框的大小为[16, 32] ->[1, 2]
            m.anchors /= m.stride.view(-1, 1, 1)
            self.stride = m.stride
            self._initialize_biases()  # only run once

        # Init weights, biases
        initialize_weights(self)  # 初始化权重
        self.info()  # 控制台输出模型的相关信息
        LOGGER.info('')

    def forward(self, x, augment=False, profile=False, visualize=False):

        if augment:
            # 增强训练,对数据采取了一些了操作
            return self._forward_augment(x)  # augmented inference, None
        return self._forward_once(x, profile, visualize)  # single-scale inference, train

    # 数据增强部分的代码
    def _forward_augment(self, x):
        # bs*c*h*w
        img_size = x.shape[-2:]  # height, width
        s = [1, 0.83, 0.67]  # scales
        # flip是翻转,这里的参数表示沿着哪个轴翻转
        f = [None, 3, None]  # flips (2-ud, 3-lr)
        y = []  # outputs
        # a = [1,2] b = [3,4] zip(a, b) = [[1,3],[2,4]]
        for si, fi in zip(s, f):
            # scale_img函数的作用就是根据传入的参数缩放和翻转图像,等到数据处理部分的时候再来看这个函数的具体代码
            xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))
            # 模型前向传播
            yi = self._forward_once(xi)[0]  # forward
            # cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1])  # save
            
           #  恢复数据增强前的模样
            yi = self._descale_pred(yi, fi, si, img_size)
            y.append(yi)
            
            
        # 这个函数看得到不是很懂,大体上就是对不同尺寸进行不同程度的筛选
        y = self._clip_augmented(y)  # clip augmented tails
        return torch.cat(y, 1), None  # augmented inference, train

    def _forward_once(self, x, profile=False, visualize=False):
        y, dt = [], []  # outputs
        for m in self.model:
            # m.f 就是该层的输入来源,如果不为-1那就不是从上一层而来
            if m.f != -1:  # if not from previous layer
                # isinstance(m.f, int)那代表是从其他层来的,并且只有一个输入。否且需要把所有的输入依次遍历
                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
            if profile:
                # 输出一些信息
                self._profile_one_layer(m, x, dt)
            x = m(x)  # run
            # 如果后面的层需要用到该层,那么就把该层的结果保存起来,否则用一个占位符None即可
            y.append(x if m.i in self.save else None)  # save output
            if visualize:
                feature_visualization(x, m.type, m.i, save_dir=visualize)
        # 最后一个层是Detect层,所以最后返回的Detect层forward的结果
        return x

    """"
     下面的两个个方法都是对数据一些增强操作
    """

    def _descale_pred(self, p, flips, scale, img_size):
        # de-scale predictions following augmented inference (inverse operation)
        if self.inplace:
            # 把x,y,w,h恢复成原来的大小
            p[..., :4] /= scale  # de-scale
            
            # bs c h w  当flips=2是对h进行变换,那就是上下进行翻转
            if flips == 2:
                p[..., 1] = img_size[0] - p[..., 1]  # de-flip ud
                
                
            # 同理flips=3是对水平进行翻转
            elif flips == 3:
                p[..., 0] = img_size[1] - p[..., 0]  # de-flip lr
        else:
            x, y, wh = p[..., 0:1] / scale, p[..., 1:2] / scale, p[..., 2:4] / scale  # de-scale
            if flips == 2:
                y = img_size[0] - y  # de-flip ud
            elif flips == 3:
                x = img_size[1] - x  # de-flip lr
            p = torch.cat((x, y, wh, p[..., 4:]), -1)
        return p
	# 该函数具体没看太懂,通过注释来看,应该是对三中变换结果根据大中小尺寸进行筛选
    def _clip_augmented(self, y):
        # Clip YOLOv5 augmented inference tails
        nl = self.model[-1].nl  # number of detection layers (P3-P5)
        g = sum(4 ** x for x in range(nl))  # grid points
        e = 1  # exclude layer count
        i = (y[0].shape[1] // g) * sum(4 ** x for x in range(e))  # indices
        y[0] = y[0][:, :-i]  # large
        i = (y[-1].shape[1] // g) * sum(4 ** (nl - 1 - x) for x in range(e))  # indices
        y[-1] = y[-1][:, i:]  # small
        return y

你可能感兴趣的:(python,人工智能,机器学习,深度学习)