对于model部分,我们首先看model的配置文件
# YOLOv5 by Ultralytics, GPL-3.0 license
# Parameters
nc: 80 # number of classes
depth_multiple: 0.33 # 更改模型的深度,这里主要影响的是网络里面的number参数,最少为一,其他的则是会变为原来的三分之一
width_multiple: 0.50 # 更改网络的宽度, 这里主要影响的是网络里面的输入通道数,变为原来的二分之一
anchors:
# 这个先验框的配置
- [10,13, 16,30, 33,23] # P3/8 大尺度
- [30,61, 62,45, 59,119] # P4/16 中尺度
- [116,90, 156,198, 373,326] # P5/32 小尺度
# YOLOv5 v6.0 backbone
backbone:
# [from, number, module, args] 这里需要熟记
[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]], # 9
]
# YOLOv5 v6.0 head
head:
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]], # 13
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 17 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 20 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]],
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 23 (P5/32-large)
[[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
]
为了突出重点,我把一些日志输出相关部分,给删去了
def parse_model(d, ch): # model_dict, input_channels(3)
"""
解析模型的方法,根据配置文件信息,搭建模型
Args:
d: 配置文件的信息
ch: 图片输入的channel
Returns: 返回解析后的模型、保存了FPN层的一些信息,每一层都有哪些层与之相连
"""
# 先验框,类别数,深度,宽度
anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # 每个检测头anchor的数量
no = na * (nc + 5) # 最后每个grid输出的个数
layers, save, c2 = [], [], ch[-1] # layers用来保存每层结构的列表 save是保存网络输出结果,在后面forward的时候有用
# 列表相加是直接把前面的列表元素和后面的列表元素进行合并
for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']): # from, number, module, args
m = eval(m) if isinstance(m, str) else m # eval 把参数当作代码块进行处理
for j, a in enumerate(args):
try:
args[j] = eval(a) if isinstance(a, str) else a
except NameError:
pass
n = n_ = max(round(n * gd), 1) if n > 1 else n # 这里来改变网络的深度 round是采用四舍五入的方式进行取整
# 针对不同的模型,需要不同的参数,如果想要为网络添加一些block,只需要修改下面的代码和yaml的配置即可
if m in (Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, MixConv2d, Focus, CrossConv,
BottleneckCSP, C3, C3TR, C3SPP, C3Ghost):
"""
对于在这里面的模块来说,yaml配置的第一个参数为输出通道,输入通道为上一层输出通道
"""
c1, c2 = ch[f], args[0] # ch表示该层之前所有层的输出,-1代表上一层的输出
if c2 != no: # 查看该层是不是输出层,如果是输出层,那么模型则不进行缩放
c2 = make_divisible(c2 * gw, 8) # 将网络的宽度给修改了,由于yolov5s是0.5 所以中间的网络层均缩小了
args = [c1, c2, *args[1:]] # *args[1:] 列表的解构赋值 相当于[c1, c2]+args[1:] 所以现在args[输入通道,输出通道,其他参数信息]
if m in [BottleneckCSP, C3, C3TR, C3Ghost]:
args.insert(2, n) # 这些网络层有n个堆叠
n = 1
"""
对于在这里面的模块来说,yaml配置的参数根据你定义的模块需要来改变,输出通道根据需要去改变
"""
elif m is nn.BatchNorm2d:
args = [ch[f]]
# 当模型采用Concat或者Detect时,他是将有一个FPN结构
elif m is Concat:
c2 = sum(ch[x] for x in f)
elif m is Detect:
# Detect 就是把每个检测层都给存起来
args.append([ch[x] for x in f])
if isinstance(args[1], int): # number of anchors, 这里一般都用不到,我们一般就是会直接给了候选框,并不只是给了候选框的个数
args[1] = [list(range(args[1] * 2))] * len(f)
elif m is Contract:
c2 = ch[f] * args[0] ** 2
elif m is Expand:
c2 = ch[f] // args[0] ** 2
else:
c2 = ch[f]
m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args) # module
t = str(m)[8:-2].replace('__main__.', '') # module type
np = sum(x.numel() for x in m_.parameters()) # numel计算数组元素的个数, 这一行表示模型的参数个数
m_.i, m_.f, m_.type, m_.np = i, f, t, np # attach index, 'from' index, type, number params
save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # 保存该层需要前置层的索引
"""
x % i for x in ([f] if isinstance(f, int) else f) if x != -1 =>
if isinstance(f, int):
for x in [f]:
if x != -1:
# save.extend(x % 100)
save.append(x % 100)
else:
for x in f:
if x != -1:
# save.extend(x % 100)
save.append(x % 100)
上面是列表生成式的写法,extend参数必须是可迭代对象,所以可以把extend改成append
"""
layers.append(m_)
if i == 0: # 从第一层的输入开始存,刚开始传入的ch为输入图片的通道数,不需要
ch = []
ch.append(c2) # 把这一次输入放入结果列表当中
return nn.Sequential(*layers), sorted(save) # 返回的模型,每一层需要前置层的索引
class Detect(nn.Module):
stride = None # strides computed during build
onnx_dynamic = False # ONNX export parameter
export = False # export mode
def __init__(self, nc=80, anchors=(), ch=(), inplace=True): # detection layer
super().__init__()
self.nc = nc # number of classes
self.no = nc + 5 # number of outputs per anchor
self.nl = len(anchors) # number of detection layers
self.na = len(anchors[0]) // 2 # number of anchors # 每个探测器的候选框个数
self.grid = [torch.zeros(1)] * self.nl # init grid
self.anchor_grid = [torch.zeros(1)] * self.nl # init anchor grid
self.register_buffer('anchors', torch.tensor(anchors).float().view(self.nl, -1, 2)) # shape(nl,na,2)
self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch) # output conv
self.inplace = inplace # use in-place ops (e.g. slice assignment)
def forward(self, x):
z = [] # inference output
# self.nl 探测头的个数
for i in range(self.nl):
x[i] = self.m[i](x[i]) # conv
bs, _, ny, nx = x[i].shape
# 对于20*20的探测头 x(bs,255,20,20) to x(bs,3,20,20,85)
x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
if not self.training: # inference
"""
-----------------预测的时候开启,在训练的时候不开启,这部分需要结合损失函数去看-------------------------
"""
if self.onnx_dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:
self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)
# 把最后的输出结果进行sigmoid处理,使其映射在0-1之间
y = x[i].sigmoid()
if self.inplace:
# y[...,0:2] 就是只观察最后一列 而最后一列 前面五个分别表示 x,y,w,h,c
y[..., 0:2] = (y[..., 0:2] * 2 + self.grid[i]) * self.stride[i] # xy
y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
else: # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
xy, wh, conf = y.split((2, 2, self.nc + 1), 4) # y.tensor_split((2, 4, 5), 4) # torch 1.8.0
xy = (xy * 2 + self.grid[i]) * self.stride[i] # xy
wh = (wh * 2) ** 2 * self.anchor_grid[i] # wh
y = torch.cat((xy, wh, conf), 4)
z.append(y.view(bs, -1, self.no)) # y->[bs,-1,85]
# 训练的时候,返回原数据,否则返回处理后的结果 该处理和loss处理是息息相关的
# x=[3*80*80*85,3*40*40*85,3*20*20*85]
# z在训练的时候没有数据,只有测试的时候才会有数据
return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)
def _make_grid(self, nx=20, ny=20, i=0):
d = self.anchors[i].device
t = self.anchors[i].dtype
shape = 1, self.na, ny, nx, 2 # grid shape
y, x = torch.arange(ny, device=d, dtype=t), torch.arange(nx, device=d, dtype=t)
if check_version(torch.__version__, '1.10.0'): # torch>=1.10.0 meshgrid workaround for torch>=0.7 compatibility
yv, xv = torch.meshgrid(y, x, indexing='ij')
else:
yv, xv = torch.meshgrid(y, x)
grid = torch.stack((xv, yv), 2).expand(shape) - 0.5 # add grid offset, i.e. y = 2.0 * x - 0.5
anchor_grid = (self.anchors[i] * self.stride[i]).view((1, self.na, 1, 1, 2)).expand(shape)
return grid, anchor_grid
了解完了模型配置和解析,那么接下来就可以看看yolo网络了,对其进行了部分删减
class Model(nn.Module):
# YOLOv5 model
def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None): # model, input channels, number of classes
super().__init__()
if isinstance(cfg, dict): # 如果配置文件是字典,则直接使用
self.yaml = cfg # model dict
else: # 配置文件是yaml类型需要进行解析
import yaml # for torch hub
self.yaml_file = Path(cfg).name
with open(cfg, encoding='ascii', errors='ignore') as f:
self.yaml = yaml.safe_load(f) # model dict
# 搭建模型
# self.yaml.get("ch", ch) 如果存在ch则使用配置文件的值,否则使用默认值
ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels
# 如果传入的时候把nc或者anchors给赋值了,那么会覆盖掉配置文件里面的值
if nc and nc != self.yaml['nc']:
LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
self.yaml['nc'] = nc # override yaml value
if anchors:
LOGGER.info(f'Overriding model.yaml anchors with anchors={anchors}')
self.yaml['anchors'] = round(anchors) # override yaml value
# 解析模型, self.model是解析后的模型 self.save是每一层与之相连的层
self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch]) # model, savelist
self.names = [str(i) for i in range(self.yaml['nc'])] # default names
self.inplace = self.yaml.get('inplace', True)
# Build strides, anchors
m = self.model[-1] # Detect() Detect层是三个输出层连接在一起ModuleList列表,具体代码后面再讲
if isinstance(m, Detect):
"""
整个这部分的作用就是把Detect层给初始化并且,缩放比例c
"""
s = 256 # 2x min stride
m.inplace = self.inplace
# yolov5s 三个特征器缩放的倍数[8, 16, 32]
m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))])
# 检查anchor的顺序 和缩放的倍数是否一致,如果不一致尝试变成一致
check_anchor_order(m) # must be in pixel-space (not grid-space)
# 当前缩放倍率下单 anchor大小 已16倍为例 候选框的大小为[16, 32] ->[1, 2]
m.anchors /= m.stride.view(-1, 1, 1)
self.stride = m.stride
self._initialize_biases() # only run once
# Init weights, biases
initialize_weights(self) # 初始化权重
self.info() # 控制台输出模型的相关信息
LOGGER.info('')
def forward(self, x, augment=False, profile=False, visualize=False):
if augment:
# 增强训练,对数据采取了一些了操作
return self._forward_augment(x) # augmented inference, None
return self._forward_once(x, profile, visualize) # single-scale inference, train
# 数据增强部分的代码
def _forward_augment(self, x):
# bs*c*h*w
img_size = x.shape[-2:] # height, width
s = [1, 0.83, 0.67] # scales
# flip是翻转,这里的参数表示沿着哪个轴翻转
f = [None, 3, None] # flips (2-ud, 3-lr)
y = [] # outputs
# a = [1,2] b = [3,4] zip(a, b) = [[1,3],[2,4]]
for si, fi in zip(s, f):
# scale_img函数的作用就是根据传入的参数缩放和翻转图像,等到数据处理部分的时候再来看这个函数的具体代码
xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))
# 模型前向传播
yi = self._forward_once(xi)[0] # forward
# cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1]) # save
# 恢复数据增强前的模样
yi = self._descale_pred(yi, fi, si, img_size)
y.append(yi)
# 这个函数看得到不是很懂,大体上就是对不同尺寸进行不同程度的筛选
y = self._clip_augmented(y) # clip augmented tails
return torch.cat(y, 1), None # augmented inference, train
def _forward_once(self, x, profile=False, visualize=False):
y, dt = [], [] # outputs
for m in self.model:
# m.f 就是该层的输入来源,如果不为-1那就不是从上一层而来
if m.f != -1: # if not from previous layer
# isinstance(m.f, int)那代表是从其他层来的,并且只有一个输入。否且需要把所有的输入依次遍历
x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
if profile:
# 输出一些信息
self._profile_one_layer(m, x, dt)
x = m(x) # run
# 如果后面的层需要用到该层,那么就把该层的结果保存起来,否则用一个占位符None即可
y.append(x if m.i in self.save else None) # save output
if visualize:
feature_visualization(x, m.type, m.i, save_dir=visualize)
# 最后一个层是Detect层,所以最后返回的Detect层forward的结果
return x
""""
下面的两个个方法都是对数据一些增强操作
"""
def _descale_pred(self, p, flips, scale, img_size):
# de-scale predictions following augmented inference (inverse operation)
if self.inplace:
# 把x,y,w,h恢复成原来的大小
p[..., :4] /= scale # de-scale
# bs c h w 当flips=2是对h进行变换,那就是上下进行翻转
if flips == 2:
p[..., 1] = img_size[0] - p[..., 1] # de-flip ud
# 同理flips=3是对水平进行翻转
elif flips == 3:
p[..., 0] = img_size[1] - p[..., 0] # de-flip lr
else:
x, y, wh = p[..., 0:1] / scale, p[..., 1:2] / scale, p[..., 2:4] / scale # de-scale
if flips == 2:
y = img_size[0] - y # de-flip ud
elif flips == 3:
x = img_size[1] - x # de-flip lr
p = torch.cat((x, y, wh, p[..., 4:]), -1)
return p
# 该函数具体没看太懂,通过注释来看,应该是对三中变换结果根据大中小尺寸进行筛选
def _clip_augmented(self, y):
# Clip YOLOv5 augmented inference tails
nl = self.model[-1].nl # number of detection layers (P3-P5)
g = sum(4 ** x for x in range(nl)) # grid points
e = 1 # exclude layer count
i = (y[0].shape[1] // g) * sum(4 ** x for x in range(e)) # indices
y[0] = y[0][:, :-i] # large
i = (y[-1].shape[1] // g) * sum(4 ** (nl - 1 - x) for x in range(e)) # indices
y[-1] = y[-1][:, i:] # small
return y