想着入门pytorch,用pytorch复现一下yolo算法,其实yolo的原理一天左右就完全搞懂了,但是真正写起代码来,就是会有各种细节不太清除,我是先从吴恩达的视频开始,然后参考着两位大佬的复现代码eriklindernoren的代码、bubbliiiing的代码,可能是我对pytorch还不太熟悉,陆陆续续搞了一个星期才写完了自己的yolo复现,并在coco2014
与voc2007
的数据集进行了训练,代码同步github
其实yolov3的模型十分简单,看起来很吓唬人,其实就一个主干网络darknet53
加卷积块convolutional set
与简单卷积核和上采样时的特征合并,下面这张图很全面低反映了整个网络的细节,因为网上的很多图片并没有把具体的细节,如通道数与padding展示出来,而且在非主干网络上也没有标明输出,因此我在图上用蓝色字添加上了通道数 输出大小
的信息,padding的话除了核为1*1的padding为0,其余的padding都为1,也就是说,只有步长为2的时候,才不是“same”的padding
从图上可以看出,其实整个几个部分:darknet53、convolutional set、最后的输出层,上采样层,但是他们中很多部分都由**普通的卷积层(带激活与正则化)**组成。
1.普通的卷积层(带激活与正则化)
#==================================#
# 简单的卷积层
#==================================#
class Conv(nn.Module):
def __init__(self, inputC, outputC, keralSize, stride = 1, padding = "same") -> None:
super(Conv, self).__init__()
self.m = nn.Sequential(
nn.Conv2d(inputC, outputC, keralSize, stride, padding, bias=False),
nn.BatchNorm2d(outputC),
nn.LeakyReLU(0.1)
)
def forward(self, x):
return self.m(x)
卷积—>正则化---->激活
2.残差块
其实残差块也是由普通卷积层构成,只是有跳连接的部分。
#==================================#
# 残差块
#==================================#
class Residual(nn.Module):
def __init__(self, inputC) -> None:
super(Residual, self).__init__()
tempC = inputC // 2
self.m = nn.Sequential(
Conv(inputC, tempC, 1, 1, 0),
Conv(tempC, inputC, 3, 1, 1)
)
def forward(self, x):
return x + self.m(x)
3.convolutional set
按照上图中的结构,可以很容易写出convolutional set,也是由普通残差块组成而已
#==================================#
# convSet
#==================================#
class convSet(nn.Module):
def __init__(self, inputC, outputC, midC) -> None:
super(convSet, self).__init__()
self.m = nn.Sequential(
Conv(inputC, outputC, 1),
Conv(outputC, midC, 3),
Conv(midC, outputC, 1),
Conv(outputC, midC, 3),
Conv(midC, outputC, 1),
)
def forward(self, x):
return self.m(x)
4.输出层
输出层有三个,分别输出13*13、26*26、52*52的图像,这里我添加了创建对应格子的网络grid的过程,后面会用到,对于模型来说没有影响
#==================================#
# lastLayer
#==================================#
class LastLayer(nn.Module):
def __init__(self, inputC, outputC, anchor=None) -> None:
super(LastLayer, self).__init__()
self.grid = None
self.anchor = np.array(anchor)
self.anchorScaled = []
self.stride = 1
self.shape = None
self.m = nn.Sequential(
Conv(inputC, inputC * 2, 3),
nn.Conv2d(inputC * 2, outputC, 1)
)
def forward(self, x):
o = self.m(x)
if self.grid is None:
self._createGrid(o.shape)
return o
def _createGrid(self, shape):
b,c,h,w = shape
self.shape = (h, w)
self.stride = CONST.inputShape[0] / h
self.anchorScaled = torch.tensor(self.anchor / self.stride, device=CONST.device)
grid = torch.ones((b,len(self.anchor),h,w,4),device=CONST.device)
gridY, gridX = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
grid[...,0] *= gridX.to(CONST.device).unsqueeze(0)
grid[...,1] *= gridY.to(CONST.device).unsqueeze(0)
grid[...,2] *= self.anchorScaled[:,0].view(1,len(self.anchor),1,1)
grid[...,3] *= self.anchorScaled[:,1].view(1,len(self.anchor),1,1)
self.grid = grid
5.主干的darknet53
#==================================#
# darknet53
#==================================#
class Darknet53(nn.Module):
def __init__(self) -> None:
super(Darknet53, self).__init__()
# 定义darknet53的层数
self.layoutNumber = [1, 2, 8, 8, 4]
self.layerA = nn.Sequential(
Conv(3, 32, 3, 1, 1),
self.MultiResidual(32, 64, 1),
self.MultiResidual(64, 128, 2),
self.MultiResidual(128, 256, 8)
)
self.layerB = self.MultiResidual(256, 512, 8)
self.layerC = self.MultiResidual(512, 1024, 4)
# 进行权值初始化
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def forward(self, x):
out1 = self.layerA(x)
out2 = self.layerB(out1)
out3 = self.layerC(out2)
return out1, out2, out3
# 多层的残差网络
def MultiResidual(self, inputC, outputC, count):
t = [Conv(inputC, outputC, 3, 2, 1) if i == 0 else Residual(outputC) for i in range(count + 1)]
return nn.Sequential(*t)
6.最终的yolo模型
将上面的几个部分和在一起,就是最终的yolo模型了
#==================================#
# 定义yolo模型
#==================================#
class MyYOLO(nn.Module):
def __init__(self) -> None:
super(MyYOLO, self).__init__()
# 得到 1024*13*13
self.darknet53 = Darknet53()
# 得到 512*13*13
self.convSet1 = convSet(1024, 512, 1024)
# 得到 256*26*26, 但是后面要和另一层的输出合起来,得到的应该是 (512+256)*26*26
self.layerA = nn.Sequential(
Conv(512, 256, 1),
nn.Upsample(scale_factor=2, mode='nearest')
)
# 得到 256*26*26
self.convSet2 = convSet(256 + 512, 256, 512)
# 得到 128*52*52, 但是后面要和另一层的输出合起来,得到的应该是 (128+256)*52*52
self.layerB = nn.Sequential(
Conv(256, 128, 1),
nn.Upsample(scale_factor=2, mode='nearest')
)
# 得到 256*26*26
self.convSet3 = convSet(128 + 256, 128, 256)
# 得到 75*13*13
self.predict1 = LastLayer(512, CONST.anchorNumber * (5 + CONST.classNumber), anchor=CONST.anchor[0])
# 得到 75*26*26
self.predict2 = LastLayer(256, CONST.anchorNumber * (5 + CONST.classNumber), anchor=CONST.anchor[1])
# 得到 75*52*52
self.predict3= LastLayer(128, CONST.anchorNumber * (5 + CONST.classNumber), anchor=CONST.anchor[2])
self.lastLayers = [self.predict1, self.predict2, self.predict3]
def forward(self, x):
x1, x2, x3 = self.darknet53(x)
convOut1 = self.convSet1(x3)
out1 = self.predict1(convOut1)
layerOut = self.layerA(convOut1)
additon = torch.cat([layerOut, x2], 1)
convOut2 = self.convSet2(additon)
out2 = self.predict2(convOut2)
layerOut = self.layerB(convOut2)
additon = torch.cat([layerOut, x1], 1)
convOut3 = self.convSet3(additon)
out3 = self.predict3(convOut3)
return out1, out2, out3
到目前为止,整个yolo模型构建完成
模型输出的是一个长度为3的list。每个list的大小分别是
使用pytorch自带的dataset
与dataloader
进行数据集的加载,先从网上下载coco数据集,在data文件夹里如下图所示进行放置。
─coco
├─images
│ ├─train2014
│ └─val2014
└─instances_train2014.json
└─instances_val2014.json
然后使用handleData.py
来进行数据集的预处理,生成标签与对应的目标框,并将目标框变为[x,y,w,h],中心点与宽高模式。在label文件夹下生成对应的label。
16 0.6066874999999999 0.3413807531380753 0.54415625 0.51
利用imgaug
模块进行数据的预处理与增强,包括将图片保持宽高比地缩放为(416,416)大小,位移,镜像等操作
# 应用iaa的图像数据增强类
class imgAug():
def __init__(self) -> None:
self.argument = None
def __call__(self, data):
img, boxes = data
bbs = []
for item in boxes:
bbs.append(BoundingBox(*item[1:], label=item[0]))
bbs = BoundingBoxesOnImage(bbs, shape=img.shape)
img, bbs = self.argument(image = img, bounding_boxes=bbs)
bbs = bbs.clip_out_of_image()
for i, item in enumerate(bbs):
boxes[i,:] = np.array([item.label, item.x1, item.y1, item.x2, item.y2])
return img, boxes
# 训练集数据增强
TRAIN_TRANSFORMS = transforms.Compose([
AbsoluteLabel(),
xywh2xyxy(),
ImgUp(),
CenterPlcae(),
ReSize(),
RelativeLabel(),
xyxy2xywh(),
ToTensor()
])
损失函数实际上就是:期望yolo模型的输出与实际yolo模型输出的损失,分别计算输出的x/y/w/h/置信度/分类的交叉熵损失,合并,再反向传播即可。
[对应batch中第几张图,类别,x,y,w,h,先验框索引]
i
与j
#==================================#
# 损失函数
#==================================#
def getLoss(yoloOut, yolo,bboxes):
BCELoss = nn.BCELoss()
MSELoss = nn.MSELoss()
bboxes = torch.cat([bboxes, torch.zeros(bboxes.shape[0],1,device=CONST.device)], 1)
anchorRelate = torch.tensor(CONST.anchor, device=CONST.device).view(-1,2) / 416
anchorRelate = torch.cat([torch.zeros_like(anchorRelate), anchorRelate], 1)
boxesWH = torch.cat([torch.zeros_like(bboxes[:,4:6]), bboxes[:,4:6]], 1)
for i,item in enumerate(boxesWH):
bboxes[i][6] = torch.argmax(iou(item, anchorRelate)) # [bs, cls, x,y,w,h,an]
# print(bboxes)
loss = 0
for l,output in enumerate(yoloOut):
lastLayer = yolo.lastLayers[l]
ba,c,h,w = output.shape
output = output.view(ba,len(lastLayer.anchor),-1,h,w).permute(0,1,3,4,2).contiguous()
b, cls, boxesScaled, an, i, j = buildTarget(bboxes, lastLayer, l)
tConf = torch.zeros_like(output[..., 4], device=CONST.device)
xLoss,yLoss,wLoss,hLoss,clsLoss = [0,0,0,0,0]
if b.shape[0] != 0:
pr = output[b, an, i, j] # type:torch.Tensor
tConf[b, an, i, j] = 1
pr[:,:2] = pr[:,:2].sigmoid()
xLoss = BCELoss(pr[..., 0], boxesScaled[...,0])
yLoss = BCELoss(pr[..., 1], boxesScaled[...,1])
wLoss = MSELoss(pr[..., 2], boxesScaled[...,2]) * 0.5
hLoss = MSELoss(pr[..., 3], boxesScaled[...,3]) * 0.5
clsLoss = BCELoss(pr[:,5:].sigmoid(), cls)
confLoss = BCELoss(output[..., 4].sigmoid(),tConf)
loss = loss + xLoss + yLoss + wLoss + hLoss + clsLoss + confLoss
return loss
利用pytorch提供的optim进行训练即可
#==================================#
# 一个框与多个框的交并比
#==================================#
def iou(box1: torch.Tensor, box2:torch.Tensor, isleftT2rightD = True) -> torch.Tensor:
# box1 的shape为(1, 4), box2的shape为(None, 4)
# 防止输入错误
box1 = box1.view(-1,4)
box2 = box2.view(-1,4)
box1 = box1.repeat((box2.shape[0], 1))
if not isleftT2rightD:
box1 = torch.concat([box1[:,:2] - box1[:,2:4] / 2, box1[:,:2] + box1[:,2:4] / 2], 1).cuda()
box2 = torch.concat([box2[:,:2] - box2[:,2:4] / 2, box2[:,:2] + box2[:,2:4] / 2], 1).cuda()
# 交集左上角的点
lu = torch.max(box1[:, :2], box2[:, :2])
# 交集右下角的点
rd = torch.min(box1[:, 2:], box2[:, 2:])
rectsN = rd - lu
rectsN[rectsN < 0] = 0#没有重叠区域设置为0
rectsN = rectsN[:,0] * rectsN[:,1]
rectsU = (box1[:,2] - box1[:,0]) * (box1[:,3] - box1[:,1]) + (box2[:,2] - box2[:,0]) * (box2[:,3] - box2[:,1])
return rectsN / (rectsU - rectsN)
#==================================#
# 非极大值抑制
#==================================#
def nms(box: torch.Tensor = None, score: torch.Tensor = None,threshold: float = 0.3) -> None:
_, sortIndex = score.sort(0, descending = True)
res = []
while sortIndex.size(0):
if sortIndex.size(0) == 1:
res.append(sortIndex[0].item())
break
res.append(sortIndex[0].item())
ious = iou(box[sortIndex[0]], box[sortIndex[1:]])
sortIndex = sortIndex[1:][ious < threshold]
return res
对模型进行验证,使用的指标为R
、P
、MP
先进行预测
遍历所有框,将预测框与真实框的交并比大于阈值,并且类别正确的标记为1,其余标记为0,整理成list,并且以置信度进行排序
对list梯形求和,总和除以总数为p
,总和除以真实标签的总和为R
,对多个p
、r
求面积为map
#==================================#
# 计算模型参数
#==================================#
def calMap(isCor, preConf, preLabels, targetLabels):
sI = np.argsort(-preConf)
isCor = isCor[sI]
preConf = preConf[sI]
preLabels = preLabels[sI]
uClasses = np.unique(targetLabels)
R = []
P = []
AP = []
for oneCls in uClasses:
sI = preLabels == oneCls
isCorOneCls = isCor[sI]
targetLabelsOneCls = targetLabels[targetLabels == oneCls]
tarTrueC = targetLabelsOneCls.size # 目标框为该类的数量
preTrueC = isCorOneCls.size # 预测框为该类的数量
if preTrueC == 0:
R.append(0)
P.append(0)
AP.append(0)
continue
tpC = isCorOneCls.cumsum()
fpC = (1 - isCorOneCls).cumsum()
r = tpC / tarTrueC
p = tpC / (tpC + fpC)
R.append(r[-1])
P.append(p[-1])
# 在前面添加是往前取矩形,在后面添加是让召回率可以达到1
r = np.concatenate(([0.0], r, [1.0]))
p = np.concatenate(([0.0], p, [0.0]))
# 保证p单调递减
for i in range(p.size - 1, 0, -1):
p[i - 1] = max(p[i], p[i - 1])
# 删除重复项
i = np.where(r[1:] != r[:-1])[0]
ap = np.sum((r[i+1] - r[i]) * p[i+1])
AP.append(ap)
return R,P,AP,uClasses
#==================================#
# show MP
#==================================#
def showMap(R,P,AP,uClasses):
res = [["class","AP", "R", "P"]]
for i,_ in enumerate(uClasses):
res.append([CONST.classes[int(uClasses[i])], "%.4f" % AP[i], "%.4f" % R[i], "%.4f" % P[i]])
res.append([])
res.append(["MAP", "%.4f" % np.average(AP)])
print(AsciiTable(res).table)