Pytorch | yolov3原理及代码详解(一)


YOLO相关原理 :

https://blog.csdn.net/leviopku/article/details/82660381

https://www.jianshu.com/p/d13ae1055302

https://blog.csdn.net/qq_34199326/article/details/84874409
https://blog.csdn.net/chandanyan8568/article/details/81089083
https://blog.csdn.net/leviopku/article/details/82660381

分析代码:

https://github.com/eriklindernoren/PyTorch-YOLOv3

注:这个是方便个人学习pytorch和yolov3所做的记录,有任何错误欢迎指出。

1、detect.py

从detect.py开始分析代码的流程。

1.1模型初始化(detect.py——part.1)

 
  1. from __future__ import division

  2. from models import *

  3. from utils.utils import *

  4. from utils.datasets import *

  5. import os

  6. import sys

  7. import time

  8. import datetime

  9. import argparse

  10. from PIL import Image

  11. import torch

  12. from torch.utils.data import DataLoader

  13. from torchvision import datasets

  14. from torch.autograd import Variable

  15. import matplotlib.pyplot as plt

  16. import matplotlib.patches as patches

  17. from matplotlib.ticker import NullLocator

  18. """

  19. (1)import argparse 首先导入模块

  20. (2)parser = argparse.ArgumentParser() 创建一个解析对象

  21. (3)parser.add_argument() 向该对象中添加你要关注的命令行参数和选项

  22. (4)parser.parse_args() 进行解析

  23. """

  24. if __name__ == "__main__":

  25. parser = argparse.ArgumentParser()

  26. parser.add_argument("--image_folder", type=str, default="data/samples", help="path to dataset")

  27. parser.add_argument("--model_def", type=str, default="config/yolov3.cfg", help="path to model definition file")

  28. parser.add_argument("--weights_path", type=str, default="weights/yolov3.weights", help="path to weights file")

  29. parser.add_argument("--class_path", type=str, default="data/coco.names", help="path to class label file")

  30. parser.add_argument("--conf_thres", type=float, default=0.8, help="object confidence threshold")

  31. parser.add_argument("--nms_thres", type=float, default=0.4, help="iou thresshold for non-maximum suppression")

  32. parser.add_argument("--batch_size", type=int, default=1, help="size of the batches")

  33. parser.add_argument("--n_cpu", type=int, default=0, help="number of cpu threads to use during batch generation")

  34. parser.add_argument("--img_size", type=int, default=416, help="size of each image dimension")

  35. parser.add_argument("--checkpoint_model", type=str, help="path to checkpoint model")

  36. opt = parser.parse_args()

  37. print(opt)

  38. #选择是否使用GPU设备

  39. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  40. #创建多级目录

  41. os.makedirs("output", exist_ok=True)

  42. # Set up model 调用darknet模型

  43. model = Darknet(opt.model_def, img_size=opt.img_size).to(device)

    1.1.1 YOLOv3模型解析

model = Darknet(opt.model_def, img_size=opt.img_size).to(device),这条语句加载darkent模型,即YOLOv3模型。Darknet模型在model.py中进行定义。其完整定义如下:

 
  1. class Darknet(nn.Module):

  2. """YOLOv3 object detection model"""

  3. def __init__(self, config_path, img_size=416):

  4. super(Darknet, self).__init__()

  5. #解析cfg文件

  6. self.module_defs = parse_model_config(config_path)

  7. #print("module_defs : ",self.module_defs)

  8. self.hyperparams, self.module_list = create_modules(self.module_defs)

  9. #print("module_list : ",self.module_list)

  10. # hasattr() 函数用于判断对象是否包含对应的属性。

  11. # yolo层有 metrics 属性

  12. self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")]

  13. #print("self.yolo_layers:\n",self.yolo_layers)

  14. self.img_size = img_size

  15. self.seen = 0

  16. self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32)

  17. def forward(self, x, targets=None):

  18. img_dim = x.shape[2]

  19. loss = 0

  20. layer_outputs, yolo_outputs = [], []

  21. print("x.shape: ",x.shape)

  22. for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):

  23. #print("module_defs : ",module_def)

  24. #print("module : ",module)

  25. #print("i: ",i," x.shape: ",x.shape)

  26. if module_def["type"] in ["convolutional", "upsample", "maxpool"]:

  27. x = module(x)

  28. elif module_def["type"] == "route":

  29. print("i: ",i," x.shape: ",x.shape)

  30. for layer_i in module_def["layers"].split(","):

  31. print("layer_i:\n",layer_i)

  32. x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1)

  33. elif module_def["type"] == "shortcut":

  34. layer_i = int(module_def["from"])

  35. x = layer_outputs[-1] + layer_outputs[layer_i]

  36. elif module_def["type"] == "yolo":

  37. x, layer_loss = module[0](x, targets, img_dim)

  38. loss += layer_loss

  39. yolo_outputs.append(x)

  40. layer_outputs.append(x)

  41. yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1))

  42. return yolo_outputs if targets is None else (loss, yolo_outputs)

  43. def load_darknet_weights(self, weights_path):

  44. """Parses and loads the weights stored in 'weights_path'"""

  45. # Open the weights file

  46. with open(weights_path, "rb") as f:

  47. header = np.fromfile(f, dtype=np.int32, count=5) # First five are header values

  48. self.header_info = header # Needed to write header when saving weights

  49. self.seen = header[3] # number of images seen during training

  50. weights = np.fromfile(f, dtype=np.float32) # The rest are weights

  51. """

  52. print("------------------------------------")

  53. print("header:\n",header)

  54. print("weights:\n",weights)

  55. print("weights.shape:\n",weights.shape)

  56. """

  57. # Establish cutoff for loading backbone weights

  58. cutoff = None

  59. if "darknet53.conv.74" in weights_path:

  60. cutoff = 75

  61. ptr = 0

  62. for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):

  63. #print("i:\n",i)

  64. #print("module_def:\n",module_def)

  65. #print("module:\n",module)

  66. if i == cutoff:

  67. break

  68. if module_def["type"] == "convolutional":

  69. conv_layer = module[0]

  70. if module_def["batch_normalize"]:

  71. # Load BN bias, weights, running mean and running variance

  72. bn_layer = module[1]

  73. num_b = bn_layer.bias.numel() # Number of biases

  74. #print("bn_layer:\n",bn_layer)

  75. #print("num_b:\n",num_b)

  76. # Bias

  77. bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)

  78. bn_layer.bias.data.copy_(bn_b)

  79. ptr += num_b

  80. # Weight

  81. bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)

  82. bn_layer.weight.data.copy_(bn_w)

  83. ptr += num_b

  84. # Running Mean

  85. bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)

  86. bn_layer.running_mean.data.copy_(bn_rm)

  87. ptr += num_b

  88. # Running Var

  89. bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)

  90. bn_layer.running_var.data.copy_(bn_rv)

  91. ptr += num_b

  92. else:

  93. # Load conv. bias

  94. num_b = conv_layer.bias.numel()

  95. conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)

  96. conv_layer.bias.data.copy_(conv_b)

  97. ptr += num_b

  98. # Load conv. weights

  99. num_w = conv_layer.weight.numel()

  100. conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)

  101. conv_layer.weight.data.copy_(conv_w)

  102. ptr += num_w

  103. #print("conv_w:\n",conv_w)

  104. #print("num_w:\n",num_w)

  105. #print("ptr:\n",ptr)

  106. def save_darknet_weights(self, path, cutoff=-1):

  107. """

  108. @:param path - path of the new weights file

  109. @:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved)

  110. """

  111. fp = open(path, "wb")

  112. self.header_info[3] = self.seen

  113. self.header_info.tofile(fp)

  114. # Iterate through layers

  115. for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):

  116. if module_def["type"] == "convolutional":

  117. conv_layer = module[0]

  118. # If batch norm, load bn first

  119. if module_def["batch_normalize"]:

  120. bn_layer = module[1]

  121. bn_layer.bias.data.cpu().numpy().tofile(fp)

  122. bn_layer.weight.data.cpu().numpy().tofile(fp)

  123. bn_layer.running_mean.data.cpu().numpy().tofile(fp)

  124. bn_layer.running_var.data.cpu().numpy().tofile(fp)

  125. # Load conv bias

  126. else:

  127. conv_layer.bias.data.cpu().numpy().tofile(fp)

  128. # Load conv weights

  129. conv_layer.weight.data.cpu().numpy().tofile(fp)

  130. fp.close()

首先从__init__()函数开始,大致流程是从.cfg中解析文件,然后根据文件内容生成相关的网络结构。

解析后会生成一个列表,存储网络结构的各种属性,通过遍历这个列表便可以得到网络结构,解析后的列表如下图所示(部分):

图1

self.hyperparams, self.module_list = create_modules(self.module_defs),这条语句会根据生成的列表构建网络结构,create_modules()函数如下:

 
  1. def create_modules(module_defs):

  2. """

  3. Constructs module list of layer blocks from module configuration in module_defs

  4. """

  5. #pop() 函数用于移除列表中的一个元素(默认最后一个元素),并且返回该元素的值。

  6. hyperparams = module_defs.pop(0)

  7. output_filters = [int(hyperparams["channels"])]

  8. module_list = nn.ModuleList()

  9. for module_i, module_def in enumerate(module_defs):

  10. modules = nn.Sequential()

  11. if module_def["type"] == "convolutional":

  12. bn = int(module_def["batch_normalize"])

  13. filters = int(module_def["filters"])

  14. kernel_size = int(module_def["size"])

  15. pad = (kernel_size - 1) // 2

  16. modules.add_module(

  17. f"conv_{module_i}",

  18. nn.Conv2d(

  19. in_channels=output_filters[-1],

  20. out_channels=filters,

  21. kernel_size=kernel_size,

  22. stride=int(module_def["stride"]),

  23. padding=pad,

  24. bias=not bn,

  25. ),

  26. )

  27. if bn:

  28. modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9, eps=1e-5))

  29. if module_def["activation"] == "leaky":

  30. modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1))

  31. elif module_def["type"] == "maxpool":

  32. kernel_size = int(module_def["size"])

  33. stride = int(module_def["stride"])

  34. if kernel_size == 2 and stride == 1:

  35. #保证输出是偶数

  36. modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1)))

  37. maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2))

  38. modules.add_module(f"maxpool_{module_i}", maxpool)

  39. elif module_def["type"] == "upsample":

  40. upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest")

  41. modules.add_module(f"upsample_{module_i}", upsample)

  42. elif module_def["type"] == "route":

  43. layers = [int(x) for x in module_def["layers"].split(",")]

  44. filters = sum([output_filters[1:][i] for i in layers])

  45. """

  46. print("------------------------------------")

  47. print("layers: \n",layers)

  48. print("output_filters:\n",output_filters)

  49. print("output_filters[1:][i] :\n",[output_filters[1:][i] for i in layers])

  50. print("output_filters[1:]:\n",output_filters[1:])

  51. print("output_filters[1:][1]:\n",output_filters[1:][1])

  52. print("output_filters[1:][3]:\n",output_filters[1:][3])

  53. """

  54. modules.add_module(f"route_{module_i}", EmptyLayer())

  55. elif module_def["type"] == "shortcut":

  56. filters = output_filters[1:][int(module_def["from"])]

  57. modules.add_module(f"shortcut_{module_i}", EmptyLayer())

  58. elif module_def["type"] == "yolo":

  59. anchor_idxs = [int(x) for x in module_def["mask"].split(",")]

  60. # Extract anchors

  61. #print("----------------------------------")

  62. #print("anchor_idxs\n:",anchor_idxs)

  63. anchors = [int(x) for x in module_def["anchors"].split(",")]

  64. #print("1. anchors \n:",anchors)

  65. anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]

  66. #print("2. anchors \n:",anchors)

  67. anchors = [anchors[i] for i in anchor_idxs]

  68. #print("3. anchors \n:",anchors)

  69. num_classes = int(module_def["classes"])

  70. img_size = int(hyperparams["height"])

  71. # Define detection layer

  72. yolo_layer = YOLOLayer(anchors, num_classes, img_size)

  73. modules.add_module(f"yolo_{module_i}", yolo_layer)

  74. # Register module list and number of output filters

  75. module_list.append(modules)

  76. output_filters.append(filters)

  77. return hyperparams, module_list

根据列表会生成相应的convolutional、maxpool、upsample、route、shortcut、yolo层。

convolutional层构建方法很常规:设置filter尺寸、数量,添加batch normalize层(在.cfg文件中batch_normalize=1),以及pad层,使用leaky激活函数。

maxpool层,不过在YOLOv3中没有使用最大池化来进行下采样,是使用的3*3的卷积核,步长=2的卷积操作进行下采样,(细心的同学会发现yolov3.cfg没有maxpool层),一共5次,下采样2^5=32倍数。

图2

upsample层,上采样层,由于nn.Upsample被弃用了,所以新建了一个类完成这个操作。

 
  1. class Upsample(nn.Module):

  2. """ nn.Upsample is deprecated """

  3. def __init__(self, scale_factor, mode="nearest"):

  4. super(Upsample, self).__init__()

  5. self.scale_factor = scale_factor

  6. self.mode = mode

  7. def forward(self, x):

  8. x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode)

  9. return x

接下来是route层,这层十分重要。这层的作用相当于把前面的特征图进行相融合。

 
  1. [route]

  2. layers = -4 # 只有一个值,一个路径

  3. [route]

  4. layers = -1, 61 # 两个值,两个路径,两个特征图进行特征融合

下图来自darknet-master(windos下的yolov3实现,纯C语言,下图是方便理解,本文不对该代码分析)

图3

layer=-4,表示当前层的序号减4,如第83层route,-4之后是79层,把79层的特征层融合(layer值只有一个,相当于只有链接过来),route层的输出可以看作是下一层的输入,即13*13*512和79层的特征图是完全吻合的。同理,layer=-1,61,表示融合85(86-1)层和61层的特征图。即26*26*512+26*26*256=26*26*768。至于为什么选这几个层进行融合,我表示并不清楚,希望有了解的朋友指点一下。 这几层刚好下采样块的输出层。

shortcut层,直连层,借鉴于ResNet网络。

https://cloud.tencent.com/developer/article/1148375,更多细节可查看:https://blog.csdn.net/u014665013/article/details/81985082

ResNet 的动机依然是解决深度模型中的退化问题:层数越深,梯度越容易发散,误差越大,难以训练。理论上,模型层数越深,误差应该越小才对,因为我们总可以根据浅层模型的解构造出深层模型的解(将深层模型与浅层模型对应的层赋值为浅层模型的权重,将后面的层取为恒等映射),使得这个深层模型的误差不大于浅层模型的误差。但是实际上,深度模型的误差要比浅层模型的误差要大,在CIFAR-10上面的训练和测试误差如下图所示。产生这种现象的原因是深度模型难以优化,难以收敛到较优的解,并假设相比于直接优化最初的plain networks的模型F(x)=y,残差F(x)=y-x更容易优化。 需要注意的是,变换F可以是很多层,也就是说shortcut不一定只跨越1层。并且实际中,由于shortcut只跨越单层没有优势,ResNet中是跨越了2层或3层

YOLOv3完整的结构有100+层,所以采用直连的方式来优化网络结构,能使网络更好的训练、更快的收敛。值得注意的是,YOLOv3的shortcut层是把网络的值进行叠加,没有改变特征图的大小,所以仔细的人会发现在shortcut层的前后,输入输出大小没变。

在本代码中,route层和shortcut层使用EmptyLayer()来进行占位。

重点:yolo层。

仔细看上图的五次采样,会发现有三个Scale,分别是Scale1(下采样2^3=8倍),Scale2(下采样2^4=16倍),Scale3(下采样2^5=32倍),此时网络默认的尺寸是416*416,对应的feature map为52*52,26*26,13*13。这里借用一幅图:

https://blog.csdn.net/leviopku/article/details/82660381

图4

之所以使用3种尺度,是为了加强对小目标的检测,这个应该是借鉴SSD的思想。比较大的特征图来检测相对较小的目标,而小的特征图负责检测大目标。

在有多尺度的概念下,作者使用k-means得到9个先验框的尺寸(416*416的尺寸下)。作者原话为:

We still use k-means clustering to determine our bounding box priors. We just sort of chose 9 clusters and 3 scales arbitrarily and then divide up the clusters evenly across scales. On the COCO dataset the 9 clusters were: (10×13),(16×30),(33×23),(30×61),(62×45),(59× 119),(116×90),(156×198),(373×326). 

解析yolo层代码:

 
  1. elif module_def["type"] == "yolo":

  2. anchor_idxs = [int(x) for x in module_def["mask"].split(",")]

  3. # Extract anchors

  4. print("----------------------------------")

  5. print("anchor_idxs\n:",anchor_idxs)

  6. anchors = [int(x) for x in module_def["anchors"].split(",")]

  7. print("1. anchors \n:",anchors)

  8. anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]

  9. print("2. anchors \n:",anchors)

  10. anchors = [anchors[i] for i in anchor_idxs]

  11. print("3. anchors \n:",anchors)

  12. num_classes = int(module_def["classes"])

  13. img_size = int(hyperparams["height"])

  14. # Define detection layer

  15. yolo_layer = YOLOLayer(anchors, num_classes, img_size)

  16. modules.add_module(f"yolo_{module_i}", yolo_layer)

可以看到输出:

图5

可以看到yolo层搭建了三次,可以看图4,第一个yolo层是下采样2^5=32倍,特征图尺寸是13*13(默认输入416*416,下同)。这层选择mask的ID是6,7,8,对应的anchor box尺寸是(116, 90)、(156, 198)、(373, 326)。这对应了上面所说的,小的特征图检测大目标,所以使用的anchor box最大。

至此,Darknet(YOLOv3)模型基本加载完毕,接下来就是,加载权重.weights文件,进行预测。

1.2模型预测(detect.py——part.2)

1.2.1 获取检测框

 
  1. #查找weights_path路径下的.weights的文件

  2. if opt.weights_path.endswith(".weights"):

  3. # Load darknet weights

  4. model.load_darknet_weights(opt.weights_path)

  5. else:

  6. # Load checkpoint weights

  7. model.load_state_dict(torch.load(opt.weights_path))

  8. # model.eval(),让model变成测试模式,这主要是对dropout和batch normalization的操作在训练和测试的时候是不一样的

  9. model.eval() # Set in evaluation mode

  10. dataloader = DataLoader(

  11. ImageFolder(opt.image_folder, img_size=opt.img_size),

  12. batch_size=opt.batch_size,

  13. shuffle=False,

  14. num_workers=opt.n_cpu,

  15. )

  16. classes = load_classes(opt.class_path) # Extracts class labels from file

  17. Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor

  18. imgs = [] # Stores image paths

  19. img_detections = [] # Stores detections for each image index

  20. print("\nPerforming object detection:")

  21. #返回当前时间的时间戳

  22. prev_time = time.time()

  23. for batch_i, (img_paths, input_imgs) in enumerate(dataloader):

  24. # Configure input

  25. input_imgs = Variable(input_imgs.type(Tensor))

  26. #print("img_paths:\n",img_paths)

  27. # Get detections

  28. with torch.no_grad():

  29. #52*52+26*26+13*13)*3=10647

  30. # 5 + 80 =85

  31. # detections : 10647*85

  32. detections = model(input_imgs)

  33. #非极大值抑制

  34. detections = non_max_suppression(detections, opt.conf_thres, opt.nms_thres)

  35. #print("detections:\n",detections)

  36. # Log progress

  37. current_time = time.time()

  38. #timedelta代表两个datetime之间的时间差

  39. inference_time = datetime.timedelta(seconds=current_time - prev_time)

  40. prev_time = current_time

  41. print("\t+ Batch %d, Inference Time: %s" % (batch_i, inference_time))

  42. # Save image and detections

  43. #extend() 函数用于在列表末尾一次性追加另一个序列中的多个值(用新列表扩展原来的列表)。

  44. imgs.extend(img_paths)

  45. img_detections.extend(detections)

  46. # Bounding-box colors

  47. cmap = plt.get_cmap("tab20b")

  48. colors = [cmap(i) for i in np.linspace(0, 1, 20)]

  49. print("\nSaving images:")

  50. # Iterate through images and save plot of detections

  51. for img_i, (path, detections) in enumerate(zip(imgs, img_detections)):

  52. print("(%d) Image: '%s'" % (img_i, path))

  53. # Create plot

  54. img = np.array(Image.open(path))

  55. plt.figure()

  56. fig, ax = plt.subplots(1)

  57. ax.imshow(img)

  58. # Draw bounding boxes and labels of detections

  59. if detections is not None:

  60. # Rescale boxes to original image

  61. detections = rescale_boxes(detections, opt.img_size, img.shape[:2])

  62. unique_labels = detections[:, -1].cpu().unique()

  63. n_cls_preds = len(unique_labels)

  64. bbox_colors = random.sample(colors, n_cls_preds)

  65. for x1, y1, x2, y2, conf, cls_conf, cls_pred in detections:

  66. print("\t+ Label: %s, Conf: %.5f" % (classes[int(cls_pred)], cls_conf.item()))

  67. box_w = x2 - x1

  68. box_h = y2 - y1

  69. color = bbox_colors[int(np.where(unique_labels == int(cls_pred))[0])]

  70. # Create a Rectangle patch

  71. bbox = patches.Rectangle((x1, y1), box_w, box_h, linewidth=2, edgecolor=color, facecolor="none")

  72. # Add the bbox to the plot

  73. ax.add_patch(bbox)

  74. # Add label

  75. plt.text(

  76. x1,

  77. y1,

  78. s=classes[int(cls_pred)],

  79. color="white",

  80. verticalalignment="top",

  81. bbox={"color": color, "pad": 0},

  82. )

  83. # Save generated image with detections

  84. plt.axis("off")

  85. plt.gca().xaxis.set_major_locator(NullLocator())

  86. plt.gca().yaxis.set_major_locator(NullLocator())

  87. filename = path.split("/")[-1].split(".")[0]

  88. plt.savefig(f"output/{filename}.jpg", bbox_inches="tight", pad_inches=0.0)

  89. plt.show()

  90. plt.close()

model.load_darknet_weights(opt.weights_path)通过这个语句加载yolov3.weights。加载的完整代码如下:

 
  1. def load_darknet_weights(self, weights_path):

  2. """Parses and loads the weights stored in 'weights_path'"""

  3. # Open the weights file

  4. with open(weights_path, "rb") as f:

  5. header = np.fromfile(f, dtype=np.int32, count=5) # First five are header values

  6. self.header_info = header # Needed to write header when saving weights

  7. self.seen = header[3] # number of images seen during training

  8. weights = np.fromfile(f, dtype=np.float32) # The rest are weights

  9. """

  10. print("------------------------------------")

  11. print("header:\n",header)

  12. print("weights:\n",weights)

  13. print("weights.shape:\n",weights.shape)

  14. """

  15. # Establish cutoff for loading backbone weights

  16. cutoff = None

  17. if "darknet53.conv.74" in weights_path:

  18. cutoff = 75

  19. ptr = 0

  20. for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):

  21. #print("i:\n",i)

  22. #print("module_def:\n",module_def)

  23. #print("module:\n",module)

  24. if i == cutoff:

  25. break

  26. if module_def["type"] == "convolutional":

  27. conv_layer = module[0]

  28. if module_def["batch_normalize"]:

  29. # Load BN bias, weights, running mean and running variance

  30. bn_layer = module[1]

  31. num_b = bn_layer.bias.numel() # Number of biases

  32. #print("bn_layer:\n",bn_layer)

  33. #print("num_b:\n",num_b)

  34. # Bias

  35. bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)

  36. bn_layer.bias.data.copy_(bn_b)

  37. ptr += num_b

  38. # Weight

  39. bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)

  40. bn_layer.weight.data.copy_(bn_w)

  41. ptr += num_b

  42. # Running Mean

  43. bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)

  44. bn_layer.running_mean.data.copy_(bn_rm)

  45. ptr += num_b

  46. # Running Var

  47. bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)

  48. bn_layer.running_var.data.copy_(bn_rv)

  49. ptr += num_b

  50. else:

  51. # Load conv. bias

  52. num_b = conv_layer.bias.numel()

  53. conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)

  54. conv_layer.bias.data.copy_(conv_b)

  55. ptr += num_b

  56. # Load conv. weights

  57. num_w = conv_layer.weight.numel()

  58. conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)

  59. conv_layer.weight.data.copy_(conv_w)

  60. ptr += num_w

  61. #print("conv_w:\n",conv_w)

  62. #print("num_w:\n",num_w)

  63. #print("ptr:\n",ptr)

这一段的代码是解析.weights文件,这里我了解不够到位,欢迎有知道的人指点。主要是不知.weights的结构是怎样的,所以有点疑惑。加载完.weights文件之后,便开始加载测试图片数据。

 
  1. dataloader = DataLoader(

  2. ImageFolder(opt.image_folder, img_size=opt.img_size),

  3. batch_size=opt.batch_size,

  4. shuffle=False,

  5. num_workers=opt.n_cpu,

  6. )

ImageFolder是遍历文件夹下的测试图片,完整定义如下。ImageFolder中的__getitem__()函数会把图像归一化处理成img_size(默认416)大小的图片。

 
  1. class ImageFolder(Dataset):

  2. def __init__(self, folder_path, img_size=416):

  3. #sorted(iterable[, cmp[, key[, reverse]]])

  4. #sorted() 函数对所有可迭代的对象进行排序操作

  5. ##获取指定目录下的所有文件

  6. self.files = sorted(glob.glob("%s/*.*" % folder_path))

  7. self.img_size = img_size

  8. def __getitem__(self, index):

  9. img_path = self.files[index % len(self.files)]

  10. # Extract image as PyTorch tensor

  11. img = transforms.ToTensor()(Image.open(img_path))

  12. # Pad to square resolution 变成方形

  13. img, _ = pad_to_square(img, 0)

  14. # Resize

  15. img = resize(img, self.img_size)

  16. return img_path, img

  17. def __len__(self):

  18. return len(self.files)

detections = model(input_imgs),把图像放进模型中,得到检测结果。这里是通过Darknet的forward()函数得到检测结果。其完整代码如下:

 
  1. def forward(self, x, targets=None):

  2. img_dim = x.shape[2]

  3. loss = 0

  4. layer_outputs, yolo_outputs = [], []

  5. for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):

  6. if module_def["type"] in ["convolutional", "upsample", "maxpool"]:

  7. x = module(x)

  8. elif module_def["type"] == "route":

  9. x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1)

  10. elif module_def["type"] == "shortcut":

  11. layer_i = int(module_def["from"])

  12. x = layer_outputs[-1] + layer_outputs[layer_i]

  13. elif module_def["type"] == "yolo":

  14. x, layer_loss = module[0](x, targets, img_dim)

  15. loss += layer_loss

  16. yolo_outputs.append(x)

  17. layer_outputs.append(x)

  18. yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1))

  19. return yolo_outputs if targets is None else (loss, yolo_outputs)

通过遍历self.module_defs,与self.module_list,来完成网络的前向传播。

如果是"convolutional", "upsample", "maxpool"层,则直接使用前向传播即可。

如果是route层,则使用torch.cat()完成特征图的融合(拼接)。这里测试一张图:

这张图的尺寸为3*768*576,我们看看放进模型进行测试的时候,其shape是如何变化的。图像会根据cfg归一化成416*416.

接下来查看一下route层对应的ID以及shape:

图6

该模型的每一层的输出通过layer_outputs.append(x),保存在layer_outputs列表中,本次结构完全符合本文前面所论述的部分。如果layer只有一个值,那么该route层的输出就是该层。如果layer有两个值,则route层输出是对应两个层的特征图的融合。

shortcut层则特别清晰,直接对应两层相叠加即可:

 
  1. elif module_def["type"] == "shortcut":

  2. layer_i = int(module_def["from"])

  3. x = layer_outputs[-1] + layer_outputs[layer_i]

yolo层有三个,分别对应的特征图大小为13*13,26*26,52*52。每一个特征图的每一个cell会预测3个bounding boxes。每一个bounding box会预测预测三类值:(1)每个框的位置(4个值,中心坐标tx和ty,,框的高度bh和宽度bw),(2)一个objectness prediction ,一个目标性评分(objectness score),即这块位置是目标的可能性有多大。这一步是在predict之前进行的,可以去掉不必要anchor,可以减少计算量(3)N个类别,COCO有80类,VOC有20类。

所以不难理解,在这里是COCO数据集,在13*13的特征图中,一共有13*13*3=507个bounding boxes,每一个bounding box预测(4+1+80=85)个值,用张量的形式表示为[1, 507, 85],那个1表示的是batch size。同理,其余张量的shape不难理解。

图7

至于如何得到这个张量的,主要需要了解yolo层的forward()和compute_grid_offsets(),其完整代码如下:

 
  1. class YOLOLayer(nn.Module):

  2. """Detection layer"""

  3. def __init__(self, anchors, num_classes, img_dim=416):

  4. super(YOLOLayer, self).__init__()

  5. self.anchors = anchors

  6. self.num_anchors = len(anchors)

  7. self.num_classes = num_classes

  8. self.ignore_thres = 0.5

  9. self.mse_loss = nn.MSELoss()

  10. self.bce_loss = nn.BCELoss()

  11. self.obj_scale = 1

  12. self.noobj_scale = 100

  13. self.metrics = {}

  14. self.img_dim = img_dim

  15. self.grid_size = 0 # grid size

  16. def compute_grid_offsets(self, grid_size, cuda=True):

  17. self.grid_size = grid_size

  18. g = self.grid_size

  19. FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor

  20. self.stride = self.img_dim / self.grid_size

  21. # Calculate offsets for each grid

  22. #repeat 相当于一个broadcasting的机制repeat(*sizes)

  23. #沿着指定的维度重复tensor。不同与expand(),本函数复制的是tensor中的数据。

  24. self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)

  25. self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)

  26. self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])

  27. self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))

  28. self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))

  29. def forward(self, x, targets=None, img_dim=None):

  30. # Tensors for cuda support

  31. FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor

  32. LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor

  33. ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor

  34. self.img_dim = img_dim

  35. num_samples = x.size(0)

  36. grid_size = x.size(2)

  37. """

  38. 所以在输入为416*416时,每个cell的三个anchor box为(116 ,90);

  39. (156 ,198); (373 ,326)。16倍适合一般大小的物体,anchor box为

  40. (30,61); (62,45); (59,119)。8倍的感受野最小,适合检测小目标,

  41. 因此anchor box为(10,13); (16,30); (33,23)。所以当输入为416*416时,

  42. 实际总共有(52*52+26*26+13*13)*3=10647个proposal box。

  43. """

  44. prediction = (

  45. x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size)

  46. .permute(0, 1, 3, 4, 2)

  47. .contiguous()

  48. )

  49. """

  50. print("----------------------------------")

  51. print("num_samples:\n",num_samples)

  52. print("self.num_anchors:\n",self.num_anchors)

  53. print("self.grid_size:\n",self.grid_size)

  54. print("grid_size:\n",grid_size)

  55. """

  56. #print("x:\n",x)

  57. #print("prediction:\n",prediction)

  58. # Get outputs

  59. #print("prediction\n:",prediction)

  60. #print("prediction.shape:\n",prediction.shape)

  61. x = torch.sigmoid(prediction[..., 0]) # Center x

  62. y = torch.sigmoid(prediction[..., 1]) # Center y

  63. w = prediction[..., 2] # Width

  64. h = prediction[..., 3] # Height

  65. pred_conf = torch.sigmoid(prediction[..., 4]) # Conf

  66. pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred.

  67. """

  68. print("anchors \n:",self.anchors)

  69. print("x.shape\n:",x.shape)

  70. print("y.shape\n:",y.shape)

  71. print("w.shape\n:",w.shape)

  72. print("h.shape\n:",h.shape)

  73. print("pred_conf.shape\n:",pred_conf.shape)

  74. print("pred_cls.shape\n:",pred_cls.shape)

  75. """

  76. # If grid size does not match current we compute new offsets

  77. if grid_size != self.grid_size:

  78. print("··················different··················")

  79. self.compute_grid_offsets(grid_size, cuda=x.is_cuda)

  80. # Add offset and scale with anchors

  81. pred_boxes = FloatTensor(prediction[..., :4].shape)

  82. """

  83. print("prediction[..., :4].shape:\n",prediction[..., :4].shape)

  84. print("self.grid_x:\n",self.grid_x)

  85. print("self.grid_y:\n",self.grid_y)

  86. print("self.anchor_w:\n",self.anchor_w)

  87. print("self.anchor_h:\n",self.anchor_h)

  88. print("self.anchors:\n",self.anchors)

  89. print("self.stride:\n",self.stride)

  90. """

  91. pred_boxes[..., 0] = x.data + self.grid_x

  92. pred_boxes[..., 1] = y.data + self.grid_y

  93. pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w

  94. pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h

  95. #torch.cat 按最后一维拼接

  96. """

  97. print("pred_boxes.view(num_samples, -1, 4).shape:\n",pred_boxes.view(num_samples, -1, 4).shape)

  98. print("pred_conf.view(num_samples, -1, 1).shape:\n",pred_conf.view(num_samples, -1, 1).shape)

  99. print("pred_cls.view(num_samples, -1, self.num_classes).shape:\n",pred_cls.view(num_samples, -1, self.num_classes).shape)

  100. """

  101. output = torch.cat(

  102. (

  103. pred_boxes.view(num_samples, -1, 4) * self.stride,

  104. pred_conf.view(num_samples, -1, 1),

  105. pred_cls.view(num_samples, -1, self.num_classes),

  106. ),

  107. -1,

  108. )

  109. #print("output.shape:\n",output.shape)

  110. #print("targets:\n",targets)

  111. if targets is None:

  112. return output, 0

  113. else:

  114. iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets(

  115. pred_boxes=pred_boxes,

  116. pred_cls=pred_cls,

  117. target=targets,

  118. anchors=self.scaled_anchors,

  119. ignore_thres=self.ignore_thres,

  120. )

  121. # Loss : Mask outputs to ignore non-existing objects (except with conf. loss)

  122. loss_x = self.mse_loss(x[obj_mask], tx[obj_mask])

  123. loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])

  124. loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])

  125. loss_h = self.mse_loss(h[obj_mask], th[obj_mask])

  126. loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask])

  127. loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])

  128. loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj

  129. loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask])

  130. total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls

  131. # Metrics

  132. cls_acc = 100 * class_mask[obj_mask].mean()

  133. conf_obj = pred_conf[obj_mask].mean()

  134. conf_noobj = pred_conf[noobj_mask].mean()

  135. conf50 = (pred_conf > 0.5).float()

  136. iou50 = (iou_scores > 0.5).float()

  137. iou75 = (iou_scores > 0.75).float()

  138. detected_mask = conf50 * class_mask * tconf

  139. precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16)

  140. recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16)

  141. recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16)

  142. self.metrics = {

  143. "loss": to_cpu(total_loss).item(),

  144. "x": to_cpu(loss_x).item(),

  145. "y": to_cpu(loss_y).item(),

  146. "w": to_cpu(loss_w).item(),

  147. "h": to_cpu(loss_h).item(),

  148. "conf": to_cpu(loss_conf).item(),

  149. "cls": to_cpu(loss_cls).item(),

  150. "cls_acc": to_cpu(cls_acc).item(),

  151. "recall50": to_cpu(recall50).item(),

  152. "recall75": to_cpu(recall75).item(),

  153. "precision": to_cpu(precision).item(),

  154. "conf_obj": to_cpu(conf_obj).item(),

  155. "conf_noobj": to_cpu(conf_noobj).item(),

  156. "grid_size": grid_size,

  157. }

  158. return output, total_loss

num_samples是每一批有多少张图片,grid_size是特征图的大小。

图8

使用torch.view,改变输入yolo层的张量结构(shape),以prediction命名的张量进行预测处理。

图9

接下来是便是对边框进行预测,具体细节可以参考:https://blog.csdn.net/qq_34199326/article/details/84109828。x,y坐标都是使用了sigmoid函数进行处理,置信度和类别概率使用同样的方法处理。

论文中的边界框预测:

图10

Bounding boxes with dimension priors and location prediction. We predict the width and height of the box as offsets from cluster centroids. We predict the center coordinates of the box relative to the location of filter application using a sigmoid function. This figure blatantly self-plagiarized from.

 
  1. x = torch.sigmoid(prediction[..., 0]) # Center x

  2. y = torch.sigmoid(prediction[..., 1]) # Center y

  3. w = prediction[..., 2] # Width

  4. h = prediction[..., 3] # Height

  5. pred_conf = torch.sigmoid(prediction[..., 4]) # Conf

  6. pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred.

在3个尺度下,分别进行预测坐标、置信度、类别概率,这和在搭建yolo层一直,可对比图5

图11

从图中我们发现grid_sizeself.grid_size是不相等的,所以需要进行计算偏移,即compute_grid_offsets。完整代码在YOLOLayer中。

以gird=13为例。此时特征图是13*13,但原图shape尺寸是416*416,所以要把416*416评价切成13*13个方格,需要得到间隔(步距self.stride=416/13=32)。相应的并把anchor的尺寸进行缩放,即116/32=3.6250,90/32=2.8125

图12

根据论文和图10可知,每一个小方格(cell),都会预测3个边界框,同样以gird=13为列。第一个小方格(cell),会预测3个边界框,每个边界框都有坐标+置信度+类别概率。所以以下代码中的x.shape=[1, 3, 13, 13],并且与y,w,h的shape一致。

 
  1. print("x.shape=",x.shape)

  2. print("x.data=\n",x.data)

  3. pred_boxes[..., 0] = x.data + self.grid_x

  4. pred_boxes[..., 1] = y.data + self.grid_y

  5. pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w

  6. pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h

  7. #torch.cat 按最后一维拼接

同时由于在最后进行拼接,得到输出output 。其507=13*13*3,2028=26*26*3,8112=52*52*3不难理解。

图13

由于target=None(推演的时候设置为None),所以输出的total_loss=0

1.2.2 非极大值抑制

 
  1. # detections : 10647*85

  2. detections = model(input_imgs)

  3. #非极大值抑制

  4. detections = non_max_suppression(detections, opt.conf_thres, opt.nms_thres)

在获取检测框之后,需要使用非极大值抑制来筛选框。即 detections = non_max_suppression(detections, opt.conf_thres, opt.nms_thres)

完整代码如下:

 
  1. def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4):

  2. """

  3. Removes detections with lower object confidence score than 'conf_thres' and performs

  4. Non-Maximum Suppression to further filter detections.

  5. Returns detections with shape:

  6. (x1, y1, x2, y2, object_conf, class_score, class_pred)

  7. """

  8. # From (center x, center y, width, height) to (x1, y1, x2, y2)

  9. prediction[..., :4] = xywh2xyxy(prediction[..., :4])

  10. output = [None for _ in range(len(prediction))]

  11. for image_i, image_pred in enumerate(prediction):

  12. # Filter out confidence scores below threshold

  13. print("------------------------------")

  14. #print("image_i:\n",image_i)

  15. print("image_pred.shape:\n",image_pred.shape)

  16. image_pred = image_pred[image_pred[:, 4] >= conf_thres]#保留大于置信度的边界框

  17. print("image_pred.size(0)",image_pred.size(0))

  18. # If none are remaining => process next image

  19. if not image_pred.size(0):

  20. continue

  21. # Object confidence times class confidence

  22. # .max(1) 返回每行tensor的最大值 .max(1)[0]具体的最大值 .max(1)[1] 最大值对应的索引

  23. score = image_pred[:, 4] * image_pred[:, 5:].max(1)[0]

  24. """

  25. print("image_pred[:, 5:]:\n",image_pred[:, 5:])

  26. print("image_pred[:, 5:].max(1):\n",image_pred[:, 5:].max(1))

  27. print("image_pred[:, 5:].max(1)[0]:\n",image_pred[:, 5:].max(1)[0])

  28. """

  29. # Sort by it

  30. # 完成从大到小排序

  31. image_pred = image_pred[(-score).argsort()]

  32. """

  33. print("score:\n",score)

  34. print("(-score).argsort():\n",(-score).argsort())

  35. print("image_pred:\n",image_pred)\

  36. """

  37. #若keepdim值为True,则在输出张量中,除了被操作的dim维度值降为1,其它维度与输入张量input相同。

  38. #否则,dim维度相当于被执行torch.squeeze()维度压缩操作,导致此维度消失,

  39. #最终输出张量会比输入张量少一个维度。

  40. class_confs, class_preds = image_pred[:, 5:].max(1, keepdim=True)

  41. #print("image_pred[:, 5:].max(1, keepdim=True):\n",image_pred[:, 5:].max(1, keepdim=True))

  42. #print("image_pred[:, 5:].max(1, keepdim=False):\n",image_pred[:, 5:].max(1, keepdim=False))

  43. detections = torch.cat((image_pred[:, :5], class_confs.float(), class_preds.float()), 1)

  44. # Perform non-maximum suppression

  45. #print("detections.size():\n",detections.size())

  46. #print("detections.size(0):\n",detections.size(0))

  47. #print("image_pred[:, :5]:\n",image_pred[:, :5])

  48. keep_boxes = []

  49. while detections.size(0):

  50. #torch.unsqueeze()这个函数主要是对数据维度进行扩充

  51. large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thres

  52. label_match = detections[0, -1] == detections[:, -1]

  53. # Indices of boxes with lower confidence scores, large IOUs and matching labels

  54. invalid = large_overlap & label_match

  55. weights = detections[invalid, 4:5]#置信度

  56. """

  57. print("1.detections:\n",detections)

  58. print("large_overlap:\n",large_overlap)

  59. print("detections[0, -1]:\n",detections[0, -1])

  60. print("detections[:, -1]:\n",detections[:, -1])

  61. print("label_match:\n",label_match)

  62. print("invalid:\n",invalid)

  63. print("weights:\n",weights)

  64. """

  65. # Merge overlapping bboxes by order of confidence

  66. detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum()

  67. """

  68. print("detections[invalid, :4]:\n",detections[invalid, :4])

  69. print("weights * detections[invalid, :4]:\n",weights * detections[invalid, :4])

  70. print("detections[invalid, :4].sum(0):\n",detections[invalid, :4].sum(0))

  71. print("weights * detections[invalid, :4].sum(0):\n",weights * detections[invalid, :4].sum(0))

  72. print("2.detections:\n",detections)

  73. """

  74. keep_boxes += [detections[0]]

  75. detections = detections[~invalid]

  76. #print("3.detections:\n",detections)

  77. if keep_boxes:

  78. output[image_i] = torch.stack(keep_boxes)

  79. return output

非极大值抑制算法可参考:

https://www.cnblogs.com/makefile/p/nms.html

https://www.jianshu.com/p/d452b5615850

在经过非极大值抑制处理之后,在这里唯一有一点不同的是,这里采取了边界框“融合”的策略:

 
  1. # Merge overlapping bboxes by order of confidence

  2. detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum()

显示非极大值抑制过后的目标检测效果。

图14

至此第一部分检测分析完毕,剩下关于训练的部分还在更新中。

Pytorch | yolov3代码详解(二)(更新中)

已更完

Pytorch | yolov3代码详解(二)

https://blog.csdn.net/qq_24739717/article/details/96705055

你可能感兴趣的:(yolo)