YOLO相关原理 :
https://blog.csdn.net/leviopku/article/details/82660381
https://www.jianshu.com/p/d13ae1055302
https://blog.csdn.net/qq_34199326/article/details/84874409
https://blog.csdn.net/chandanyan8568/article/details/81089083
https://blog.csdn.net/leviopku/article/details/82660381
分析代码:
https://github.com/eriklindernoren/PyTorch-YOLOv3
注:这个是方便个人学习pytorch和yolov3所做的记录,有任何错误欢迎指出。
从detect.py开始分析代码的流程。
from __future__ import division
from models import *
from utils.utils import *
from utils.datasets import *
import os
import sys
import time
import datetime
import argparse
from PIL import Image
import torch
from torch.utils.data import DataLoader
from torchvision import datasets
from torch.autograd import Variable
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.ticker import NullLocator
"""
(1)import argparse 首先导入模块
(2)parser = argparse.ArgumentParser() 创建一个解析对象
(3)parser.add_argument() 向该对象中添加你要关注的命令行参数和选项
(4)parser.parse_args() 进行解析
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--image_folder", type=str, default="data/samples", help="path to dataset")
parser.add_argument("--model_def", type=str, default="config/yolov3.cfg", help="path to model definition file")
parser.add_argument("--weights_path", type=str, default="weights/yolov3.weights", help="path to weights file")
parser.add_argument("--class_path", type=str, default="data/coco.names", help="path to class label file")
parser.add_argument("--conf_thres", type=float, default=0.8, help="object confidence threshold")
parser.add_argument("--nms_thres", type=float, default=0.4, help="iou thresshold for non-maximum suppression")
parser.add_argument("--batch_size", type=int, default=1, help="size of the batches")
parser.add_argument("--n_cpu", type=int, default=0, help="number of cpu threads to use during batch generation")
parser.add_argument("--img_size", type=int, default=416, help="size of each image dimension")
parser.add_argument("--checkpoint_model", type=str, help="path to checkpoint model")
opt = parser.parse_args()
print(opt)
#选择是否使用GPU设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#创建多级目录
os.makedirs("output", exist_ok=True)
# Set up model 调用darknet模型
model = Darknet(opt.model_def, img_size=opt.img_size).to(device)
model = Darknet(opt.model_def, img_size=opt.img_size).to(device),这条语句加载darkent模型,即YOLOv3模型。Darknet模型在model.py中进行定义。其完整定义如下:
class Darknet(nn.Module):
"""YOLOv3 object detection model"""
def __init__(self, config_path, img_size=416):
super(Darknet, self).__init__()
#解析cfg文件
self.module_defs = parse_model_config(config_path)
#print("module_defs : ",self.module_defs)
self.hyperparams, self.module_list = create_modules(self.module_defs)
#print("module_list : ",self.module_list)
# hasattr() 函数用于判断对象是否包含对应的属性。
# yolo层有 metrics 属性
self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")]
#print("self.yolo_layers:\n",self.yolo_layers)
self.img_size = img_size
self.seen = 0
self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32)
def forward(self, x, targets=None):
img_dim = x.shape[2]
loss = 0
layer_outputs, yolo_outputs = [], []
print("x.shape: ",x.shape)
for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
#print("module_defs : ",module_def)
#print("module : ",module)
#print("i: ",i," x.shape: ",x.shape)
if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
x = module(x)
elif module_def["type"] == "route":
print("i: ",i," x.shape: ",x.shape)
for layer_i in module_def["layers"].split(","):
print("layer_i:\n",layer_i)
x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1)
elif module_def["type"] == "shortcut":
layer_i = int(module_def["from"])
x = layer_outputs[-1] + layer_outputs[layer_i]
elif module_def["type"] == "yolo":
x, layer_loss = module[0](x, targets, img_dim)
loss += layer_loss
yolo_outputs.append(x)
layer_outputs.append(x)
yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1))
return yolo_outputs if targets is None else (loss, yolo_outputs)
def load_darknet_weights(self, weights_path):
"""Parses and loads the weights stored in 'weights_path'"""
# Open the weights file
with open(weights_path, "rb") as f:
header = np.fromfile(f, dtype=np.int32, count=5) # First five are header values
self.header_info = header # Needed to write header when saving weights
self.seen = header[3] # number of images seen during training
weights = np.fromfile(f, dtype=np.float32) # The rest are weights
"""
print("------------------------------------")
print("header:\n",header)
print("weights:\n",weights)
print("weights.shape:\n",weights.shape)
"""
# Establish cutoff for loading backbone weights
cutoff = None
if "darknet53.conv.74" in weights_path:
cutoff = 75
ptr = 0
for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
#print("i:\n",i)
#print("module_def:\n",module_def)
#print("module:\n",module)
if i == cutoff:
break
if module_def["type"] == "convolutional":
conv_layer = module[0]
if module_def["batch_normalize"]:
# Load BN bias, weights, running mean and running variance
bn_layer = module[1]
num_b = bn_layer.bias.numel() # Number of biases
#print("bn_layer:\n",bn_layer)
#print("num_b:\n",num_b)
# Bias
bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)
bn_layer.bias.data.copy_(bn_b)
ptr += num_b
# Weight
bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)
bn_layer.weight.data.copy_(bn_w)
ptr += num_b
# Running Mean
bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)
bn_layer.running_mean.data.copy_(bn_rm)
ptr += num_b
# Running Var
bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)
bn_layer.running_var.data.copy_(bn_rv)
ptr += num_b
else:
# Load conv. bias
num_b = conv_layer.bias.numel()
conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)
conv_layer.bias.data.copy_(conv_b)
ptr += num_b
# Load conv. weights
num_w = conv_layer.weight.numel()
conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)
conv_layer.weight.data.copy_(conv_w)
ptr += num_w
#print("conv_w:\n",conv_w)
#print("num_w:\n",num_w)
#print("ptr:\n",ptr)
def save_darknet_weights(self, path, cutoff=-1):
"""
@:param path - path of the new weights file
@:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
"""
fp = open(path, "wb")
self.header_info[3] = self.seen
self.header_info.tofile(fp)
# Iterate through layers
for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
if module_def["type"] == "convolutional":
conv_layer = module[0]
# If batch norm, load bn first
if module_def["batch_normalize"]:
bn_layer = module[1]
bn_layer.bias.data.cpu().numpy().tofile(fp)
bn_layer.weight.data.cpu().numpy().tofile(fp)
bn_layer.running_mean.data.cpu().numpy().tofile(fp)
bn_layer.running_var.data.cpu().numpy().tofile(fp)
# Load conv bias
else:
conv_layer.bias.data.cpu().numpy().tofile(fp)
# Load conv weights
conv_layer.weight.data.cpu().numpy().tofile(fp)
fp.close()
首先从__init__()函数开始,大致流程是从.cfg中解析文件,然后根据文件内容生成相关的网络结构。
解析后会生成一个列表,存储网络结构的各种属性,通过遍历这个列表便可以得到网络结构,解析后的列表如下图所示(部分):
图1
self.hyperparams, self.module_list = create_modules(self.module_defs),这条语句会根据生成的列表构建网络结构,create_modules()函数如下:
def create_modules(module_defs):
"""
Constructs module list of layer blocks from module configuration in module_defs
"""
#pop() 函数用于移除列表中的一个元素(默认最后一个元素),并且返回该元素的值。
hyperparams = module_defs.pop(0)
output_filters = [int(hyperparams["channels"])]
module_list = nn.ModuleList()
for module_i, module_def in enumerate(module_defs):
modules = nn.Sequential()
if module_def["type"] == "convolutional":
bn = int(module_def["batch_normalize"])
filters = int(module_def["filters"])
kernel_size = int(module_def["size"])
pad = (kernel_size - 1) // 2
modules.add_module(
f"conv_{module_i}",
nn.Conv2d(
in_channels=output_filters[-1],
out_channels=filters,
kernel_size=kernel_size,
stride=int(module_def["stride"]),
padding=pad,
bias=not bn,
),
)
if bn:
modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9, eps=1e-5))
if module_def["activation"] == "leaky":
modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1))
elif module_def["type"] == "maxpool":
kernel_size = int(module_def["size"])
stride = int(module_def["stride"])
if kernel_size == 2 and stride == 1:
#保证输出是偶数
modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1)))
maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2))
modules.add_module(f"maxpool_{module_i}", maxpool)
elif module_def["type"] == "upsample":
upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest")
modules.add_module(f"upsample_{module_i}", upsample)
elif module_def["type"] == "route":
layers = [int(x) for x in module_def["layers"].split(",")]
filters = sum([output_filters[1:][i] for i in layers])
"""
print("------------------------------------")
print("layers: \n",layers)
print("output_filters:\n",output_filters)
print("output_filters[1:][i] :\n",[output_filters[1:][i] for i in layers])
print("output_filters[1:]:\n",output_filters[1:])
print("output_filters[1:][1]:\n",output_filters[1:][1])
print("output_filters[1:][3]:\n",output_filters[1:][3])
"""
modules.add_module(f"route_{module_i}", EmptyLayer())
elif module_def["type"] == "shortcut":
filters = output_filters[1:][int(module_def["from"])]
modules.add_module(f"shortcut_{module_i}", EmptyLayer())
elif module_def["type"] == "yolo":
anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
# Extract anchors
#print("----------------------------------")
#print("anchor_idxs\n:",anchor_idxs)
anchors = [int(x) for x in module_def["anchors"].split(",")]
#print("1. anchors \n:",anchors)
anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
#print("2. anchors \n:",anchors)
anchors = [anchors[i] for i in anchor_idxs]
#print("3. anchors \n:",anchors)
num_classes = int(module_def["classes"])
img_size = int(hyperparams["height"])
# Define detection layer
yolo_layer = YOLOLayer(anchors, num_classes, img_size)
modules.add_module(f"yolo_{module_i}", yolo_layer)
# Register module list and number of output filters
module_list.append(modules)
output_filters.append(filters)
return hyperparams, module_list
根据列表会生成相应的convolutional、maxpool、upsample、route、shortcut、yolo层。
convolutional层构建方法很常规:设置filter尺寸、数量,添加batch normalize层(在.cfg文件中batch_normalize=1),以及pad层,使用leaky激活函数。
maxpool层,不过在YOLOv3中没有使用最大池化来进行下采样,是使用的3*3的卷积核,步长=2的卷积操作进行下采样,(细心的同学会发现yolov3.cfg没有maxpool层),一共5次,下采样2^5=32倍数。
图2
upsample层,上采样层,由于nn.Upsample被弃用了,所以新建了一个类完成这个操作。
class Upsample(nn.Module):
""" nn.Upsample is deprecated """
def __init__(self, scale_factor, mode="nearest"):
super(Upsample, self).__init__()
self.scale_factor = scale_factor
self.mode = mode
def forward(self, x):
x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode)
return x
接下来是route层,这层十分重要。这层的作用相当于把前面的特征图进行相融合。
[route]
layers = -4 # 只有一个值,一个路径
[route]
layers = -1, 61 # 两个值,两个路径,两个特征图进行特征融合
下图来自darknet-master(windos下的yolov3实现,纯C语言,下图是方便理解,本文不对该代码分析)
图3
layer=-4,表示当前层的序号减4,如第83层route,-4之后是79层,把79层的特征层融合(layer值只有一个,相当于只有链接过来),route层的输出可以看作是下一层的输入,即13*13*512和79层的特征图是完全吻合的。同理,layer=-1,61,表示融合85(86-1)层和61层的特征图。即26*26*512+26*26*256=26*26*768。至于为什么选这几个层进行融合,我表示并不清楚,希望有了解的朋友指点一下。 这几层刚好下采样块的输出层。
shortcut层,直连层,借鉴于ResNet网络。
https://cloud.tencent.com/developer/article/1148375,更多细节可查看:https://blog.csdn.net/u014665013/article/details/81985082
ResNet 的动机依然是解决深度模型中的退化问题:层数越深,梯度越容易发散,误差越大,难以训练。理论上,模型层数越深,误差应该越小才对,因为我们总可以根据浅层模型的解构造出深层模型的解(将深层模型与浅层模型对应的层赋值为浅层模型的权重,将后面的层取为恒等映射),使得这个深层模型的误差不大于浅层模型的误差。但是实际上,深度模型的误差要比浅层模型的误差要大,在CIFAR-10上面的训练和测试误差如下图所示。产生这种现象的原因是深度模型难以优化,难以收敛到较优的解,并假设相比于直接优化最初的plain networks的模型F(x)=y,残差F(x)=y-x更容易优化。 需要注意的是,变换F可以是很多层,也就是说shortcut不一定只跨越1层。并且实际中,由于shortcut只跨越单层没有优势,ResNet中是跨越了2层或3层
YOLOv3完整的结构有100+层,所以采用直连的方式来优化网络结构,能使网络更好的训练、更快的收敛。值得注意的是,YOLOv3的shortcut层是把网络的值进行叠加,没有改变特征图的大小,所以仔细的人会发现在shortcut层的前后,输入输出大小没变。
在本代码中,route层和shortcut层使用EmptyLayer()来进行占位。
重点:yolo层。
仔细看上图的五次采样,会发现有三个Scale,分别是Scale1(下采样2^3=8倍),Scale2(下采样2^4=16倍),Scale3(下采样2^5=32倍),此时网络默认的尺寸是416*416,对应的feature map为52*52,26*26,13*13。这里借用一幅图:
https://blog.csdn.net/leviopku/article/details/82660381
图4
之所以使用3种尺度,是为了加强对小目标的检测,这个应该是借鉴SSD的思想。比较大的特征图来检测相对较小的目标,而小的特征图负责检测大目标。
在有多尺度的概念下,作者使用k-means得到9个先验框的尺寸(416*416的尺寸下)。作者原话为:
We still use k-means clustering to determine our bounding box priors. We just sort of chose 9 clusters and 3 scales arbitrarily and then divide up the clusters evenly across scales. On the COCO dataset the 9 clusters were: (10×13),(16×30),(33×23),(30×61),(62×45),(59× 119),(116×90),(156×198),(373×326).
解析yolo层代码:
elif module_def["type"] == "yolo":
anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
# Extract anchors
print("----------------------------------")
print("anchor_idxs\n:",anchor_idxs)
anchors = [int(x) for x in module_def["anchors"].split(",")]
print("1. anchors \n:",anchors)
anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
print("2. anchors \n:",anchors)
anchors = [anchors[i] for i in anchor_idxs]
print("3. anchors \n:",anchors)
num_classes = int(module_def["classes"])
img_size = int(hyperparams["height"])
# Define detection layer
yolo_layer = YOLOLayer(anchors, num_classes, img_size)
modules.add_module(f"yolo_{module_i}", yolo_layer)
可以看到输出:
图5
可以看到yolo层搭建了三次,可以看图4,第一个yolo层是下采样2^5=32倍,特征图尺寸是13*13(默认输入416*416,下同)。这层选择mask的ID是6,7,8,对应的anchor box尺寸是(116, 90)、(156, 198)、(373, 326)。这对应了上面所说的,小的特征图检测大目标,所以使用的anchor box最大。
至此,Darknet(YOLOv3)模型基本加载完毕,接下来就是,加载权重.weights文件,进行预测。
#查找weights_path路径下的.weights的文件
if opt.weights_path.endswith(".weights"):
# Load darknet weights
model.load_darknet_weights(opt.weights_path)
else:
# Load checkpoint weights
model.load_state_dict(torch.load(opt.weights_path))
# model.eval(),让model变成测试模式,这主要是对dropout和batch normalization的操作在训练和测试的时候是不一样的
model.eval() # Set in evaluation mode
dataloader = DataLoader(
ImageFolder(opt.image_folder, img_size=opt.img_size),
batch_size=opt.batch_size,
shuffle=False,
num_workers=opt.n_cpu,
)
classes = load_classes(opt.class_path) # Extracts class labels from file
Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
imgs = [] # Stores image paths
img_detections = [] # Stores detections for each image index
print("\nPerforming object detection:")
#返回当前时间的时间戳
prev_time = time.time()
for batch_i, (img_paths, input_imgs) in enumerate(dataloader):
# Configure input
input_imgs = Variable(input_imgs.type(Tensor))
#print("img_paths:\n",img_paths)
# Get detections
with torch.no_grad():
#52*52+26*26+13*13)*3=10647
# 5 + 80 =85
# detections : 10647*85
detections = model(input_imgs)
#非极大值抑制
detections = non_max_suppression(detections, opt.conf_thres, opt.nms_thres)
#print("detections:\n",detections)
# Log progress
current_time = time.time()
#timedelta代表两个datetime之间的时间差
inference_time = datetime.timedelta(seconds=current_time - prev_time)
prev_time = current_time
print("\t+ Batch %d, Inference Time: %s" % (batch_i, inference_time))
# Save image and detections
#extend() 函数用于在列表末尾一次性追加另一个序列中的多个值(用新列表扩展原来的列表)。
imgs.extend(img_paths)
img_detections.extend(detections)
# Bounding-box colors
cmap = plt.get_cmap("tab20b")
colors = [cmap(i) for i in np.linspace(0, 1, 20)]
print("\nSaving images:")
# Iterate through images and save plot of detections
for img_i, (path, detections) in enumerate(zip(imgs, img_detections)):
print("(%d) Image: '%s'" % (img_i, path))
# Create plot
img = np.array(Image.open(path))
plt.figure()
fig, ax = plt.subplots(1)
ax.imshow(img)
# Draw bounding boxes and labels of detections
if detections is not None:
# Rescale boxes to original image
detections = rescale_boxes(detections, opt.img_size, img.shape[:2])
unique_labels = detections[:, -1].cpu().unique()
n_cls_preds = len(unique_labels)
bbox_colors = random.sample(colors, n_cls_preds)
for x1, y1, x2, y2, conf, cls_conf, cls_pred in detections:
print("\t+ Label: %s, Conf: %.5f" % (classes[int(cls_pred)], cls_conf.item()))
box_w = x2 - x1
box_h = y2 - y1
color = bbox_colors[int(np.where(unique_labels == int(cls_pred))[0])]
# Create a Rectangle patch
bbox = patches.Rectangle((x1, y1), box_w, box_h, linewidth=2, edgecolor=color, facecolor="none")
# Add the bbox to the plot
ax.add_patch(bbox)
# Add label
plt.text(
x1,
y1,
s=classes[int(cls_pred)],
color="white",
verticalalignment="top",
bbox={"color": color, "pad": 0},
)
# Save generated image with detections
plt.axis("off")
plt.gca().xaxis.set_major_locator(NullLocator())
plt.gca().yaxis.set_major_locator(NullLocator())
filename = path.split("/")[-1].split(".")[0]
plt.savefig(f"output/{filename}.jpg", bbox_inches="tight", pad_inches=0.0)
plt.show()
plt.close()
model.load_darknet_weights(opt.weights_path)通过这个语句加载yolov3.weights。加载的完整代码如下:
def load_darknet_weights(self, weights_path):
"""Parses and loads the weights stored in 'weights_path'"""
# Open the weights file
with open(weights_path, "rb") as f:
header = np.fromfile(f, dtype=np.int32, count=5) # First five are header values
self.header_info = header # Needed to write header when saving weights
self.seen = header[3] # number of images seen during training
weights = np.fromfile(f, dtype=np.float32) # The rest are weights
"""
print("------------------------------------")
print("header:\n",header)
print("weights:\n",weights)
print("weights.shape:\n",weights.shape)
"""
# Establish cutoff for loading backbone weights
cutoff = None
if "darknet53.conv.74" in weights_path:
cutoff = 75
ptr = 0
for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
#print("i:\n",i)
#print("module_def:\n",module_def)
#print("module:\n",module)
if i == cutoff:
break
if module_def["type"] == "convolutional":
conv_layer = module[0]
if module_def["batch_normalize"]:
# Load BN bias, weights, running mean and running variance
bn_layer = module[1]
num_b = bn_layer.bias.numel() # Number of biases
#print("bn_layer:\n",bn_layer)
#print("num_b:\n",num_b)
# Bias
bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)
bn_layer.bias.data.copy_(bn_b)
ptr += num_b
# Weight
bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)
bn_layer.weight.data.copy_(bn_w)
ptr += num_b
# Running Mean
bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)
bn_layer.running_mean.data.copy_(bn_rm)
ptr += num_b
# Running Var
bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)
bn_layer.running_var.data.copy_(bn_rv)
ptr += num_b
else:
# Load conv. bias
num_b = conv_layer.bias.numel()
conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)
conv_layer.bias.data.copy_(conv_b)
ptr += num_b
# Load conv. weights
num_w = conv_layer.weight.numel()
conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)
conv_layer.weight.data.copy_(conv_w)
ptr += num_w
#print("conv_w:\n",conv_w)
#print("num_w:\n",num_w)
#print("ptr:\n",ptr)
这一段的代码是解析.weights文件,这里我了解不够到位,欢迎有知道的人指点。主要是不知.weights的结构是怎样的,所以有点疑惑。加载完.weights文件之后,便开始加载测试图片数据。
dataloader = DataLoader(
ImageFolder(opt.image_folder, img_size=opt.img_size),
batch_size=opt.batch_size,
shuffle=False,
num_workers=opt.n_cpu,
)
ImageFolder是遍历文件夹下的测试图片,完整定义如下。ImageFolder中的__getitem__()函数会把图像归一化处理成img_size(默认416)大小的图片。
class ImageFolder(Dataset):
def __init__(self, folder_path, img_size=416):
#sorted(iterable[, cmp[, key[, reverse]]])
#sorted() 函数对所有可迭代的对象进行排序操作
##获取指定目录下的所有文件
self.files = sorted(glob.glob("%s/*.*" % folder_path))
self.img_size = img_size
def __getitem__(self, index):
img_path = self.files[index % len(self.files)]
# Extract image as PyTorch tensor
img = transforms.ToTensor()(Image.open(img_path))
# Pad to square resolution 变成方形
img, _ = pad_to_square(img, 0)
# Resize
img = resize(img, self.img_size)
return img_path, img
def __len__(self):
return len(self.files)
detections = model(input_imgs),把图像放进模型中,得到检测结果。这里是通过Darknet的forward()函数得到检测结果。其完整代码如下:
def forward(self, x, targets=None):
img_dim = x.shape[2]
loss = 0
layer_outputs, yolo_outputs = [], []
for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
x = module(x)
elif module_def["type"] == "route":
x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1)
elif module_def["type"] == "shortcut":
layer_i = int(module_def["from"])
x = layer_outputs[-1] + layer_outputs[layer_i]
elif module_def["type"] == "yolo":
x, layer_loss = module[0](x, targets, img_dim)
loss += layer_loss
yolo_outputs.append(x)
layer_outputs.append(x)
yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1))
return yolo_outputs if targets is None else (loss, yolo_outputs)
通过遍历self.module_defs,与self.module_list,来完成网络的前向传播。
如果是"convolutional", "upsample", "maxpool"层,则直接使用前向传播即可。
如果是route层,则使用torch.cat()完成特征图的融合(拼接)。这里测试一张图:
这张图的尺寸为3*768*576,我们看看放进模型进行测试的时候,其shape是如何变化的。图像会根据cfg归一化成416*416.
接下来查看一下route层对应的ID以及shape:
图6
该模型的每一层的输出通过layer_outputs.append(x),保存在layer_outputs列表中,本次结构完全符合本文前面所论述的部分。如果layer只有一个值,那么该route层的输出就是该层。如果layer有两个值,则route层输出是对应两个层的特征图的融合。
shortcut层则特别清晰,直接对应两层相叠加即可:
elif module_def["type"] == "shortcut":
layer_i = int(module_def["from"])
x = layer_outputs[-1] + layer_outputs[layer_i]
yolo层有三个,分别对应的特征图大小为13*13,26*26,52*52。每一个特征图的每一个cell会预测3个bounding boxes。每一个bounding box会预测预测三类值:(1)每个框的位置(4个值,中心坐标tx和ty,,框的高度bh和宽度bw),(2)一个objectness prediction ,一个目标性评分(objectness score),即这块位置是目标的可能性有多大。这一步是在predict之前进行的,可以去掉不必要anchor,可以减少计算量(3)N个类别,COCO有80类,VOC有20类。
所以不难理解,在这里是COCO数据集,在13*13的特征图中,一共有13*13*3=507个bounding boxes,每一个bounding box预测(4+1+80=85)个值,用张量的形式表示为[1, 507, 85],那个1表示的是batch size。同理,其余张量的shape不难理解。
图7
至于如何得到这个张量的,主要需要了解yolo层的forward()和compute_grid_offsets(),其完整代码如下:
class YOLOLayer(nn.Module):
"""Detection layer"""
def __init__(self, anchors, num_classes, img_dim=416):
super(YOLOLayer, self).__init__()
self.anchors = anchors
self.num_anchors = len(anchors)
self.num_classes = num_classes
self.ignore_thres = 0.5
self.mse_loss = nn.MSELoss()
self.bce_loss = nn.BCELoss()
self.obj_scale = 1
self.noobj_scale = 100
self.metrics = {}
self.img_dim = img_dim
self.grid_size = 0 # grid size
def compute_grid_offsets(self, grid_size, cuda=True):
self.grid_size = grid_size
g = self.grid_size
FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
self.stride = self.img_dim / self.grid_size
# Calculate offsets for each grid
#repeat 相当于一个broadcasting的机制repeat(*sizes)
#沿着指定的维度重复tensor。不同与expand(),本函数复制的是tensor中的数据。
self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)
self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)
self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])
self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))
self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))
def forward(self, x, targets=None, img_dim=None):
# Tensors for cuda support
FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor
self.img_dim = img_dim
num_samples = x.size(0)
grid_size = x.size(2)
"""
所以在输入为416*416时,每个cell的三个anchor box为(116 ,90);
(156 ,198); (373 ,326)。16倍适合一般大小的物体,anchor box为
(30,61); (62,45); (59,119)。8倍的感受野最小,适合检测小目标,
因此anchor box为(10,13); (16,30); (33,23)。所以当输入为416*416时,
实际总共有(52*52+26*26+13*13)*3=10647个proposal box。
"""
prediction = (
x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size)
.permute(0, 1, 3, 4, 2)
.contiguous()
)
"""
print("----------------------------------")
print("num_samples:\n",num_samples)
print("self.num_anchors:\n",self.num_anchors)
print("self.grid_size:\n",self.grid_size)
print("grid_size:\n",grid_size)
"""
#print("x:\n",x)
#print("prediction:\n",prediction)
# Get outputs
#print("prediction\n:",prediction)
#print("prediction.shape:\n",prediction.shape)
x = torch.sigmoid(prediction[..., 0]) # Center x
y = torch.sigmoid(prediction[..., 1]) # Center y
w = prediction[..., 2] # Width
h = prediction[..., 3] # Height
pred_conf = torch.sigmoid(prediction[..., 4]) # Conf
pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred.
"""
print("anchors \n:",self.anchors)
print("x.shape\n:",x.shape)
print("y.shape\n:",y.shape)
print("w.shape\n:",w.shape)
print("h.shape\n:",h.shape)
print("pred_conf.shape\n:",pred_conf.shape)
print("pred_cls.shape\n:",pred_cls.shape)
"""
# If grid size does not match current we compute new offsets
if grid_size != self.grid_size:
print("··················different··················")
self.compute_grid_offsets(grid_size, cuda=x.is_cuda)
# Add offset and scale with anchors
pred_boxes = FloatTensor(prediction[..., :4].shape)
"""
print("prediction[..., :4].shape:\n",prediction[..., :4].shape)
print("self.grid_x:\n",self.grid_x)
print("self.grid_y:\n",self.grid_y)
print("self.anchor_w:\n",self.anchor_w)
print("self.anchor_h:\n",self.anchor_h)
print("self.anchors:\n",self.anchors)
print("self.stride:\n",self.stride)
"""
pred_boxes[..., 0] = x.data + self.grid_x
pred_boxes[..., 1] = y.data + self.grid_y
pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h
#torch.cat 按最后一维拼接
"""
print("pred_boxes.view(num_samples, -1, 4).shape:\n",pred_boxes.view(num_samples, -1, 4).shape)
print("pred_conf.view(num_samples, -1, 1).shape:\n",pred_conf.view(num_samples, -1, 1).shape)
print("pred_cls.view(num_samples, -1, self.num_classes).shape:\n",pred_cls.view(num_samples, -1, self.num_classes).shape)
"""
output = torch.cat(
(
pred_boxes.view(num_samples, -1, 4) * self.stride,
pred_conf.view(num_samples, -1, 1),
pred_cls.view(num_samples, -1, self.num_classes),
),
-1,
)
#print("output.shape:\n",output.shape)
#print("targets:\n",targets)
if targets is None:
return output, 0
else:
iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets(
pred_boxes=pred_boxes,
pred_cls=pred_cls,
target=targets,
anchors=self.scaled_anchors,
ignore_thres=self.ignore_thres,
)
# Loss : Mask outputs to ignore non-existing objects (except with conf. loss)
loss_x = self.mse_loss(x[obj_mask], tx[obj_mask])
loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])
loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])
loss_h = self.mse_loss(h[obj_mask], th[obj_mask])
loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask])
loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])
loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj
loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask])
total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
# Metrics
cls_acc = 100 * class_mask[obj_mask].mean()
conf_obj = pred_conf[obj_mask].mean()
conf_noobj = pred_conf[noobj_mask].mean()
conf50 = (pred_conf > 0.5).float()
iou50 = (iou_scores > 0.5).float()
iou75 = (iou_scores > 0.75).float()
detected_mask = conf50 * class_mask * tconf
precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16)
recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16)
recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16)
self.metrics = {
"loss": to_cpu(total_loss).item(),
"x": to_cpu(loss_x).item(),
"y": to_cpu(loss_y).item(),
"w": to_cpu(loss_w).item(),
"h": to_cpu(loss_h).item(),
"conf": to_cpu(loss_conf).item(),
"cls": to_cpu(loss_cls).item(),
"cls_acc": to_cpu(cls_acc).item(),
"recall50": to_cpu(recall50).item(),
"recall75": to_cpu(recall75).item(),
"precision": to_cpu(precision).item(),
"conf_obj": to_cpu(conf_obj).item(),
"conf_noobj": to_cpu(conf_noobj).item(),
"grid_size": grid_size,
}
return output, total_loss
num_samples是每一批有多少张图片,grid_size是特征图的大小。
图8
使用torch.view,改变输入yolo层的张量结构(shape),以prediction命名的张量进行预测处理。
图9
接下来是便是对边框进行预测,具体细节可以参考:https://blog.csdn.net/qq_34199326/article/details/84109828。x,y坐标都是使用了sigmoid函数进行处理,置信度和类别概率使用同样的方法处理。
论文中的边界框预测:
图10
Bounding boxes with dimension priors and location prediction. We predict the width and height of the box as offsets from cluster centroids. We predict the center coordinates of the box relative to the location of filter application using a sigmoid function. This figure blatantly self-plagiarized from.
x = torch.sigmoid(prediction[..., 0]) # Center x
y = torch.sigmoid(prediction[..., 1]) # Center y
w = prediction[..., 2] # Width
h = prediction[..., 3] # Height
pred_conf = torch.sigmoid(prediction[..., 4]) # Conf
pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred.
在3个尺度下,分别进行预测坐标、置信度、类别概率,这和在搭建yolo层一直,可对比图5。
图11
从图中我们发现grid_size和self.grid_size是不相等的,所以需要进行计算偏移,即compute_grid_offsets。完整代码在YOLOLayer中。
以gird=13为例。此时特征图是13*13,但原图shape尺寸是416*416,所以要把416*416评价切成13*13个方格,需要得到间隔(步距self.stride=416/13=32)。相应的并把anchor的尺寸进行缩放,即116/32=3.6250,90/32=2.8125。
图12
根据论文和图10可知,每一个小方格(cell),都会预测3个边界框,同样以gird=13为列。第一个小方格(cell),会预测3个边界框,每个边界框都有坐标+置信度+类别概率。所以以下代码中的x.shape=[1, 3, 13, 13],并且与y,w,h的shape一致。
print("x.shape=",x.shape)
print("x.data=\n",x.data)
pred_boxes[..., 0] = x.data + self.grid_x
pred_boxes[..., 1] = y.data + self.grid_y
pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h
#torch.cat 按最后一维拼接
同时由于在最后进行拼接,得到输出output 。其507=13*13*3,2028=26*26*3,8112=52*52*3不难理解。
图13
由于target=None(推演的时候设置为None),所以输出的total_loss=0。
# detections : 10647*85
detections = model(input_imgs)
#非极大值抑制
detections = non_max_suppression(detections, opt.conf_thres, opt.nms_thres)
在获取检测框之后,需要使用非极大值抑制来筛选框。即 detections = non_max_suppression(detections, opt.conf_thres, opt.nms_thres)
完整代码如下:
def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4):
"""
Removes detections with lower object confidence score than 'conf_thres' and performs
Non-Maximum Suppression to further filter detections.
Returns detections with shape:
(x1, y1, x2, y2, object_conf, class_score, class_pred)
"""
# From (center x, center y, width, height) to (x1, y1, x2, y2)
prediction[..., :4] = xywh2xyxy(prediction[..., :4])
output = [None for _ in range(len(prediction))]
for image_i, image_pred in enumerate(prediction):
# Filter out confidence scores below threshold
print("------------------------------")
#print("image_i:\n",image_i)
print("image_pred.shape:\n",image_pred.shape)
image_pred = image_pred[image_pred[:, 4] >= conf_thres]#保留大于置信度的边界框
print("image_pred.size(0)",image_pred.size(0))
# If none are remaining => process next image
if not image_pred.size(0):
continue
# Object confidence times class confidence
# .max(1) 返回每行tensor的最大值 .max(1)[0]具体的最大值 .max(1)[1] 最大值对应的索引
score = image_pred[:, 4] * image_pred[:, 5:].max(1)[0]
"""
print("image_pred[:, 5:]:\n",image_pred[:, 5:])
print("image_pred[:, 5:].max(1):\n",image_pred[:, 5:].max(1))
print("image_pred[:, 5:].max(1)[0]:\n",image_pred[:, 5:].max(1)[0])
"""
# Sort by it
# 完成从大到小排序
image_pred = image_pred[(-score).argsort()]
"""
print("score:\n",score)
print("(-score).argsort():\n",(-score).argsort())
print("image_pred:\n",image_pred)\
"""
#若keepdim值为True,则在输出张量中,除了被操作的dim维度值降为1,其它维度与输入张量input相同。
#否则,dim维度相当于被执行torch.squeeze()维度压缩操作,导致此维度消失,
#最终输出张量会比输入张量少一个维度。
class_confs, class_preds = image_pred[:, 5:].max(1, keepdim=True)
#print("image_pred[:, 5:].max(1, keepdim=True):\n",image_pred[:, 5:].max(1, keepdim=True))
#print("image_pred[:, 5:].max(1, keepdim=False):\n",image_pred[:, 5:].max(1, keepdim=False))
detections = torch.cat((image_pred[:, :5], class_confs.float(), class_preds.float()), 1)
# Perform non-maximum suppression
#print("detections.size():\n",detections.size())
#print("detections.size(0):\n",detections.size(0))
#print("image_pred[:, :5]:\n",image_pred[:, :5])
keep_boxes = []
while detections.size(0):
#torch.unsqueeze()这个函数主要是对数据维度进行扩充
large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thres
label_match = detections[0, -1] == detections[:, -1]
# Indices of boxes with lower confidence scores, large IOUs and matching labels
invalid = large_overlap & label_match
weights = detections[invalid, 4:5]#置信度
"""
print("1.detections:\n",detections)
print("large_overlap:\n",large_overlap)
print("detections[0, -1]:\n",detections[0, -1])
print("detections[:, -1]:\n",detections[:, -1])
print("label_match:\n",label_match)
print("invalid:\n",invalid)
print("weights:\n",weights)
"""
# Merge overlapping bboxes by order of confidence
detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum()
"""
print("detections[invalid, :4]:\n",detections[invalid, :4])
print("weights * detections[invalid, :4]:\n",weights * detections[invalid, :4])
print("detections[invalid, :4].sum(0):\n",detections[invalid, :4].sum(0))
print("weights * detections[invalid, :4].sum(0):\n",weights * detections[invalid, :4].sum(0))
print("2.detections:\n",detections)
"""
keep_boxes += [detections[0]]
detections = detections[~invalid]
#print("3.detections:\n",detections)
if keep_boxes:
output[image_i] = torch.stack(keep_boxes)
return output
非极大值抑制算法可参考:
https://www.cnblogs.com/makefile/p/nms.html
https://www.jianshu.com/p/d452b5615850
在经过非极大值抑制处理之后,在这里唯一有一点不同的是,这里采取了边界框“融合”的策略:
# Merge overlapping bboxes by order of confidence
detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum()
显示非极大值抑制过后的目标检测效果。
图14
至此第一部分检测分析完毕,剩下关于训练的部分还在更新中。
Pytorch | yolov3代码详解(二)(更新中)
已更完
Pytorch | yolov3代码详解(二)
https://blog.csdn.net/qq_24739717/article/details/96705055