def to_cpu(tensor): # 张量 转移到 cpu
return tensor.detach().cpu() # .detach() 创建成一个没有梯度的tensor, 如果想要保留梯度就使用tensor.clone()
def load_classes(path): # 读取路径中文件成 列表 data/coco.names
"""
Loads class labels at 'path'
"""
fp = open(path, "r")
names = fp.read().split("\n")[:-1] # 读取coco数据集的名称,按行读取,每次只取每一行的最后一个
return names # 返回的是 coco name 类别名称的列表 list
person
bicycle
car
…… # 一共80行,coco 数据集有80类别
输出:
['person', 'bicycle', 'car', 'motorbike', ……]
def weights_init_normal(m): # 自定义初始化权重的函数
# m 是网络中的(每)一个submodule(子模块),这weights_init_normal函数被调用来初始化模型中每一个子模块的参数。
# model.apply(fn) 的作用是 将 fn function to be applied to each submodule
classname = m.__class__.__name__
if classname.find("Conv") != -1: # 这里等于 -1 表示,
torch.nn.init.normal_(m.weight.data, 0.0, 0.02) # 将正太分布N(0, 0.2)生成的值赋值给 m.weight.data 中的每个元素
elif classname.find("BatchNorm2d") != -1:
torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
torch.nn.init.constant_(m.bias.data, 0.0) # 将 0 赋值给 m.bias.data
这里定义的这个 weights_init_normal 函数会作为 model.apply(fn)的输入fn。即 fn 是函数weights_init_normal。
函数在 train.py 中被使用,使用如下:
model.apply(weights_init_normal) # train.py 中模型初始化参数
model 的 apply 函数是torch中 Module(object) 类中的一个标准函数。
torch 中 model.apply 定义如下:
class Module(object):
def apply(self, fn):
for module in self.children():
module.apply(fn) # 这是一个递归
fn(self) # 这里的fn 就是 我们定义的 def weights_init_normal(m) 初始化权重函数
return self
def rescale_boxes(boxes, current_dim, original_shape): # 将相对网络输入的box转化成原始图片的box。
""" Rescales bounding boxes to the original shape
现在的 boxes: 是输入神经网络中的图片的 boxes, 不是原始图片的 boxes ,所以需要将 boxes 转换成原始图片的 boxes。
current_dim: 通常是 416
original_shape: coco数据集中的原始图片的 shape 各不相同
"""
orig_h, orig_w = original_shape
# The amount of padding that was added
pad_x = max(orig_h - orig_w, 0) * (current_dim / max(original_shape)) # 水平上 左右 pad 的和
pad_y = max(orig_w - orig_h, 0) * (current_dim / max(original_shape)) # 垂直上 上下 pad 的和
# Image height and width after padding is removed
unpad_h = current_dim - pad_y
unpad_w = current_dim - pad_x
# Rescale bounding boxes to dimension of original image
boxes[:, 0] = ((boxes[:, 0] - pad_x // 2) / unpad_w) * orig_w
boxes[:, 1] = ((boxes[:, 1] - pad_y // 2) / unpad_h) * orig_h
boxes[:, 2] = ((boxes[:, 2] - pad_x // 2) / unpad_w) * orig_w
boxes[:, 3] = ((boxes[:, 3] - pad_y // 2) / unpad_h) * orig_h
return boxes
def xywh2xyxy(x): # 将中心坐标和高宽,转成左上角右下角的坐标
y = x.new(x.shape) # 创建一个新的空间
y[..., 0] = x[..., 0] - x[..., 2] / 2
y[..., 1] = x[..., 1] - x[..., 3] / 2
y[..., 2] = x[..., 0] + x[..., 2] / 2
y[..., 3] = x[..., 1] + x[..., 3] / 2
return y
""" Compute the average precision, given the recall and precision curves.
Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
# Arguments
tp: True positives (list).
conf: Objectness value from 0-1 (list).
pred_cls: Predicted object classes (list).
target_cls: True object classes (list).
# Returns
The average precision as computed in py-faster-rcnn.
"""
""" Compute the average precision, given the recall and precision curves.
Code originally from https://github.com/rbgirshick/py-faster-rcnn.
# Arguments
recall: The recall curve (list).
precision: The precision curve (list).
# Returns
The average precision as computed in py-faster-rcnn.
"""
def bbox_wh_iou(wh1, wh2): # 求两个 box 的交并比
wh2 = wh2.t() # 转换数据的格式,并没有交换数据的位置
w1, h1 = wh1[0], wh1[1]
w2, h2 = wh2[0], wh2[1]
inter_area = torch.min(w1, w2) * torch.min(h1, h2) # 宽高都取最下的那个
union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area
return inter_area / union_area
def bbox_iou(box1, box2, x1y1x2y2=True):
"""
Returns the IoU of two bounding boxes
"""
if not x1y1x2y2: 将 xywh的表示方式转换成 xyxy的表示方式
# Transform from center and width to exact coordinates
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
else:
# Get the coordinates of bounding boxes
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
# get the corrdinates of the intersection rectangle
inter_rect_x1 = torch.max(b1_x1, b2_x1)
inter_rect_y1 = torch.max(b1_y1, b2_y1)
inter_rect_x2 = torch.min(b1_x2, b2_x2)
inter_rect_y2 = torch.min(b1_y2, b2_y2)
# Intersection area # torch.clamp 截断处理,将小于min的值取值为min
inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp(
inter_rect_y2 - inter_rect_y1 + 1, min=0
)
# Union Area
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) # + 1 是因为边界宽度有个 1 ,注意这个点。
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
return iou
def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4): # 非极大抑制
"""
Removes detections with lower object confidence score than 'conf_thres' and performs
Non-Maximum Suppression to further filter detections.
Returns detections with shape:
(x1, y1, x2, y2, object_conf, class_score, class_pred)
"""
# From (center x, center y, width, height) to (x1, y1, x2, y2)
prediction[..., :4] = xywh2xyxy(prediction[..., :4]) # 处理一个batch 每一张图片每一个对象的数据:坐标转换(只改变每一张图片的每一个预测框中的前四个坐标值,其他值不变)
# prediction = tensor([[
# [1.2897e+01, 1.8118e+01, 1.7194e+02, ..., 1.1081e-02,5.1385e-03, 3.2795e-03],
# [5.7479e+01, 2.5666e+01, 1.1065e+02, ..., 6.1178e-03,3.3721e-03, 9.3959e-04],
# [7.7897e+01, 2.4972e+01, 1.4381e+02, ..., 1.4635e-03,3.0854e-03, 8.7795e-04],
# ...,
# [3.9590e+02, 4.1341e+02, 7.7788e+01, ..., 3.0758e-04, 1.1135e-04, 1.2358e-04],
# [4.0272e+02, 4.1227e+02, 4.9530e+01, ..., 3.6771e-04, 3.1682e-04, 4.5721e-04],
# [4.1132e+02, 4.1317e+02, 7.5396e+01, ..., 1.0301e-03, 9.6391e-04, 1.7760e-03]]])
output = [None for _ in range(len(prediction))] # 图片的长度
for image_i, image_pred in enumerate(prediction): # 遍历一张图片的每一个预测框(这里处理的对象是一张图片)
# Filter out confidence scores below threshold
image_pred = image_pred[image_pred[:, 4] >= conf_thres] # 过滤掉 边框置信度小于阈值的框。0123是坐标,4是置信度,5:是种类
# If none are remaining => process next image
if not image_pred.size(0): # 如果 没有满足阈值要求的框,就访问下一个框
continue
# Object confidence times class confidence
score = image_pred[:, 4] * image_pred[:, 5:].max(1)[0] # 求当前预测框类别中的极大值([0])。 在1方向(max(1)),找到每行(属于80个种类中的某一类的最大值)的最大值。
# 框的位置执行度 * 框中对象的执行度(找的是每一个框的这个最大值,这里的每一行就是每一个框)
# Sort by it
image_pred = image_pred[(-score).argsort()] # .argsort() 返回从大到小排序索引值 (.argsort 是从小到大排序,加了符号之后就是从大到小的排序。)
# image_pred = tensor([
# [6.9979e+01, 1.7314e+02, 1.7263e+02, 3.4342e+02, 9.9952e-01, 2.7360e-06,
# 3.1124e-05, 7.5148e-08, 2.1126e-07, 7.7389e-09, 2.3194e-08, 5.6478e-10,
# 3.2973e-08, 4.0248e-07, 9.3334e-09, 9.8726e-08, 1.9962e-08, 1.4138e-08,
# 4.4086e-07, 1.7810e-05, 3.2029e-03, 9.9335e-01, 1.4822e-06, 1.3331e-05,
# 2.8374e-06, 2.3131e-07, 3.8589e-06, 6.9625e-08, 1.2171e-06, 3.3805e-06,
# 1.5626e-07, 9.1728e-08, 8.7323e-08, 1.3251e-07, 1.4588e-06, 2.2201e-06,
# 7.4246e-08, 1.9406e-07, 1.9214e-07, 1.8509e-08, 9.3417e-08, 2.9386e-06,
# 2.2876e-07, 2.0008e-07, 1.7397e-08, 1.9292e-09, 9.9117e-09, 1.0033e-09,
# 1.1846e-08, 4.8930e-08, 1.7767e-09, 2.1066e-07, 1.0837e-07, 1.8093e-07,
# 2.0299e-09, 4.5130e-07, 2.2242e-09, 1.1962e-07, 2.4569e-05, 2.8944e-07,
# 3.8133e-08, 1.9768e-07, 5.1894e-07, 1.6013e-06, 5.0550e-06, 6.7384e-08,
# 3.8338e-08, 1.3275e-07, 2.6257e-08, 3.8255e-08, 2.3908e-08, 1.3826e-07,
# 2.6799e-09, 3.1228e-08, 6.6355e-08, 2.0321e-08, 4.7900e-07, 1.1243e-08,
# 1.9466e-08, 2.2574e-07, 5.0917e-10, 2.6649e-08, 7.6042e-07, 7.5514e-08,
# 1.9578e-08],
# [……],
# [……],
# ……])
class_confs, class_preds = image_pred[:, 5:].max(1, keepdim=True) # 返回每一行指定位置(5:)的最大值,和最大值的索引(类别取得最大值的索引就是类别编号)
# class_confs, class_preds =
# tensor([[0.9933],
# [0.9998],
# [0.9996],
# [0.9880],
# [0.9423],
# [0.9894],
# [0.9210],
# [0.9850],
# [0.9857]])
# tensor([[16],
# [ 1],
# [ 1],
# [16],
# [ 7],
# [16],
# [ 7],
# [16],
# [16]])
#
detections = torch.cat((image_pred[:, :5], class_confs.float(), class_preds.float()), 1) # 提取出 80个类别中class_confs最大的那一个,及其下标 class_preds;其他大的79个类别就不保留了
# .cat 是将数据按行拼接起来。01234(image_pred[:, :5])表示4个坐标和1个框置信度,5(class_confs)表示类别置信度,6(class_preds)表示类别编号
# detections = tensor([
# [ 69.9790, 173.1447, 172.6293, 343.4247, 0.9995, 0.9933, 16.0000],
# [ 92.8356, 111.2542, 305.3051, 294.4642, 0.9927, 0.9998, 1.0000],
# [ 63.1184, 116.6243, 311.2260, 294.3138, 0.9900, 0.9996, 1.0000],
# [ 68.8240, 157.3179, 172.9849, 340.8053, 0.9996, 0.9880, 16.0000],
# [254.1283, 98.5848, 373.9042, 144.4332, 0.9970, 0.9423, 7.0000],
# [ 70.7442, 184.7154, 168.4275, 335.5417, 0.9453, 0.9894, 16.0000],
# [253.6593, 95.9781, 372.7262, 144.6945, 0.9973, 0.9210, 7.0000],
# [ 70.6210, 173.2062, 169.2387, 327.5236, 0.9228, 0.9850, 16.0000],
# [ 72.2745, 174.2344, 167.7682, 325.6776, 0.8158, 0.9857, 16.0000]])
# Perform non-maximum suppression
keep_boxes = []
while detections.size(0):
# detections[0, :4].unsqueeze(0)是 tensor([[ 69.9790, 173.1447, 172.6293, 343.4247]])
large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thres # 计算出每一个box 与box[0] 交并比
# detections[0, :4]:第一个检测对象(框)的四个坐标;.unsqueeze(0) 是为了增加一个维度,方便后面的计算
# detections[:, :4]:图片中的每一个检测对象的四个坐标
label_match = detections[0, -1] == detections[:, -1] # 判断每一个 box 的类别与 box[0]的类别
# label_match 是 tensor([1, 0, 0, 1, 0, 1, 0, 1, 1], dtype=torch.uint8)
# Indices of boxes with lower confidence scores, large IOUs and matching labels
invalid = large_overlap & label_match # 将的得到的两个列表求与,box中与box[0] 同时满足IOU和类别 box
# invalid = tensor([1, 0, 0, 1, 0, 1, 0, 1, 1], dtype=torch.uint8)
# invalid 是 tensor([1, 0, 0, 1, 0, 1, 0, 1, 1], dtype=torch.uint8)
# 表示下标为 3 5 7 8 的box与下标为0 的box 具有相同的类别且满足交并比要求
weights = detections[invalid, 4:5] # 将box 的置信度当成box 的 weights(可靠性、正确的程度)使用
# weights = tensor([[0.9995], [0.9996], [0.9453], [0.9228], [0.8158]])
# Merge overlapping bboxes by order of confidence
detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum() # 根据每一个box 的权重,进行加权求平均坐标
# detections =
#tensor([[ 70.4133, 172.3040, 170.3421, 335.0493, 0.9995, 0.9933, 16.0000],
# [ 92.8356, 111.2542, 305.3051, 294.4642, 0.9927, 0.9998, 1.0000],
# [ 63.1184, 116.6243, 311.2260, 294.3138, 0.9900, 0.9996, 1.0000],
# [ 68.8240, 157.3179, 172.9849, 340.8053, 0.9996, 0.9880, 16.0000],
# [254.1283, 98.5848, 373.9042, 144.4332, 0.9970, 0.9423, 7.0000],
# [ 70.7442, 184.7154, 168.4275, 335.5417, 0.9453, 0.9894, 16.0000],
# [253.6593, 95.9781, 372.7262, 144.6945, 0.9973, 0.9210, 7.0000],
# [ 70.6210, 173.2062, 169.2387, 327.5236, 0.9228, 0.9850, 16.0000],
# [ 72.2745, 174.2344, 167.7682, 325.6776, 0.8158, 0.9857, 16.0000]])
keep_boxes += [detections[0]] # 访问刚才生成的box, 将其(这个目标 box)添加到 keep_boxes 中。最终这个keep_boxes 会存放所有需要的box框
# keep_boxes = [tensor([ 70.4133, 172.3040, 170.3421, 335.0493, 0.9995, 0.9933, 16.0000])]
detections = detections[~invalid] # invalid 是一个列表,表示与box[0]重叠的目标,形如[1, 0, 0, 1, 1,0],
# 这里取反再赋值就是 将detectioins 的值更新为 刚才没有分析过的 box, 对没有分析过的box 继续查找重叠,加权合并重叠。
# 直到 detections 中没有任何 box
# detections = tensor([[ 92.8356, 111.2542, 305.3051, 294.4642, 0.9927, 0.9998, 1.0000],
# [ 63.1184, 116.6243, 311.2260, 294.3138, 0.9900, 0.9996, 1.0000],
# [254.1283, 98.5848, 373.9042, 144.4332, 0.9970, 0.9423, 7.0000],
# [253.6593, 95.9781, 372.7262, 144.6945, 0.9973, 0.9210, 7.0000]])
if keep_boxes:
output[image_i] = torch.stack(keep_boxes) # 将一张图片的检测结果 keep_boxes (多个box)打包作为一张图片的检测结果。
# output[image_i] 一张图片的加测结果
return output # 多张图片的检测结果
prediction: 存放的是多个图片的检测结果,每张图片检测到多个目标框,每个目标框有多个值,依次是x y w h 边框置信度 80个类别的置信度。[[120 120 15 15 0.8 0.1 0.2 0.3……], [……], [……]],这里要理解成每一box边框只有一个边框置信度,当有80个类别置信度,最后的置信度这两个置信度的乘积。
conf_thres: 置信度阈值,是边框置信度和类别置信度的乘积。
nms_thres: 交并比阈值,合并相同类别的box时,同时需要满足的最低的交并比要求。