1.这部分代码,看了比较长时间。原因是以前一直不懂目标检测,感觉一学就废
2.文章思想部分借鉴了很多大佬
3.代码部分直接看的Bubbliiing佬
4.我只记录我看懂的部分,博客写的不好轻喷,欢迎指正
def get_target(self, l, targets, anchors, in_h, in_w):
#-----------------------------------------------------#
# 计算一共有多少张图片,targets是归一化之后的坐标
#-----------------------------------------------------#
bs = len(targets)
#-----------------------------------------------------#
# 用于选取哪些先验框不包含物体
#-----------------------------------------------------#
noobj_mask = torch.ones(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False)
#-----------------------------------------------------#
# 让网络更加去关注小目标
#-----------------------------------------------------#
box_loss_scale = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False)
#-----------------------------------------------------#
# batch_size, 3, 13, 13, 5 + num_classes
# zj-y_true就是将真实框的tensor变成这样
#-----------------------------------------------------#
y_true = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, self.bbox_attrs, requires_grad = False)
for b in range(bs):
#zj-当前图片目标数为0就不用进行处理了
if len(targets[b])==0:
continue
#构造batch_target,下面用来存储针对于当前预测特征层的真实值的状态
batch_target = torch.zeros_like(targets[b])
#-------------------------------------------------------#
# 计算出正样本在特征层上的中心点
# zj---将正式框从归一化状态(targets的归一化是除以图片的宽和高得到的)调整为相对与当前预测特征层的状态
# zj---计算方式是x,y,w,h的状态
#-------------------------------------------------------#
batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w
batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h
batch_target[:, 4] = targets[b][:, 4]
batch_target = batch_target.cpu()
#-------------------------------------------------------#
# 将真实框转换一个形式
# num_true_box, 4
#-------------------------------------------------------#
gt_box = torch.FloatTensor(torch.cat((torch.zeros((batch_target.size(0), 2)), batch_target[:, 2:4]), 1))
#-------------------------------------------------------#
# 将先验框转换一个形式
# 9, 4
#-------------------------------------------------------#
anchor_shapes = torch.FloatTensor(torch.cat((torch.zeros((len(anchors), 2)), torch.FloatTensor(anchors)), 1))
#-------------------------------------------------------#
# 计算交并比
# self.calculate_iou(gt_box, anchor_shapes) = [num_true_box, 9]每一个真实框和9个先验框的重合情况
# best_ns:
# [每个真实框最大的重合度max_iou, 每一个真实框最重合的先验框的序号]
#-------------------------------------------------------#
best_ns = torch.argmax(self.calculate_iou(gt_box, anchor_shapes), dim=-1)
for t, best_n in enumerate(best_ns):
if best_n not in self.anchors_mask[l]:
continue
#----------------------------------------#
# 判断这个先验框是当前特征点的哪一个先验框
#----------------------------------------#
k = self.anchors_mask[l].index(best_n)
#----------------------------------------#
# 获得真实框属于哪个网格点
#----------------------------------------#
i = torch.floor(batch_target[t, 0]).long()
j = torch.floor(batch_target[t, 1]).long()
#----------------------------------------#
# 取出真实框的种类
#----------------------------------------#
c = batch_target[t, 4].long()
#----------------------------------------#
# zj-noobj_mask代表无目标的特征点,初始化全为1
#----------------------------------------#
noobj_mask[b, k, j, i] = 0
#----------------------------------------#
# tx、ty代表中心调整参数的真实值
#----------------------------------------#
y_true[b, k, j, i, 0] = batch_target[t, 0]
y_true[b, k, j, i, 1] = batch_target[t, 1]
y_true[b, k, j, i, 2] = batch_target[t, 2]
y_true[b, k, j, i, 3] = batch_target[t, 3]
y_true[b, k, j, i, 4] = 1
y_true[b, k, j, i, c + 5] = 1
#----------------------------------------#
# 用于获得xywh的比例
# 大目标loss权重小,小目标loss权重大
#----------------------------------------#
box_loss_scale[b, k, j, i] = batch_target[t, 2] * batch_target[t, 3] / in_w / in_h
return y_true, noobj_mask, box_loss_scale
上面这一部分,我感觉最核心的部分,是对真实值进行编码。真实值编码的结果就是将真实值编码成网络预测出的样子。
1.首先将归一化后的真实值targets,调整成相对于网络当前预测特征层的数值,并且存入到batch_target 这个tensor张量之中。
2.对目标物体真实值(相对于当前预测特征层)的宽高取出来,除此之外将使用k-means聚类得到的先验框的宽高取出来。(其实只需要取出相对于当前预测特征层的锚框宽高取出即可,但是b佬这里都取出来了)。将取出的真实框的宽高和锚框的宽高进行iou计算。得出iou值最大的那个锚框(也可以设定某一个阀值,只需要大于阀值也可)。
3.以1*3*13*13*25(batchsize,锚框种类,宽,长,参数(4个回归参数+置信度+20个类别概率)),接上面。如果第二个锚框和真实值的iou最大。y_true[1,1,真实框所在网格的宽,真实框所在网格的高,25)。将真实框的相对于预测特征层值填入y_true中。
def get_ignore(self, l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask):
#-----------------------------------------------------#
# 计算一共有多少张图片
#-----------------------------------------------------#
bs = len(targets)
FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
#-----------------------------------------------------#
# 生成网格,先验框中心,网格左上角
#-----------------------------------------------------#
grid_x = torch.linspace(0, in_w - 1, in_w).repeat(in_h, 1).repeat(
int(bs * len(self.anchors_mask[l])), 1, 1).view(x.shape).type(FloatTensor)
grid_y = torch.linspace(0, in_h - 1, in_h).repeat(in_w, 1).t().repeat(
int(bs * len(self.anchors_mask[l])), 1, 1).view(y.shape).type(FloatTensor)
# 生成先验框的宽高
scaled_anchors_l = np.array(scaled_anchors)[self.anchors_mask[l]]
anchor_w = FloatTensor(scaled_anchors_l).index_select(1, LongTensor([0]))
anchor_h = FloatTensor(scaled_anchors_l).index_select(1, LongTensor([1]))
anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape)
anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape)
#-------------------------------------------------------#
# 计算调整后的先验框中心与宽高
#-------------------------------------------------------#
pred_boxes_x = torch.unsqueeze(x + grid_x, -1)
pred_boxes_y = torch.unsqueeze(y + grid_y, -1)
pred_boxes_w = torch.unsqueeze(torch.exp(w) * anchor_w, -1)
pred_boxes_h = torch.unsqueeze(torch.exp(h) * anchor_h, -1)
pred_boxes = torch.cat([pred_boxes_x, pred_boxes_y, pred_boxes_w, pred_boxes_h], dim = -1)
for b in range(bs):
#-------------------------------------------------------#
# 将预测结果转换一个形式
# pred_boxes_for_ignore num_anchors, 4
#-------------------------------------------------------#
pred_boxes_for_ignore = pred_boxes[b].view(-1, 4)
#-------------------------------------------------------#
# 计算真实框,并把真实框转换成相对于特征层的大小
# gt_box num_true_box, 4
#-------------------------------------------------------#
if len(targets[b]) > 0:
batch_target = torch.zeros_like(targets[b])
#-------------------------------------------------------#
# 计算出正样本在特征层上的中心点
#-------------------------------------------------------#
batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w
batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h
batch_target = batch_target[:, :4]
#-------------------------------------------------------#
# 计算交并比
# anch_ious num_true_box, num_anchors
#-------------------------------------------------------#
anch_ious = self.calculate_iou(batch_target, pred_boxes_for_ignore)
#-------------------------------------------------------#
# 每个先验框对应真实框的最大重合度
# anch_ious_max num_anchors
#-------------------------------------------------------#
anch_ious_max, _ = torch.max(anch_ious, dim = 0)
anch_ious_max = anch_ious_max.view(pred_boxes[b].size()[:3])
noobj_mask[b][anch_ious_max > self.ignore_threshold] = 0
return noobj_mask, pred_boxes
这一部分代码的核心思想就是筛选出正样本和负样本(其实正样本没啥好筛选的)
首先就是计算在1*3*13*13个网络中会产生多少个锚框,显然是507个。
将507个先验框的的x,y,w,h分别取出来与每个真实框的x,y,w,h进行iou计算。如果iou大于某个阀值就将noobj_mask里面对应位置置为0,代表该先验框既不是正样本,也不是负样本。
def forward(self, l, input, targets=None):
#----------------------------------------------------#
# l 代表使用的是第几个有效特征层
# input的shape为 bs, 3*(5+num_classes), 13, 13
# bs, 3*(5+num_classes), 26, 26
# bs, 3*(5+num_classes), 52, 52
# targets 真实框的标签情况 [batch_size, num_gt, 5]
#----------------------------------------------------#
#--------------------------------#
# 获得图片数量,特征层的高和宽
#--------------------------------#
bs = input.size(0)
in_h = input.size(2)
in_w = input.size(3)
#-----------------------------------------------------------------------#
# 计算步长
# 每一个特征点对应原来的图片上多少个像素点
#
# 如果特征层为13x13的话,一个特征点就对应原来的图片上的32个像素点
# 如果特征层为26x26的话,一个特征点就对应原来的图片上的16个像素点
# 如果特征层为52x52的话,一个特征点就对应原来的图片上的8个像素点
# stride_h = stride_w = 32、16、8
#-----------------------------------------------------------------------#
stride_h = self.input_shape[0] / in_h
stride_w = self.input_shape[1] / in_w
#-------------------------------------------------#
# 此时获得的scaled_anchors大小是相对于特征层的
#-------------------------------------------------#
scaled_anchors = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors]
#-----------------------------------------------#
# 输入的input一共有三个,他们的shape分别是
# bs, 3 * (5+num_classes), 13, 13 => bs, 3, 5 + num_classes, 13, 13 => batch_size, 3, 13, 13, 5 + num_classes
# batch_size, 3, 13, 13, 5 + num_classes
# batch_size, 3, 26, 26, 5 + num_classes
# batch_size, 3, 52, 52, 5 + num_classes
#-----------------------------------------------#
prediction = input.view(bs, len(self.anchors_mask[l]), self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous()
#-----------------------------------------------#
# 先验框的中心位置的调整参数
#-----------------------------------------------#
x = torch.sigmoid(prediction[..., 0])
y = torch.sigmoid(prediction[..., 1])
#-----------------------------------------------#
# 先验框的宽高调整参数
#-----------------------------------------------#
w = prediction[..., 2]
h = prediction[..., 3]
#-----------------------------------------------#
# 获得置信度,是否有物体
#-----------------------------------------------#
conf = torch.sigmoid(prediction[..., 4])
#-----------------------------------------------#
# 种类置信度
#-----------------------------------------------#
pred_cls = torch.sigmoid(prediction[..., 5:])
#-----------------------------------------------#
# 获得网络应该有的预测结果
# noobj_mask代表无目标的特征点,无目标为0
#-----------------------------------------------#
y_true, noobj_mask, box_loss_scale = self.get_target(l, targets, scaled_anchors, in_h, in_w)
#---------------------------------------------------------------#
# 将预测结果进行解码,判断预测结果和真实值的重合程度
# 如果重合程度过大则忽略,因为这些特征点属于预测比较准确的特征点
# 作为负样本不合适
#----------------------------------------------------------------#
noobj_mask, pred_boxes = self.get_ignore(l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask)
if self.cuda:
y_true = y_true.cuda()
noobj_mask = noobj_mask.cuda()
box_loss_scale = box_loss_scale.cuda()
#-----------------------------------------------------------#
# reshape_y_true[...,2:3]和reshape_y_true[...,3:4]
# 表示真实框的宽高,二者均在0-1之间
# 真实框越大,比重越小,小框的比重更大。
#-----------------------------------------------------------#
box_loss_scale = 2 - box_loss_scale
#---------------------------------------------------------------#
# 计算预测结果和真实结果的CIOU
#----------------------------------------------------------------#
ciou = (1 - self.box_ciou(pred_boxes[y_true[..., 4] == 1], y_true[..., :4][y_true[..., 4] == 1])) * box_loss_scale[y_true[..., 4] == 1]
loss_loc = torch.sum(ciou)
#-----------------------------------------------------------#
# zj-计算置信度的loss= 正样本的置信度损失 + 负样本的置信度损失
#-----------------------------------------------------------#
loss_conf = torch.sum(self.BCELoss(conf, y_true[..., 4]) * y_true[..., 4]) + \
torch.sum(self.BCELoss(conf, y_true[..., 4]) * noobj_mask)
# zj-这里根本没有做标签平滑(用来防止在训练集上过拟合的操作)
loss_cls = torch.sum(self.BCELoss(pred_cls[y_true[..., 4] == 1], self.smooth_labels(y_true[..., 5:][y_true[..., 4] == 1], self.label_smoothing, self.num_classes)))
loss = loss_loc + loss_conf + loss_cls
# zj-num_pos计算正样本的数量
num_pos = torch.sum(y_true[..., 4])
num_pos = torch.max(num_pos, torch.ones_like(num_pos))
return loss, num_pos
这部分是损失函数的部分,
首先是x,y,w,h的回归损失,使用的是ciou损失。显然,只计算了正样本的回归损失。除此之外通过box_loss_scale来增加小样本的损失,减小大样本损失。最后torch.sum 得到损失值。从tensor取出真实值,进行计算。从tensor取出正样本值,进行计算。
其次是置信度损失,采用了Bceloss 。计算了正样本和负样本损失。这里noobj_mask的作用了,可以将计算的非正本和非负样本对应位置的损失进行置0。最后torch.sum 得到损失值。
未从tensor中取出,根据真实值编码在对应位置进行计算。
最后是分类损失,分类损失采用了Bceloss。计算了正样本损失。从tensor取出真实值,进行计算。从tensor取出正样本值,进行计算。同时使用了标签平滑,防止训练过程过拟合。标签平滑的意思比如真实值编码为[1,0,0],现在将真实值变为[0.95,0.025,0.025]其实就是对训练过程进行惩罚。
import torch
import torch.nn as nn
import math
import numpy as np
class YOLOLoss(nn.Module):
def __init__(self, anchors, num_classes, input_shape, cuda, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]], label_smoothing = 0):
super(YOLOLoss, self).__init__()
#-----------------------------------------------------------#
# 13x13的特征层对应的anchor是[142, 110],[192, 243],[459, 401]
# 26x26的特征层对应的anchor是[36, 75],[76, 55],[72, 146]
# 52x52的特征层对应的anchor是[12, 16],[19, 36],[40, 28]
#-----------------------------------------------------------#
self.anchors = anchors
self.num_classes = num_classes
self.bbox_attrs = 5 + num_classes
self.input_shape = input_shape
self.anchors_mask = anchors_mask
self.label_smoothing = label_smoothing
self.ignore_threshold = 0.5
self.cuda = cuda
def clip_by_tensor(self, t, t_min, t_max):
t = t.float()
result = (t >= t_min).float() * t + (t < t_min).float() * t_min
result = (result <= t_max).float() * result + (result > t_max).float() * t_max
return result
def MSELoss(self, pred, target):
return torch.pow(pred - target, 2)
def BCELoss(self, pred, target):
epsilon = 1e-7
pred = self.clip_by_tensor(pred, epsilon, 1.0 - epsilon)
output = - target * torch.log(pred) - (1.0 - target) * torch.log(1.0 - pred)
return output
def box_ciou(self, b1, b2):
"""
输入为:
----------
b1: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh
b2: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh
返回为:
-------
ciou: tensor, shape=(batch, feat_w, feat_h, anchor_num, 1)
"""
#----------------------------------------------------#
# 求出预测框左上角右下角
#----------------------------------------------------#
b1_xy = b1[..., :2]
b1_wh = b1[..., 2:4]
b1_wh_half = b1_wh/2.
b1_mins = b1_xy - b1_wh_half
b1_maxes = b1_xy + b1_wh_half
#----------------------------------------------------#
# 求出真实框左上角右下角
#----------------------------------------------------#
b2_xy = b2[..., :2]
b2_wh = b2[..., 2:4]
b2_wh_half = b2_wh/2.
b2_mins = b2_xy - b2_wh_half
b2_maxes = b2_xy + b2_wh_half
#----------------------------------------------------#
# 求真实框和预测框所有的iou
# zj-首先计算两个矩形的外包边
#----------------------------------------------------#
intersect_mins = torch.max(b1_mins, b2_mins)
intersect_maxes = torch.min(b1_maxes, b2_maxes)
intersect_wh = torch.max(intersect_maxes - intersect_mins, torch.zeros_like(intersect_maxes))
# intersect_area为相交矩形的面积
intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
b1_area = b1_wh[..., 0] * b1_wh[..., 1]
b2_area = b2_wh[..., 0] * b2_wh[..., 1]
union_area = b1_area + b2_area - intersect_area
iou = intersect_area / torch.clamp(union_area,min = 1e-6)
#----------------------------------------------------#
# 计算中心的差距
#----------------------------------------------------#
center_distance = torch.sum(torch.pow((b1_xy - b2_xy), 2), axis=-1)
#----------------------------------------------------#
# 找到包裹两个框的最小框的左上角和右下角
#----------------------------------------------------#
enclose_mins = torch.min(b1_mins, b2_mins)
enclose_maxes = torch.max(b1_maxes, b2_maxes)
enclose_wh = torch.max(enclose_maxes - enclose_mins, torch.zeros_like(intersect_maxes))
#----------------------------------------------------#
# 计算对角线距离
#----------------------------------------------------#
enclose_diagonal = torch.sum(torch.pow(enclose_wh,2), axis=-1)
ciou = iou - 1.0 * (center_distance) / torch.clamp(enclose_diagonal,min = 1e-6)
v = (4 / (math.pi ** 2)) * torch.pow((torch.atan(b1_wh[..., 0] / torch.clamp(b1_wh[..., 1],min = 1e-6)) - torch.atan(b2_wh[..., 0] / torch.clamp(b2_wh[..., 1], min = 1e-6))), 2)
alpha = v / torch.clamp((1.0 - iou + v), min=1e-6)
ciou = ciou - alpha * v
return ciou
#---------------------------------------------------#
# 平滑标签
#---------------------------------------------------#
def smooth_labels(self, y_true, label_smoothing, num_classes):
return y_true * (1.0 - label_smoothing) + label_smoothing / num_classes
def forward(self, l, input, targets=None):
#----------------------------------------------------#
# l 代表使用的是第几个有效特征层
# input的shape为 bs, 3*(5+num_classes), 13, 13
# bs, 3*(5+num_classes), 26, 26
# bs, 3*(5+num_classes), 52, 52
# targets 真实框的标签情况 [batch_size, num_gt, 5]
#----------------------------------------------------#
#--------------------------------#
# 获得图片数量,特征层的高和宽
#--------------------------------#
bs = input.size(0)
in_h = input.size(2)
in_w = input.size(3)
#-----------------------------------------------------------------------#
# 计算步长
# 每一个特征点对应原来的图片上多少个像素点
#
# 如果特征层为13x13的话,一个特征点就对应原来的图片上的32个像素点
# 如果特征层为26x26的话,一个特征点就对应原来的图片上的16个像素点
# 如果特征层为52x52的话,一个特征点就对应原来的图片上的8个像素点
# stride_h = stride_w = 32、16、8
#-----------------------------------------------------------------------#
stride_h = self.input_shape[0] / in_h
stride_w = self.input_shape[1] / in_w
#-------------------------------------------------#
# 此时获得的scaled_anchors大小是相对于特征层的
#-------------------------------------------------#
scaled_anchors = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors]
#-----------------------------------------------#
# 输入的input一共有三个,他们的shape分别是
# bs, 3 * (5+num_classes), 13, 13 => bs, 3, 5 + num_classes, 13, 13 => batch_size, 3, 13, 13, 5 + num_classes
# batch_size, 3, 13, 13, 5 + num_classes
# batch_size, 3, 26, 26, 5 + num_classes
# batch_size, 3, 52, 52, 5 + num_classes
#-----------------------------------------------#
prediction = input.view(bs, len(self.anchors_mask[l]), self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous()
#-----------------------------------------------#
# 先验框的中心位置的调整参数
#-----------------------------------------------#
x = torch.sigmoid(prediction[..., 0])
y = torch.sigmoid(prediction[..., 1])
#-----------------------------------------------#
# 先验框的宽高调整参数
#-----------------------------------------------#
w = prediction[..., 2]
h = prediction[..., 3]
#-----------------------------------------------#
# 获得置信度,是否有物体
#-----------------------------------------------#
conf = torch.sigmoid(prediction[..., 4])
#-----------------------------------------------#
# 种类置信度
#-----------------------------------------------#
pred_cls = torch.sigmoid(prediction[..., 5:])
#-----------------------------------------------#
# 获得网络应该有的预测结果
# noobj_mask代表无目标的特征点,无目标为0
#-----------------------------------------------#
y_true, noobj_mask, box_loss_scale = self.get_target(l, targets, scaled_anchors, in_h, in_w)
#---------------------------------------------------------------#
# 将预测结果进行解码,判断预测结果和真实值的重合程度
# 如果重合程度过大则忽略,因为这些特征点属于预测比较准确的特征点
# 作为负样本不合适
#----------------------------------------------------------------#
noobj_mask, pred_boxes = self.get_ignore(l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask)
if self.cuda:
y_true = y_true.cuda()
noobj_mask = noobj_mask.cuda()
box_loss_scale = box_loss_scale.cuda()
#-----------------------------------------------------------#
# reshape_y_true[...,2:3]和reshape_y_true[...,3:4]
# 表示真实框的宽高,二者均在0-1之间
# 真实框越大,比重越小,小框的比重更大。
#-----------------------------------------------------------#
box_loss_scale = 2 - box_loss_scale
#---------------------------------------------------------------#
# 计算预测结果和真实结果的CIOU
#----------------------------------------------------------------#
ciou = (1 - self.box_ciou(pred_boxes[y_true[..., 4] == 1], y_true[..., :4][y_true[..., 4] == 1])) * box_loss_scale[y_true[..., 4] == 1]
loss_loc = torch.sum(ciou)
#-----------------------------------------------------------#
# zj-计算置信度的loss= 正样本的置信度损失 + 负样本的置信度损失
#-----------------------------------------------------------#
loss_conf = torch.sum(self.BCELoss(conf, y_true[..., 4]) * y_true[..., 4]) + \
torch.sum(self.BCELoss(conf, y_true[..., 4]) * noobj_mask)
# zj-这里根本没有做标签平滑(用来防止在训练集上过拟合的操作)
loss_cls = torch.sum(self.BCELoss(pred_cls[y_true[..., 4] == 1], self.smooth_labels(y_true[..., 5:][y_true[..., 4] == 1], self.label_smoothing, self.num_classes)))
loss = loss_loc + loss_conf + loss_cls
# zj-num_pos计算正样本的数量
num_pos = torch.sum(y_true[..., 4])
num_pos = torch.max(num_pos, torch.ones_like(num_pos))
return loss, num_pos
def calculate_iou(self, _box_a, _box_b):
#-----------------------------------------------------------#
# 计算真实框的左上角和右下角
#-----------------------------------------------------------#
b1_x1, b1_x2 = _box_a[:, 0] - _box_a[:, 2] / 2, _box_a[:, 0] + _box_a[:, 2] / 2
b1_y1, b1_y2 = _box_a[:, 1] - _box_a[:, 3] / 2, _box_a[:, 1] + _box_a[:, 3] / 2
#-----------------------------------------------------------#
# 计算先验框获得的预测框的左上角和右下角
#-----------------------------------------------------------#
b2_x1, b2_x2 = _box_b[:, 0] - _box_b[:, 2] / 2, _box_b[:, 0] + _box_b[:, 2] / 2
b2_y1, b2_y2 = _box_b[:, 1] - _box_b[:, 3] / 2, _box_b[:, 1] + _box_b[:, 3] / 2
#-----------------------------------------------------------#
# 将真实框和预测框都转化成左上角右下角的形式
#-----------------------------------------------------------#
box_a = torch.zeros_like(_box_a)
box_b = torch.zeros_like(_box_b)
box_a[:, 0], box_a[:, 1], box_a[:, 2], box_a[:, 3] = b1_x1, b1_y1, b1_x2, b1_y2
box_b[:, 0], box_b[:, 1], box_b[:, 2], box_b[:, 3] = b2_x1, b2_y1, b2_x2, b2_y2
#-----------------------------------------------------------#
# A为真实框的数量,B为先验框的数量
#-----------------------------------------------------------#
A = box_a.size(0)
B = box_b.size(0)
#-----------------------------------------------------------#
# 计算交的面积
#-----------------------------------------------------------#
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), box_b[:, :2].unsqueeze(0).expand(A, B, 2))
# zj--torch.clamp()输入一个张量,使张量里面的类容都不小于最小值或者大于最大值
inter = torch.clamp((max_xy - min_xy), min=0)
inter = inter[:, :, 0] * inter[:, :, 1]
#-----------------------------------------------------------#
# 计算预测框和真实框各自的面积
#-----------------------------------------------------------#
area_a = ((box_a[:, 2]-box_a[:, 0]) * (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
area_b = ((box_b[:, 2]-box_b[:, 0]) * (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
#-----------------------------------------------------------#
# 求IOU
#-----------------------------------------------------------#
union = area_a + area_b - inter
return inter / union # [A,B]
def get_target(self, l, targets, anchors, in_h, in_w):
#-----------------------------------------------------#
# 计算一共有多少张图片,targets是归一化之后的坐标
#-----------------------------------------------------#
bs = len(targets)
#-----------------------------------------------------#
# 用于选取哪些先验框不包含物体
#-----------------------------------------------------#
noobj_mask = torch.ones(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False)
#-----------------------------------------------------#
# 让网络更加去关注小目标
#-----------------------------------------------------#
box_loss_scale = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False)
#-----------------------------------------------------#
# batch_size, 3, 13, 13, 5 + num_classes
# zj-y_true就是将真实框的tensor变成这样
#-----------------------------------------------------#
y_true = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, self.bbox_attrs, requires_grad = False)
for b in range(bs):
#zj-当前图片目标数为0就不用进行处理了
if len(targets[b])==0:
continue
#构造batch_target,下面用来存储针对于当前预测特征层的真实值的状态
batch_target = torch.zeros_like(targets[b])
#-------------------------------------------------------#
# 计算出正样本在特征层上的中心点
# zj---将正式框从归一化状态(targets的归一化是除以图片的宽和高得到的)调整为相对与当前预测特征层的状态
# zj---计算方式是x,y,w,h的状态
#-------------------------------------------------------#
batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w
batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h
batch_target[:, 4] = targets[b][:, 4]
batch_target = batch_target.cpu()
#-------------------------------------------------------#
# 将真实框转换一个形式
# num_true_box, 4
#-------------------------------------------------------#
gt_box = torch.FloatTensor(torch.cat((torch.zeros((batch_target.size(0), 2)), batch_target[:, 2:4]), 1))
#-------------------------------------------------------#
# 将先验框转换一个形式
# 9, 4
#-------------------------------------------------------#
anchor_shapes = torch.FloatTensor(torch.cat((torch.zeros((len(anchors), 2)), torch.FloatTensor(anchors)), 1))
#-------------------------------------------------------#
# 计算交并比
# self.calculate_iou(gt_box, anchor_shapes) = [num_true_box, 9]每一个真实框和9个先验框的重合情况
# best_ns:
# [每个真实框最大的重合度max_iou, 每一个真实框最重合的先验框的序号]
#-------------------------------------------------------#
best_ns = torch.argmax(self.calculate_iou(gt_box, anchor_shapes), dim=-1)
for t, best_n in enumerate(best_ns):
if best_n not in self.anchors_mask[l]:
continue
#----------------------------------------#
# 判断这个先验框是当前特征点的哪一个先验框
#----------------------------------------#
k = self.anchors_mask[l].index(best_n)
#----------------------------------------#
# 获得真实框属于哪个网格点
#----------------------------------------#
i = torch.floor(batch_target[t, 0]).long()
j = torch.floor(batch_target[t, 1]).long()
#----------------------------------------#
# 取出真实框的种类
#----------------------------------------#
c = batch_target[t, 4].long()
#----------------------------------------#
# zj-noobj_mask代表无目标的特征点,初始化全为1
#----------------------------------------#
noobj_mask[b, k, j, i] = 0
#----------------------------------------#
# tx、ty代表中心调整参数的真实值
#----------------------------------------#
y_true[b, k, j, i, 0] = batch_target[t, 0]
y_true[b, k, j, i, 1] = batch_target[t, 1]
y_true[b, k, j, i, 2] = batch_target[t, 2]
y_true[b, k, j, i, 3] = batch_target[t, 3]
y_true[b, k, j, i, 4] = 1
y_true[b, k, j, i, c + 5] = 1
#----------------------------------------#
# 用于获得xywh的比例
# 大目标loss权重小,小目标loss权重大
#----------------------------------------#
box_loss_scale[b, k, j, i] = batch_target[t, 2] * batch_target[t, 3] / in_w / in_h
return y_true, noobj_mask, box_loss_scale
def get_ignore(self, l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask):
#-----------------------------------------------------#
# 计算一共有多少张图片
#-----------------------------------------------------#
bs = len(targets)
FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
#-----------------------------------------------------#
# 生成网格,先验框中心,网格左上角
#-----------------------------------------------------#
grid_x = torch.linspace(0, in_w - 1, in_w).repeat(in_h, 1).repeat(
int(bs * len(self.anchors_mask[l])), 1, 1).view(x.shape).type(FloatTensor)
grid_y = torch.linspace(0, in_h - 1, in_h).repeat(in_w, 1).t().repeat(
int(bs * len(self.anchors_mask[l])), 1, 1).view(y.shape).type(FloatTensor)
# 生成先验框的宽高
scaled_anchors_l = np.array(scaled_anchors)[self.anchors_mask[l]]
anchor_w = FloatTensor(scaled_anchors_l).index_select(1, LongTensor([0]))
anchor_h = FloatTensor(scaled_anchors_l).index_select(1, LongTensor([1]))
anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape)
anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape)
#-------------------------------------------------------#
# 计算调整后的先验框中心与宽高
#-------------------------------------------------------#
pred_boxes_x = torch.unsqueeze(x + grid_x, -1)
pred_boxes_y = torch.unsqueeze(y + grid_y, -1)
pred_boxes_w = torch.unsqueeze(torch.exp(w) * anchor_w, -1)
pred_boxes_h = torch.unsqueeze(torch.exp(h) * anchor_h, -1)
pred_boxes = torch.cat([pred_boxes_x, pred_boxes_y, pred_boxes_w, pred_boxes_h], dim = -1)
for b in range(bs):
#-------------------------------------------------------#
# 将预测结果转换一个形式
# pred_boxes_for_ignore num_anchors, 4
#-------------------------------------------------------#
pred_boxes_for_ignore = pred_boxes[b].view(-1, 4)
#-------------------------------------------------------#
# 计算真实框,并把真实框转换成相对于特征层的大小
# gt_box num_true_box, 4
#-------------------------------------------------------#
if len(targets[b]) > 0:
batch_target = torch.zeros_like(targets[b])
#-------------------------------------------------------#
# 计算出正样本在特征层上的中心点
#-------------------------------------------------------#
batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w
batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h
batch_target = batch_target[:, :4]
#-------------------------------------------------------#
# 计算交并比
# anch_ious num_true_box, num_anchors
#-------------------------------------------------------#
anch_ious = self.calculate_iou(batch_target, pred_boxes_for_ignore)
#-------------------------------------------------------#
# 每个先验框对应真实框的最大重合度
# anch_ious_max num_anchors
#-------------------------------------------------------#
anch_ious_max, _ = torch.max(anch_ious, dim = 0)
anch_ious_max = anch_ious_max.view(pred_boxes[b].size()[:3])
noobj_mask[b][anch_ious_max > self.ignore_threshold] = 0
return noobj_mask, pred_boxes
def weights_init(net, init_type='normal', init_gain = 0.02):
def init_func(m):
classname = m.__class__.__name__
if hasattr(m, 'weight') and classname.find('Conv') != -1:
if init_type == 'normal':
torch.nn.init.normal_(m.weight.data, 0.0, init_gain)
elif init_type == 'xavier':
torch.nn.init.xavier_normal_(m.weight.data, gain=init_gain)
elif init_type == 'kaiming':
torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
elif init_type == 'orthogonal':
torch.nn.init.orthogonal_(m.weight.data, gain=init_gain)
else:
raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
elif classname.find('BatchNorm2d') != -1:
torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
torch.nn.init.constant_(m.bias.data, 0.0)
print('initialize network with %s type' % init_type)
net.apply(init_func)
完整代码如上