YOLOV的主要思想是将yolox获得框与另一条路reg_feat和cls_feat2 的特征相匹配,将匹配的特征和预测框放到注意力机制中得到新的预测框。
流程:
1.首先将16x6804x35(预测框)放入到函数self.postpro_woclass(这个函数主要做的是通过置信度和非极大抑制筛选出30个框,37相比较于35多两位,多出的是类别的置信度和类别)得到30x37
2.将另一条路上的cls_feat和cls_feat2分别展成16x6804x256
3.按照yolox得到的pred_idx找到对应cls、reg特征、cls_scores(类别分数) fg_scores(置信度 前后背景分数)
4.将得到的特征和分数输入到注意力机制中
self.hw = [x.shape[-2:] for x in outputs_decode] # 72x72 36x36 18x18
# output里的坐标已经发生了改变,还原到了原图的上,而output_decode的坐标还是原来的特征图上的大小,没有改变 坐标应该是中心点+长宽
outputs_decode = torch.cat([x.flatten(start_dim=2) for x in outputs_decode], dim=2 #16x6804x35
).permute(0, 2, 1)
decode_res = self.decode_outputs(outputs_decode, dtype=xin[0].type()) #find topK predictions, play the same role as RPN 和上面的output操作差不多
#nms 置信度 nms前30个 37(x1, y1, x2, y2, obj_conf, class_conf, class_pred)
pred_result, pred_idx = self.postpro_woclass(decode_res, num_classes=self.num_classes, nms_thre=self.nms_thresh,
topK=self.Afternum) # postprocess(decode_res,num_classes=30)
#return pred_result
if not self.training and imgs.shape[0] == 1:
return self.postprocess_single_img(pred_result, self.num_classes)
#YOLOv新的一条路
cls_feat_flatten = torch.cat( #16x6804x256
[x.flatten(start_dim=2) for x in before_nms_features], dim=2 #cls_feat2
).permute(0, 2, 1) # [b,features,channels]
reg_feat_flatten = torch.cat( #16x6804x256 #reg_feat
[x.flatten(start_dim=2) for x in before_nms_regf], dim=2
).permute(0, 2, 1)
#通过pred_idx找到对应cls、reg特征、cls_scores(类别分数) fg_scores(置信度 前后背景分数) 按照yolox得到的框的ID寻找
features_cls, features_reg, cls_scores, fg_scores = self.find_feature_score(cls_feat_flatten, pred_idx,
reg_feat_flatten, imgs,
pred_result)
features_reg = features_reg.unsqueeze(0) #1x480x256 480=30*batchsize(16)
features_cls = features_cls.unsqueeze(0) #1x480x256
if not self.training:
cls_scores = cls_scores.to(cls_feat_flatten.dtype)
fg_scores = fg_scores.to(cls_feat_flatten.dtype)
if self.use_score:
trans_cls = self.trans(features_cls, features_reg, cls_scores, fg_scores, sim_thresh=self.sim_thresh,
ave=self.ave, use_mask=self.use_mask)
else:
trans_cls = self.trans(features_cls, features_reg, None, None, sim_thresh=self.sim_thresh, ave=self.ave)
fc_output = self.linear_pred(trans_cls)
fc_output = torch.reshape(fc_output, [outputs_decode.shape[0], -1, self.num_classes + 1])[:, :, :-1]
postpro_woclass函数:首先将预测框的中心点长宽改成左上角和右下角的坐标更新prediction,然后计算30类中的值哪个一个最大,返回最大的值和位置,将其放入到35中形成37,接着获取置信度最高的前750个,最后使用torchvision.ops.batched_nms过滤,获取nms最大的前30个。
def postpro_woclass(self, prediction, num_classes, nms_thre=0.75, topK=75, features=None):
# find topK predictions, play the same role as RPN
'''
Args:
prediction: [batch,feature_num,5+clsnum]
num_classes:
conf_thre:
conf_thre_high:
nms_thre:
Returns:
[batch,topK,5+clsnum]
'''
self.topK = topK
box_corner = prediction.new(prediction.shape) # 中心点和长宽 -> 左上角右下角
box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
prediction[:, :, :4] = box_corner[:, :, :4] #
output = [None for _ in range(len(prediction))]
output_index = [None for _ in range(len(prediction))]
features_list = []
for i, image_pred in enumerate(prediction):
if not image_pred.size(0):
continue
# Get score and class with highest confidence
class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True) # 返回每一个batch(每一帧)30个类别每一行最大值并返回索引
# Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
detections = torch.cat(
(image_pred[:, :5], class_conf, class_pred.float(), image_pred[:, 5: 5 + num_classes]), 1)
#获取置信度前750个
conf_score = image_pred[:, 4]
top_pre = torch.topk(conf_score, k=self.Prenum)
sort_idx = top_pre.indices[:self.Prenum]
detections_temp = detections[sort_idx, :]
#根据每个类别进行过滤,只对同一种类别进行计算IOU和阈值过滤。
# boxes: Tensor, 预测框
# scores: Tensor, 预测置信度
# idxs: Tensor, 预测框类别
# iou_threshold: float, IOU阈值
nms_out_index = torchvision.ops.batched_nms(
detections_temp[:, :4], #x1, y1, x2, y2,
detections_temp[:, 4] * detections_temp[:, 5], #obj_conf * class_conf 目标的置信度*类别的置信度
detections_temp[:, 6], #class_pred 预测的类别
nms_thre,
)
topk_idx = sort_idx[nms_out_index[:self.topK]] # nms最大的前30个
output[i] = detections[topk_idx, :]
output_index[i] = topk_idx
return output, output_index
find_feature_score:根据ID寻找对应的feature,然后与score返回
def find_feature_score(self, features, idxs, reg_features, imgs=None, predictions=None, roi_features=None):
features_cls = []
features_reg = []
cls_scores = []
fg_scores = []
for i, feature in enumerate(features):
features_cls.append(feature[idxs[i][:self.simN]]) #30x256
features_reg.append(reg_features[i, idxs[i][:self.simN]]) #30x256
cls_scores.append(predictions[i][:self.simN, 5]) #30x1 类别的分数
fg_scores.append(predictions[i][:self.simN, 4]) #30x1 置信度
features_cls = torch.cat(features_cls) #240x256=30*8x256 30xbatch
features_reg = torch.cat(features_reg)
cls_scores = torch.cat(cls_scores)
fg_scores = torch.cat(fg_scores)
return features_cls, features_reg, cls_scores, fg_scores
注意力机制:
1.首先输入到Attention_msa中,先经过全连接层将1x240x256->1x240x768(240是8个batch*30,代码中有的标注是16个batch),再->3x1x4x240x64 (3:qkv ;4:四个头 ;240:240个),简单理解就是由原来的一个1x240x256经过全连接变成了3个,代表qkv 把256分成四份,交由四个头计算,后面再融合(不一定准确)。
2.然后就是qkv的计算,这里他还乘了分数,将分数高的赋予更高的权重。
3.最后的if ave:部分不是很了解,我的理解获取多头平均的权重,在后来的find_similar_round2对输出再一次权重的赋值,与原来的输出cat一起,再经过一层全连接后输出。
def forward(self, x_cls, x_reg, cls_score=None, fg_score=None, sim_thresh=0.75, ave=True, use_mask=False):
trans_cls, trans_reg, sim_round2 = self.msa(x_cls, x_reg, cls_score, fg_score, sim_thresh=sim_thresh, ave=ave,
use_mask=use_mask)
msa = self.linear1(trans_cls)
msa = self.find_similar_round2(msa, sim_round2)
out = self.linear2(msa)
return out
class Attention_msa(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., scale=25):
# dim :input[batchsize,sequence length, input dimension]-->output[batchsize, sequence lenght, dim]
# qkv_bias : Is it matter?
# qk_scale, attn_drop,proj_drop will not be used
# object = Attention(dim,num head)
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = scale # qk_scale or head_dim ** -0.5
self.qkv_cls = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.qkv_reg = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
def forward(self, x_cls, x_reg, cls_score=None, fg_score=None, return_attention=False, ave=True, sim_thresh=0.75,
use_mask=False):
B, N, C = x_cls.shape
# 1x240x768(256*3) -> 1x240x3x4x64 ->3x1x4x240x64 3:qkv 4:四个头 240:240个
# 简单理解就是由原来的一个1x240x256经过全连接变成了3个,代表qkv 把256分成四份,交由四个头计算,后面再融合(不一定准确)
#q:我要找个零食吃 k:1 v:饼干 k:2 v:薯片
qkv_cls = self.qkv_cls(x_cls).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1,4) # 3, B, num_head, N, c
qkv_reg = self.qkv_reg(x_reg).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q_cls, k_cls, v_cls = qkv_cls[0], qkv_cls[1], qkv_cls[2] # make torchscript happy (cannot use tensor as tuple)
q_reg, k_reg, v_reg = qkv_reg[0], qkv_reg[1], qkv_reg[2]
#归一化
q_cls = q_cls / torch.norm(q_cls, dim=-1, keepdim=True) #最后一个维度求2的范数 维度保持不变
k_cls = k_cls / torch.norm(k_cls, dim=-1, keepdim=True)
q_reg = q_reg / torch.norm(q_reg, dim=-1, keepdim=True)
k_reg = k_reg / torch.norm(k_reg, dim=-1, keepdim=True)
v_cls_normed = v_cls / torch.norm(v_cls, dim=-1, keepdim=True)
if cls_score == None:
cls_score = 1
else: # 将其与权重相成,相当于给分高的更多的权重
cls_score = torch.reshape(cls_score, [1, 1, 1, -1]).repeat(1, self.num_heads, N, 1) # 哪个维度多少,哪个维度复制多少
if fg_score == None:
fg_score = 1
else: # 240-> 1x1x1x240-> 1x4x240x240
fg_score = torch.reshape(fg_score, [1, 1, 1, -1]).repeat(1, self.num_heads, N, 1)
attn_cls_raw = v_cls_normed @ v_cls_normed.transpose(-2, -1)
if use_mask:
# only reference object with higher confidence..
cls_score_mask = (cls_score > (cls_score.transpose(-2, -1) - 0.1)).type_as(cls_score)
fg_score_mask = (fg_score > (fg_score.transpose(-2, -1) - 0.1)).type_as(fg_score)
else:
cls_score_mask = fg_score_mask = 1
# cls_score_mask = (cls_score < (cls_score.transpose(-2, -1) + 0.1)).type_as(cls_score)
# fg_score_mask = (fg_score < (fg_score.transpose(-2, -1) + 0.1)).type_as(fg_score)
# visual_attention(cls_score[0, 0, :, :])
# visual_attention(cls_score_mask[0,0,:,:])
attn_cls = (q_cls @ k_cls.transpose(-2, -1)) * self.scale * cls_score * cls_score_mask # 分少的给更少的权重
attn_cls = attn_cls.softmax(dim=-1)
attn_cls = self.attn_drop(attn_cls) # 注意力的权重
attn_reg = (q_reg @ k_reg.transpose(-2, -1)) * self.scale * fg_score * fg_score_mask
attn_reg = attn_reg.softmax(dim=-1)
attn_reg = self.attn_drop(attn_reg)
attn = (attn_reg + attn_cls) / 2
x = (attn @ v_cls).transpose(1, 2).reshape(B, N, C) # 1x4x240x64 -> 1x240x256 经过注意力机制后输出的特征
x_ori = v_cls.permute(0, 2, 1, 3).reshape(B, N, C)
x_cls = torch.cat([x, x_ori], dim=-1) # 输出特征和v的融合
#
if ave:
ones_matrix = torch.ones(attn.shape[2:]).to('cuda')
zero_matrix = torch.zeros(attn.shape[2:]).to('cuda')
#sim_mask 自己和自己的权重肯定大,
attn_cls_raw = torch.sum(attn_cls_raw, dim=1, keepdim=False)[0] / self.num_heads
sim_mask = torch.where(attn_cls_raw > sim_thresh, ones_matrix, zero_matrix) # 应该是设置阈值过滤低于0.75的值
sim_attn = torch.sum(attn, dim=1, keepdim=False)[0] / self.num_heads
sim_round2 = torch.softmax(sim_attn, dim=-1)
sim_round2 = sim_mask * sim_round2 / (torch.sum(sim_mask * sim_round2, dim=-1, keepdim=True)) # 相当于softmax
return x_cls, None, sim_round2
else:
return x_cls, None, None
yolox代码不太懂可以看哔站上的介绍:yolox代码逐行讲解-train_哔哩哔哩_bilibili
注意力机制可以看李宏毅老师的:强烈推荐!台大李宏毅自注意力机制和Transformer详解!_哔哩哔哩_bilibili
想要我注释过得代码的可以私信我,我发给你,有什么问题大家可以一起讨论。