YOLOV :基于YOLOX,使静态图像对象检测器在视频对象检测方面表现出色,注意力机制的魅力

 YOLOV :基于YOLOX,使静态图像对象检测器在视频对象检测方面表现出色,注意力机制的魅力_第1张图片

YOLOV :基于YOLOX,使静态图像对象检测器在视频对象检测方面表现出色,注意力机制的魅力_第2张图片

 YOLOV的主要思想是将yolox获得框与另一条路reg_feat和cls_feat2 的特征相匹配,将匹配的特征和预测框放到注意力机制中得到新的预测框。

流程:

1.首先将16x6804x35(预测框)放入到函数self.postpro_woclass(这个函数主要做的是通过置信度和非极大抑制筛选出30个框,37相比较于35多两位,多出的是类别的置信度和类别)得到30x37

2.将另一条路上的cls_feat和cls_feat2分别展成16x6804x256

3.按照yolox得到的pred_idx找到对应cls、reg特征、cls_scores(类别分数) fg_scores(置信度 前后背景分数)

4.将得到的特征和分数输入到注意力机制中

self.hw = [x.shape[-2:] for x in outputs_decode]  # 72x72  36x36 18x18
        # output里的坐标已经发生了改变,还原到了原图的上,而output_decode的坐标还是原来的特征图上的大小,没有改变   坐标应该是中心点+长宽
        outputs_decode = torch.cat([x.flatten(start_dim=2) for x in outputs_decode], dim=2   #16x6804x35
                                   ).permute(0, 2, 1)
        decode_res = self.decode_outputs(outputs_decode, dtype=xin[0].type())  #find topK predictions, play the same role as RPN  和上面的output操作差不多
        #nms   置信度 nms前30个   37(x1, y1, x2, y2, obj_conf, class_conf, class_pred)
        pred_result, pred_idx = self.postpro_woclass(decode_res, num_classes=self.num_classes, nms_thre=self.nms_thresh,
                                                     topK=self.Afternum)   # postprocess(decode_res,num_classes=30)
        #return pred_result
        if not self.training and imgs.shape[0] == 1:
            return self.postprocess_single_img(pred_result, self.num_classes)
        #YOLOv新的一条路
        cls_feat_flatten = torch.cat(   #16x6804x256
            [x.flatten(start_dim=2) for x in before_nms_features], dim=2   #cls_feat2
        ).permute(0, 2, 1)  # [b,features,channels]
        reg_feat_flatten = torch.cat(    #16x6804x256                      #reg_feat
            [x.flatten(start_dim=2) for x in before_nms_regf], dim=2
        ).permute(0, 2, 1)
        #通过pred_idx找到对应cls、reg特征、cls_scores(类别分数) fg_scores(置信度 前后背景分数)    按照yolox得到的框的ID寻找
        features_cls, features_reg, cls_scores, fg_scores = self.find_feature_score(cls_feat_flatten, pred_idx,
                                                                                    reg_feat_flatten, imgs,
                                                                                    pred_result)
        features_reg = features_reg.unsqueeze(0)    #1x480x256      480=30*batchsize(16)
        features_cls = features_cls.unsqueeze(0)    #1x480x256
        if not self.training:
            cls_scores = cls_scores.to(cls_feat_flatten.dtype)
            fg_scores = fg_scores.to(cls_feat_flatten.dtype)
        if self.use_score:
            trans_cls = self.trans(features_cls, features_reg, cls_scores, fg_scores, sim_thresh=self.sim_thresh,
                                   ave=self.ave, use_mask=self.use_mask)
        else:
            trans_cls = self.trans(features_cls, features_reg, None, None, sim_thresh=self.sim_thresh, ave=self.ave)
        fc_output = self.linear_pred(trans_cls)
        fc_output = torch.reshape(fc_output, [outputs_decode.shape[0], -1, self.num_classes + 1])[:, :, :-1]

 postpro_woclass函数:首先将预测框的中心点长宽改成左上角和右下角的坐标更新prediction,然后计算30类中的值哪个一个最大,返回最大的值和位置,将其放入到35中形成37,接着获取置信度最高的前750个,最后使用torchvision.ops.batched_nms过滤,获取nms最大的前30个。

    def postpro_woclass(self, prediction, num_classes, nms_thre=0.75, topK=75, features=None):
        # find topK predictions, play the same role as RPN
        '''

        Args:
            prediction: [batch,feature_num,5+clsnum]
            num_classes:
            conf_thre:
            conf_thre_high:
            nms_thre:

        Returns:
            [batch,topK,5+clsnum]
        '''
        self.topK = topK
        box_corner = prediction.new(prediction.shape) # 中心点和长宽  -> 左上角右下角
        box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
        box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
        box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
        box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
        prediction[:, :, :4] = box_corner[:, :, :4]  #
        output = [None for _ in range(len(prediction))]
        output_index = [None for _ in range(len(prediction))]
        features_list = []
        for i, image_pred in enumerate(prediction):

            if not image_pred.size(0):
                continue
            # Get score and class with highest confidence
            class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True) # 返回每一个batch(每一帧)30个类别每一行最大值并返回索引

            # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
            detections = torch.cat(
                (image_pred[:, :5], class_conf, class_pred.float(), image_pred[:, 5: 5 + num_classes]), 1)
            #获取置信度前750个
            conf_score = image_pred[:, 4]
            top_pre = torch.topk(conf_score, k=self.Prenum)
            sort_idx = top_pre.indices[:self.Prenum]
            detections_temp = detections[sort_idx, :]
            #根据每个类别进行过滤,只对同一种类别进行计算IOU和阈值过滤。
            # boxes: Tensor, 预测框
            # scores: Tensor, 预测置信度
            # idxs: Tensor, 预测框类别
            # iou_threshold: float, IOU阈值
            nms_out_index = torchvision.ops.batched_nms(
                detections_temp[:, :4],                          #x1, y1, x2, y2,
                detections_temp[:, 4] * detections_temp[:, 5],   #obj_conf * class_conf   目标的置信度*类别的置信度
                detections_temp[:, 6],                           #class_pred   预测的类别
                nms_thre,
            )

            topk_idx = sort_idx[nms_out_index[:self.topK]]  # nms最大的前30个
            output[i] = detections[topk_idx, :]
            output_index[i] = topk_idx

        return output, output_index

 find_feature_score:根据ID寻找对应的feature,然后与score返回

 def find_feature_score(self, features, idxs, reg_features, imgs=None, predictions=None, roi_features=None):
        features_cls = []
        features_reg = []
        cls_scores = []
        fg_scores = []
        for i, feature in enumerate(features):
            features_cls.append(feature[idxs[i][:self.simN]])           #30x256
            features_reg.append(reg_features[i, idxs[i][:self.simN]])   #30x256
            cls_scores.append(predictions[i][:self.simN, 5])            #30x1   类别的分数
            fg_scores.append(predictions[i][:self.simN, 4])             #30x1   置信度
        features_cls = torch.cat(features_cls)     #240x256=30*8x256   30xbatch
        features_reg = torch.cat(features_reg)
        cls_scores = torch.cat(cls_scores)
        fg_scores = torch.cat(fg_scores)
        return features_cls, features_reg, cls_scores, fg_scores

 注意力机制:

1.首先输入到Attention_msa中,先经过全连接层将1x240x256->1x240x768(240是8个batch*30,代码中有的标注是16个batch),再->3x1x4x240x64 (3:qkv ;4:四个头 ;240:240个),简单理解就是由原来的一个1x240x256经过全连接变成了3个,代表qkv 把256分成四份,交由四个头计算,后面再融合(不一定准确)。

2.然后就是qkv的计算,这里他还乘了分数,将分数高的赋予更高的权重。

3.最后的if ave:部分不是很了解,我的理解获取多头平均的权重,在后来的find_similar_round2对输出再一次权重的赋值,与原来的输出cat一起,再经过一层全连接后输出。

    def forward(self, x_cls, x_reg, cls_score=None, fg_score=None, sim_thresh=0.75, ave=True, use_mask=False):
        trans_cls, trans_reg, sim_round2 = self.msa(x_cls, x_reg, cls_score, fg_score, sim_thresh=sim_thresh, ave=ave,
                                                    use_mask=use_mask)
        msa = self.linear1(trans_cls)
        msa = self.find_similar_round2(msa, sim_round2)

        out = self.linear2(msa)
        return out
class Attention_msa(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., scale=25):
        # dim :input[batchsize,sequence length, input dimension]-->output[batchsize, sequence lenght, dim]
        # qkv_bias : Is it matter?
        # qk_scale, attn_drop,proj_drop will not be used
        # object = Attention(dim,num head)
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
        self.scale = scale  # qk_scale or head_dim ** -0.5

        self.qkv_cls = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.qkv_reg = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)

    def forward(self, x_cls, x_reg, cls_score=None, fg_score=None, return_attention=False, ave=True, sim_thresh=0.75,
                use_mask=False):
        B, N, C = x_cls.shape
        # 1x240x768(256*3) -> 1x240x3x4x64  ->3x1x4x240x64  3:qkv 4:四个头 240:240个
        # 简单理解就是由原来的一个1x240x256经过全连接变成了3个,代表qkv 把256分成四份,交由四个头计算,后面再融合(不一定准确)
        #q:我要找个零食吃 k:1 v:饼干   k:2  v:薯片
        qkv_cls = self.qkv_cls(x_cls).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1,4)  # 3, B, num_head, N, c
        qkv_reg = self.qkv_reg(x_reg).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q_cls, k_cls, v_cls = qkv_cls[0], qkv_cls[1], qkv_cls[2]  # make torchscript happy (cannot use tensor as tuple)
        q_reg, k_reg, v_reg = qkv_reg[0], qkv_reg[1], qkv_reg[2]
        #归一化
        q_cls = q_cls / torch.norm(q_cls, dim=-1, keepdim=True) #最后一个维度求2的范数 维度保持不变
        k_cls = k_cls / torch.norm(k_cls, dim=-1, keepdim=True)
        q_reg = q_reg / torch.norm(q_reg, dim=-1, keepdim=True)
        k_reg = k_reg / torch.norm(k_reg, dim=-1, keepdim=True)
        v_cls_normed = v_cls / torch.norm(v_cls, dim=-1, keepdim=True)

        if cls_score == None:
            cls_score = 1
        else:  # 将其与权重相成,相当于给分高的更多的权重
            cls_score = torch.reshape(cls_score, [1, 1, 1, -1]).repeat(1, self.num_heads, N, 1) # 哪个维度多少,哪个维度复制多少

        if fg_score == None:
            fg_score = 1
        else:    # 240-> 1x1x1x240-> 1x4x240x240
            fg_score = torch.reshape(fg_score, [1, 1, 1, -1]).repeat(1, self.num_heads, N, 1)

        attn_cls_raw = v_cls_normed @ v_cls_normed.transpose(-2, -1)
        if use_mask:
            # only reference object with higher confidence..
            cls_score_mask = (cls_score > (cls_score.transpose(-2, -1) - 0.1)).type_as(cls_score)
            fg_score_mask = (fg_score > (fg_score.transpose(-2, -1) - 0.1)).type_as(fg_score)
        else:
            cls_score_mask = fg_score_mask = 1

        # cls_score_mask = (cls_score < (cls_score.transpose(-2, -1) + 0.1)).type_as(cls_score)
        # fg_score_mask = (fg_score < (fg_score.transpose(-2, -1) + 0.1)).type_as(fg_score)
        # visual_attention(cls_score[0, 0, :, :])
        # visual_attention(cls_score_mask[0,0,:,:])

        attn_cls = (q_cls @ k_cls.transpose(-2, -1)) * self.scale * cls_score * cls_score_mask  # 分少的给更少的权重
        attn_cls = attn_cls.softmax(dim=-1)
        attn_cls = self.attn_drop(attn_cls)   # 注意力的权重

        attn_reg = (q_reg @ k_reg.transpose(-2, -1)) * self.scale * fg_score * fg_score_mask
        attn_reg = attn_reg.softmax(dim=-1)
        attn_reg = self.attn_drop(attn_reg)

        attn = (attn_reg + attn_cls) / 2
        x = (attn @ v_cls).transpose(1, 2).reshape(B, N, C)  # 1x4x240x64 -> 1x240x256 经过注意力机制后输出的特征

        x_ori = v_cls.permute(0, 2, 1, 3).reshape(B, N, C)
        x_cls = torch.cat([x, x_ori], dim=-1)   # 输出特征和v的融合
        #

        if ave:
            ones_matrix = torch.ones(attn.shape[2:]).to('cuda')
            zero_matrix = torch.zeros(attn.shape[2:]).to('cuda')
            #sim_mask 自己和自己的权重肯定大,
            attn_cls_raw = torch.sum(attn_cls_raw, dim=1, keepdim=False)[0] / self.num_heads
            sim_mask = torch.where(attn_cls_raw > sim_thresh, ones_matrix, zero_matrix)     # 应该是设置阈值过滤低于0.75的值
            sim_attn = torch.sum(attn, dim=1, keepdim=False)[0] / self.num_heads

            sim_round2 = torch.softmax(sim_attn, dim=-1)
            sim_round2 = sim_mask * sim_round2 / (torch.sum(sim_mask * sim_round2, dim=-1, keepdim=True))  # 相当于softmax
            return x_cls, None, sim_round2
        else:
            return x_cls, None, None

yolox代码不太懂可以看哔站上的介绍:yolox代码逐行讲解-train_哔哩哔哩_bilibili

注意力机制可以看李宏毅老师的:强烈推荐!台大李宏毅自注意力机制和Transformer详解!_哔哩哔哩_bilibili

想要我注释过得代码的可以私信我,我发给你,有什么问题大家可以一起讨论。

你可能感兴趣的:(深度学习,人工智能,计算机视觉,目标检测)