3D点云学习:SECOND目标检测③源码注释

源码链接: PCDet.
根据论文.中给出的网络结构图,网络主要分为3个部分。

1 network structure

网络的主体框架在PCDet-master\pcdet\models\detectors\second_net.py中,可以清晰的看到网络的主体结构。

  1. self.vfe:对应论文中的VFE,与论文介绍不同的是,源码中使用的是平均法求特征,没有使用卷积+maxpool的方法.
  2. spconv.SparseConvTensor:输入是体素的原始数据,点云中点的数量以及点对应的体素的坐标(第三个没有用到),输出是每个体素中的特征[N, 4]。
  3. self.rpn_net:体素特征下采样,特征channal从4->256,体素维度缩小了8倍。
  4. self.rpn_head:anchor生成,target计算及损失计算
class SECONDNet(Detector3D):
    # dataset就是全部的给入数据
    def __init__(self, num_class, dataset):
        super().__init__(num_class, dataset)

        self.sparse_shape = dataset.voxel_generator.grid_size[::-1] + [1, 0, 0]
        self.build_networks(cfg.MODEL)  # 在这里找到Detector3D的初始化

    def forward_rpn(self, voxels, num_points, coordinates, batch_size, voxel_centers, **kwargs):
        # 原本voxel是包含了每个体素里面max_points_of_each_voxel所有点的特征
        # 将voxel转化为了一个体素一个4维特征的形式
        voxel_features = self.vfe(
            features=voxels,
            num_voxels=num_points,
            coords=coordinates
        )
        # 在这里,将voxel_features,coordinates结合起来,转化为了最容易易理解的3D形式的数据
        input_sp_tensor = spconv.SparseConvTensor(
            features=voxel_features,
            indices=coordinates,
            spatial_shape=self.sparse_shape,
            batch_size=batch_size
        )
        # backbone下采样
        backbone_ret_dict = self.rpn_net(
            input_sp_tensor,
            **{'voxel_centers': voxel_centers}
        )
        # 计算损失
        rpn_preds_dict = self.rpn_head(
            backbone_ret_dict['spatial_features'],
            **{'gt_boxes': kwargs.get('gt_boxes', None)}
        )
        rpn_preds_dict.update(backbone_ret_dict)

        rpn_ret_dict = {
            'rpn_cls_preds': rpn_preds_dict['cls_preds'],
            'rpn_box_preds': rpn_preds_dict['box_preds'],
            'rpn_dir_cls_preds': rpn_preds_dict.get('dir_cls_preds', None),
            'anchors': rpn_preds_dict['anchors']
        }
        return rpn_ret_dict

    def forward(self, input_dict):
        rpn_ret_dict = self.forward_rpn(**input_dict)

        if self.training:
            loss, tb_dict, disp_dict = self.get_training_loss()

            ret_dict = {
                'loss': loss
            }
            return ret_dict, tb_dict, disp_dict
        else:
            pred_dicts, recall_dicts = self.predict_boxes(rpn_ret_dict, rcnn_ret_dict=None, input_dict=input_dict)
            return pred_dicts, recall_dicts

    def get_training_loss(self):
        disp_dict = {}

        loss_anchor_box, tb_dict = self.rpn_head.get_loss()
        loss_rpn = loss_anchor_box
        tb_dict = {
            'loss_rpn': loss_rpn.item(),
            **tb_dict
        }

        loss = loss_rpn
        # disp_dict是空的
        return loss, tb_dict, disp_dict

2 MeanVoxelFeatureExtractor

初步得到每个体素的特征,对一个体素块中的每个点的坐标和强度求均值

class MeanVoxelFeatureExtractor(VoxelFeatureExtractor):
    def __init__(self, **kwargs):
        super().__init__()

    def get_output_feature_dim(self):
        return cfg.DATA_CONFIG.NUM_POINT_FEATURES['use']    # 4

    # 这里采用的是均值求特征法,即将每个体素中的所有点的坐标和强度求均值
    # 对每一个voxel中所有点的数据求平均值,得到points_mean[N, 4]
    def forward(self, features, num_voxels, **kwargs):
        """
        :param features: (N, max_points_of_each_voxel, 3 + C) [batch_size * num_voxels, max_points_of_each_voxel, 4]
        :param num_voxels: (N)(batch_size * num_voxels)
        :param kwargs:
        :return:
        """
        points_mean = features[:, :, :].sum(dim=1, keepdim=False) / num_voxels.type_as(features).view(-1, 1)
        return points_mean.contiguous()

3 BackBone8x

与论文中结构有一点点的不同,大体上相同

class BackBone8x(nn.Module):
    def __init__(self, input_channels):
        super().__init__()
        # 给nn.BatchNorm1d几个默认参数
        norm_fn = partial(nn.BatchNorm1d, eps=1e-3, momentum=0.01)

        # 一个subm
        self.conv_input = spconv.SparseSequential(
            spconv.SubMConv3d(input_channels, 16, 3, padding=1, bias=False, indice_key='subm1'),
            norm_fn(16),
            nn.ReLU(),
        )
        block = self.post_act_block
        # 又一个subm
        # [1600, 1408, 41]
        self.conv1 = spconv.SparseSequential(
            block(16, 16, 3, norm_fn=norm_fn, padding=1, indice_key='subm1'),
        )

        # spconv_stride2 + subm + subm
        self.conv2 = spconv.SparseSequential(
            # [1600, 1408, 41] <- [800, 704, 21]
            block(16, 32, 3, norm_fn=norm_fn, stride=2, padding=1, indice_key='spconv2', conv_type='spconv'),
            block(32, 32, 3, norm_fn=norm_fn, padding=1, indice_key='subm2'),
            block(32, 32, 3, norm_fn=norm_fn, padding=1, indice_key='subm2'),
        )

        # spconv_stride2 + subm + subm
        self.conv3 = spconv.SparseSequential(
            # [800, 704, 21] <- [400, 352, 11]
            block(32, 64, 3, norm_fn=norm_fn, stride=2, padding=1, indice_key='spconv3', conv_type='spconv'),
            block(64, 64, 3, norm_fn=norm_fn, padding=1, indice_key='subm3'),
            block(64, 64, 3, norm_fn=norm_fn, padding=1, indice_key='subm3'),
        )

        # spconv_stride2 + subm + subm
        self.conv4 = spconv.SparseSequential(
            # [400, 352, 11] <- [200, 176, 5]
            block(64, 64, 3, norm_fn=norm_fn, stride=2, padding=(0, 1, 1), indice_key='spconv4', conv_type='spconv'),
            block(64, 64, 3, norm_fn=norm_fn, padding=1, indice_key='subm4'),
            block(64, 64, 3, norm_fn=norm_fn, padding=1, indice_key='subm4'),
        )

        last_pad = 0 if cfg.DATA_CONFIG.VOXEL_GENERATOR.VOXEL_SIZE[-1] in [0.1, 0.2] else (1, 0, 0)

        # spconv_stride(2, 1, 1)
        self.conv_out = spconv.SparseSequential(
            # [200, 150, 5] -> [200, 150, 2]
            spconv.SparseConv3d(64, 128, (3, 1, 1), stride=(2, 1, 1), padding=last_pad,
                                bias=False, indice_key='spconv_down2'),
            norm_fn(128),
            nn.ReLU(),
        )

    def forward(self, input_sp_tensor, **kwargs):
        """
        :param voxel_features:  (N, C) N个体素,每个有C维特征
        :param coors:   (N, 4)  [batch_idx, z_idx, y_idx, x_idx], N个体素分别的体素坐标位置 sparse_shape: (z_size, y_size, x_size)
        :param batch_size:
        :return:
        """
        x = self.conv_input(input_sp_tensor)    # [16, 4, [41, 1408, 1600]] --> [16, 4, [41, 1408, 1600]]

        x_conv1 = self.conv1(x)                 # [16, 16, [41, 1408, 1600]] --> [16, 4, [41, 1408, 1600]]
        x_conv2 = self.conv2(x_conv1)           # [16, 16, [41, 1408, 1600]] --> [16, 32, [21, 704, 800]]
        x_conv3 = self.conv3(x_conv2)           # [16, 32, [21, 704, 800]]--> [16, 64, [11, 352, 400]]
        x_conv4 = self.conv4(x_conv3)           # [16, 64, [11, 352, 400]] --> [16, 64, [5, 176, 200]]
        # spconv卷积的结果不像conv2d那么简单,batch_size,channal,shape这三个内容都是分开存放的,在后面用out.dense才把这三个内容组合到一  起了,变为密集型的张量
        # spconv卷积的输入也是一样,输入和输出更像是一个字典或者说元组
        # 注意卷积中pad与no_pad的区别
        # for detection head
        # [200, 176, 5] -> [200, 176, 2]
        out = self.conv_out(x_conv4)            # [16, 64, [5, 176, 200]] --> [16, 128, [2, 176, 200]]
        spatial_features = out.dense()          # 转化为密集tensor,[16, 128, 2, 176, 200]

        N, C, D, H, W = spatial_features.shape
        spatial_features = spatial_features.view(N, C * D, H, W)    # reshape为2D鸟瞰特征

        ret = {'spatial_features': spatial_features}
        return ret

4 RPN_head

这一部分与论文中介绍的也有微小的差异,论文中RPN_head是三部分的拼接,但是在源码中,缺少了一部分,是两个部分的拼接。

class AnchorHead(nn.Module):
    def __init__(self, grid_size, anchor_target_cfg):
        super().__init__()

        anchor_cfg = anchor_target_cfg.ANCHOR_GENERATOR
        anchor_generators = []

        self.num_class = len(cfg.CLASS_NAMES)
        # 对每一种class都进行专有的anchor_generator,放到一个list里面
        for cur_name in cfg.CLASS_NAMES:
            cur_cfg = None
            # 根据class_name拿出对应的cfg
            for a_cfg in anchor_cfg:
                if a_cfg['class_name'] == cur_name:
                    cur_cfg = a_cfg
                    break
            assert cur_cfg is not None, 'Not found anchor config: %s' % cur_name
            # 根据不同class_name的cfg的参数,初始化不同的anchor_generator
            anchor_generator = AnchorGeneratorRange(
                anchor_ranges=cur_cfg['anchor_range'],
                sizes=cur_cfg['sizes'],
                rotations=cur_cfg['rotations'],
                class_name=cur_cfg['class_name'],
                match_threshold=cur_cfg['matched_threshold'],
                unmatch_threshold=cur_cfg['unmatched_threshold']
            )
            anchor_generators.append(anchor_generator)

        # ResidualCoder
        # 初始化box_coder_utils,用来将gt_box转化为target
        self.box_coder = getattr(box_coder_utils, anchor_target_cfg.BOX_CODER)()

        # target_assigner中集成了一系列功能,anchor_generators以及target生成
        self.target_assigner = TargetAssigner(
            anchor_generators=anchor_generators,
            pos_fraction=anchor_target_cfg.SAMPLE_POS_FRACTION,
            sample_size=anchor_target_cfg.SAMPLE_SIZE,
            region_similarity_fn_name=anchor_target_cfg.REGION_SIMILARITY_FN,
            box_coder=self.box_coder
        )
        # 每个位置上根据角度不同,size不同可能会有好多个anchor
        self.num_anchors_per_location = self.target_assigner.num_anchors_per_location
        self.box_code_size = self.box_coder.code_size   # 7
        
        # grid_size是初始体素的维度,DOWNSAMPLED_FACTOR是下采样的倍数,feature_map_size网络输出时的维度,也是anchor生成的维度
        feature_map_size = grid_size[:2] // anchor_target_cfg.DOWNSAMPLED_FACTOR    # [176,200]
        feature_map_size = [*feature_map_size, 1][::-1]                             # [1,200,176]
        # ret:{       'anchors': anchors,
        #             'matched_thresholds': matched_thresholds,
        #             'unmatched_thresholds': unmatched_thresholds
        #         }
        ret = self.target_assigner.generate_anchors(feature_map_size)
        # anchors_dict种对每一个class的anchor都进行了分类存放,内容和上面的ret基本上一样
        anchors_dict = self.target_assigner.generate_anchors_dict(feature_map_size)
        #
        anchors = ret['anchors'].reshape([-1, 7])
        self.anchor_cache = {
            'anchors': anchors,
            'anchors_dict': anchors_dict,
        }

        self.forward_ret_dict = None
        # 三种loss的初始化
        self.build_losses(cfg.MODEL.LOSSES)

    def build_losses(self, losses_cfg):
        # loss function definition
        self.cls_loss_func = loss_utils.SigmoidFocalClassificationLoss(alpha=0.25, gamma=2.0)
        code_weights = losses_cfg.LOSS_WEIGHTS['code_weights']

        rpn_code_weights = code_weights[3:7] if losses_cfg.RPN_REG_LOSS == 'bin-based' else code_weights
        self.reg_loss_func = loss_utils.WeightedSmoothL1LocalizationLoss(sigma=3.0, code_weights=rpn_code_weights)
        self.dir_loss_func = loss_utils.WeightedSoftmaxClassificationLoss()

    def assign_targets(self, gt_boxes):
        """
        :param gt_boxes: (B, N, 8)
        :return:
        """
        gt_boxes = gt_boxes.cpu().numpy()
        batch_size = gt_boxes.shape[0]
        # class_label
        gt_classes = gt_boxes[:, :, 7]
        # bbox_label
        gt_boxes = gt_boxes[:, :, :7]
        targets_dict_list = []
        for k in range(batch_size):
            cur_gt = gt_boxes[k]
            cnt = cur_gt.__len__() - 1
            while cnt > 0 and cur_gt[cnt].sum() == 0:
                cnt -= 1
            cur_gt = cur_gt[:cnt + 1]

            cur_gt_classes = gt_classes[k][:cnt + 1]
            cur_gt_names = np.array(cfg.CLASS_NAMES)[cur_gt_classes.astype(np.int32) - 1]
            # 每一个class对应的target_dict
            cur_target_dict = self.target_assigner.assign_v2(
                anchors_dict=self.anchor_cache['anchors_dict'], # 每个class分开的anchor和theresh
                gt_boxes=cur_gt,
                gt_classes=cur_gt_classes,
                gt_names=cur_gt_names
            )
            targets_dict_list.append(cur_target_dict)

        # targets_dict_list是个大的list,其中包含小的dict,下面把list拆开,小dict合并同类项变list,变成一个大的dict,包含小list
        # 进一步的意思,舍弃掉不同class对应的target_dict,合并成整个网络的target
        targets_dict = {}
        for key in targets_dict_list[0].keys():
            val = np.stack([x[key] for x in targets_dict_list], axis=0)
            targets_dict[key] = val

        return targets_dict

    @staticmethod
    def add_sin_difference(boxes1, boxes2, dim=6):
        assert dim != -1
        rad_pred_encoding = torch.sin(boxes1[..., dim:dim+1]) * torch.cos(boxes2[..., dim:dim+1])
        rad_tg_encoding = torch.cos(boxes1[..., dim:dim+1]) * torch.sin(boxes2[..., dim:dim+1])
        boxes1 = torch.cat([boxes1[..., :dim], rad_pred_encoding, boxes1[..., dim+1:]], dim=-1)
        boxes2 = torch.cat([boxes2[..., :dim], rad_tg_encoding, boxes2[..., dim+1:]], dim=-1)
        return boxes1, boxes2

    @staticmethod
    def get_direction_target(anchors, reg_targets, one_hot=True, dir_offset=0, num_bins=2):
        batch_size = reg_targets.shape[0]
        anchors = anchors.view(batch_size, -1, anchors.shape[-1])
        # 真实角度值
        rot_gt = reg_targets[..., 6] + anchors[..., 6]
        # 约束到一定范围内
        offset_rot = common_utils.limit_period_torch(rot_gt - dir_offset, 0, 2 * np.pi)
        # 在这里将朝向转化为0或者1
        dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long()
        # 约束到0或者1
        dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)

        if one_hot:
            dir_targets = torch.zeros(*list(dir_cls_targets.shape), num_bins, dtype=anchors.dtype,
                                      device=dir_cls_targets.device)
            dir_targets.scatter_(-1, dir_cls_targets.unsqueeze(dim=-1).long(), 1.0)
            dir_cls_targets = dir_targets
        return dir_cls_targets

    def get_loss(self, forward_ret_dict=None):
        loss_cfgs = cfg.MODEL.LOSSES
        # forward_ret_dict是forward计算的结果
        forward_ret_dict = self.forward_ret_dict if forward_ret_dict is None else forward_ret_dict
        #
        anchors = forward_ret_dict['anchors']
        box_preds = forward_ret_dict['box_preds']
        cls_preds = forward_ret_dict['cls_preds']
        box_dir_cls_preds = forward_ret_dict['dir_cls_preds']
        box_cls_labels = forward_ret_dict['box_cls_labels']
        box_reg_targets = forward_ret_dict['box_reg_targets']
        batch_size = int(box_preds.shape[0])

        anchors = anchors.view(1, -1, anchors.shape[-1]).repeat(batch_size, 1, 1)

        # rpn head losses
        # 损失权重
        cared = box_cls_labels >= 0  # [batch_size, num_anchors]
        positives = box_cls_labels > 0  # [batch_size, num_anchors]
        negatives = box_cls_labels == 0 # [batch_size, num_anchors]
        negative_cls_weights = negatives * 1.0  # [batch_size, num_anchors]
        cls_weights = (negative_cls_weights + 1.0 * positives).float()  # [batch_size, num_anchors]
        reg_weights = positives.float() # 只有box_cls_labels > 0对应的anchor才有损失,其他的损失weight都是0

        # 归一化
        pos_normalizer = positives.sum(1, keepdim=True).float()
        reg_weights /= torch.clamp(pos_normalizer, min=1.0)
        cls_weights /= torch.clamp(pos_normalizer, min=1.0)
        cls_targets = box_cls_labels * cared.type_as(box_cls_labels)
        cls_targets = cls_targets.unsqueeze(dim=-1)

        num_class = self.num_class

        cls_targets = cls_targets.squeeze(dim=-1)
        one_hot_targets = torch.zeros(
            *list(cls_targets.shape), num_class + 1, dtype=box_preds.dtype, device=cls_targets.device
        )
        one_hot_targets.scatter_(-1, cls_targets.unsqueeze(dim=-1).long(), 1.0)

        if cfg.MODEL.RPN.RPN_HEAD.ARGS['encode_background_as_zeros']:
            # 把背景视为0分类
            # box_cls_labels中背景也对应着一个分类,label编号是0,one_hot_targets中最后一维的第一位就是背景的,去掉背景
            cls_preds = cls_preds.view(batch_size, -1, num_class)
            one_hot_targets = one_hot_targets[..., 1:]
        else:
            cls_preds = cls_preds.view(batch_size, -1, num_class + 1)

        loss_weights_dict = loss_cfgs.LOSS_WEIGHTS
        cls_loss = self.cls_loss_func(cls_preds, one_hot_targets, weights=cls_weights)  # [N, M][batch_size, num_anchors, num_classes]
        cls_loss_reduced = cls_loss.sum() / batch_size
        # cls损失占总损失的比重
        cls_loss_reduced = cls_loss_reduced * loss_weights_dict['rpn_cls_weight']

        box_preds = box_preds.view(batch_size, -1, box_preds.shape[-1] // self.num_anchors_per_location)
        if loss_cfgs.RPN_REG_LOSS == 'smooth-l1':
            # sin(a - b) = sinacosb-cosasinb
            box_preds_sin, reg_targets_sin = self.add_sin_difference(box_preds, box_reg_targets)
            loc_loss = self.reg_loss_func(box_preds_sin, reg_targets_sin, weights=reg_weights)  # [N, M][batch_size, num_anchors]
            loc_loss_reduced = loc_loss.sum() / batch_size
        else:
            raise NotImplementedError
        # loc损失在总损失中占的比重
        loc_loss_reduced = loc_loss_reduced * loss_weights_dict['rpn_loc_weight']

        rpn_loss = loc_loss_reduced + cls_loss_reduced

        tb_dict = {
            'rpn_loss_loc': loc_loss_reduced.item(),
            'rpn_loss_cls': cls_loss_reduced.item()
        }
        if box_dir_cls_preds is not None:
            dir_targets = self.get_direction_target(
                anchors, box_reg_targets,
                dir_offset=cfg.MODEL.RPN.RPN_HEAD.ARGS['dir_offset'],
                num_bins=cfg.MODEL.RPN.RPN_HEAD.ARGS['num_direction_bins']
            )

            dir_logits = box_dir_cls_preds.view(batch_size, -1, cfg.MODEL.RPN.RPN_HEAD.ARGS['num_direction_bins'])
            # 也是label>0时才有损失,也归一化
            weights = positives.type_as(dir_logits)
            weights /= torch.clamp(weights.sum(-1, keepdim=True), min=1.0)
            # 计算损失
            dir_loss = self.dir_loss_func(dir_logits, dir_targets, weights=weights)
            dir_loss = dir_loss.sum() / batch_size
            dir_loss = dir_loss * loss_weights_dict['rpn_dir_weight']
            rpn_loss += dir_loss
            tb_dict['rpn_loss_dir'] = dir_loss.item()

        tb_dict['rpn_loss'] = rpn_loss.item()
        # rpn_loss总损失,tb_dict损失表
        return rpn_loss, tb_dict


class RPNV2( ):
    def __init__(self, num_class, args, anchor_target_cfg, grid_size, **kwargs):
        super().__init__(grid_size=grid_size, anchor_target_cfg=anchor_target_cfg)  # 在这里初始化了上面的AnchorHead
        self._use_direction_classifier = args['use_direction_classifier']   # True
        self._concat_input = args['concat_input']                           # False
        assert len(args['layer_strides']) == len(args['layer_nums'])
        assert len(args['num_filters']) == len(args['layer_nums'])
        assert len(args['num_upsample_filters']) == len(args['layer_nums'])

        # partial:将函数中的一些参数固定,以后调用时不用指明这些参数
        if args['use_norm']:
            BatchNorm2d = partial(nn.BatchNorm2d, eps=1e-3, momentum=0.01)
            Conv2d = partial(nn.Conv2d, bias=False)
            ConvTranspose2d = partial(nn.ConvTranspose2d, bias=False)
        else:
            BatchNorm2d = Empty
            Conv2d = partial(nn.Conv2d, bias=True)
            ConvTranspose2d = partial(nn.ConvTranspose2d, bias=True)

        in_filters = [args['num_input_features'], *args['num_filters'][:-1]]    # [256, 256, 128]
        blocks = []
        deblocks = []
        # ARGS: {
        #                 'use_norm': True,
        #                 'concat_input': False,
        #                 'num_input_features': 256,
        #                 'layer_nums': [5, 5],
        #                 'layer_strides': [1, 2],
        #                 'num_filters': [128, 256],
        #                 'upsample_strides': [1, 2],
        #                 'num_upsample_filters': [256, 256],
        #                 'encode_background_as_zeros': True,
        #
        #                 'use_direction_classifier': True,
        #                 'num_direction_bins': 2,
        #                 'dir_offset': 0.78539,
        #                 'dir_limit_offset': 0.0,
        #                 'use_binary_dir_classifier': False
        #             }
        # [5, 5]
        for i, layer_num in enumerate(args['layer_nums']):
            # i = 0: in_filters[i] = 256, args['num_filters'][i] = 128, stride=args['layer_strides'][i] = 1
            # i = 1: in_filters[i] = 256, args['num_filters'][i] = 128, stride=args['layer_strides'][i] = 2
            block = Sequential(
                nn.ZeroPad2d(1),
                Conv2d(in_filters[i], args['num_filters'][i], 3, stride=args['layer_strides'][i]),
                BatchNorm2d(args['num_filters'][i]),
                nn.ReLU(),
            )
            # i = 0: 加入5个Conv2d(128, 128, 3, p=1)
            # i = 1: 加入5个Conv2d(256, 256, 3, p=1)
            for j in range(layer_num):
                block.add(Conv2d(args['num_filters'][i], args['num_filters'][i], 3, padding=1))
                block.add(BatchNorm2d(args['num_filters'][i]))
                block.add(nn.ReLU())
            blocks.append(block)
            # i = 0: ConvTranspose2d(128, 256, 1)
            # i = 1:ConvTranspose2d(256, 256, 2)
            deblock = Sequential(
                ConvTranspose2d(
                    args['num_filters'][i], args['num_upsample_filters'][i], args['upsample_strides'][i],
                    stride=args['upsample_strides'][i]
                ),
                BatchNorm2d(args['num_upsample_filters'][i]),
                nn.ReLU(),
            )
            deblocks.append(deblock)
        # blocks = [Conv2d(256,128,s=1)+Conv2d(128,128)*5, Conv2d(256,256,s=2)+Conv2d*5]
        # deblock = [ConvTranspose2d(128, 256, s=1),ConvTranspose2d(256, 256, s=2)]
        c_in = sum(args['num_upsample_filters'])    # 256+256=512
        if self._concat_input:  # False
            c_in += args['num_input_features']
        
        # 额外的deblock部分,目前的cfg中没有
        if len(args['upsample_strides']) > len(args['num_filters']):
            deblock = Sequential(
                ConvTranspose2d(c_in, c_in, args['upsample_strides'][-1], stride=args['upsample_strides'][-1]),
                BatchNorm2d(c_in),
                nn.ReLU(),
            )
            deblocks.append(deblock)
        self.blocks = nn.ModuleList(blocks)
        self.deblocks = nn.ModuleList(deblocks)

        # 每个体素上都有num_anchors_per_location个anchor,num_class种分类
        if args['encode_background_as_zeros']:
            num_cls = self.num_anchors_per_location * num_class
        else:
            num_cls = self.num_anchors_per_location * (num_class + 1)
        # 全卷积网络到计算cls损失的形式
        self.conv_cls = nn.Conv2d(c_in, num_cls, 1)
        # 全卷积网络到计算reg损失的形式
        reg_channels = self.num_anchors_per_location * self.box_code_size
        self.conv_box = nn.Conv2d(c_in, reg_channels, 1)
        if args['use_direction_classifier']:
            self.conv_dir_cls = nn.Conv2d(c_in, self.num_anchors_per_location * args['num_direction_bins'], 1)
            
        # 这里大概是最后一个初始化操作了,执行torch网络的初始化
        self.init_weights()

    def init_weights(self):
        pi = 0.01
        nn.init.constant_(self.conv_cls.bias, -np.log((1 - pi) / pi))

    def forward(self, x_in, bev=None, **kwargs):
        ups = []
        x = x_in
        ret_dict = {}
        # biocks里面包括两个block
        for i in range(len(self.blocks)):
            x = self.blocks[i](x)
            # 有strict
            stride = int(x_in.shape[2] / x.shape[2])
            ret_dict['spatial_features_%dx' % stride] = x
            # 经过了deconv,放到了ups中
            ups.append(self.deblocks[i](x))
        # 一个conv的步长为1,deconv倍数也是1,得到一个结果
        # 一个conv的步长为2,deconv倍数也是2,得到一个结果
        # 两个结果拼接
        if self._concat_input:
            ups.append(x_in)
        # 两个block的结果cat起来
        if len(ups) > 1:
            x = torch.cat(ups, dim=1)
        else:
            x = ups[0]
        if len(self.deblocks)>len(self.blocks):
            x = self.deblocks[-1](x)
        ret_dict['spatial_features_last'] = x

        # 得到类别识别结果和reg识别结果
        box_preds = self.conv_box(x)
        cls_preds = self.conv_cls(x)
        # [N, C, y(H), x(W)]
        box_preds = box_preds.permute(0, 2, 3, 1).contiguous()
        cls_preds = cls_preds.permute(0, 2, 3, 1).contiguous()
        ret_dict.update({
            'box_preds': box_preds,
            'cls_preds': cls_preds,
        })
        # 朝向识别的结果
        if self._use_direction_classifier:
            dir_cls_preds = self.conv_dir_cls(x)
            dir_cls_preds = dir_cls_preds.permute(0, 2, 3, 1).contiguous()
            ret_dict['dir_cls_preds'] = dir_cls_preds

        ret_dict['anchors'] = torch.from_numpy(self.anchor_cache['anchors']).cuda()
        if self.training:
            # 生成target,即网络的输出计算损失时要比较的对象!!!!!好难
            targets_dict = self.assign_targets(
                gt_boxes=kwargs['gt_boxes'],
            )
	     
            ret_dict.update({
                'box_cls_labels': torch.from_numpy(targets_dict['labels']).cuda(),
                'box_reg_targets': torch.from_numpy(targets_dict['bbox_targets']).cuda(),
                'reg_src_targets': torch.from_numpy(targets_dict['bbox_src_targets']).cuda(),
                'reg_weights': torch.from_numpy(targets_dict['bbox_outside_weights']).cuda(),
            })

        self.forward_ret_dict = ret_dict
        return ret_dict

5 损失计算

计算损失是神经网络中比较重要的部分。
首先明确计算损失的四个要素:①预测值②标签值③权重④损失函数

  1. 预测值:即网络的直接输出,根据RPN_head的计算,网络有三个输出,box_predscls_predsdir_cls_preds
  2. 标签值:目标检测任务中,无法直接使用gt_box标签,因为gt_box是在点云坐标系下的,而且也不是anchor的形式,与网络输出不对应,需要通过上面提到的最复杂头疼难看的self.target_assigner程序来进行
  3. 权重:不同的任务,不同的损失计算函数都需要不同的权重,例如:在计算box参数损失时,需要使用权重规定哪些box的损失是有效的;在计算cls参数时,不同分类的损失比重可能不同
  4. 损失函数:损失函数的分类有很多,本算法中用到的有
def build_losses(self, losses_cfg):
    # loss function definition
    self.cls_loss_func = loss_utils.SigmoidFocalClassificationLoss(alpha=0.25, gamma=2.0)
    code_weights = losses_cfg.LOSS_WEIGHTS['code_weights']

    rpn_code_weights = code_weights[3:7] if losses_cfg.RPN_REG_LOSS == 'bin-based' else code_weights
    self.reg_loss_func = loss_utils.WeightedSmoothL1LocalizationLoss(sigma=3.0, code_weights=rpn_code_weights)
    self.dir_loss_func = loss_utils.WeightedSoftmaxClassificationLoss()

从名称上就可以看出来使用的损失函数,损失函数看源码比较麻烦,可以直接百度数学原理。
其实在整个网络中以上只是一小部分,其中调用到的类,调用到的函数几乎包含了文件夹里面的全部,都要慢慢看呐。

参考博客:point pillars.

你可能感兴趣的:(点云目标识别,深度学习,神经网络,python,人工智能,卷积神经网络)