3D点云学习:SA-SSD③源码注释

1 参考博客

在注释源码时,主要参考了另一个作者的几篇博客,帮助非常大,感谢大佬。
参考博客: 简析SA-SSD在预处理训练评估的框架.
作者博客空间: 小白科研笔记.
由于代码量和注释量都比较大,因此截取几个第一次接触或者有启发的代码。

2 源码注释

1 create_data.py

SASSD与Pointpillars,PointRCNN,second 有很多处理很相似,作者也在源码地址介绍了,create_data部分就是几乎一样的方法。
看完之后,感觉这个代码的作用就是配合kitti.py生成用于数据增强的数据,其他作用还没发现。
参考博客:简析SA-SSD在预处理训练评估的框架.
参考博客:Pointpillars --creat_date篇.
create_data.py中主要分为三大部分:

create_kitti_info_file('E:/KITTI/detection')
create_reduced_point_cloud('E:/KITTI/detection')

create_groundtruth_database(data_path='E:/KITTI/detection', \
                            info_path='E:/KITTI/detection/kitti_infos_trainval.pkl', \
                            db_info_save_path='E:/KITTI/detection/kitti_dbinfos_trainval.pkl')

第一部分create_kitti_info_file

# 获取数据集每一帧点云的base info和label info整合到一个文件中,总的来说就是kitti.get_kitti_image_info + num_points_in_gt
def create_kitti_info_file(data_path,
                           save_path=None,
                           relative_path=True):
    # txt中存放train test val的序号
    train_img_ids = _read_imageset_file(osp.join(data_path, "ImageSets/train.txt"))
    val_img_ids = _read_imageset_file(osp.join(data_path, "ImageSets/val.txt"))
    trainval_img_ids = _read_imageset_file(osp.join(data_path, "ImageSets/trainval.txt"))
    test_img_ids = _read_imageset_file(osp.join(data_path, "ImageSets/test.txt"))

    print("Generate info. this may take several minutes.")
    if save_path is None:
        save_path = pathlib.Path(data_path)
    else:
        save_path = pathlib.Path(save_path)
    # image_info是一个字典型变量,包含'image_idx',pointcloud_num_features(4),velodyne_path,img_path,img_shape,annos。
    # image_info还包含标定的参数,比如相机内参数,calib/P0到calib/P4,雷达相机外参数,calib/R0_rect和calib/Tr_velo_to_cam。
    kitti_infos_train = kitti.get_kitti_image_info(
        data_path,
        training=True,
        velodyne=True,
        calib=True,
        image_ids=train_img_ids,
        relative_path=relative_path)
    # 在anno中加入了num_points_in_gt,但是似乎没有用到
    _calculate_num_points_in_gt(data_path, kitti_infos_train, relative_path)
    filename = save_path / 'kitti_infos_train.pkl'
    print(f"Kitti info train file is saved to {filename}")
    with open(filename, 'wb') as f:
    	# 序列化对象,将对象obj保存到文件file中去。file必须是w或者wb的状态
        pickle.dump(kitti_infos_train, f)

主要作用是对每一帧的点云数据的velodye,image,label等数据做一个整合,放到一个pkl文件里面,这样方便后面data_loader中kitti.py的调用。其中主要调用了以下几个函数,着重注释一下。
get_kitti_image_info

def get_kitti_image_info(path,
                         training=True,
                         label_info=True,
                         velodyne=False,
                         calib=False,
                         image_ids=7481,
                         extend_matrix=True,
                         num_worker=8,
                         relative_path=True,
                         with_imageshape=True):
    # image_infos = []
    root_path = pathlib.Path(path)
    if not isinstance(image_ids, list):
        image_ids = list(range(image_ids))
    # 开始往image_info里面写入内容
    def map_func(idx):
        # 写入序号  feature_num
        image_info = {'image_idx': idx, 'pointcloud_num_features': 4}
        annotations = None
        # 写入点云路径
        if velodyne:
            image_info['velodyne_path'] = get_velodyne_path(
                idx, path, training, relative_path)
        # 写入图片路径
        image_info['img_path'] = get_image_path(idx, path, training,
                                                relative_path)

        if with_imageshape:
            img_path = image_info['img_path']
            if relative_path:
                img_path = str(root_path / img_path)
            # 写入图片shape
            image_info['img_shape'] = np.array(
                io.imread(img_path).shape[:2], dtype=np.int32)
        if label_info:
            # 找到目标检测标签label的路径
            label_path = get_label_path(idx, path, training, relative_path)
            if relative_path:
                # 如果是相对路径就加上前缀,得到绝对路径
                label_path = str(root_path / label_path)
            # annotations与目标检测标注相关,一个字典变量
            # 读取目标检测的标签info
            annotations = get_label_anno(label_path)
        if calib:
            calib_path = get_calib_path(
                idx, path, training, relative_path=False)
            with open(calib_path, 'r') as f:
                lines = f.readlines()
            P0 = np.array(
                [float(info) for info in lines[0].split(' ')[1:13]]).reshape(
                    [3, 4])
            P1 = np.array(
                [float(info) for info in lines[1].split(' ')[1:13]]).reshape(
                    [3, 4])
            P2 = np.array(
                [float(info) for info in lines[2].split(' ')[1:13]]).reshape(
                    [3, 4])
            P3 = np.array(
                [float(info) for info in lines[3].split(' ')[1:13]]).reshape(
                    [3, 4])
            if extend_matrix:
                P0 = _extend_matrix(P0)
                P1 = _extend_matrix(P1)
                P2 = _extend_matrix(P2)
                P3 = _extend_matrix(P3)
            # 读取和写入校准工具内参
            image_info['calib/P0'] = P0
            image_info['calib/P1'] = P1
            image_info['calib/P2'] = P2
            image_info['calib/P3'] = P3
            R0_rect = np.array([
                float(info) for info in lines[4].split(' ')[1:10]
            ]).reshape([3, 3])
            if extend_matrix:
                rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
                rect_4x4[3, 3] = 1.
                rect_4x4[:3, :3] = R0_rect
            else:
                rect_4x4 = R0_rect
            # 写入校准工具外参
            image_info['calib/R0_rect'] = rect_4x4
            Tr_velo_to_cam = np.array([
                float(info) for info in lines[5].split(' ')[1:13]
            ]).reshape([3, 4])
            Tr_imu_to_velo = np.array([
                float(info) for info in lines[6].split(' ')[1:13]
            ]).reshape([3, 4])
            if extend_matrix:
                Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
                Tr_imu_to_velo = _extend_matrix(Tr_imu_to_velo)
            image_info['calib/Tr_velo_to_cam'] = Tr_velo_to_cam
            image_info['calib/Tr_imu_to_velo'] = Tr_imu_to_velo
        if annotations is not None:
            image_info['annos'] = annotations
            # annos["difficulty"] = 0(Easy), 1(Mid),2(Hard)
            # Easy: Min. bounding box height: 40 Px, Max. occlusion level: Fully visible, Max. truncation: 15 %
            # Moderate: Min. bounding box height: 25 Px, Max. occlusion level: Partly occluded, Max. truncation: 30 %
            # Hard: Min. bounding box height: 25 Px, Max. occlusion level: Difficult to see, Max. truncation: 50 %
            add_difficulty_to_annos(image_info)# 加入了新的信息,diff
        return image_info

在其中又调用了get_label_anno

# 获得label info,全部放到一个anno里面
def get_label_anno(label_path):
    annotations = {}
    annotations.update({
        'name': [],         # 标签的种类
        'truncated': [],    # 标签被遮挡的程度0-1
        'occluded': [],     # 部分遮挡或者没有遮挡或者大部分遮挡
        'alpha': [],        # 角度
        'bbox': [],         # 图片中的框的坐标上下左右
        'dimensions': [],   # 3D框维度,包括长宽高
        'location': [],     # 3D框坐标,包括xyz坐标
        'rotation_y': []    # 3D框的yaw角度
    })

第二部分create_reduced_point_cloud

# 生成3D目标检测真值,并保存到文件中,去除掉相机视野外的点
def create_reduced_point_cloud(data_path,
                               train_info_path=None,
                               val_info_path=None,
                               test_info_path=None,
                               save_path=None,
                               with_back=False):
    # 读取第一部分生成的信息
    if train_info_path is None:
        train_info_path = pathlib.Path(data_path) / 'kitti_infos_train.pkl'
    if val_info_path is None:
        val_info_path = pathlib.Path(data_path) / 'kitti_infos_val.pkl'
    if test_info_path is None:
        test_info_path = pathlib.Path(data_path) / 'kitti_infos_test.pkl'
    # 利用校准工具,获取相机视场内的点云,保证剩下点云的每个点都可以映射到image上
    _create_reduced_point_cloud(data_path, train_info_path, save_path)
    _create_reduced_point_cloud(data_path, val_info_path, save_path)
    _create_reduced_point_cloud(data_path, test_info_path, save_path)
    if with_back:
        _create_reduced_point_cloud(
            data_path, train_info_path, save_path, back=True)
        _create_reduced_point_cloud(
            data_path, val_info_path, save_path, back=True)
        _create_reduced_point_cloud(
            data_path, test_info_path, save_path, back=True)

这部分比较简单,而且似乎没有用到。先读取第一部分产生的pkl文件,再利用其中的点云,image,校准工具去除相机视场外(image外)的点,保存到名称中带有_reduce的文件中。
第三部分create_groundtruth_database

# 创建真值数据库,真值的意思是数据只包含特殊类别的点云,只有车,人,等等物体的base info和label info,用于kiiti数据的数据增强
def create_groundtruth_database(data_path,
                                info_path=None,
                                used_classes=None,
                                database_save_path=None,
                                db_info_save_path=None,
                                relative_path=True,
                                lidar_only=False,
                                bev_only=False,
                                coors_range=None):
    root_path = pathlib.Path(data_path)
    # 打开第一部分生成的pkl文件,train部分
    if info_path is None:
        info_path = root_path / 'kitti_infos_train.pkl'
    # 得到保存数据的路径
    if database_save_path is None:
        database_save_path = root_path / 'gt_database'
    else:
        database_save_path = pathlib.Path(database_save_path)
    # 得到db_info数据的保存路径,和上面第一部分的内容差不多
    if db_info_save_path is None:
        db_info_save_path = root_path / "kitti_dbinfos_train.pkl"
    database_save_path.mkdir(parents=True, exist_ok=True)
    with open(info_path, 'rb') as f:
        kitti_infos = pickle.load(f)
    all_db_infos = {}
    # 得到所有的识别类别
    if used_classes is None:
        used_classes = list(kitti.get_classes())
        # class_to_label = {
        #         'Car': 0,
        #         'Pedestrian': 1,
        #         'Cyclist': 2,
        #         'Van': 3,
        #         'Person_sitting': 4,
        #         'Truck': 5,
        #         'Tram': 6,
        #         'Misc': 7,
        #         'DontCare': -1,
        #     }
        used_classes.pop(used_classes.index('DontCare'))    # 去除DontCare类别
    for name in used_classes:   # 为每一个有效类别占一个坑
        all_db_infos[name] = []
    group_counter = 0
    for info in prog_bar(kitti_infos):
        # 没有用第二部分生成的点云,用的是全部的原始顶点云
        velodyne_path = info['velodyne_path']
        if relative_path:
            velodyne_path = str(root_path / velodyne_path)
        num_features = 4
        if 'pointcloud_num_features' in info:
            num_features = info['pointcloud_num_features']
        # 拿出点云数据 N x 4,和其他数据
        points = np.fromfile(
            velodyne_path, dtype=np.float32, count=-1).reshape([-1, num_features])
        image_idx = info["image_idx"]
        rect = info['calib/R0_rect']
        P2 = info['calib/P2']
        Trv2c = info['calib/Tr_velo_to_cam']
        if not lidar_only:  # lidar_only是false,没有用到
            points = remove_outside_points(points, rect, Trv2c, P2,
                                                      info["img_shape"])

        annos = info["annos"]
        names = annos["name"]
        bboxes = annos["bbox"]
        difficulty = annos["difficulty"]
        gt_idxes = annos["index"]
        num_obj = np.sum(annos["index"] >= 0)
        # 从info里的anno恢复image中的label bbox
        rbbox_cam = kitti.anno_to_rbboxes(annos)[:num_obj]
        # 从image中的label bbox恢复点云坐标下的3D bbox
        rbbox_lidar = box_camera_to_lidar(rbbox_cam, rect, Trv2c)
        if bev_only:  # set z and h to limits
            # 给z坐标设立一个固定值,3D bbox的高度h - z,减去设立好的z,就是去掉框在给定的z坐标以下的部分
            assert coors_range is not None
            rbbox_lidar[:, 2] = coors_range[2]
            rbbox_lidar[:, 5] = coors_range[5] - coors_range[2]

        group_dict = {}
        group_ids = np.full([bboxes.shape[0]], -1, dtype=np.int64)
        if "group_ids" in annos:
            group_ids = annos["group_ids"]
        else:
            group_ids = np.arange(bboxes.shape[0], dtype=np.int64)
        # point_indices N x m,N个点,m个框,代表一个点在不在label框里面,在是1,不在是0
        point_indices = points_in_rbbox(points, rbbox_lidar)
        for i in range(num_obj):
            # 要保存的文件名称,对每一帧数据的 每一种分类的框 的每一个框,都分别得到一个文件,用来单独保存每个3D bbox里面的点
            filename = f"{image_idx}_{names[i]}_{gt_idxes[i]}.bin"
            filepath = database_save_path / filename
            gt_points = points[point_indices[:, i]]
            # 转换坐标系,从lidar坐标系转换到每个框的坐标系下,即框的中心点为原点
            gt_points[:, :3] -= rbbox_lidar[i, :3]
            # 写入文件
            with open(filepath, 'w') as f:
                gt_points.tofile(f)
            if names[i] in used_classes:
                if relative_path:
                    db_path = str(database_save_path.stem + "/" + filename)
                else:
                    db_path = str(filepath)
                db_info = {
                    "name": names[i],
                    "path": db_path,
                    "image_idx": image_idx,
                    "gt_idx": gt_idxes[i],
                    "box3d_lidar": rbbox_lidar[i],
                    "num_points_in_gt": gt_points.shape[0],
                    "difficulty": difficulty[i],
                }

                local_group_id = group_ids[i]
                # if local_group_id >= 0:
                if local_group_id not in group_dict:
                    group_dict[local_group_id] = group_counter
                    group_counter += 1
                db_info["group_id"] = group_dict[local_group_id]
                if "score" in annos:
                    db_info["score"] = annos["score"][i]
                all_db_infos[names[i]].append(db_info)
    for k, v in all_db_infos.items():
        print(f"load {len(v)} {k} database infos")

    with open(db_info_save_path, 'wb') as f:
        pickle.dump(all_db_infos, f)

从train数据集里,将bboxes和box中的点分别拿出来,一个box对应一个文件,文件名称中注明了idx + class_name + idx_class。保存下来的数据主要是为了做数据增强用,data augmentation,数据增强的部分在mmdet\datasets\kitti.py中,后面再看。
这个data augmentation的方法简单暴力,就是将别的点云中的car_points拿出来,放到这个点云里面再训练,我喜欢。

2 输入数据的生成

模型的输入数据在mmdet\datasets\kitti.py中生成,除了原数据的内容,kitti.py中还加入了数据增强,体素生成以及anchor生成功能,对于目标识别任务来说,这几个功能是非常实用的,因此以后若自建voxel_based,anchor_base的3D Det算法,打算把SASSD中的输入数据生成方法生成当作模板。下面简要分析。

① KittiLiDAR类的初始化

收先分析一下,mmdetection框架中,输入数据的产生调用流程。
tools\train.py是整个模型训练的入口,产生输入数据的代码是:

# get_dataset,用cfg中的参数信息,通过datasets文件夹,生成包含所有数据的一个数据集
# 再本次实例中,obj_from_dict,更有深层次的理解。根据字典型变量info去指定初始化一个parrent类对象。如果parrent类是一个虚类,
# 它会根据info的变量自动地匹配一个Matched的子类,去指定初始化这个子类的实例。
# 毫无疑问,肯定是生成datasets类子类中的KittiLiDAR类
train_dataset = get_dataset(cfg.data.train)

调用的get_dataset类在mmdet\datasets\utils.py中:

def get_dataset(data_cfg):
    # 生成index文件的实例,'ann_file'是data_root + 'ImageSets/train.txt'
    # num_dset 就是训练数据总数,训练集的总数,比如只使用kitti是1,使用kitti+coco是2.。。
    if isinstance(data_cfg['ann_file'], (list, tuple)):
        ann_files = data_cfg['ann_file']
        num_dset = len(ann_files)
    else:
        ann_files = [data_cfg['ann_file']]
        num_dset = 1
    # SA-SSD没有使用它,按照else,生成 N 个 None
    if 'proposal_file' in data_cfg.keys():
        if isinstance(data_cfg['proposal_file'], (list, tuple)):
            proposal_files = data_cfg['proposal_file']
        else:
            proposal_files = [data_cfg['proposal_file']]
    else:
        proposal_files = [None] * num_dset
    assert len(proposal_files) == num_dset
    # SA-SSD没有使用它,算法不需要图像,'img_prefix'=None
    # 按照else,生成 N 个 None
    # 如果需要RGB的话,可以在cfg中写img_prefix=data_root + 'train2017/'相应路径
    if isinstance(data_cfg['img_prefix'], (list, tuple)):
        img_prefixes = data_cfg['img_prefix']
    else:
        img_prefixes = [data_cfg['img_prefix']] * num_dset
    assert len(img_prefixes) == num_dset

    # 按照data_cfg['generator']的参数,初始化voxel_generator,用于预处理点云体素化
    if 'generator' in data_cfg.keys() and data_cfg['generator'] is not None:
        generator = obj_from_dict(data_cfg['generator'], voxel_generator)
    else:
        generator = None
    # 按照data_cfg['augmentor']的参数,初始化point_augmentor,用于提供3D目标真值
    if 'augmentor' in data_cfg.keys() and data_cfg['augmentor'] is not None:
        augmentor = obj_from_dict(data_cfg['augmentor'], point_augmentor)
    else:
        augmentor = None
    # 按照data_cfg['anchor_generator']的参数,初始化anchor3d_generator,用于提供3DAnchor
    if 'anchor_generator' in data_cfg.keys() and data_cfg['anchor_generator'] is not None:
        anchor_generator = obj_from_dict(data_cfg['anchor_generator'], anchor3d_generator)
    else:
        anchor_generator = None
    # # 按照data_cfg['target_encoder']的参数,初始化bbox3d_target
    #  # SA-SSD中貌似没有使用,返回 None
    if 'target_encoder' in data_cfg.keys() and data_cfg['target_encoder'] is not None:
        target_encoder = obj_from_dict(data_cfg['target_encoder'], bbox3d_target)
    else:
        target_encoder = None
    dsets = []
    for i in range(num_dset):
        # 定义字典型变量data_info ,用于引导训练数据的装填
        # 使用copy,复制了cfg中的所有超参数,再改了改某几个参数
        data_info = copy.deepcopy(data_cfg)
        data_info['ann_file'] = ann_files[i]
        data_info['proposal_file'] = proposal_files[i]
        data_info['img_prefix'] = img_prefixes[i]
        if generator is not None:
            data_info['generator'] = generator
        if anchor_generator is not None:
            data_info['anchor_generator'] = anchor_generator
        if augmentor is not None:
            data_info['augmentor'] = augmentor
        if target_encoder is not None:
            data_info['target_encoder'] = target_encoder
        # 使用data_info去实例化datasets,根据data_info里面的设置,生成对应的dset
        # dest就是在这里调用了kittilidar类
        dset = obj_from_dict(data_info, datasets)
        dsets.append(dset)
    if len(dsets) > 1:
        dset = ConcatDataset(dsets)
    else:
        dset = dsets[0]
    # 从上述操作中,每一个训练数据都是一个datasets类
    # 使用ConcatDataset,把所有datasets类,统一变成一类datasets类
    return dset
    # dset由data_info生成,data_info中包括:ann_file配置文件, proposal_file辅助文件是None,img_prefix是None,generator是完成了初始化的体素生成器,augmentor是数据增强器,anchor_generator是anchor生成器
    # anchor_generator完成初始化的anchor生成器,target_encoder是None

在上面的函数定义中,读取了car_cfg/data/train里面的超参数,其中type=kittilidar,这样就用obj_from_dict调用了mmdet\datasets\kitti.py里面的KittiLiDAR(Dataset)类:

class KittiLiDAR(Dataset):
    def __init__(self, root, ann_file,             # 数据路径,配置文件路径
                 img_prefix,
                 img_norm_cfg,                     # 图片归一化数据
                 img_scale=(1242, 375),            # 图片的大小
                 size_divisor=32,                  # 标准化之后的图片大小都是32的倍数
                 proposal_file=None,
                 flip_ratio=0.5,                   # 翻转机率
                 with_point=False,                 # 使用点云
                 with_mask=False,                  # 使用mask
                 with_label=True,                  # 使用label
                 class_names = ['Car', 'Van'],     # 只关注这些种类的label
                 augmentor=None,                   # 数据增强
                 generator=None,                   # 点云生成体素的生成器
                 anchor_generator=None,            # 生成anchor的生成器
                 anchor_area_threshold=1,          # anchor的阈值,anchor里面有大于阈值数量的点,认为是有效anchor
                 target_encoder=None,              # 还不知道有啥用???????????
                 out_size_factor=2,                # 输出的anchor与体素规模的比例
                 test_mode=False):
        self.root = root
        self.img_scales = img_scale if isinstance(img_scale,
                                                  list) else [img_scale]
        assert mmcv.is_list_of(self.img_scales, tuple)
        # normalization configs
        self.img_norm_cfg = img_norm_cfg
        # flip ratio
        self.flip_ratio = flip_ratio
        # size_divisor (used for FPN)
        self.size_divisor = size_divisor
        self.class_names = class_names
        self.test_mode = test_mode
        self.with_label = with_label
        self.with_mask = with_mask
        self.with_point = with_point
        # 获取KITTI相关各种数据的前缀路径
        self.img_prefix = osp.join(root, 'image_2')
        self.right_prefix = osp.join(root, 'image_3')   # 没有用到
        self.lidar_prefix = osp.join(root, 'velodyne_reduced')  # 注意这里用的reduce后的点云,打脸了
        self.calib_prefix = osp.join(root, 'calib') # 校准器
        self.label_prefix = osp.join(root, 'label_2')   # label

        with open(ann_file, 'r') as f:
            self.sample_ids = list(map(int, f.read().splitlines()))
	# 根据图像宽高比设置标志 宽高比大于1的图像将被设置为组1,否则组0。
        if not self.test_mode:
            self._set_group_flag()

        # transforms
        self.img_transform = ImageTransform(
            size_divisor=self.size_divisor, **self.img_norm_cfg)
        # 1.将图像重新缩放到预期大小
        # 2.标准化图像
        # 3.翻转图像(如果需要)
        # 4.填充图像(如果需要)
        # 5.转置为(c,h,w)
        # voxel相关
        self.augmentor = augmentor
        self.generator = generator
        self.target_encoder = target_encoder
        self.out_size_factor = out_size_factor
        self.anchor_area_threshold = anchor_area_threshold
        # anchor 生成anchor box,SSD有自己生成随机框的方法,同时也生成鸟瞰图里的anchor,二维框
        if anchor_generator is not None:
            feature_map_size = self.generator.grid_size[:2] // self.out_size_factor     # feature_map_size  应该指 xy 平面上的空间区域,记为 [1408,1600]
            feature_map_size = [*feature_map_size, 1][::-1]                             # [1408,1600] => [1408,1600, 1] => [1, 1600, 1408]
            # 生成anchors
            anchors = anchor_generator(feature_map_size)                                # 喂入 [1, 1600, 1408] 生成 anchors
                                                                                        # 它是 (1, 1600, 1408, 1, 2, 7) 的张量,
                                                                                        # 2 表示旋转角度类别( 0 和 90 度),7 表示 Anchor 参数,xyzwlh 以及 Yaw 旋转角
            # 在每一个体素都声成一个框
            # 三位anchor,三维,7个参数
            self.anchors = anchors.reshape([-1, 7])                                     # 7 个参数,分别是 xyzwlh 和 Yaw 旋转角
                                                                                        # self.anchors 是 (1600*1408*2,7) 的张量
            self.anchors_bv = rbbox2d_to_near_bbox(
                self.anchors[:, [0, 1, 3, 4, 6]])
            # self.anchors_bv为[N, 4(xmin, ymin, xmax, ymax)],一个框的四个最值,用来代表这个框
        else:
            self.anchors=None

KittiLiDAR类的初始化中,做的工作还不少,下面分功能介绍。

② 数据增强

mmdet\datasets\utils.py中对augmentor做了初始化

if 'augmentor' in data_cfg.keys() and data_cfg['augmentor'] is not None:
    augmentor = obj_from_dict(data_cfg['augmentor'], point_augmentor)

结合cfg中augmentor的超参数,对augmentor进行初始化定义为:

class PointAugmentor:
    def __init__(self, root_path, \
                 info_path, \
                 sample_classes, # 对这些分类进行sample\
                 min_num_points, # 一个box里包含的点数超过了这个值,再进行sample\
                 sample_max_num, \
                 removed_difficulties,
                 gt_rot_range=None, \
                 global_rot_range=None,
                 center_noise_std=None, \
                 scale_range=None):
        with open(info_path, 'rb') as f:
            db_infos_all = pickle.load(f)   # 拿出所有分类的db
        self._samplers = list()
        for sample_class in sample_classes:
            db_infos = db_infos_all[sample_class]   # 拿出特定分类的db
            print(f"load {len(db_infos)} {sample_class} database infos")
            # 先过滤掉点数较少的box
            filtered_infos = []
            for info in db_infos:
                if info["num_points_in_gt"] >= min_num_points:
                    filtered_infos.append(info)
            db_infos = filtered_infos
            # 再过滤掉info属性difficulty = -1的db,这样的db不属于简单,中等,困难三个级别
            new_db_infos = [
                info for info in db_infos
                if info["difficulty"] not in removed_difficulties
            ]
            print("After filter database:")
            print(f"load {len(new_db_infos)} {sample_class} database infos")
            self._samplers.append(BatchSampler(new_db_infos, sample_class))
        self.root_path = root_path
        # self._db_infos = new_db_infos
        self._sample_classes = sample_classes
        self._sample_max_num = sample_max_num
        # self._sampler = BatchSampler(self._db_infos, sample_class)
        self._global_rot_range = global_rot_range
        self._gt_rot_range = gt_rot_range
        self._center_noise_std = center_noise_std
        self._min_scale = scale_range[0]
        self._max_scale = scale_range[1]

初始化过程中对db进行了两次过滤,保留了有效bbox,将有效框new_db_infos和sample_class放到了self._samplers中,BatchSampler是一个包装,对数据包装了几个函数,方便从数据中取样。
然后在KittiLiDAR(Dataset)类中进行增强操作:

if self.augmentor is not None and self.test_mode is False:
    sampled_gt_boxes, sampled_gt_types, sampled_points = self.augmentor.sample_all(gt_bboxes, gt_types)
    # box的信息,box的class,box对应的点云
    ###########################################
    # 采样时是从kitti_dbinfos_train.pkl中进行采样的,同时在采样时,确保不会产生图像和点的重叠问题
    # 当gt_box中某个分类不够15个时,从文件中拿出来一些db,补进去
    ###########################################
    assert sampled_points.dtype == np.float32
    # 合并
    gt_bboxes = np.concatenate([gt_bboxes, sampled_gt_boxes])
    # 合并
    gt_types = gt_types + sampled_gt_types
    assert len(gt_types) == len(gt_bboxes)

    # to avoid overlapping point (option)
    masks = points_in_rbbox(points, sampled_gt_boxes)
    # [num_points, num_box],表示某个点是否在某个box内部,points表示所有点,sampled_gt_boxes被抽样出来的gt_boxes
    #masks = points_op_cpu.points_in_bbox3d_np(points[:,:3], sampled_gt_boxes)

    points = points[np.logical_not(masks.any(-1))]
    # 将所有不属于任意一个采样框的点拿出来,因为采样出来的框本身保证了与其他gt框没有重叠,这里避免其他非车的点与采样框对应的点的覆盖

    # paste sampled points to the scene
    points = np.concatenate([sampled_points, points], axis=0)
    # 拼接之后就是需要输入进训练模型的所有点

    # select the interest classes
    selected = [i for i in range(len(gt_types)) if gt_types[i] in self.class_names]
    gt_bboxes = gt_bboxes[selected, :]
    gt_types = [gt_types[i] for i in range(len(gt_types)) if gt_types[i] in self.class_names]

    # force van to have same label as car
    gt_types = ['Car' if n == 'Van' else n for n in gt_types]
    gt_labels = np.array([self.class_names.index(n) + 1 for n in gt_types], dtype=np.int64)

    # 使用数据增强后,输入数据和真值标签同时做变换
    # 下面是常见的四种数据增强方式:
    self.augmentor.noise_per_object_(gt_bboxes, points, num_try=100)
    gt_bboxes, points = self.augmentor.random_flip(gt_bboxes, points)
    gt_bboxes, points = self.augmentor.global_rotation(gt_bboxes, points)
    gt_bboxes, points = self.augmentor.global_scaling(gt_bboxes, points)

实际上augmentor包含了两个阶段,第一阶段sample,通过采样使得每一帧数据中都至少有15个敏感物体bbox,补全bbox的数量时,也避免了补上的bbox与原来bbox的碰撞,补上之后,又避免了补上的bbox里的点与不是敏感物体的点的碰撞;第二个部分就是几种典型的augmentor方式,添加噪声,翻转,旋转,裁剪。

③ voxel生成

mmdet\datasets\utils.py中对augmentor做了初始化:

# 按照data_cfg['generator']的参数,初始化voxel_generator,用于预处理点云体素化
if 'generator' in data_cfg.keys() and data_cfg['generator'] is not None:
    generator = obj_from_dict(data_cfg['generator'], voxel_generator)

结合cfg中generator的超参数,对generator进行初始化定义为:

class VoxelGenerator:
    def __init__(self,
                 voxel_size,
                 point_cloud_range,
                 max_num_points,
                 max_voxels=20000):
        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
        # [0, -40, -3, 70.4, 40, 1]
        voxel_size = np.array(voxel_size, dtype=np.float32)
        grid_size = (
            point_cloud_range[3:] - point_cloud_range[:3]) / voxel_size
        # 体素范围和体素分辨率生成体素的维度,grid_size 是 1408*1600*40
        grid_size = np.round(grid_size).astype(np.int64) # grid_size 向下取整
        self._voxel_size = voxel_size
        self._point_cloud_range = point_cloud_range
        self._max_num_points = max_num_points
        self._max_voxels = max_voxels
        self._grid_size = grid_size

这个初始化定义比较直白,没有什么花里胡哨的。但注意这里第一次出现了grid_size:1408160040,点云体素化之后的维度,在神经网络中会经常用到这个维度或者成比例缩小后的维度。
然后在KittiLiDAR(Dataset)类中进行voxel生成操作:

if isinstance(self.generator, VoxelGenerator):
    #voxels, coordinates, num_points = self.generator.generate(points)
    voxel_size = self.generator.voxel_size          # 体素的分辨率,单位体素大小[0.05, 0.05, 0.1],
    pc_range = self.generator.point_cloud_range     # 有效点云的范围,在range内的点才会转化为体素[0, -40., -3., 70.4, 40., 1.]
    grid_size = self.generator.grid_size            # 生成后的体素的维度1408*1600*40

    keep = points_op_cpu.points_bound_kernel(points, pc_range[:3], pc_range[3:])
    voxels = points[keep, :]                        # 保留范围内的点云,是 N*4 的张量
    coordinates = ((voxels[:, [2, 1, 0]] - np.array(pc_range[[2,1,0]], dtype=np.float32)) / np.array(
    					voxel_size[::-1], dtype=np.float32)).astype(np.int32)    # 直接做除法然后取整得到 voxel,是 		    N*3 的张量,这里的3代表每个点的三维坐标										  
    num_points = np.ones(len(keep)).astype(np.int32)                # voxel 数目
    data['voxels'] = DC(to_tensor(voxels.astype(np.float32)))
    data['coordinates'] = DC(to_tensor(coordinates))
    data['num_points'] = DC(to_tensor(num_points))
    # SA-SSD中计算voxel的方法比较简陋,没有调用正规方法points_to_voxel
    # anchor_mask
    # 考虑到雷达点云是稀疏,尽管Anchor覆盖了整个BEV区域。显然,只有在有点云的地方,才有可能有3d目标。
    # 那些没有点云的空洞区域的Anchor是没啥用的。Anchor Mask的作用就是把覆盖点云的Anchor标记出来。
    if self.anchor_area_threshold >= 0 and self.anchors is not None:
        # coordinates 是 N*3 的张量,代表每个点云中每个点所在的体素的坐标
        # grid_size 是 [1408,1600,40]
        # tuple(grid_size[::-1][1:]) 是 [1600, 1408] 的元组
        # dense_voxel_map 是 1600*1408 的矩阵,
        # dense_voxel_map[i][j] = a,表示 (i,j) 区域内体素的个数为 a
        # dense_voxel_map 可以看作是体素分布的密度函数,只是再xy平面上的密度,没有考虑z维度的密度
        dense_voxel_map = sparse_sum_for_anchors_mask(
            			coordinates, tuple(grid_size[::-1][1:]))    # 在这里舍去了z轴数据
        dense_voxel_map = dense_voxel_map.cumsum(0)     # 累加
        dense_voxel_map = dense_voxel_map.cumsum(1)     # 累加不是相加,维度不变,还是1600*1408
        # self.anchors_bv 是 BEV 视图下生成的 Anchors,是 (1600*1408*2,5) 的张量
        # voxel_size 是 [0.05, 0.05, 0.1]
        # pc_range 是 [0, -40., -3., 70.4, 40., 1.]
        # grid_size 是 [1408,1600,40]
        # anchors_area 是 1408*1600*2 的向量,之所以有*2是因为每个位置anchor都有0°和90°两个框,anchors_area里面放的是每个框包含点的密度
        anchors_area = fused_get_anchors_area(
            dense_voxel_map, self.anchors_bv, voxel_size, pc_range, grid_size)
        # mask anchor的生成过程全程没有点云的参与,就是根据步长依次生成的,覆盖了全部的区域,mask筛选出没点的框
        anchors_mask = anchors_area > self.anchor_area_threshold
        data['anchors_mask'] = DC(to_tensor(anchors_mask.astype(np.uint8)))
    # filter gt_bbox out of range
    bv_range = self.generator.point_cloud_range[[0, 1, 3, 4]]
    mask = filter_gt_box_outside_range(gt_bboxes, bv_range)
    gt_bboxes = gt_bboxes[mask]
    gt_labels = gt_labels[mask]
    # 滤波掉超过range的gt
else:
    NotImplementedError

由于是第一次接触体素网络,很疑惑基于体素的方法输入到底是什么,按照定义是每个小体素里有多少个点。这里大概弄懂了,voxel_generator其实还是没有生成最终的输入,但是产生了两个很重要的数据,一个是voxels = points[keep, :],名字是voxel,实际上还是points,但是已经是只差一步的points了;第二个数据是coordinates,代表第一个数据voxels中每个点都属于哪个体素。然后,在神经网络的forward入口处mmdet\models\necks\cmn.py,有一个函数:

x = spconv.SparseConvTensor(voxel_features, coors, self.sparse_shape, batch_size)   # !!!!!!!!!真正的变成了体素

这里才是真正的符合定义的体素输入,维度是【batch_size,channels, sparse_shape】,代表每一个体素点的特征。
参考链接: github.

④ anchor生成

基于anchor的目标检测方法也是第一次看,大体的意思是:在每一个体素点上都预生成两个3D bbox,框的3D大小是固定的,同一个体素的两个框一个yaw为0,一个为90,然后经过神经网络的特征提取,输出一个与anchor维度大小相同的特征,视为anchor预选框的偏移量,这样,在计算3D框的损失时,比较的两者就是gt_labels和anchors+features。
个人理解:anchor就是强行赋予了体素化点云一个3Dbbox形式的额外形式,目的是为了在计算bbox大小回归损失时,两者可以对应起来,避免了让网络直接提取出bboxes,降低了对网络的要求,换句话说,给网络了一个答案的标准形式,让网络向答案靠近,而不是直接让网络得到标准答案。

if anchor_generator is not None:
    feature_map_size = self.generator.grid_size[:2] // self.out_size_factor     
    # feature_map_size  应该指 xy 平面上的空间区域,记为 [1408,1600]
    feature_map_size = [*feature_map_size, 1][::-1]                         # [1408,1600] => [1408,1600, 1] => [1, 1600, 1408]
    # 生成anchors
    anchors = anchor_generator(feature_map_size)    # 喂入 [1, 1600, 1408] 生成 anchors
                                                    # 它是 (1, 1600, 1408, 1, 2, 7) 的张量,
                                                    # 2 表示旋转角度类别( 0 和 90 度),7 表示 Anchor 参数,xyzwlh 以及 Yaw 旋转角
    # 在每一个体素都声成一个框
    # 三位anchor,三维,7个参数
    self.anchors = anchors.reshape([-1, 7])                                     # 7 个参数,分别是 xyzwlh 和 Yaw 旋转角
                                                                                # self.anchors 是 (1600*1408*2,7) 的张量
    self.anchors_bv = rbbox2d_to_near_bbox(
        self.anchors[:, [0, 1, 3, 4, 6]])
    # self.anchors_bv为[N, 4(xmin, ymin, xmax, ymax)],一个框的四个最值,用来代表这个框
else:
    self.anchors=None

anchor_generatormmdet\core\anchor\anchor3d_generator.py中:

class AnchorGeneratorStride:
    def __init__(self,
                 sizes=[1.6, 3.9, 1.56],
                 anchor_strides=[0.4, 0.4, 1.0],
                 anchor_offsets=[0.2, -39.8, -1.78],
                 rotations=[0, np.pi / 2],
                 dtype=np.float32):
        self._sizes = sizes
        self._anchor_strides = anchor_strides
        self._anchor_offsets = anchor_offsets
        self._rotations = rotations
        self._dtype = dtype

    @property
    def num_anchors_per_localization(self):
        num_rot = len(self._rotations)
        num_size = np.array(self._sizes).reshape([-1, 3]).shape[0]
        return num_rot * num_size

    def __call__(self, feature_map_size):
        return create_anchors_3d_stride(
            feature_map_size, self._sizes, self._anchor_strides,
            self._anchor_offsets, self._rotations, self._dtype)

其他部分没什么好说的,num_anchors_per_localization是计算每个位置(体素点)上有多少个anchor,主要是create_anchors_3d_stride

def create_anchors_3d_stride(feature_size,                          # 是 [1, 1600, 1408] z,y,x
                             sizes=[1.6, 3.9, 1.56],                # 单个 Anchor 的大小
                             anchor_strides=[0.4, 0.4, 0.0],        # 指每个 Anchor 的间距 cfg 中是 [0.4, 0.4, 1.0]
                             anchor_offsets=[0.2, -39.8, -1.78],    # 分别是anchor中心的阈值,中心从这里开始生成
                             rotations=[0, np.pi / 2],              # 角度,0和90的弧度值
                             dtype=np.float32):
    """
    Args:
        feature_size: list [D, H, W](zyx)
        sizes: [N, 3] list of list or array, size of anchors, xyz

    Returns:
        anchors: [*feature_size, num_sizes, num_rots, 7] tensor.
    """
    # almost 2x faster than v1

    # 步长和下限
    x_stride, y_stride, z_stride = anchor_strides                   # 分别是 0.4,0.4,1.0
    x_offset, y_offset, z_offset = anchor_offsets                   # 分别是 0.2,-39.8,-1.78

    # 基准值
    z_centers = np.arange(feature_size[0], dtype=dtype)             # 生成数组,0
    y_centers = np.arange(feature_size[1], dtype=dtype)             # 生成数组,0,1,...,1600-1
    x_centers = np.arange(feature_size[2], dtype=dtype)             # 生成数组,0,1,...,1408-1

    # 真实的zyx中心坐标
    z_centers = z_centers * z_stride + z_offset                     # -1.78
    y_centers = y_centers * y_stride + y_offset                     # -39.8,-39.4,...,599.8
    x_centers = x_centers * x_stride + x_offset                     # 0.2,0.6,...,563.0

    # 单个 Anchor 的大小有多少种 N x 3---->1 x 3
    sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3])       # 变成 1*3 张量,如果要生成 N 种大小的 Anchor,就会有 N*3 张量
    # 单个 Anchor 的角度有多少种 N
    rotations = np.array(rotations, dtype=dtype)
    # 生成网格点,用上面的数据生成了每个网格中心点的坐标,上面都是分开的
    #
    rets = np.meshgrid(
        x_centers, y_centers, z_centers, rotations, indexing='ij')

    tile_shape = [1] * 5# 等价于 [1,1,1,1,1]
    # 1,1,1,N,1 实际上N = 1,[1,1,1,1,1]
    tile_shape[-2] = int(sizes.shape[0])    # 等价于 [1,1,1,N,1],N种anchor的size
    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!下面这一段看不	  
    # 懂!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    for i in range(len(rets)):
        rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape)
        rets[i] = rets[i][..., np.newaxis]  # for concat

    # sizes从[N, 3]---->[1, 1, 1, N, 1, 3]
    sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3])
    tile_size_shape = list(rets[0].shape)
    tile_size_shape[3] = 1
    sizes = np.tile(sizes, tile_size_shape)
    rets.insert(3, sizes)
    ret = np.concatenate(rets, axis=-1)
    # 输出结果是 (1, 1600, 1408, 1, 2, 7) 的张量
    # 第零维没啥说的,可能是batch
    # 第一维是 anchor 在 y 轴上的序号 0~1600-1
    # 第二维是 anchor 在 x 轴上的序号 0~1408-1
    # 第三维是 anchor 的大小类别,只生成一种大小的anchor,所以为1
    # 第四维是 anchoe 的转角,只生成了 0 度和 90 度,这两类
    # 第五维是 anchor 的7个值,第7个为 Yaw 旋转角,前六个是 xyz 和 wlh
    return np.transpose(ret, [2, 1, 0, 3, 4, 5]) # 前5个维度表示这个框的整体信息,最后一个维度表示这个框的实际形状

写一遍下来比之前捋通顺了很多,一定要跟准cfg文件,从cfg出发。
在神经网络的核心部分,创新点都在读论文的时候了解了,这部分源码就看起来比较顺畅,接下来还是记录一下其中比较精华的部分。

你可能感兴趣的:(点云目标识别)