在注释源码时,主要参考了另一个作者的几篇博客,帮助非常大,感谢大佬。
参考博客: 简析SA-SSD在预处理训练评估的框架.
作者博客空间: 小白科研笔记.
由于代码量和注释量都比较大,因此截取几个第一次接触或者有启发的代码。
SASSD与Pointpillars,PointRCNN,second 有很多处理很相似,作者也在源码地址介绍了,create_data部分就是几乎一样的方法。
看完之后,感觉这个代码的作用就是配合kitti.py
生成用于数据增强的数据,其他作用还没发现。
参考博客:简析SA-SSD在预处理训练评估的框架.
参考博客:Pointpillars --creat_date篇.
create_data.py中主要分为三大部分:
create_kitti_info_file('E:/KITTI/detection')
create_reduced_point_cloud('E:/KITTI/detection')
create_groundtruth_database(data_path='E:/KITTI/detection', \
info_path='E:/KITTI/detection/kitti_infos_trainval.pkl', \
db_info_save_path='E:/KITTI/detection/kitti_dbinfos_trainval.pkl')
第一部分create_kitti_info_file
:
# 获取数据集每一帧点云的base info和label info整合到一个文件中,总的来说就是kitti.get_kitti_image_info + num_points_in_gt
def create_kitti_info_file(data_path,
save_path=None,
relative_path=True):
# txt中存放train test val的序号
train_img_ids = _read_imageset_file(osp.join(data_path, "ImageSets/train.txt"))
val_img_ids = _read_imageset_file(osp.join(data_path, "ImageSets/val.txt"))
trainval_img_ids = _read_imageset_file(osp.join(data_path, "ImageSets/trainval.txt"))
test_img_ids = _read_imageset_file(osp.join(data_path, "ImageSets/test.txt"))
print("Generate info. this may take several minutes.")
if save_path is None:
save_path = pathlib.Path(data_path)
else:
save_path = pathlib.Path(save_path)
# image_info是一个字典型变量,包含'image_idx',pointcloud_num_features(4),velodyne_path,img_path,img_shape,annos。
# image_info还包含标定的参数,比如相机内参数,calib/P0到calib/P4,雷达相机外参数,calib/R0_rect和calib/Tr_velo_to_cam。
kitti_infos_train = kitti.get_kitti_image_info(
data_path,
training=True,
velodyne=True,
calib=True,
image_ids=train_img_ids,
relative_path=relative_path)
# 在anno中加入了num_points_in_gt,但是似乎没有用到
_calculate_num_points_in_gt(data_path, kitti_infos_train, relative_path)
filename = save_path / 'kitti_infos_train.pkl'
print(f"Kitti info train file is saved to {filename}")
with open(filename, 'wb') as f:
# 序列化对象,将对象obj保存到文件file中去。file必须是w或者wb的状态
pickle.dump(kitti_infos_train, f)
主要作用是对每一帧的点云数据的velodye,image,label等数据做一个整合,放到一个pkl文件里面,这样方便后面data_loader中kitti.py的调用。其中主要调用了以下几个函数,着重注释一下。
get_kitti_image_info
:
def get_kitti_image_info(path,
training=True,
label_info=True,
velodyne=False,
calib=False,
image_ids=7481,
extend_matrix=True,
num_worker=8,
relative_path=True,
with_imageshape=True):
# image_infos = []
root_path = pathlib.Path(path)
if not isinstance(image_ids, list):
image_ids = list(range(image_ids))
# 开始往image_info里面写入内容
def map_func(idx):
# 写入序号 feature_num
image_info = {'image_idx': idx, 'pointcloud_num_features': 4}
annotations = None
# 写入点云路径
if velodyne:
image_info['velodyne_path'] = get_velodyne_path(
idx, path, training, relative_path)
# 写入图片路径
image_info['img_path'] = get_image_path(idx, path, training,
relative_path)
if with_imageshape:
img_path = image_info['img_path']
if relative_path:
img_path = str(root_path / img_path)
# 写入图片shape
image_info['img_shape'] = np.array(
io.imread(img_path).shape[:2], dtype=np.int32)
if label_info:
# 找到目标检测标签label的路径
label_path = get_label_path(idx, path, training, relative_path)
if relative_path:
# 如果是相对路径就加上前缀,得到绝对路径
label_path = str(root_path / label_path)
# annotations与目标检测标注相关,一个字典变量
# 读取目标检测的标签info
annotations = get_label_anno(label_path)
if calib:
calib_path = get_calib_path(
idx, path, training, relative_path=False)
with open(calib_path, 'r') as f:
lines = f.readlines()
P0 = np.array(
[float(info) for info in lines[0].split(' ')[1:13]]).reshape(
[3, 4])
P1 = np.array(
[float(info) for info in lines[1].split(' ')[1:13]]).reshape(
[3, 4])
P2 = np.array(
[float(info) for info in lines[2].split(' ')[1:13]]).reshape(
[3, 4])
P3 = np.array(
[float(info) for info in lines[3].split(' ')[1:13]]).reshape(
[3, 4])
if extend_matrix:
P0 = _extend_matrix(P0)
P1 = _extend_matrix(P1)
P2 = _extend_matrix(P2)
P3 = _extend_matrix(P3)
# 读取和写入校准工具内参
image_info['calib/P0'] = P0
image_info['calib/P1'] = P1
image_info['calib/P2'] = P2
image_info['calib/P3'] = P3
R0_rect = np.array([
float(info) for info in lines[4].split(' ')[1:10]
]).reshape([3, 3])
if extend_matrix:
rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
rect_4x4[3, 3] = 1.
rect_4x4[:3, :3] = R0_rect
else:
rect_4x4 = R0_rect
# 写入校准工具外参
image_info['calib/R0_rect'] = rect_4x4
Tr_velo_to_cam = np.array([
float(info) for info in lines[5].split(' ')[1:13]
]).reshape([3, 4])
Tr_imu_to_velo = np.array([
float(info) for info in lines[6].split(' ')[1:13]
]).reshape([3, 4])
if extend_matrix:
Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
Tr_imu_to_velo = _extend_matrix(Tr_imu_to_velo)
image_info['calib/Tr_velo_to_cam'] = Tr_velo_to_cam
image_info['calib/Tr_imu_to_velo'] = Tr_imu_to_velo
if annotations is not None:
image_info['annos'] = annotations
# annos["difficulty"] = 0(Easy), 1(Mid),2(Hard)
# Easy: Min. bounding box height: 40 Px, Max. occlusion level: Fully visible, Max. truncation: 15 %
# Moderate: Min. bounding box height: 25 Px, Max. occlusion level: Partly occluded, Max. truncation: 30 %
# Hard: Min. bounding box height: 25 Px, Max. occlusion level: Difficult to see, Max. truncation: 50 %
add_difficulty_to_annos(image_info)# 加入了新的信息,diff
return image_info
在其中又调用了get_label_anno
:
# 获得label info,全部放到一个anno里面
def get_label_anno(label_path):
annotations = {}
annotations.update({
'name': [], # 标签的种类
'truncated': [], # 标签被遮挡的程度0-1
'occluded': [], # 部分遮挡或者没有遮挡或者大部分遮挡
'alpha': [], # 角度
'bbox': [], # 图片中的框的坐标上下左右
'dimensions': [], # 3D框维度,包括长宽高
'location': [], # 3D框坐标,包括xyz坐标
'rotation_y': [] # 3D框的yaw角度
})
第二部分create_reduced_point_cloud
:
# 生成3D目标检测真值,并保存到文件中,去除掉相机视野外的点
def create_reduced_point_cloud(data_path,
train_info_path=None,
val_info_path=None,
test_info_path=None,
save_path=None,
with_back=False):
# 读取第一部分生成的信息
if train_info_path is None:
train_info_path = pathlib.Path(data_path) / 'kitti_infos_train.pkl'
if val_info_path is None:
val_info_path = pathlib.Path(data_path) / 'kitti_infos_val.pkl'
if test_info_path is None:
test_info_path = pathlib.Path(data_path) / 'kitti_infos_test.pkl'
# 利用校准工具,获取相机视场内的点云,保证剩下点云的每个点都可以映射到image上
_create_reduced_point_cloud(data_path, train_info_path, save_path)
_create_reduced_point_cloud(data_path, val_info_path, save_path)
_create_reduced_point_cloud(data_path, test_info_path, save_path)
if with_back:
_create_reduced_point_cloud(
data_path, train_info_path, save_path, back=True)
_create_reduced_point_cloud(
data_path, val_info_path, save_path, back=True)
_create_reduced_point_cloud(
data_path, test_info_path, save_path, back=True)
这部分比较简单,而且似乎没有用到。先读取第一部分产生的pkl文件,再利用其中的点云,image,校准工具去除相机视场外(image外)的点,保存到名称中带有_reduce的文件中。
第三部分create_groundtruth_database
:
# 创建真值数据库,真值的意思是数据只包含特殊类别的点云,只有车,人,等等物体的base info和label info,用于kiiti数据的数据增强
def create_groundtruth_database(data_path,
info_path=None,
used_classes=None,
database_save_path=None,
db_info_save_path=None,
relative_path=True,
lidar_only=False,
bev_only=False,
coors_range=None):
root_path = pathlib.Path(data_path)
# 打开第一部分生成的pkl文件,train部分
if info_path is None:
info_path = root_path / 'kitti_infos_train.pkl'
# 得到保存数据的路径
if database_save_path is None:
database_save_path = root_path / 'gt_database'
else:
database_save_path = pathlib.Path(database_save_path)
# 得到db_info数据的保存路径,和上面第一部分的内容差不多
if db_info_save_path is None:
db_info_save_path = root_path / "kitti_dbinfos_train.pkl"
database_save_path.mkdir(parents=True, exist_ok=True)
with open(info_path, 'rb') as f:
kitti_infos = pickle.load(f)
all_db_infos = {}
# 得到所有的识别类别
if used_classes is None:
used_classes = list(kitti.get_classes())
# class_to_label = {
# 'Car': 0,
# 'Pedestrian': 1,
# 'Cyclist': 2,
# 'Van': 3,
# 'Person_sitting': 4,
# 'Truck': 5,
# 'Tram': 6,
# 'Misc': 7,
# 'DontCare': -1,
# }
used_classes.pop(used_classes.index('DontCare')) # 去除DontCare类别
for name in used_classes: # 为每一个有效类别占一个坑
all_db_infos[name] = []
group_counter = 0
for info in prog_bar(kitti_infos):
# 没有用第二部分生成的点云,用的是全部的原始顶点云
velodyne_path = info['velodyne_path']
if relative_path:
velodyne_path = str(root_path / velodyne_path)
num_features = 4
if 'pointcloud_num_features' in info:
num_features = info['pointcloud_num_features']
# 拿出点云数据 N x 4,和其他数据
points = np.fromfile(
velodyne_path, dtype=np.float32, count=-1).reshape([-1, num_features])
image_idx = info["image_idx"]
rect = info['calib/R0_rect']
P2 = info['calib/P2']
Trv2c = info['calib/Tr_velo_to_cam']
if not lidar_only: # lidar_only是false,没有用到
points = remove_outside_points(points, rect, Trv2c, P2,
info["img_shape"])
annos = info["annos"]
names = annos["name"]
bboxes = annos["bbox"]
difficulty = annos["difficulty"]
gt_idxes = annos["index"]
num_obj = np.sum(annos["index"] >= 0)
# 从info里的anno恢复image中的label bbox
rbbox_cam = kitti.anno_to_rbboxes(annos)[:num_obj]
# 从image中的label bbox恢复点云坐标下的3D bbox
rbbox_lidar = box_camera_to_lidar(rbbox_cam, rect, Trv2c)
if bev_only: # set z and h to limits
# 给z坐标设立一个固定值,3D bbox的高度h - z,减去设立好的z,就是去掉框在给定的z坐标以下的部分
assert coors_range is not None
rbbox_lidar[:, 2] = coors_range[2]
rbbox_lidar[:, 5] = coors_range[5] - coors_range[2]
group_dict = {}
group_ids = np.full([bboxes.shape[0]], -1, dtype=np.int64)
if "group_ids" in annos:
group_ids = annos["group_ids"]
else:
group_ids = np.arange(bboxes.shape[0], dtype=np.int64)
# point_indices N x m,N个点,m个框,代表一个点在不在label框里面,在是1,不在是0
point_indices = points_in_rbbox(points, rbbox_lidar)
for i in range(num_obj):
# 要保存的文件名称,对每一帧数据的 每一种分类的框 的每一个框,都分别得到一个文件,用来单独保存每个3D bbox里面的点
filename = f"{image_idx}_{names[i]}_{gt_idxes[i]}.bin"
filepath = database_save_path / filename
gt_points = points[point_indices[:, i]]
# 转换坐标系,从lidar坐标系转换到每个框的坐标系下,即框的中心点为原点
gt_points[:, :3] -= rbbox_lidar[i, :3]
# 写入文件
with open(filepath, 'w') as f:
gt_points.tofile(f)
if names[i] in used_classes:
if relative_path:
db_path = str(database_save_path.stem + "/" + filename)
else:
db_path = str(filepath)
db_info = {
"name": names[i],
"path": db_path,
"image_idx": image_idx,
"gt_idx": gt_idxes[i],
"box3d_lidar": rbbox_lidar[i],
"num_points_in_gt": gt_points.shape[0],
"difficulty": difficulty[i],
}
local_group_id = group_ids[i]
# if local_group_id >= 0:
if local_group_id not in group_dict:
group_dict[local_group_id] = group_counter
group_counter += 1
db_info["group_id"] = group_dict[local_group_id]
if "score" in annos:
db_info["score"] = annos["score"][i]
all_db_infos[names[i]].append(db_info)
for k, v in all_db_infos.items():
print(f"load {len(v)} {k} database infos")
with open(db_info_save_path, 'wb') as f:
pickle.dump(all_db_infos, f)
从train数据集里,将bboxes和box中的点分别拿出来,一个box对应一个文件,文件名称中注明了idx + class_name + idx_class。保存下来的数据主要是为了做数据增强用,data augmentation,数据增强的部分在mmdet\datasets\kitti.py
中,后面再看。
这个data augmentation的方法简单暴力,就是将别的点云中的car_points拿出来,放到这个点云里面再训练,我喜欢。
模型的输入数据在mmdet\datasets\kitti.py
中生成,除了原数据的内容,kitti.py中还加入了数据增强,体素生成以及anchor生成功能,对于目标识别任务来说,这几个功能是非常实用的,因此以后若自建voxel_based,anchor_base的3D Det算法,打算把SASSD中的输入数据生成方法生成当作模板。下面简要分析。
收先分析一下,mmdetection框架中,输入数据的产生调用流程。
tools\train.py
是整个模型训练的入口,产生输入数据的代码是:
# get_dataset,用cfg中的参数信息,通过datasets文件夹,生成包含所有数据的一个数据集
# 再本次实例中,obj_from_dict,更有深层次的理解。根据字典型变量info去指定初始化一个parrent类对象。如果parrent类是一个虚类,
# 它会根据info的变量自动地匹配一个Matched的子类,去指定初始化这个子类的实例。
# 毫无疑问,肯定是生成datasets类子类中的KittiLiDAR类
train_dataset = get_dataset(cfg.data.train)
调用的get_dataset
类在mmdet\datasets\utils.py中:
def get_dataset(data_cfg):
# 生成index文件的实例,'ann_file'是data_root + 'ImageSets/train.txt'
# num_dset 就是训练数据总数,训练集的总数,比如只使用kitti是1,使用kitti+coco是2.。。
if isinstance(data_cfg['ann_file'], (list, tuple)):
ann_files = data_cfg['ann_file']
num_dset = len(ann_files)
else:
ann_files = [data_cfg['ann_file']]
num_dset = 1
# SA-SSD没有使用它,按照else,生成 N 个 None
if 'proposal_file' in data_cfg.keys():
if isinstance(data_cfg['proposal_file'], (list, tuple)):
proposal_files = data_cfg['proposal_file']
else:
proposal_files = [data_cfg['proposal_file']]
else:
proposal_files = [None] * num_dset
assert len(proposal_files) == num_dset
# SA-SSD没有使用它,算法不需要图像,'img_prefix'=None
# 按照else,生成 N 个 None
# 如果需要RGB的话,可以在cfg中写img_prefix=data_root + 'train2017/'相应路径
if isinstance(data_cfg['img_prefix'], (list, tuple)):
img_prefixes = data_cfg['img_prefix']
else:
img_prefixes = [data_cfg['img_prefix']] * num_dset
assert len(img_prefixes) == num_dset
# 按照data_cfg['generator']的参数,初始化voxel_generator,用于预处理点云体素化
if 'generator' in data_cfg.keys() and data_cfg['generator'] is not None:
generator = obj_from_dict(data_cfg['generator'], voxel_generator)
else:
generator = None
# 按照data_cfg['augmentor']的参数,初始化point_augmentor,用于提供3D目标真值
if 'augmentor' in data_cfg.keys() and data_cfg['augmentor'] is not None:
augmentor = obj_from_dict(data_cfg['augmentor'], point_augmentor)
else:
augmentor = None
# 按照data_cfg['anchor_generator']的参数,初始化anchor3d_generator,用于提供3DAnchor
if 'anchor_generator' in data_cfg.keys() and data_cfg['anchor_generator'] is not None:
anchor_generator = obj_from_dict(data_cfg['anchor_generator'], anchor3d_generator)
else:
anchor_generator = None
# # 按照data_cfg['target_encoder']的参数,初始化bbox3d_target
# # SA-SSD中貌似没有使用,返回 None
if 'target_encoder' in data_cfg.keys() and data_cfg['target_encoder'] is not None:
target_encoder = obj_from_dict(data_cfg['target_encoder'], bbox3d_target)
else:
target_encoder = None
dsets = []
for i in range(num_dset):
# 定义字典型变量data_info ,用于引导训练数据的装填
# 使用copy,复制了cfg中的所有超参数,再改了改某几个参数
data_info = copy.deepcopy(data_cfg)
data_info['ann_file'] = ann_files[i]
data_info['proposal_file'] = proposal_files[i]
data_info['img_prefix'] = img_prefixes[i]
if generator is not None:
data_info['generator'] = generator
if anchor_generator is not None:
data_info['anchor_generator'] = anchor_generator
if augmentor is not None:
data_info['augmentor'] = augmentor
if target_encoder is not None:
data_info['target_encoder'] = target_encoder
# 使用data_info去实例化datasets,根据data_info里面的设置,生成对应的dset
# dest就是在这里调用了kittilidar类
dset = obj_from_dict(data_info, datasets)
dsets.append(dset)
if len(dsets) > 1:
dset = ConcatDataset(dsets)
else:
dset = dsets[0]
# 从上述操作中,每一个训练数据都是一个datasets类
# 使用ConcatDataset,把所有datasets类,统一变成一类datasets类
return dset
# dset由data_info生成,data_info中包括:ann_file配置文件, proposal_file辅助文件是None,img_prefix是None,generator是完成了初始化的体素生成器,augmentor是数据增强器,anchor_generator是anchor生成器
# anchor_generator完成初始化的anchor生成器,target_encoder是None
在上面的函数定义中,读取了car_cfg/data/train
里面的超参数,其中type=kittilidar,这样就用obj_from_dict
调用了mmdet\datasets\kitti.py
里面的KittiLiDAR(Dataset)
类:
class KittiLiDAR(Dataset):
def __init__(self, root, ann_file, # 数据路径,配置文件路径
img_prefix,
img_norm_cfg, # 图片归一化数据
img_scale=(1242, 375), # 图片的大小
size_divisor=32, # 标准化之后的图片大小都是32的倍数
proposal_file=None,
flip_ratio=0.5, # 翻转机率
with_point=False, # 使用点云
with_mask=False, # 使用mask
with_label=True, # 使用label
class_names = ['Car', 'Van'], # 只关注这些种类的label
augmentor=None, # 数据增强
generator=None, # 点云生成体素的生成器
anchor_generator=None, # 生成anchor的生成器
anchor_area_threshold=1, # anchor的阈值,anchor里面有大于阈值数量的点,认为是有效anchor
target_encoder=None, # 还不知道有啥用???????????
out_size_factor=2, # 输出的anchor与体素规模的比例
test_mode=False):
self.root = root
self.img_scales = img_scale if isinstance(img_scale,
list) else [img_scale]
assert mmcv.is_list_of(self.img_scales, tuple)
# normalization configs
self.img_norm_cfg = img_norm_cfg
# flip ratio
self.flip_ratio = flip_ratio
# size_divisor (used for FPN)
self.size_divisor = size_divisor
self.class_names = class_names
self.test_mode = test_mode
self.with_label = with_label
self.with_mask = with_mask
self.with_point = with_point
# 获取KITTI相关各种数据的前缀路径
self.img_prefix = osp.join(root, 'image_2')
self.right_prefix = osp.join(root, 'image_3') # 没有用到
self.lidar_prefix = osp.join(root, 'velodyne_reduced') # 注意这里用的reduce后的点云,打脸了
self.calib_prefix = osp.join(root, 'calib') # 校准器
self.label_prefix = osp.join(root, 'label_2') # label
with open(ann_file, 'r') as f:
self.sample_ids = list(map(int, f.read().splitlines()))
# 根据图像宽高比设置标志 宽高比大于1的图像将被设置为组1,否则组0。
if not self.test_mode:
self._set_group_flag()
# transforms
self.img_transform = ImageTransform(
size_divisor=self.size_divisor, **self.img_norm_cfg)
# 1.将图像重新缩放到预期大小
# 2.标准化图像
# 3.翻转图像(如果需要)
# 4.填充图像(如果需要)
# 5.转置为(c,h,w)
# voxel相关
self.augmentor = augmentor
self.generator = generator
self.target_encoder = target_encoder
self.out_size_factor = out_size_factor
self.anchor_area_threshold = anchor_area_threshold
# anchor 生成anchor box,SSD有自己生成随机框的方法,同时也生成鸟瞰图里的anchor,二维框
if anchor_generator is not None:
feature_map_size = self.generator.grid_size[:2] // self.out_size_factor # feature_map_size 应该指 xy 平面上的空间区域,记为 [1408,1600]
feature_map_size = [*feature_map_size, 1][::-1] # [1408,1600] => [1408,1600, 1] => [1, 1600, 1408]
# 生成anchors
anchors = anchor_generator(feature_map_size) # 喂入 [1, 1600, 1408] 生成 anchors
# 它是 (1, 1600, 1408, 1, 2, 7) 的张量,
# 2 表示旋转角度类别( 0 和 90 度),7 表示 Anchor 参数,xyzwlh 以及 Yaw 旋转角
# 在每一个体素都声成一个框
# 三位anchor,三维,7个参数
self.anchors = anchors.reshape([-1, 7]) # 7 个参数,分别是 xyzwlh 和 Yaw 旋转角
# self.anchors 是 (1600*1408*2,7) 的张量
self.anchors_bv = rbbox2d_to_near_bbox(
self.anchors[:, [0, 1, 3, 4, 6]])
# self.anchors_bv为[N, 4(xmin, ymin, xmax, ymax)],一个框的四个最值,用来代表这个框
else:
self.anchors=None
KittiLiDAR类的初始化中,做的工作还不少,下面分功能介绍。
在mmdet\datasets\utils.py
中对augmentor做了初始化
if 'augmentor' in data_cfg.keys() and data_cfg['augmentor'] is not None:
augmentor = obj_from_dict(data_cfg['augmentor'], point_augmentor)
结合cfg中augmentor的超参数,对augmentor
进行初始化定义为:
class PointAugmentor:
def __init__(self, root_path, \
info_path, \
sample_classes, # 对这些分类进行sample\
min_num_points, # 一个box里包含的点数超过了这个值,再进行sample\
sample_max_num, \
removed_difficulties,
gt_rot_range=None, \
global_rot_range=None,
center_noise_std=None, \
scale_range=None):
with open(info_path, 'rb') as f:
db_infos_all = pickle.load(f) # 拿出所有分类的db
self._samplers = list()
for sample_class in sample_classes:
db_infos = db_infos_all[sample_class] # 拿出特定分类的db
print(f"load {len(db_infos)} {sample_class} database infos")
# 先过滤掉点数较少的box
filtered_infos = []
for info in db_infos:
if info["num_points_in_gt"] >= min_num_points:
filtered_infos.append(info)
db_infos = filtered_infos
# 再过滤掉info属性difficulty = -1的db,这样的db不属于简单,中等,困难三个级别
new_db_infos = [
info for info in db_infos
if info["difficulty"] not in removed_difficulties
]
print("After filter database:")
print(f"load {len(new_db_infos)} {sample_class} database infos")
self._samplers.append(BatchSampler(new_db_infos, sample_class))
self.root_path = root_path
# self._db_infos = new_db_infos
self._sample_classes = sample_classes
self._sample_max_num = sample_max_num
# self._sampler = BatchSampler(self._db_infos, sample_class)
self._global_rot_range = global_rot_range
self._gt_rot_range = gt_rot_range
self._center_noise_std = center_noise_std
self._min_scale = scale_range[0]
self._max_scale = scale_range[1]
初始化过程中对db进行了两次过滤,保留了有效bbox,将有效框new_db_infos和sample_class放到了self._samplers中,BatchSampler是一个包装,对数据包装了几个函数,方便从数据中取样。
然后在KittiLiDAR(Dataset)
类中进行增强操作:
if self.augmentor is not None and self.test_mode is False:
sampled_gt_boxes, sampled_gt_types, sampled_points = self.augmentor.sample_all(gt_bboxes, gt_types)
# box的信息,box的class,box对应的点云
###########################################
# 采样时是从kitti_dbinfos_train.pkl中进行采样的,同时在采样时,确保不会产生图像和点的重叠问题
# 当gt_box中某个分类不够15个时,从文件中拿出来一些db,补进去
###########################################
assert sampled_points.dtype == np.float32
# 合并
gt_bboxes = np.concatenate([gt_bboxes, sampled_gt_boxes])
# 合并
gt_types = gt_types + sampled_gt_types
assert len(gt_types) == len(gt_bboxes)
# to avoid overlapping point (option)
masks = points_in_rbbox(points, sampled_gt_boxes)
# [num_points, num_box],表示某个点是否在某个box内部,points表示所有点,sampled_gt_boxes被抽样出来的gt_boxes
#masks = points_op_cpu.points_in_bbox3d_np(points[:,:3], sampled_gt_boxes)
points = points[np.logical_not(masks.any(-1))]
# 将所有不属于任意一个采样框的点拿出来,因为采样出来的框本身保证了与其他gt框没有重叠,这里避免其他非车的点与采样框对应的点的覆盖
# paste sampled points to the scene
points = np.concatenate([sampled_points, points], axis=0)
# 拼接之后就是需要输入进训练模型的所有点
# select the interest classes
selected = [i for i in range(len(gt_types)) if gt_types[i] in self.class_names]
gt_bboxes = gt_bboxes[selected, :]
gt_types = [gt_types[i] for i in range(len(gt_types)) if gt_types[i] in self.class_names]
# force van to have same label as car
gt_types = ['Car' if n == 'Van' else n for n in gt_types]
gt_labels = np.array([self.class_names.index(n) + 1 for n in gt_types], dtype=np.int64)
# 使用数据增强后,输入数据和真值标签同时做变换
# 下面是常见的四种数据增强方式:
self.augmentor.noise_per_object_(gt_bboxes, points, num_try=100)
gt_bboxes, points = self.augmentor.random_flip(gt_bboxes, points)
gt_bboxes, points = self.augmentor.global_rotation(gt_bboxes, points)
gt_bboxes, points = self.augmentor.global_scaling(gt_bboxes, points)
实际上augmentor
包含了两个阶段,第一阶段sample,通过采样使得每一帧数据中都至少有15个敏感物体bbox,补全bbox的数量时,也避免了补上的bbox与原来bbox的碰撞,补上之后,又避免了补上的bbox里的点与不是敏感物体的点的碰撞;第二个部分就是几种典型的augmentor方式,添加噪声,翻转,旋转,裁剪。
在mmdet\datasets\utils.py
中对augmentor做了初始化:
# 按照data_cfg['generator']的参数,初始化voxel_generator,用于预处理点云体素化
if 'generator' in data_cfg.keys() and data_cfg['generator'] is not None:
generator = obj_from_dict(data_cfg['generator'], voxel_generator)
结合cfg中generator的超参数,对generator
进行初始化定义为:
class VoxelGenerator:
def __init__(self,
voxel_size,
point_cloud_range,
max_num_points,
max_voxels=20000):
point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
# [0, -40, -3, 70.4, 40, 1]
voxel_size = np.array(voxel_size, dtype=np.float32)
grid_size = (
point_cloud_range[3:] - point_cloud_range[:3]) / voxel_size
# 体素范围和体素分辨率生成体素的维度,grid_size 是 1408*1600*40
grid_size = np.round(grid_size).astype(np.int64) # grid_size 向下取整
self._voxel_size = voxel_size
self._point_cloud_range = point_cloud_range
self._max_num_points = max_num_points
self._max_voxels = max_voxels
self._grid_size = grid_size
这个初始化定义比较直白,没有什么花里胡哨的。但注意这里第一次出现了grid_size:1408160040,点云体素化之后的维度,在神经网络中会经常用到这个维度或者成比例缩小后的维度。
然后在KittiLiDAR(Dataset)
类中进行voxel生成操作:
if isinstance(self.generator, VoxelGenerator):
#voxels, coordinates, num_points = self.generator.generate(points)
voxel_size = self.generator.voxel_size # 体素的分辨率,单位体素大小[0.05, 0.05, 0.1],
pc_range = self.generator.point_cloud_range # 有效点云的范围,在range内的点才会转化为体素[0, -40., -3., 70.4, 40., 1.]
grid_size = self.generator.grid_size # 生成后的体素的维度1408*1600*40
keep = points_op_cpu.points_bound_kernel(points, pc_range[:3], pc_range[3:])
voxels = points[keep, :] # 保留范围内的点云,是 N*4 的张量
coordinates = ((voxels[:, [2, 1, 0]] - np.array(pc_range[[2,1,0]], dtype=np.float32)) / np.array(
voxel_size[::-1], dtype=np.float32)).astype(np.int32) # 直接做除法然后取整得到 voxel,是 N*3 的张量,这里的3代表每个点的三维坐标
num_points = np.ones(len(keep)).astype(np.int32) # voxel 数目
data['voxels'] = DC(to_tensor(voxels.astype(np.float32)))
data['coordinates'] = DC(to_tensor(coordinates))
data['num_points'] = DC(to_tensor(num_points))
# SA-SSD中计算voxel的方法比较简陋,没有调用正规方法points_to_voxel
# anchor_mask
# 考虑到雷达点云是稀疏,尽管Anchor覆盖了整个BEV区域。显然,只有在有点云的地方,才有可能有3d目标。
# 那些没有点云的空洞区域的Anchor是没啥用的。Anchor Mask的作用就是把覆盖点云的Anchor标记出来。
if self.anchor_area_threshold >= 0 and self.anchors is not None:
# coordinates 是 N*3 的张量,代表每个点云中每个点所在的体素的坐标
# grid_size 是 [1408,1600,40]
# tuple(grid_size[::-1][1:]) 是 [1600, 1408] 的元组
# dense_voxel_map 是 1600*1408 的矩阵,
# dense_voxel_map[i][j] = a,表示 (i,j) 区域内体素的个数为 a
# dense_voxel_map 可以看作是体素分布的密度函数,只是再xy平面上的密度,没有考虑z维度的密度
dense_voxel_map = sparse_sum_for_anchors_mask(
coordinates, tuple(grid_size[::-1][1:])) # 在这里舍去了z轴数据
dense_voxel_map = dense_voxel_map.cumsum(0) # 累加
dense_voxel_map = dense_voxel_map.cumsum(1) # 累加不是相加,维度不变,还是1600*1408
# self.anchors_bv 是 BEV 视图下生成的 Anchors,是 (1600*1408*2,5) 的张量
# voxel_size 是 [0.05, 0.05, 0.1]
# pc_range 是 [0, -40., -3., 70.4, 40., 1.]
# grid_size 是 [1408,1600,40]
# anchors_area 是 1408*1600*2 的向量,之所以有*2是因为每个位置anchor都有0°和90°两个框,anchors_area里面放的是每个框包含点的密度
anchors_area = fused_get_anchors_area(
dense_voxel_map, self.anchors_bv, voxel_size, pc_range, grid_size)
# mask anchor的生成过程全程没有点云的参与,就是根据步长依次生成的,覆盖了全部的区域,mask筛选出没点的框
anchors_mask = anchors_area > self.anchor_area_threshold
data['anchors_mask'] = DC(to_tensor(anchors_mask.astype(np.uint8)))
# filter gt_bbox out of range
bv_range = self.generator.point_cloud_range[[0, 1, 3, 4]]
mask = filter_gt_box_outside_range(gt_bboxes, bv_range)
gt_bboxes = gt_bboxes[mask]
gt_labels = gt_labels[mask]
# 滤波掉超过range的gt
else:
NotImplementedError
由于是第一次接触体素网络,很疑惑基于体素的方法输入到底是什么,按照定义是每个小体素里有多少个点。这里大概弄懂了,voxel_generator其实还是没有生成最终的输入,但是产生了两个很重要的数据,一个是voxels = points[keep, :]
,名字是voxel,实际上还是points,但是已经是只差一步的points了;第二个数据是coordinates,代表第一个数据voxels中每个点都属于哪个体素。然后,在神经网络的forward入口处mmdet\models\necks\cmn.py
,有一个函数:
x = spconv.SparseConvTensor(voxel_features, coors, self.sparse_shape, batch_size) # !!!!!!!!!真正的变成了体素
这里才是真正的符合定义的体素输入,维度是【batch_size,channels, sparse_shape】,代表每一个体素点的特征。
参考链接: github.
基于anchor的目标检测方法也是第一次看,大体的意思是:在每一个体素点上都预生成两个3D bbox,框的3D大小是固定的,同一个体素的两个框一个yaw为0,一个为90,然后经过神经网络的特征提取,输出一个与anchor维度大小相同的特征,视为anchor预选框的偏移量,这样,在计算3D框的损失时,比较的两者就是gt_labels和anchors+features。
个人理解:anchor就是强行赋予了体素化点云一个3Dbbox形式的额外形式,目的是为了在计算bbox大小回归损失时,两者可以对应起来,避免了让网络直接提取出bboxes,降低了对网络的要求,换句话说,给网络了一个答案的标准形式,让网络向答案靠近,而不是直接让网络得到标准答案。
if anchor_generator is not None:
feature_map_size = self.generator.grid_size[:2] // self.out_size_factor
# feature_map_size 应该指 xy 平面上的空间区域,记为 [1408,1600]
feature_map_size = [*feature_map_size, 1][::-1] # [1408,1600] => [1408,1600, 1] => [1, 1600, 1408]
# 生成anchors
anchors = anchor_generator(feature_map_size) # 喂入 [1, 1600, 1408] 生成 anchors
# 它是 (1, 1600, 1408, 1, 2, 7) 的张量,
# 2 表示旋转角度类别( 0 和 90 度),7 表示 Anchor 参数,xyzwlh 以及 Yaw 旋转角
# 在每一个体素都声成一个框
# 三位anchor,三维,7个参数
self.anchors = anchors.reshape([-1, 7]) # 7 个参数,分别是 xyzwlh 和 Yaw 旋转角
# self.anchors 是 (1600*1408*2,7) 的张量
self.anchors_bv = rbbox2d_to_near_bbox(
self.anchors[:, [0, 1, 3, 4, 6]])
# self.anchors_bv为[N, 4(xmin, ymin, xmax, ymax)],一个框的四个最值,用来代表这个框
else:
self.anchors=None
anchor_generator
在mmdet\core\anchor\anchor3d_generator.py
中:
class AnchorGeneratorStride:
def __init__(self,
sizes=[1.6, 3.9, 1.56],
anchor_strides=[0.4, 0.4, 1.0],
anchor_offsets=[0.2, -39.8, -1.78],
rotations=[0, np.pi / 2],
dtype=np.float32):
self._sizes = sizes
self._anchor_strides = anchor_strides
self._anchor_offsets = anchor_offsets
self._rotations = rotations
self._dtype = dtype
@property
def num_anchors_per_localization(self):
num_rot = len(self._rotations)
num_size = np.array(self._sizes).reshape([-1, 3]).shape[0]
return num_rot * num_size
def __call__(self, feature_map_size):
return create_anchors_3d_stride(
feature_map_size, self._sizes, self._anchor_strides,
self._anchor_offsets, self._rotations, self._dtype)
其他部分没什么好说的,num_anchors_per_localization
是计算每个位置(体素点)上有多少个anchor,主要是create_anchors_3d_stride
:
def create_anchors_3d_stride(feature_size, # 是 [1, 1600, 1408] z,y,x
sizes=[1.6, 3.9, 1.56], # 单个 Anchor 的大小
anchor_strides=[0.4, 0.4, 0.0], # 指每个 Anchor 的间距 cfg 中是 [0.4, 0.4, 1.0]
anchor_offsets=[0.2, -39.8, -1.78], # 分别是anchor中心的阈值,中心从这里开始生成
rotations=[0, np.pi / 2], # 角度,0和90的弧度值
dtype=np.float32):
"""
Args:
feature_size: list [D, H, W](zyx)
sizes: [N, 3] list of list or array, size of anchors, xyz
Returns:
anchors: [*feature_size, num_sizes, num_rots, 7] tensor.
"""
# almost 2x faster than v1
# 步长和下限
x_stride, y_stride, z_stride = anchor_strides # 分别是 0.4,0.4,1.0
x_offset, y_offset, z_offset = anchor_offsets # 分别是 0.2,-39.8,-1.78
# 基准值
z_centers = np.arange(feature_size[0], dtype=dtype) # 生成数组,0
y_centers = np.arange(feature_size[1], dtype=dtype) # 生成数组,0,1,...,1600-1
x_centers = np.arange(feature_size[2], dtype=dtype) # 生成数组,0,1,...,1408-1
# 真实的zyx中心坐标
z_centers = z_centers * z_stride + z_offset # -1.78
y_centers = y_centers * y_stride + y_offset # -39.8,-39.4,...,599.8
x_centers = x_centers * x_stride + x_offset # 0.2,0.6,...,563.0
# 单个 Anchor 的大小有多少种 N x 3---->1 x 3
sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3]) # 变成 1*3 张量,如果要生成 N 种大小的 Anchor,就会有 N*3 张量
# 单个 Anchor 的角度有多少种 N
rotations = np.array(rotations, dtype=dtype)
# 生成网格点,用上面的数据生成了每个网格中心点的坐标,上面都是分开的
#
rets = np.meshgrid(
x_centers, y_centers, z_centers, rotations, indexing='ij')
tile_shape = [1] * 5# 等价于 [1,1,1,1,1]
# 1,1,1,N,1 实际上N = 1,[1,1,1,1,1]
tile_shape[-2] = int(sizes.shape[0]) # 等价于 [1,1,1,N,1],N种anchor的size
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!下面这一段看不
# 懂!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
for i in range(len(rets)):
rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape)
rets[i] = rets[i][..., np.newaxis] # for concat
# sizes从[N, 3]---->[1, 1, 1, N, 1, 3]
sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3])
tile_size_shape = list(rets[0].shape)
tile_size_shape[3] = 1
sizes = np.tile(sizes, tile_size_shape)
rets.insert(3, sizes)
ret = np.concatenate(rets, axis=-1)
# 输出结果是 (1, 1600, 1408, 1, 2, 7) 的张量
# 第零维没啥说的,可能是batch
# 第一维是 anchor 在 y 轴上的序号 0~1600-1
# 第二维是 anchor 在 x 轴上的序号 0~1408-1
# 第三维是 anchor 的大小类别,只生成一种大小的anchor,所以为1
# 第四维是 anchoe 的转角,只生成了 0 度和 90 度,这两类
# 第五维是 anchor 的7个值,第7个为 Yaw 旋转角,前六个是 xyz 和 wlh
return np.transpose(ret, [2, 1, 0, 3, 4, 5]) # 前5个维度表示这个框的整体信息,最后一个维度表示这个框的实际形状
写一遍下来比之前捋通顺了很多,一定要跟准cfg文件,从cfg出发。
在神经网络的核心部分,创新点都在读论文的时候了解了,这部分源码就看起来比较顺畅,接下来还是记录一下其中比较精华的部分。