基于LSS的成功,鉴智机器人提出了BEVDet,目前来到了2.0版本,在nuscences排行榜中以mAP=0.586暂列第一名。本文将对BEVDet的原理进行简要说明,然后结合代码对BEVDet进深度解析。
repo: https://github.com/HuangJunJie2017/BEVDet
paper:https://arxiv.org/abs/2211.17111
欢迎进入BEV感知交流群,一起解决学习过程发现的问题,可以加v:Rex1586662742或者q群:468713665。
BEVDet的主要包含一下四个步骤,如下图所示:
通过View Transformer,可以获得显示的BEV特征,类似点云特征,因此可以对BEV特征进行增强,同时使用cuda对Voxel Pooling过程进行加速,并且在Head中使用改进的NMS,下面就进入到代码部分了。
1、tools/test.py
if not distributed:
outputs = single_gpu_test(...)
# -> mmdet3d/apis/test.py
else:
...
2、mmdet3d/apis/test.py
if return_loss:
return self.forward_train(**kwargs)
else:
return self.forward_test(**kwargs)
# -> mmdet3d/models/detectors/base.py
3、mmdet3d/models/detectors/bevdet.py
class BEVDet(...):
def __init__(...):
...
def forward_test(...):
if not isinstance(img_inputs[0][0], list):
return self.simple_test(...)
def simple_test(...):
img_feats, _, _ = self.extract_feat(...)
# 参考centerpoint
bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale)
def extract_feat(...):
img_feats, depth = self.extract_img_feat(...)
pts_feats = None
return (img_feats, pts_feats, depth)
def extract_img_feat(...):
# 提取环视图片的特征
x = self.image_encoder(img[0])
# BEV 特征
x, depth = self.img_view_transformer([x] + img[1:7])
# -> mmdet3d/models/necks/view_transformer.py
x = self.bev_encoder(x)
return [x],depth
4、mmdet3d/models/necks/view_transformer.py
Voxel pooling的关键步骤为voxel_pooling_prepare_v2,为了更好的理解,在代码下方准备了图例来进行理解。
class LSSViewTransformer(...):
def create_frustum(...):
...
def forward(self, input):
""" Transform image-view feature into bird-eye-view feature.
Args:
input: [image-view feature,rots,trans,intrins,post_rots,post_trans]
image-view feature:环视图片特征
rots:由相机坐标系->车身坐标系的旋转矩阵
trans:相机坐标系->车身坐标系的平移矩阵
intrinsic:相机内参
post_rots:由图像增强引起的旋转矩阵
post_trans:由图像增强引起的平移矩阵
"""
# LIFT, x:[6, 139, 16, 44]
# 前self.D为预测的离散距离,后self.out_channels为深度特征
x = self.depth_net(x)
# 深度
depth_digit = x[:, :self.D, ...]
# 特征
tran_feat = x[:, self.D:self.D + self.out_channels, ...]
# 深度概率分布
depth = depth_digit.softmax(dim=1)
# 转化到bev空间
return self.view_transform(input, depth, tran_feat)
def view_transform(...):
return self.view_transform_core(input, depth, tran_feat)
def view_transform_core(...):
'''
Args:
input:[1, 6, 512, 16, 44],环视相机特征
depth:[6, 59, 16, 44],# 深度概率分布
tran_feat: [6, 80, 16, 44],深度特征
'''
if ...:
...
else:
# 获得点云
coor = self.get_lidar_coor(*input[1:7])
# 将点云投影到BEV空间
# 讲解链接可参考 https://zhuanlan.zhihu.com/p/586637783
bev_feat = self.voxel_pooling_v2(...)
# bev_feat:[1, 80, 128, 128] depth:[6, 59, 16, 44]
return bev_feat,depth
def get_lidar_coor(...):
# self.frustum 视锥
# 减去数据增强的平移矩阵
points = self.frustum.to(rots) - post_trans.view(B, N, 1, 1, 1, 3)
# 乘以图像预处理的旋转矩阵的逆矩阵
points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))
# 图像坐标系 -> 归一化相机坐标系 -> 相机坐标系 -> 车身坐标系
# lamda * [xs, ys, 1 ] -> lamda * xs ,lamda * ys , lamda,在多个项目中都有体现,像素坐标系转相机坐标系
points = torch.cat((points[..., :2, :] * points[..., 2:3, :], points[..., 2:3, :]), 5)
# 相机内参
combine = rots.matmul(torch.inverse(cam2imgs))
# 相机坐标系转车身坐标系
points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
points += trans.view(B, N, 1, 1, 1, 3)
# bad 为BEV 特征下的增强矩阵,这里为单位矩阵
# 解释来源为 https://github.com/Megvii-BaseDetection/BEVDepth/issues/44
points = bda.view(B, 1, 1, 1, 1, 3,3).matmul(points.unsqueeze(-1)).squeeze(-1)
return points
def voxel_pooling_v2(self, coor, depth, feat):
"""
Args:
coor:车身坐标系下的视锥点坐标
depth:离散深度概率分布
feat:深度特征
"""
ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths = self.voxel_pooling_prepare_v2(coor)
def voxel_pooling_prepare_v2(...):
"""Data preparation for voxel pooling
"""
B, N, D, H, W, _ = coor.shape
num_points = B * N * D * H * W # 总视锥点个数
ranks_depth = torch.range(0, num_points - 1, dtype=torch.int, device=coor.device) # 0~249215
# 每一层feat的位置索引 [0,1,2,3..4223,0,1,2...,4223,...,0,1,2...,4223]
ranks_feat = ...
# 将原点移动到左下角并且将坐标系转到BEV空间的尺度
# [-51.2,51.2] -> [0,102.4] -> [0,128]
coor = ((coor - self.grid_lower_bound.to(coor)) / self.grid_interval.to(coor))
coor = coor.long().view(num_points, 3
# 记录当前视锥点在哪个batch
batch_idx = torch.range(0, B - 1).reshape(B, 1). expand(B, num_points // B).reshape(num_points, 1).to(coor)
coor = torch.cat((coor, batch_idx), 1)
# 过滤掉不在bev空间下的视锥点
kept = (coor[:, 0] >= 0) & (coor[:, 0] < self.grid_size[0]) & \
(coor[:, 1] >= 0) & (coor[:, 1] < self.grid_size[1]) & \
(coor[:, 2] >= 0) & (coor[:, 2] < self.grid_size[2])
if len(kept) == 0:
return None, None, None, None, None
# 挑选BEV空间下的视锥点
coor, ranks_depth, ranks_feat = coor[kept], ranks_depth[kept], ranks_feat[kept]
# 利用视锥 点的batch,x,y 计算出 视锥点在BEV特征下的全局索引(128*128)
ranks_bev = coor[:, 3] * (self.grid_size[2] * self.grid_size[1] * self.grid_size[0])
ranks_bev += coor[:, 2] * (self.grid_size[1] * self.grid_size[0])
ranks_bev += coor[:, 1] * self.grid_size[0] + coor[:, 0]
# 排序,将BEV空间下,全局索引为相同的值排列在一起
order = ranks_bev.argsort()
ranks_bev, ranks_depth, ranks_feat = ranks_bev[order], ranks_depth[order], ranks_feat[order]
kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
# 错位比较,可以使得索引位置相同的,只有最后一个位置为True,如图所示。
kept[1:] = ranks_bev[1:] != ranks_bev[:-1]
interval_starts = torch.where(kept)[0].int()
if len(interval_starts) == 0:
return None, None, None, None, None
interval_lengths = torch.zeros_like(interval_starts)
# 每个为True的索引位置,向前累加的长度
interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1]
return ranks_bev.int().contiguous(), ranks_depth.int().contiguous(
), ranks_feat.int().contiguous(), interval_starts.int().contiguous(
), interval_lengths.int().contiguous()
5、mmdet3d/ops/bev_pool_v2/src/bev_pool_cuda.cu
void bev_pool_v2(...) {
"""
Args:
c:80,bev特征channel维度
n_intervals:Nd,位置为true的索引的集合
其他参数见上方的 voxel_pooling_prepare_v2函数
"""
# 索引位置为True的视锥点,每个视锥点的特征深度为80 一共开辟 视锥点个数*80个thread
# 共有(int)ceil(((double)n_intervals * c / 256)) 个block ,每个block有 256个线程 ,为每个深度特征的每一层(80层)创建一个thread
bev_pool_v2_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(...);
}
__global__ void bev_pool_v2_kernel(...) {
// out:输出的bev特征 [1,1,128,128,80]
int idx = blockIdx.x * blockDim.x + threadIdx.x; //当前thread的全局索引
int index = idx / c; // 当前处理哪一个视锥点
int cur_c = idx % c; // 当前处理哪一个视锥点的第 cur_c 层的数据 (共80层)
if (index >= n_intervals) return;
int interval_start = interval_starts[index]; // 为True的索引
int interval_length = interval_lengths[index]; // 向前累加多少个长度
float psum = 0; //某层深度特征的累加和
const float* cur_depth;
const float* cur_feat;
// 累加
for(int i = 0; i < interval_length; i++){
cur_depth = depth + ranks_depth[interval_start+i]; # 视锥点的预测深度
cur_feat = feat + ranks_feat[interval_start+i] * c + cur_c; # 视锥点深度特征
psum += *cur_feat * *cur_depth; # 相乘
}
const int* cur_rank = ranks_bev + interval_start; // ranks_bev + interval_start 在bev特征的位置索引(128*128)中的位置索引
float* cur_out = out + *cur_rank * c + cur_c; // 在BEV特征中的位置索引(128*128*80)中的位置索引
*cur_out = psum;
}
通过对BEVDet的了解,进一步理解了LSS的思想,同时也理解了Voxel Pooling中的各种操作,后续希望能够再深入了解几篇自顶而下的论文,如PETR\PETRv2等。