在开源的BEV模型中,可以追溯到2020年nvidia开源的 Lift-Splat- Shoot这篇论文中,论文的核心是显示的估计图像的深度特征,并转化为BEV特征,作为BEV视角下的开山鼻祖,自然是要对原理以及代码进行阅读,本文将对关键代码进行注释,欢迎正在学习或者想学习BEV模型的朋友加入交流群一起讨论、学习论文或者代码实现中的问题 ,v:Rex1586662742,数字为q。
项目地址:https://github.com/nv-tlabs/lift-splat-shoot
学习视频:手撕BEV的开山之作 lift, splat, shoot
LSS这篇论文主要的贡献在于提出LIFT,在视锥的条件下,显示的预测了每个特征点的离散距离,如下图所示。
通过对推理的梳理,可以理解论文的思路,以及在代码中的实现,下面为foward的过程。
1、src/explore.py
def viz_model_preds(...):
'''
xbound=[-50.0, 50.0, 0.5], bev特征大小
ybound=[-50.0, 50.0, 0.5],
zbound=[-10.0, 10.0, 20.0], LSS论文中不计算物体的高度
dbound=[4.0, 45.0, 1.0], 离散深度尺寸,间隔为1
'''
out = model(...)
# 进入到/data/cv_demo/bev/lift-splat-shoot/src/models.py -> forward(...)
2、src/models.py
class LiftSplatShoot(...):
def __init__(...):、
# 生成视锥
self.frustum = self.create_frustum()
# 图像特征提取
self.camencode = CamEncode(self.D, self.camC, self.downsample)
# bev特征提取
self.bevencode = BevEncode(inC=self.camC, outC=outC)
def create_frustum(...):
# 原图高宽
ogfH, ogfW = self.data_aug_conf['final_dim']
# 特征图宽高
fH, fW = ogfH // self.downsample, ogfW // self.downsample\
# 距离 ds:[[41, 8, 22]] 每个特征点预测41个距离
ds = ...
# 视锥x坐标 xs:[41, 8, 22]
xs = ...
# 视锥y坐标 xs:[41, 8, 22]
ys = ...
# 视锥点云 frustum:[41, 8, 22, 3]
frustum = torch.stack((xs, ys, ds), -1)
def get_geometry(...):
"""
param:rots, trans, intrins, post_rots, post_trans
"""
# 视锥点云减去图像预处理的平移矩阵
points = self.frustum - post_trans.view(B, N, 1, 1, 1, 3)
# 视锥点云乘以图像与处理的旋转矩阵的逆矩阵
points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))
# 图像坐标系转相机坐标系,对应转化的公式
# xs,ys,lamda
# lamda * xs, lamda *ys, lamda
points = torch.cat((points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
points[:, :, :, :, :, 2:3]
), 5)
# 相机坐标系转ego坐标系
combine = rots.matmul(torch.inverse(intrins))
points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
points += trans.view(B, N, 1, 1, 1, 3)
return points
def get_cam_feats(...):
"""
x:环视图片 [4, 6, 3, 128, 352]
"""
# 图片深度特征提取 -> [24, 64, 41, 8, 22]
x = self.camencode(x)
x = x.view(B, N, self.camC, self.D, imH//self.downsample, imW//self.downsample)
# [4, 6, 41, 8, 22, 64]
x = x.permute(0, 1, 3, 4, 5, 2)
return x
def voxel_pooling(...):
"""
geom_feats:ego坐标系下的视锥点云 [4, 6, 41, 8, 22, 3]
x:环视图像的特征 [4, 6, 41, 8, 22, 64]
"""
Nprime = B*N*D*H*W # 视锥点的个数
# x:[173184, 64]
x = x.reshape(Nprime, C)
# 想坐标系的中心移动到左下角bev空间的(原本是bev空间的中心)
geom_feats = ((geom_feats - (self.bx - self.dx/2.)) / self.dx).long()
# 所有视锥点的坐标 geom_feats:[173184, 3]
geom_feats = geom_feats.view(Nprime, 3)
# batch_ix:[173184, 1] [0,0,0,...,1,1,1,...,2,2,2] 记录每个点在那个batch内
batch_ix = ...
# 过滤不在bev空间下的点
kept = ...
x = x[kept]
geom_feats = geom_feats[kept]
# x,y,z,b 将多个相机中重合点排到一起
# (x*200 + y * 1 + z * 1 + 1) * B
ranks = ...
sorts = ranks.argsort()
# 将xyz相同的点排到一起
x, geom_feats, ranks = x[sorts], geom_feats[sorts], ranks[sorts]
# rank中一样的值,只取最后一个
# 例如rank为 [1,1,2,2,2,3,3,3,3,3]
# 将其转化为 [0,1,0,0,1,0,0,0,0,1]然后将为1的点取出来
if not self.use_quickcumsum:
x, geom_feats = cumsum_trick(x, geom_feats, ranks)
else:
...
# final:[4, 64, 1, 200, 200] bev 特征
final = ...
return final
def get_voxels(...):
geom = self.get_geometry(rots, trans, intrins, post_rots, post_trans)
x = self.get_cam_feats(x)
# x:[4, 64, 200, 200] bev特征
x = self.voxel_pooling(geom, x)
return x
def forward(...):
"""
x:imgs,环视图片 (bs, N, 3, H, W)
rots:由相机坐标系->车身坐标系的旋转矩阵,rots = (bs, N, 3, 3)
trans:由相机坐标系->车身坐标系的平移矩阵,trans=(bs, N, 3)
intrins:相机内参,intrinsic = (bs, N, 3, 3)
post_rots:由图像增强引起的旋转矩阵,post_rots = (bs, N, 3, 3)
post_trans:由图像增强引起的平移矩阵,post_trans = (bs, N, 3)
"""
x = self.get_voxels(x, rots, trans, intrins, post_rots, post_trans)
# 卷积
x = self.bevencode(x)
return x
# LIFT
class CamEncode(...):
def __init__(...):
self.D = D # 离散距离 41
self.C = C # 特征维度 64
# 图片特征提取
self.trunk = ...
# 上采样模块
self.up1 = ...
# 深度特征提取网络
self.depthnet = ...
def get_depth_feat(...):
# 使用efficientnet提取特征 x: 24 x 512 x 8 x 22
x = self.get_eff_depth(x)
# 预测深度特征
x = self.depthnet(x)
# 深度
depth = self.get_depth_dist(x[:, :self.D])
# 深度特征
new_x = ...
return depth, new_x
# Splat
class BevEncode(...):
def __init__(...):
...
def forward(...)
...
3、src/tools.py
class SimpleLoss(torch.nn.Module):
def __init__(self, pos_weight):
super(SimpleLoss, self).__init__()
# sigmoid+二值交叉熵损失
self.loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([pos_weight]))
def forward(self, ypred, ytgt):
loss = self.loss_fn(ypred, ytgt)
return loss
以上为LSS算法实现的基本流程,如有错误,欢迎大家进行指正,后续还要学习其他BEV系列的模型,希望能对BEV模型有进一步的理解。