Sparse Convolution成功用于3D目标检测的网络,例如Second,Part-A^2,PV-RCNN等,证明其有效性。相比于3D Convolution,在运算速度和显存消耗中有巨大的优势。Sparse Convolution在SECOND论文中提出,并且原文中给出了实现的方法。但在Part- A 2 A^2 A2和PV-RCNN中用的是另外一个版本,并且在pytorch中非常易用。
本文通过阅读OpenPCDet和Spconv代码,来看看spconv到底是怎么实现的。OpenPCDet主要是用来看spconv如何使用。
最重要的一句话,spconv的readme中写道:“This implementation use gather-gemm-scatter framework to do sparse convolution.”看完本文,对能理解这句话了。
主要是看看spconv是怎么用的,具体查看OpenPCDet中的pcdet/models/backbones_3d/spconv_backbone.py中的VoxelBackBone8x类别。
class VoxelBackBone8x(nn.Module):
def __init__(self, model_cfg, input_channels, grid_size, **kwargs):
super().__init__()
self.model_cfg = model_cfg
norm_fn = partial(nn.BatchNorm1d, eps=1e-3, momentum=0.01)
self.sparse_shape = grid_size[::-1] + [1, 0, 0] # self.sparse_shape=[41, 1600, 1480]
self.conv_input = spconv.SparseSequential(
spconv.SubMConv3d(input_channels, 16, 3, padding=1, bias=False, indice_key='subm1'), # 重点关注
norm_fn(16),
nn.ReLU(),
)
...
def forward(self, batch_dict):
"""
Args:
batch_dict:
batch_size: int
vfe_features: (num_voxels, C)
voxel_coords: (num_voxels, 4), [batch_idx, z_idx, y_idx, x_idx]
Returns:
batch_dict:
encoded_spconv_tensor: sparse tensor
"""
voxel_features, voxel_coords = batch_dict['voxel_features'], batch_dict['voxel_coords']
batch_size = batch_dict['batch_size']
# 先初始化Sparse Tensor
input_sp_tensor = spconv.SparseConvTensor(
features=voxel_features, # [32000, 3], 其中32000为两帧点云中voxel的数量,3是xyz
indices=voxel_coords.int(), # [32000, 4],voxel的具体坐标,4中的第一个代表是哪一帧,后面是3维中的3个坐标
spatial_shape=self.sparse_shape, # self.sparse_shape=[41, 1600, 1480],由kitti决定
batch_size=batch_size # batch_size=2,有两帧点云
)
x = self.conv_input(input_sp_tensor) # 进行submanifold convolution
...
在进行稀疏卷积之前,要构建Sparse Tensor,先看看这个是怎么构建的,在spconv中的spconv/__init__.py中:
class SparseConvTensor(object):
def __init__(self, features, indices, spatial_shape, batch_size, grid=None):
"""
Args:
grid: pre-allocated grid tensor. should be used when the volume of spatial shape
is very large.
"""
self.features = features # 储存密集的feature
self.indices = indices # 储存每个feature对应的voxel坐标系下的坐标
if self.indices.dtype != torch.int32:
self.indices.int()
self.spatial_shape = spatial_shape
self.batch_size = batch_size # 储存batch size
self.indice_dict = {} # 储存坐标之间的对应关系
self.grid = grid
...
接着上面看,要进行submanifold convolution,输入就是Sparse Tensor,从spconv中的spconv/conv.py中查看:
class SparseConvolution(SparseModule):
def __init__(self,
ndim,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
subm=False,
output_padding=0,
transposed=False,
inverse=False,
indice_key=None):
super(SparseConvolution, self).__init__()
...
# 下面的weight和bias就是本层filter拥有的权重和偏置
self.weight = Parameter(
torch.Tensor(*kernel_size, in_channels, out_channels))
if bias:
self.bias = Parameter(torch.Tensor(out_channels))
else:
self.register_parameter('bias', None)
...
def forward(self, input):
...
# 上面为一些准备工作,1)通过kernel size,padding等计算out_spatial_shape,处理卷积核1x1的特殊情况。
# input为sparse tensor,找到本layer要依赖的上一个indice
datas = input.find_indice_pair(self.indice_key)
...
if self.indice_key is not None and datas is not None:
outids, _, indice_pairs, indice_pair_num, _ = datas
else:
# 找到本层输入和输出的indices和它们的关系。输入的indices就是input的indices的属性。
# 输出的indices则是outids,indice_pairs表示indices和outids的对应关系,indice_pair_num是用于指示的中间变量。
# incides: [32000, 4]
# outids: [32000, 4],由于submconv性质,outids和incides是一样的,如果是标准的spconv,就不一样了
# indices_pairs: [27, 2, 32000],27是因为3x3x3的卷积核。2是对应关系,第0位储存indices的下标,第1位储存outids中的下标。
# indice_pair_num: [27],indice_pair_num[i]==sum(indices_pairs[i,0,:])==sum(indices_pairs[i,1,:]),用于卷积过程中构造矩阵。
outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
indices, batch_size, spatial_shape, self.kernel_size,
self.stride, self.padding, self.dilation, self.output_padding, self.subm, self.transposed, grid=input.grid)
input.indice_dict[self.indice_key] = (outids, indices, indice_pairs, indice_pair_num, spatial_shape)
...
if self.subm:
# 进行spconv
out_features = Fsp.indice_subm_conv(features, self.weight,
indice_pairs.to(device),
indice_pair_num,
outids.shape[0])
...
# 加上偏置
if self.bias is not None:
out_features += self.bias
# 构建输出的sparse tensor
out_tensor = spconv.SparseConvTensor(out_features, outids,
out_spatial_shape, batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
return out_tensor
从上面代码中,可以看到两个问题没有解决:1)输出out_tensor的indices(也就是outids)怎么来的,两者关系indice_pairs怎么来的;2)subm_conv怎么计算的。
先看outids和indice_pairs怎么计算的,要看spconv的spconv/ops.py中:
def get_indice_pairs(indices,
batch_size,
spatial_shape,
ksize=3,
stride=1,
padding=0,
dilation=1,
out_padding=0,
subm=False,
transpose=False,
grid=None):
...
# 上面就是根据输入的indices,kernel size等信息计算out_shape。
if grid is None:
if ndim == 2:
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_2d
elif ndim == 3:
# 这里get_indice_pairs_func就是get_indice_pairs_3d!!!!!!!!!
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_3d
else:
raise NotImplementedError
# 重点关注
# indices: [32000, 4]
# batch_size = 2
# out_shape = [41, 1600, 1408]
# spatial_shape = [41, 1600, 1408]
# ksize = [3, 3, 3]
# stride = [1, 1, 1]
# int(subm) = 1
# int(transpose) = 0
return get_indice_pairs_func(indices, batch_size, out_shape, spatial_shape, ksize,
stride, padding, dilation, out_padding, int(subm), int(transpose))
...
那么接着看get_indice_pairs_3d这个函数,要在spconv的src/spconv/all.cc中:
static auto registry =
...
.op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>) # get_indice_pairs_3d相关
.op("spconv::indice_conv_fp32", &spconv::indiceConv<float>)
...
那就得接着看spconv中的include/spconv/spconv_ops.h
template <unsigned NDim>
std::vector<torch::Tensor>
getIndicePair(torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
// 具体就不细节展开了,可以自己去看代码
// 做的事情就是给定indices,conv方法,kernel size,确定输出的voxel哪些不为空,也就是找到outdis。由于submconv的性质,outdis其实是和indices是一样的。
// 有了outdis和indices,还需要知道indices中每一个voxel的27个周围voxel中哪些对应着outdis中哪个。也就是indice_pairs。
}
这个在spconv中的spconv/functional.py中:
class SubMConvFunction(Function):
@staticmethod
def forward(
ctx,
features,
filters,
indice_pairs,
indice_pair_num,
num_activate_out):
ctx.save_for_backward(
indice_pairs,
indice_pair_num,
features,
filters)
# 重要
return ops.indice_conv(features, filters, indice_pairs, indice_pair_num, num_activate_out, False, True)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(features, filters, grad_output, indice_pairs, indice_pair_num, False, True)
return input_bp, filters_bp, None, None, None
indice_subm_conv = SubMConvFunction.apply
indice_conv要去看spconv/ops.py:
def indice_conv(features,
filters,
indice_pairs,
indice_pair_num,
num_activate_out,
inverse=False,
subm=False):
if filters.dtype == torch.float32:
# 重要
return torch.ops.spconv.indice_conv_fp32(features, filters, indice_pairs,
indice_pair_num, num_activate_out,
int(inverse), int(subm))
elif filters.dtype == torch.half:
return torch.ops.spconv.indice_conv_half(features, filters, indice_pairs,
indice_pair_num, num_activate_out,
int(inverse), int(subm))
else:
raise NotImplementedError
indice_conv_fp32从spconv的src/spconv/all.cc中得知,要去看include/spconv/spconv_ops.h中:
template <typename T>
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
...
// kenerVolume = 27
// 也就是说,这个循环是对3x3x3的kernel中每个分开计算的
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
// 定义输入输出的buffer
auto outputBufferBlob =
torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
...
// 有了indices,outids,和他俩之间关系indice_pairs,可以对于kernel的某个cell,gather出来这个cell要计算的voxel
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
TV_CHECK_CUDA_ERR();
// 矩阵乘法,对kernel中某个cell计算经过该cell提取特征的输出
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
...
// 将输出scatter到outids中
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
TV_CHECK_CUDA_ERR();
}
return output;
}
从这里,就理解了Readme中说的:“This implementation use gather-gemm-scatter framework to do sparse convolution.”
gemm为通用矩阵乘的优化,在本代码中,直接调用torch.mm_out这个函数,依赖torch已经做好的GEMM。GEMM的理解,可以看下述博客:
https://blog.csdn.net/qq_20880415/article/details/104332743
https://zhuanlan.zhihu.com/p/66958390
对GEMM的理解就是,GEMM并未改变运算量,是通过把循环分开写,通过更多的利用cache储存来代替内存访问,从而减少时间。