Point-Voxel CNN for Efficient 3D Deep Learning, 2019 NIPS
code:https://github.com/mit-han-lab/pvcnn
文章的的解读可以看我另一篇博客。
PVconv的具体实现在pvcnn-master/modules/pvconv.py
voxel_features, voxel_coords = self.voxelization(features, coords)
voxel_features = self.voxel_layers(voxel_features)
voxel_features = F.trilinear_devoxelize(voxel_features, voxel_coords, self.resolution, self.training)
fused_features = voxel_features + self.point_features(features)
首先,根据上面代码可以看到,是把point-wise的feature和coords传入,那我们也跟进去看,pvcnn-master/modules/voxelization.py:
class Voxelization(nn.Module):
def __init__(self, resolution, normalize=True, eps=0):
super().__init__()
self.r = int(resolution)
self.normalize = normalize
self.eps = eps
def forward(self, features, coords)
coords = coords.detach()
# 把coords归一到局部坐标系中,先减去均值
norm_coords = coords - coords.mean(2, keepdim=True)
if self.normalize:
# 找到最远的点当做半径,然后每个点除以2*半径,将坐标归一到[-0.5,0.5],然后加上0.5
norm_coords = norm_coords / (norm_coords.norm(dim=1, keepdim=True).max(dim=2, keepdim=True).values * 2.0 + self.eps) + 0.5
else:
norm_coords = (norm_coords + 1) / 2.0
# resolution是正整数,将norm_coords从[0,1]放大到[0,r-1]
norm_coords = torch.clamp(norm_coords * self.r, 0, self.r - 1)
# 通过round,得到vox_coords,vox_coords的取值是[0,r-1]的整数,一共r个值
vox_coords = torch.round(norm_coords).to(torch.int32)
# 前向计算,进行voxelize
return F.avg_voxelize(features, vox_coords, self.r), norm_coords
把feature和vox_coords传入,那么我们也跟进去看,pvcnn-master/modules/functional/voxelization.py:
class AvgVoxelization(Function):
@staticmethod
def forward(ctx, features, coords, resolution):
"""
:param ctx:
:param features: Features of the point cloud, FloatTensor[B, C, N]
:param coords: Voxelized Coordinates of each point, IntTensor[B, 3, N]
:param resolution: Voxel resolution
:return:
Voxelized Features, FloatTensor[B, C, R, R, R]
"""
features = features.contiguous()
coords = coords.int().contiguous()
b, c, _ = features.shape
# 前向计算
out, indices, counts = _backend.avg_voxelize_forward(features, coords, resolution)
ctx.save_for_backward(indices, counts)
return out.view(b, c, resolution, resolution, resolution)
@staticmethod
def backward(ctx, grad_output):
"""
:param ctx:
:param grad_output: gradient of output, FloatTensor[B, C, R, R, R]
:return:
gradient of inputs, FloatTensor[B, C, N]
"""
b, c = grad_output.shape[:2]
indices, counts = ctx.saved_tensors
# 反向传播
grad_features = _backend.avg_voxelize_backward(grad_output.contiguous().view(b, c, -1), indices, counts)
return grad_features, None, None
前向计算中的coords其实是vox_coords,我们再次跟到c++的程序中,pvcnn-master/modules/functional/src/voxelization/vox.cpp:
/*
Function: average pool voxelization (forward)
Args:
features: features, FloatTensor[b, c, n]
coords : coords of each point, IntTensor[b, 3, n]
resolution : voxel resolution
Return:
out : outputs, FloatTensor[b, c, s], s = r ** 3
ind : voxel index of each point, IntTensor[b, n]
cnt : #points in each voxel index, IntTensor[b, s]
*/
std::vector<at::Tensor> avg_voxelize_forward(const at::Tensor features,
const at::Tensor coords,
const int resolution) {
CHECK_CUDA(features);
CHECK_CUDA(coords);
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(coords);
CHECK_IS_FLOAT(features);
CHECK_IS_INT(coords);
int b = features.size(0);
int c = features.size(1);
int n = features.size(2);
int r = resolution;
int r2 = r * r;
int r3 = r2 * r;
// 在显存开要输出的变量的空间
at::Tensor ind = torch::zeros(
{b, n}, at::device(features.device()).dtype(at::ScalarType::Int));
at::Tensor out = torch::zeros(
{b, c, r3}, at::device(features.device()).dtype(at::ScalarType::Float));
at::Tensor cnt = torch::zeros(
{b, r3}, at::device(features.device()).dtype(at::ScalarType::Int));
// 调用cuda写的函数
avg_voxelize(b, c, n, r, r2, r3, coords.data_ptr<int>(),
features.data_ptr<float>(), ind.data_ptr<int>(),
cnt.data_ptr<int>(), out.data_ptr<float>());
return {out, ind, cnt};
}
我们再次跟到控制GPU的函数中,pvcnn-master/modules/functional/src/voxelization/vox.cu:
void avg_voxelize(int b, int c, int n, int r, int r2, int r3, const int *coords,
const float *feat, int *ind, int *cnt, float *out) {
// 首先统计每个voxel中有多少个点,这个是计算均值需要的
grid_stats_kernel<<<b, optimal_num_threads(n)>>>(b, n, r, r2, r3, coords, ind,
cnt);
// 求每个voxel的feature
avg_voxelize_kernel<<<b, optimal_num_threads(n)>>>(b, c, n, r3, ind, cnt,
feat, out);
CUDA_CHECK_ERRORS();
}
先看第一个函数:
/*
Function: get how many points in each voxel grid
Args:
b : batch size
n : number of points
r : voxel resolution
r2 : = r * r
r3 : s, voxel cube size = r ** 3
coords : coords of each point, IntTensor[b, 3, n]
ind : voxel index of each point, IntTensor[b, n]
cnt : #points in each voxel index, IntTensor[b, s]
*/
__global__ void grid_stats_kernel(int b, int n, int r, int r2, int r3,
const int *__restrict__ coords,
int *__restrict__ ind, int *cnt) {
int batch_index = blockIdx.x;
// stride是代表每个batch用多少个thread来并行计算
int stride = blockDim.x;
int index = threadIdx.x;
// coords是一个指针,把它指到目前的batch上
coords += batch_index * n * 3;
ind += batch_index * n;
cnt += batch_index * r3;
for (int i = index; i < n; i += stride) {
// if (ind[i] == -1)
// continue;
// 计算这个点属于哪个voxel,找到其下标
ind[i] = coords[i] * r2 + coords[i + n] * r + coords[i + n + n];
// 该voxel内点的数量加1
atomicAdd(cnt + ind[i], 1);
}
}
再看第二个函数
/*
Function: average pool voxelization (forward)
Args:
b : batch size
c : #channels
n : number of points
s : voxel cube size = voxel resolution ** 3
ind : voxel index of each point, IntTensor[b, n]
cnt : #points in each voxel index, IntTensor[b, s]
feat: features, FloatTensor[b, c, n]
out : outputs, FloatTensor[b, c, s]
*/
__global__ void avg_voxelize_kernel(int b, int c, int n, int s,
const int *__restrict__ ind,
const int *__restrict__ cnt,
const float *__restrict__ feat,
float *__restrict__ out) {
int batch_index = blockIdx.x;
int stride = blockDim.x;
int index = threadIdx.x;
ind += batch_index * n;
feat += batch_index * c * n;
out += batch_index * c * s;
cnt += batch_index * s;
for (int i = index; i < n; i += stride) {
// 找到当前点对应voxel的下标
int pos = ind[i];
// if (pos == -1)
// continue;
// 获取该voxel中点的数量
int cur_cnt = cnt[pos];
if (cur_cnt > 0) {
float div_cur_cnt = 1.0 / static_cast<float>(cur_cnt);
for (int j = 0; j < c; j++) {
// 更新voxel的feature中的每一个维度
atomicAdd(out + j * s + pos, feat[j * n + i] * div_cur_cnt);
}
}
}
}
接下来来看看反向传播,有了每个点所在voxel的信息,其实反向传播也很简单理解:
/*
Function: average pool voxelization (backward)
Args:
b : batch size
c : #channels
n : number of points
r3 : voxel cube size = voxel resolution ** 3
ind : voxel index of each point, IntTensor[b, n]
cnt : #points in each voxel index, IntTensor[b, s]
grad_y : grad outputs, FloatTensor[b, c, s]
grad_x : grad inputs, FloatTensor[b, c, n]
*/
__global__ void avg_voxelize_grad_kernel(int b, int c, int n, int r3,
const int *__restrict__ ind,
const int *__restrict__ cnt,
const float *__restrict__ grad_y,
float *__restrict__ grad_x) {
int batch_index = blockIdx.x;
int stride = blockDim.x;
int index = threadIdx.x;
ind += batch_index * n;
grad_x += batch_index * c * n;
grad_y += batch_index * c * r3;
cnt += batch_index * r3;
for (int i = index; i < n; i += stride) {
int pos = ind[i];
// if (pos == -1)
// continue;
int cur_cnt = cnt[pos];
if (cur_cnt > 0) {
float div_cur_cnt = 1.0 / static_cast<float>(cur_cnt);
for (int j = 0; j < c; j++) {
// 注意这里
atomicAdd(grad_x + j * n + i, grad_y[j * r3 + pos] * div_cur_cnt);
}
}
}
}
上面代码中我标注了“注意这里”,这个就是链式计算的结果。例如 c = 0.5 ∗ ( a + b ) c=0.5*(a+b) c=0.5∗(a+b),c的梯度记为 d c dc dc。 d L / d a = d L / d c × d c / d a = d L / d c × 0.5 dL/da = dL/dc \times dc/da = dL/dc \times 0.5 dL/da=dL/dc×dc/da=dL/dc×0.5
devoxelization用的是三线性插值,也就是用三个维度上的八个临近点插值得到要求取的点。算法可以参考这个博客。这里直接说代码。
首先看pvcnn-master/modules/pvconv.py,是直接调用的trilinear_devoxelize函数,这部分没有封装成模块。
def forward(self, inputs):
features, coords = inputs
voxel_features, voxel_coords = self.voxelization(features, coords)
voxel_features = self.voxel_layers(voxel_features)
# voxel_feature: FloatTensor[B, C, R, R, R],就是通过三维卷积算出来的
# voxel_coords: FloatTenso[B, 3, N],是经过归一化到(0,r-1]的坐标,具体可以看上面一部分
voxel_features = F.trilinear_devoxelize(voxel_features, voxel_coords, self.resolution, self.training)
fused_features = voxel_features + self.point_features(features)
return fused_features, coords
跟进去看pvcnn-master/modules/functional/devoxelization.py,这块代码没啥,就是直接调用的cpp。
class TrilinearDevoxelization(Function):
@staticmethod
def forward(ctx, features, coords, resolution, is_training=True):
"""
:param ctx:
:param coords: the coordinates of points, FloatTensor[B, 3, N]
:param features: FloatTensor[B, C, R, R, R]
:param resolution: int, the voxel resolution
:param is_training: bool, training mode
:return:
FloatTensor[B, C, N]
"""
B, C = features.shape[:2]
features = features.contiguous().view(B, C, -1)
coords = coords.contiguous()
outs, inds, wgts = _backend.trilinear_devoxelize_forward(resolution, is_training, coords, features)
if is_training:
ctx.save_for_backward(inds, wgts)
ctx.r = resolution
return outs
@staticmethod
def backward(ctx, grad_output):
"""
:param ctx:
:param grad_output: gradient of outputs, FloatTensor[B, C, N]
:return:
gradient of inputs, FloatTensor[B, C, R, R, R]
"""
inds, wgts = ctx.saved_tensors
grad_inputs = _backend.trilinear_devoxelize_backward(grad_output.contiguous(), inds, wgts, ctx.r)
return grad_inputs.view(grad_output.size(0), grad_output.size(1), ctx.r, ctx.r, ctx.r), None, None, None
trilinear_devoxelize = TrilinearDevoxelization.apply
接下来来看cpp,pvcnn-master/modules/functional/src/interpolate/trilinear_devox.cpp
/*
Function: trilinear devoxelization (forward)
Args:
r : voxel resolution
trainig : whether is training mode
coords : the coordinates of points, FloatTensor[b, 3, n]
features : features, FloatTensor[b, c, s], s = r ** 3
Return:
outs : outputs, FloatTensor[b, c, n]
inds : the voxel coordinates of point cube, IntTensor[b, 8, n]
wgts : weight for trilinear interpolation, FloatTensor[b, 8, n]
*/
std::vector<at::Tensor>
trilinear_devoxelize_forward(const int r, const bool is_training,
const at::Tensor coords,
const at::Tensor features) {
CHECK_CUDA(features);
CHECK_CUDA(coords);
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(coords);
CHECK_IS_FLOAT(features);
CHECK_IS_FLOAT(coords);
int b = features.size(0);
int c = features.size(1);
int n = coords.size(2);
int r2 = r * r;
int r3 = r2 * r;
// 定义输出的变量
at::Tensor outs = torch::zeros(
{b, c, n}, at::device(features.device()).dtype(at::ScalarType::Float));
if (is_training) {
// 如果是training,就输出weight和inds,反向传播要用
at::Tensor inds = torch::zeros(
{b, 8, n}, at::device(features.device()).dtype(at::ScalarType::Int));
at::Tensor wgts = torch::zeros(
{b, 8, n}, at::device(features.device()).dtype(at::ScalarType::Float));
trilinear_devoxelize(b, c, n, r, r2, r3, true, coords.data_ptr<float>(),
features.data_ptr<float>(), inds.data_ptr<int>(),
wgts.data_ptr<float>(), outs.data_ptr<float>());
return {outs, inds, wgts};
} else {
// 如果不training,就不输出weights和inds
at::Tensor inds = torch::zeros(
{1}, at::device(features.device()).dtype(at::ScalarType::Int));
at::Tensor wgts = torch::zeros(
{1}, at::device(features.device()).dtype(at::ScalarType::Float));
trilinear_devoxelize(b, c, n, r, r2, r3, false, coords.data_ptr<float>(),
features.data_ptr<float>(), inds.data_ptr<int>(),
wgts.data_ptr<float>(), outs.data_ptr<float>());
return {outs, inds, wgts};
}
}
pvcnn-master/modules/functional/src/interpolate/trilinear_devox.cu
void trilinear_devoxelize(int b, int c, int n, int r, int r2, int r3,
bool training, const float *coords, const float *feat,
int *inds, float *wgts, float *outs) {
trilinear_devoxelize_kernel<<<b, optimal_num_threads(n)>>>(
b, c, n, r, r2, r3, training, coords, feat, inds, wgts, outs);
CUDA_CHECK_ERRORS();
}
接下来就到了最核心的代码。这一块首先得理解如下:voxel_feature是[R, R, R]的,对应着[R, R, R]个voxel。但具体对应的是voxel中的哪个点呢?从代码中看,是对应着每个voxel在三个维度上最小的角点。例如,第一个voxel所覆盖的范围是([0,1), [0,1), [0,1)),最后一个voxel覆盖的范围是([r-1,r), [r-1,r), [r-1,r))。那么第一个voxel_feature对应的点是(0,0,0),最后一个feature所对应的点是(r-1,r-1,r-1)。这其实就有个问题,边界点怎么处理,就是每个维度上的第R个的voxel中的点怎么处理,该voxel只有部分角点的feature,例如最后一个voxel只有一个角点的feauture。这个就是代码中要解决的问题。
这个特殊情况的解决方法是,在得到点云的坐标的时候,就是经过clamp的,都是(0,r-1]的,当一个点的坐标是r-1的时候,就说明它在这个维度上的最后一个voxel中。反之亦然,当一个点在某个维度的第R个voxel中,它这个维度的坐标必然是r-1。
/*
Function: trilinear devoxlization (forward)
Args:
b : batch size
c : #channels
n : number of points
r : voxel resolution
r2 : r ** 2
r3 : r ** 3
coords : the coordinates of points, FloatTensor[b, 3, n]
feat : features, FloatTensor[b, c, r3]
inds : the voxel indices of point cube, IntTensor[b, 8, n]
wgts : weight for trilinear interpolation, FloatTensor[b, 8, n]
outs : outputs, FloatTensor[b, c, n]
*/
__global__ void trilinear_devoxelize_kernel(int b, int c, int n, int r, int r2,
int r3, bool is_training,
const float *__restrict__ coords,
const float *__restrict__ feat,
int *__restrict__ inds,
float *__restrict__ wgts,
float *__restrict__ outs) {
int batch_index = blockIdx.x;
int stride = blockDim.x;
int index = threadIdx.x;
// 把指针指向当前的batch
coords += batch_index * n * 3;
// 在不training的状态下,指针会指到错误的位置上,但不对wgts和inds指向的显存做操作
inds += batch_index * n * 8;
wgts += batch_index * n * 8;
feat += batch_index * c * r3;
outs += batch_index * c * n;
for (int i = index; i < n; i += stride) {
// 拿到第i个点的坐标
float x = coords[i];
float y = coords[i + n];
float z = coords[i + n + n];
// 计算用于插值的第一个点,也就是(floor(x), floor(y), floor(z))
float x_lo_f = floorf(x);
float y_lo_f = floorf(y);
float z_lo_f = floorf(z);
// 因为每一个grid边长都是1,计算第i个点到周围八个点的距离
// 注意,大多数点都是在grid内部的,由于点云的稀疏性,很难出现在grid的分界面上。
// 但是,由于voxel_coord是经过clamp的,是将(0,r)强行clamp到(0,r-1]的
// 也就是说大于r-1的点会变成r-1,从而使得最后一个voxel内部的点其实都是在最后一个voxel的分界面上
// 也就是说,x_d_1等于0的时候,它基本就是在x方向上的第R个voxel内的。
float x_d_1 = x - x_lo_f; // / (x_hi_f - x_lo_f + 1e-8f)
float y_d_1 = y - y_lo_f;
float z_d_1 = z - z_lo_f;
float x_d_0 = 1.0f - x_d_1;
float y_d_0 = 1.0f - y_d_1;
float z_d_0 = 1.0f - z_d_1;
// 计算权重
// 举例,对于1维度线性插值,f(x) = (ceil(x)-x) * f(floor(x)) + (x-floor(x)) * f(ceil(x))
// wgt000 = (ceil(x)-x) * (ceil(y)-y) * (ceil(z)-z)
// 所以wgt000对应的是点是(floor(x), floor(y), floor(z))
float wgt000 = x_d_0 * y_d_0 * z_d_0;
float wgt001 = x_d_0 * y_d_0 * z_d_1;
float wgt010 = x_d_0 * y_d_1 * z_d_0;
float wgt011 = x_d_0 * y_d_1 * z_d_1;
float wgt100 = x_d_1 * y_d_0 * z_d_0;
float wgt101 = x_d_1 * y_d_0 * z_d_1;
float wgt110 = x_d_1 * y_d_1 * z_d_0;
float wgt111 = x_d_1 * y_d_1 * z_d_1;
// 计算(floor(x), floor(y), floor(z))的坐标
int x_lo = static_cast<int>(x_lo_f);
int y_lo = static_cast<int>(y_lo_f);
int z_lo = static_cast<int>(z_lo_f);
// 注意,在最后一个voxel中,x_d_1 == y_d_1 == z_d_1 == 0
int x_hi = (x_d_1 > 0) ? -1 : 0;
int y_hi = (y_d_1 > 0) ? -1 : 0;
// 如果z_d_1==0,说明这个点在边界上,也就没有下一个z_hi对应的feautre了,所以置位1
int z_hi = (z_d_1 > 0) ? 1 : 0;
int idx000 = x_lo * r2 + y_lo * r + z_lo;
int idx001 = idx000 + z_hi; // x_lo * r2 + y_lo * r + z_hi;
// 当y_hi==0时,说明y_d_1==0,说明这个点在y方向上的第R个voxel中,所以在y方向上没有下一个voxel了
// 此时y_hi & r == 0,保证数组不会越界
// 当当y_hi==1时,说明这个点不在边界上,由于-1的所有位都是1,此时y_hi & r == r
int idx010 = idx000 + (y_hi & r); // x_lo * r2 + y_hi * r + z_lo;
int idx011 = idx010 + z_hi; // x_lo * r2 + y_hi * r + z_hi;
int idx100 = idx000 + (x_hi & r2); // x_hi * r2 + y_lo * r + z_lo;
int idx101 = idx100 + z_hi; // x_hi * r2 + y_lo * r + z_hi;
int idx110 = idx100 + (y_hi & r); // x_hi * r2 + y_hi * r + z_lo;
int idx111 = idx110 + z_hi; // x_hi * r2 + y_hi * r + z_hi;
if (is_training) {
// 如果不training, 就不修改wgts和inds对应的显存
wgts[i] = wgt000;
wgts[i + n] = wgt001;
wgts[i + n * 2] = wgt010;
wgts[i + n * 3] = wgt011;
wgts[i + n * 4] = wgt100;
wgts[i + n * 5] = wgt101;
wgts[i + n * 6] = wgt110;
wgts[i + n * 7] = wgt111;
inds[i] = idx000;
inds[i + n] = idx001;
inds[i + n * 2] = idx010;
inds[i + n * 3] = idx011;
inds[i + n * 4] = idx100;
inds[i + n * 5] = idx101;
inds[i + n * 6] = idx110;
inds[i + n * 7] = idx111;
}
for (int j = 0; j < c; j++) {
int jr3 = j * r3;
// 计算这个点的feature的每个channal
outs[j * n + i] =
wgt000 * feat[jr3 + idx000] + wgt001 * feat[jr3 + idx001] +
wgt010 * feat[jr3 + idx010] + wgt011 * feat[jr3 + idx011] +
wgt100 * feat[jr3 + idx100] + wgt101 * feat[jr3 + idx101] +
wgt110 * feat[jr3 + idx110] + wgt111 * feat[jr3 + idx111];
}
}
}
/*
Function: trilinear devoxlization (backward)
Args:
b : batch size
c : #channels
n : number of points
r3 : voxel cube size = voxel resolution ** 3
inds : the voxel indices of point cube, IntTensor[b, 8, n]
wgts : weight for trilinear interpolation, FloatTensor[b, 8, n]
grad_y : grad outputs, FloatTensor[b, c, n]
grad_x : grad inputs, FloatTensor[b, c, r3]
*/
__global__ void trilinear_devoxelize_grad_kernel(
int b, int c, int n, int r3, const int *__restrict__ inds,
const float *__restrict__ wgts, const float *__restrict__ grad_y,
float *__restrict__ grad_x) {
int batch_index = blockIdx.x;
int stride = blockDim.x;
int index = threadIdx.x;
// wgts和inds都是前向计算中得到的
inds += batch_index * n * 8;
wgts += batch_index * n * 8;
grad_x += batch_index * c * r3;
grad_y += batch_index * c * n;
for (int i = index; i < n; i += stride) {
// 对于每个点,找到其8个顶点对应的voxel的位置
int idx000 = inds[i];
int idx001 = inds[i + n];
int idx010 = inds[i + n * 2];
int idx011 = inds[i + n * 3];
int idx100 = inds[i + n * 4];
int idx101 = inds[i + n * 5];
int idx110 = inds[i + n * 6];
int idx111 = inds[i + n * 7];
// 拿到8个点对应的权重
float wgt000 = wgts[i];
float wgt001 = wgts[i + n];
float wgt010 = wgts[i + n * 2];
float wgt011 = wgts[i + n * 3];
float wgt100 = wgts[i + n * 4];
float wgt101 = wgts[i + n * 5];
float wgt110 = wgts[i + n * 6];
float wgt111 = wgts[i + n * 7];
for (int j = 0; j < c; j++) {
int jr3 = j * r3;
// 对于这个点的feautre的每个channel的梯度,进行反向传播
float g = grad_y[j * n + i];
atomicAdd(grad_x + jr3 + idx000, wgt000 * g);
atomicAdd(grad_x + jr3 + idx001, wgt001 * g);
atomicAdd(grad_x + jr3 + idx010, wgt010 * g);
atomicAdd(grad_x + jr3 + idx011, wgt011 * g);
atomicAdd(grad_x + jr3 + idx100, wgt100 * g);
atomicAdd(grad_x + jr3 + idx101, wgt101 * g);
atomicAdd(grad_x + jr3 + idx110, wgt110 * g);
atomicAdd(grad_x + jr3 + idx111, wgt111 * g);
}
}
}