最近准备入门3D视觉,主要应用于3D点云的深度学习检测。所以从点云处理的开篇之作pointnet入手,定期做做笔记,不然容易忘记,与大家共勉哈。
论文地址:pointnet
源码地址:源码
点云是某个坐标系下的点的数据集。点包含了丰富的信息,包括三维坐标X,Y,Z、颜色、分类值、强度值、时间等等。从论文中作者主要介绍了点云的三个主要特征:
class STN3d(nn.Module):
'''首先数据的输入为(B, 3, n)--(32, 3, 2500)'''
def __init__(self):
super(STN3d, self).__init__()
self.conv1 = torch.nn.Conv1d(3, 64, 1)
self.conv2 = torch.nn.Conv1d(64, 128, 1)
self.conv3 = torch.nn.Conv1d(128, 1024, 1)
self.fc1 = nn.Linear(1024, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, 9)
self.relu = nn.ReLU()
self.bn1 = nn.BatchNorm1d(64)
self.bn2 = nn.BatchNorm1d(128)
self.bn3 = nn.BatchNorm1d(1024)
self.bn4 = nn.BatchNorm1d(512)
self.bn5 = nn.BatchNorm1d(256)
def forward(self, x):
batchsize = x.size()[0] # batchsize=32
x = F.relu(self.bn1(self.conv1(x))) # [32, 3, 2500] ---[32, 64, 2500]
x = F.relu(self.bn2(self.conv2(x))) # [32, 64, 2500] --- [32, 128, 2500]
x = F.relu(self.bn3(self.conv3(x))) # [32, 128, 2500] --- [32, 1024, 2500]
x = torch.max(x, 2, keepdim=True)[0] # [32, 1024, 2500] --- [32, 1024, 1] 这里代表每个维度选取一个最大值
x = x.view(-1, 1024) # [32, 1024, 1] --- [32, 1024]
x = F.relu(self.bn4(self.fc1(x))) # [32, 1024] --- [32, 512]
x = F.relu(self.bn5(self.fc2(x))) # [32, 512] --- [32, 256]
x = self.fc3(x) # [32, 256] --- [32, 9]
# iden生成单位变换矩阵, Variable不用管,已经不用了
iden = Variable(torch.from_numpy(np.array([1,0,0,0,1,0,0,0,1]).astype(np.float32))).view(1,9).repeat(batchsize,1)
if x.is_cuda:
iden = iden.cuda()
x = x + iden
x = x.view(-1, 3, 3) # [32, 9] --- [32, 3, 3]
return x
class STNkd(nn.Module):
def __init__(self, k=64):
super(STNkd, self).__init__()
self.conv1 = torch.nn.Conv1d(k, 64, 1)
self.conv2 = torch.nn.Conv1d(64, 128, 1)
self.conv3 = torch.nn.Conv1d(128, 1024, 1)
self.fc1 = nn.Linear(1024, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, k*k)
self.relu = nn.ReLU()
self.bn1 = nn.BatchNorm1d(64)
self.bn2 = nn.BatchNorm1d(128)
self.bn3 = nn.BatchNorm1d(1024)
self.bn4 = nn.BatchNorm1d(512)
self.bn5 = nn.BatchNorm1d(256)
self.k = k
def forward(self, x):
batchsize = x.size()[0] # batchsize = 32
x = F.relu(self.bn1(self.conv1(x))) # [32, 64, 2500]
x = F.relu(self.bn2(self.conv2(x))) # [32, 128, 2500]
x = F.relu(self.bn3(self.conv3(x))) # [32, 1024, 2500]
x = torch.max(x, 2, keepdim=True)[0] # [32, 1024, 1]
x = x.view(-1, 1024) # [32, 1024]
x = F.relu(self.bn4(self.fc1(x))) # [32, 512]
x = F.relu(self.bn5(self.fc2(x))) # [32, 256]
x = self.fc3(x) # [32, 4096] 4096 = 64 * 64
iden = Variable(torch.from_numpy(np.eye(self.k).flatten().astype(np.float32))).view(1,self.k*self.k).repeat(batchsize,1)
if x.is_cuda:
iden = iden.cuda()
x = x + iden
x = x.view(-1, self.k, self.k) # [32, 64, 64]
return x
max pool
,根据global_feat
选择输出维度状态–主要为了之后判断是进行分类还是分割。class PointNetfeat(nn.Module):
def __init__(self, global_feat = True, feature_transform = False):
super(PointNetfeat, self).__init__()
self.stn = STN3d()
self.conv1 = torch.nn.Conv1d(3, 64, 1)
self.conv2 = torch.nn.Conv1d(64, 128, 1)
self.conv3 = torch.nn.Conv1d(128, 1024, 1)
self.bn1 = nn.BatchNorm1d(64)
self.bn2 = nn.BatchNorm1d(128)
self.bn3 = nn.BatchNorm1d(1024)
self.global_feat = global_feat
self.feature_transform = feature_transform
if self.feature_transform:
self.fstn = STNkd(k=64)
def forward(self, x):
n_pts = x.size()[2] # 点云的数量 2500
trans = self.stn(x) # [32, 3, 3]
x = x.transpose(2, 1) # [32, 2500, 3]
x = torch.bmm(x, trans) # [32, 2500, 3]
x = x.transpose(2, 1) # [32, 3, 2500]
x = F.relu(self.bn1(self.conv1(x))) # [32, 64, 2500]
if self.feature_transform:
trans_feat = self.fstn(x)
x = x.transpose(2,1)
x = torch.bmm(x, trans_feat)
x = x.transpose(2,1)
else:
trans_feat = None
pointfeat = x # [32, 64, 2500]
x = F.relu(self.bn2(self.conv2(x))) # [32, 128, 2500]
x = self.bn3(self.conv3(x)) # [32, 1024, 2500]
x = torch.max(x, 2, keepdim=True)[0] # [32, 1024, 1]
x = x.view(-1, 1024) # [32, 1024]
if self.global_feat:
return x, trans, trans_feat
else:
x = x.view(-1, 1024, 1).repeat(1, 1, n_pts) # [32, 1024, 2500]
return torch.cat([x, pointfeat], 1), trans, trans_feat
这里补充个知识点`max pool`主要为了解决点云的无序问题,在维度上任意打乱的时候,为了表述同一个物体,最简单的就是使用对称函数。论文使用的是Max,无论顺序如何变化,最大值是不会变的。
还有个问题:如果点云特征为2500*3,在空间维度x,y,z是那个进行最大池化后就变为1*3,这样做损失的特征太多了,所以论文将点云的每个点先映射到一个冗余的高维空间后(例如1024维),再去进行max的对称函数操作,损失的特征就没那么多了。代码中是从[32, 1024,2500]变为[32,1024,1]。32是batchsize的大小,从每个维度上选取最大值。
class PointNetCls(nn.Module):
def __init__(self, k=2, feature_transform=False):
super(PointNetCls, self).__init__()
self.feature_transform = feature_transform
self.feat = PointNetfeat(global_feat=True, feature_transform=feature_transform)
self.fc1 = nn.Linear(1024, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, k)
self.dropout = nn.Dropout(p=0.3)
self.bn1 = nn.BatchNorm1d(512)
self.bn2 = nn.BatchNorm1d(256)
self.relu = nn.ReLU()
def forward(self, x):
x, trans, trans_feat = self.feat(x) # x=[32, 1024], trans=[32, 3, 3], trans_feat = none
x = F.relu(self.bn1(self.fc1(x))) # x = [32, 512]
x = F.relu(self.bn2(self.dropout(self.fc2(x)))) # x = [32, 256]
x = self.fc3(x) # x = [32, k] 此时k=5
return F.log_softmax(x, dim=1), trans, trans_feat
class PointNetDenseCls(nn.Module):
def __init__(self, k = 2, feature_transform=False):
super(PointNetDenseCls, self).__init__()
self.k = k
self.feature_transform=feature_transform
self.feat = PointNetfeat(global_feat=False, feature_transform=feature_transform)
self.conv1 = torch.nn.Conv1d(1088, 512, 1)
self.conv2 = torch.nn.Conv1d(512, 256, 1)
self.conv3 = torch.nn.Conv1d(256, 128, 1)
self.conv4 = torch.nn.Conv1d(128, self.k, 1)
self.bn1 = nn.BatchNorm1d(512)
self.bn2 = nn.BatchNorm1d(256)
self.bn3 = nn.BatchNorm1d(128)
def forward(self, x):
batchsize = x.size()[0] # batchsize = 32
n_pts = x.size()[2] # 2500
x, trans, trans_feat = self.feat(x) # x= [32, 1088, 2500], trans = [32, 3, 3]
x = F.relu(self.bn1(self.conv1(x))) # [32, 512, 2500]
x = F.relu(self.bn2(self.conv2(x))) # [32, 256, 2500]
x = F.relu(self.bn3(self.conv3(x))) # [32, 128, 2500]
x = self.conv4(x) # [32, 3, 2500]
x = x.transpose(2,1).contiguous() # [32, 2500, 3]
x = F.log_softmax(x.view(-1,self.k), dim=-1) # [80000, 3]
x = x.view(batchsize, n_pts, self.k) # [32, 2500, 3]
return x, trans, trans_feat
这里其实是自己有些不理解的地方哈哈。下面这个代码我看有的解释是控制最后的loss来对变换矩阵进行调整。不是太理解具体是怎么调整的
def feature_transform_regularizer(trans):
d = trans.size()[1] # d =3
batchsize = trans.size()[0] #batchsize = 32
I = torch.eye(d)[None, :, :] # [1, 3, 3]
if trans.is_cuda:
I = I.cuda()
loss = torch.mean(torch.norm(torch.bmm(trans, trans.transpose(2,1)) - I, dim=(1,2)))
return loss
同样的还有个地方:生成单位矩阵加到输入上就能起到旋转的作用了?
iden = Variable(torch.from_numpy(np.eye(self.k).flatten().astype(np.float32))).view(1,self.k*self.k).repeat(batchsize,1)
这两个应该是同一类型的问题,脑子里完全没这方面的概念哈哈,下去再找点资料看看吧。也希望有大佬能帮忙解决下,感谢感谢
链接: PointNet原理详解.
点云的无序性_三维点云分类与分割-PointNet
三维深度学习之pointnet系列详解(一)
PointNet:论文总结及pytorch源码详解