ST-GCN源码来自https://github.com/yysijie/st-gcn;文中代码中的注释仅是个人理解,如有错误,欢迎指正。
ST-GCN原文翻译可以参考https://blog.csdn.net/qq_46579456/article/details/123756030?spm=1001.2014.3001.5502
graph.py文件主要用于获取图结构信息,即邻接矩阵A。
(1)__init__()函数
def __init__(self,
layout='openpose',
strategy='uniform',
max_hop=1,
dilation=1):
#1-hop邻居节点(论文中只考虑了1-hop)
self.max_hop = max_hop
self.dilation = dilation
#默认为openpose工具箱处理
self.get_edge(layout)
#各关节点之间的距离
self.hop_dis = get_hop_distance(
self.num_node, self.edge, max_hop=max_hop)
#默认单标签邻接矩阵
self.get_adjacency(strategy)
(2)get_edge() 获取所有小于max-hop的连接边的集合
def get_edge(self, layout):
#对Kinetics数据集操作的关节图
if layout == 'openpose':
#共18处关节
self.num_node = 18
#关节自身连接
self_link = [(i, i) for i in range(self.num_node)]
neighbor_link = [(4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12,11),
(10, 9), (9, 8), (11, 5), (8, 2), (5, 1), (2, 1),
(0, 1), (15, 0), (14, 0), (17, 15), (16, 14)]
#边的集合列表
self.edge = self_link + neighbor_link
#center点是1号关节
self.center = 1
#ntu-rgb+d数据集的关节图
elif layout == 'ntu-rgb+d':
#共25处关节
self.num_node = 25
#关节自身连接
self_link = [(i, i) for i in range(self.num_node)]
neighbor_1base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21),
(6, 5), (7, 6), (8, 7), (9, 21), (10, 9),
(11, 10), (12, 11), (13, 1), (14, 13), (15, 14),
(16, 15), (17, 1), (18, 17), (19, 18), (20, 19),
(22, 23), (23, 8), (24, 25), (25, 12)]
neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
# 边的集合列表
self.edge = self_link + neighbor_link
#center点是20号关节
self.center = 21 - 1
elif layout == 'ntu_edge':
self.num_node = 24
self_link = [(i, i) for i in range(self.num_node)]
neighbor_1base = [(1, 2), (3, 2), (4, 3), (5, 2), (6, 5), (7, 6),
(8, 7), (9, 2), (10, 9), (11, 10), (12, 11),
(13, 1), (14, 13), (15, 14), (16, 15), (17, 1),
(18, 17), (19, 18), (20, 19), (21, 22), (22, 8),
(23, 24), (24, 12)]
neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
self.edge = self_link + neighbor_link
self.center = 2
# elif layout=='customer settings'
# pass
else:
raise ValueError("Do Not Exist This Layout.")
以Kinetics数据集为例,首先通过openpose工具箱获得18个关节,并按照下图对关节进行编号;self_link为关节自身连接,即0-hop连接,neighbor_link为1-hop连接,二者相加即所有连接边的集合;且选的1号关节为中心关节。
(3)get_hop_distance() 获取各关节之间的距离矩阵
def get_hop_distance(num_node, edge, max_hop=1):
#初始化为0
A = np.zeros((num_node, num_node))
for i, j in edge:
A[j, i] = 1
A[i, j] = 1
# compute hop steps
hop_dis = np.zeros((num_node, num_node)) + np.inf
transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)]
arrive_mat = (np.stack(transfer_mat) > 0)
#倒序更新距离矩阵
for d in range(max_hop, -1, -1):
hop_dis[arrive_mat[d]] = d
return hop_dis
以Kinetics数据集为例,首先创建一个18*18的零矩阵,再根据get_edge()获得的边集将0-hop和1-hop的对应位置设为1;np.linalg.matrix_power函数用来快速求图中距离为d的节点,所以通过for循环可以获得距离为0和1的两个18*18的距离矩阵,再通过np.stack()函数将两个矩阵堆叠为2*18*18的矩阵;通过for循环倒序更新距离矩阵。(倒序的原因:例如关节到自身的距离可以为0,2,4等,但是需要取最短距离0,所以将d的取值变化设为从大到小)
(4)get_adjacency() 获取邻接矩阵
def get_adjacency(self, strategy):
#获取需要的hop值
valid_hop = range(0, self.max_hop + 1, self.dilation)
adjacency = np.zeros((self.num_node, self.num_node))
#距离为0或1的相应的矩阵位置置为1
for hop in valid_hop:
adjacency[self.hop_dis == hop] = 1
normalize_adjacency = normalize_digraph(adjacency)
#单标签
if strategy == 'uniform':
A = np.zeros((1, self.num_node, self.num_node))
A[0] = normalize_adjacency
self.A = A
#距离划分标签
elif strategy == 'distance':
#将节点分为两个子集,一个是节点距离为1,另一个节点距离为0
A = np.zeros((len(valid_hop), self.num_node, self.num_node))
for i, hop in enumerate(valid_hop):
A[i][self.hop_dis == hop] = normalize_adjacency[self.hop_dis == hop]
self.A = A
#结构划分标签
elif strategy == 'spatial':
A = []
for hop in valid_hop:
a_root = np.zeros((self.num_node, self.num_node))
a_close = np.zeros((self.num_node, self.num_node))
a_further = np.zeros((self.num_node, self.num_node))
for i in range(self.num_node):
for j in range(self.num_node):
if self.hop_dis[j, i] == hop:
if self.hop_dis[j, self.center] == self.hop_dis[i, self.center]:
a_root[j, i] = normalize_adjacency[j, i]
elif self.hop_dis[j, self.center] > self.hop_dis[i, self.center]:
a_close[j, i] = normalize_adjacency[j, i]
else:
a_further[j, i] = normalize_adjacency[j, i]
if hop == 0:
#hop=0时,只有j=i才成立,所以只有a_root矩阵有数据
A.append(a_root)
else:
#hop=1时,可能j和i距离center相等,所以需要将a_root的内容加到a_close里面
A.append(a_root + a_close)
A.append(a_further)
A = np.stack(A)
self.A = A
else:
raise ValueError("Do Not Exist This Strategy")
以Kinetics数据集为例,首先获得所需所有的hop值valid_hop,再根据距离矩阵将邻接矩阵中对应hop的对应位置设置为1(代码中即将hop为0和1的位置设置为1),再进行预处理(除以所在列中1的个数,即除以度矩阵);对于单标签方式,则将所有节点放在同一集合中(1*18*18的邻接矩阵);对于距离划分标签,则将节点分为两个子集,一个是节点距离为1,另一个节点距离为0(2*18*18的邻接矩阵);对于结构划分标签,将节点划分为3个子集,一个是节点自身,一个是比节点自身更接近中心的节点集合,一个是比节点自身更远离中心的节点集合(3*18*18的邻接矩阵)。需要特别指出的是,文中采用的图卷积公式是,这里的normalize_digraph的输出结果已经是。
st-gcn.py包括moudle和st-gcn两个类。moudle类为整体网络结构,st-gcn类为一个st-gcn块。
(1)__init__()
def __init__(self, in_channels, num_class, graph_args,
edge_importance_weighting, **kwargs):
super().__init__()
#计算相应的邻接矩阵
self.graph = Graph(**graph_args)
A = torch.tensor(self.graph.A, dtype=torch.float32, requires_grad=False)
#注册变量,A是不会改变的常量
self.register_buffer('A', A)
#搭建网络结构
#空间卷积核大小等于子集个数,即3
spatial_kernel_size = A.size(0)
#时间卷积核大小为9
temporal_kernel_size = 9
kernel_size = (temporal_kernel_size, spatial_kernel_size)
#归一化
self.data_bn = nn.BatchNorm1d(in_channels * A.size(1))
kwargs0 = {k: v for k, v in kwargs.items() if k != 'dropout'}
#网络结构,10层ST-GCN
self.st_gcn_networks = nn.ModuleList((
st_gcn(in_channels, 64, kernel_size, 1, residual=False, **kwargs0),
st_gcn(64, 64, kernel_size, 1, **kwargs),
st_gcn(64, 64, kernel_size, 1, **kwargs),
st_gcn(64, 64, kernel_size, 1, **kwargs),
st_gcn(64, 128, kernel_size, 2, **kwargs),
st_gcn(128, 128, kernel_size, 1, **kwargs),
st_gcn(128, 128, kernel_size, 1, **kwargs),
st_gcn(128, 256, kernel_size, 2, **kwargs),
st_gcn(256, 256, kernel_size, 1, **kwargs),
st_gcn(256, 256, kernel_size, 1, **kwargs),
))
#初始化权重超参数
if edge_importance_weighting:
#初始化为1,可训练
self.edge_importance = nn.ParameterList([
nn.Parameter(torch.ones(self.A.size()))
for i in self.st_gcn_networks
])
else:
#初始化为1,不可训练,即不设置w
self.edge_importance = [1] * len(self.st_gcn_networks)
#结果预测
self.fcn = nn.Conv2d(256, num_class, kernel_size=1)
(2)forward()
def forward(self, x):
#数据预处理
N, C, T, V, M = x.size()
x = x.permute(0, 4, 3, 1, 2).contiguous()
x = x.view(N * M, V * C, T)
x = self.data_bn(x)
x = x.view(N, M, V, C, T)
x = x.permute(0, 1, 3, 4, 2).contiguous()
x = x.view(N * M, C, T, V)
#前向传播
#注意图卷积输入的是self.A * importance,不是邻接矩阵self.A
#且循环中的gcn实际代表的是st-gcn
for gcn, importance in zip(self.st_gcn_networks, self.edge_importance):
x, _ = gcn(x, self.A * importance)
#池化层
x = F.avg_pool2d(x, x.size()[2:])
x = x.view(N, M, -1, 1, 1).mean(dim=1)
#结果预测
x = self.fcn(x)
x = x.view(x.size(0), -1)
return x
前向传播中的预处理部分,数据的size分别代表
整个网络的输入是一个(N = batch_size,C = 3,T = 300,V = 18,M = 2)的tensor,所以在进行2维卷积(n,c,h,w)的时候需要将 N 与 M 合并起来形成(N * M, C, T, V)换成这样的格式就可以与2维卷积完全类比起来。CNN中核的两维对应的是(h,w),而st-gcn的核对应的是(T,V).
(1)class st_gcn()
一个ST-GCN块包括一个GCN和TCN,其中GCN的结构在tgcn.py文件中,具体参考2.3 tgcn.py。TCN的结构如下:
self.tcn = nn.Sequential(
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(
out_channels,
out_channels,
(kernel_size[0], 1),
(stride, 1),
padding,
),
nn.BatchNorm2d(out_channels),
nn.Dropout(dropout, inplace=True),
)
一个TCN层的组成:
此外,论文中还采用了残差结构
#残差结构
if not residual:
self.residual = lambda x: 0
elif (in_channels == out_channels) and (stride == 1):
self.residual = lambda x: x
else:
self.residual = nn.Sequential(
nn.Conv2d(
in_channels,
out_channels,
kernel_size=1,
stride=(stride, 1)),
nn.BatchNorm2d(out_channels),
)
ST-GCN块的前向传播过程如下:
def forward(self, x, A):
#残差结构
res = self.residual(x)
#空间图卷积
x, A = self.gcn(x, A)
#时间图卷积
x = self.tcn(x) + res
#激活函数
return self.relu(x), A
tgcn.py主要进行空间卷积运算。
(1)__init__()
def __init__(self,
in_channels,
out_channels,
kernel_size,
t_kernel_size=1,
t_stride=1,
t_padding=0,
t_dilation=1,
bias=True):
super().__init__()
self.kernel_size = kernel_size
self.conv = nn.Conv2d(
in_channels,
out_channels * kernel_size,
kernel_size=(t_kernel_size, 1),
padding=(t_padding, 0),
stride=(t_stride, 1),
dilation=(t_dilation, 1),
bias=bias)
因为采用第三种策略,所以kernel_size的取值为3。
(2)forward()
def forward(self, x, A):
#若A.size(0) != self.kernel_size,则报错
assert A.size(0) == self.kernel_size
x = self.conv(x)
n, kc, t, v = x.size()
# //表示向下取整,相当于对x进行resize操作
x = x.view(n, self.kernel_size, kc//self.kernel_size, t, v)
#x和A进行矩阵乘法运算
x = torch.einsum('nkctv,kvw->nctw', (x, A))
return x.contiguous(), A
由于GCN中只对每一帧的空间信息进行卷积,将18个特征分为3组置于不同的权值相当于卷积核大小为(3,1)的卷积操作(3个权值取加权平均置于目标位置),即x的size不变,其输出仍然是(N * M, C, T, V)。
在forward时作者使用了einsum
的矩阵抽象乘积表达式,其计算等价于
综上所述,整体的网络结构如下图所示:
St-gcn 动作识别 理论+源码分析(Pytorch) | JoeyF's Home
ST-GCN的学习之路(二)源码解读 (Pytorch版)_LgrandStar的博客-CSDN博客
ST-GCN论文简读以及复现_点PY的博客-CSDN博客