首先参照下面的命令,下载st-gcn算法的训练代码,配置环境。
git clone https://github.com/yysijie/st-gcn.git
cd st-gcn
pip install -r requirements.txt
cd torchlight
python setup.py install
cd ..
训练之前,需要根据kinetics-skeleton数据集的格式,提取自建数据集中目标的行为数据。下图中是kinetics-skeleton数据集的组成。其中kinetics_train和kinetics_val文件夹中存储的是每一段视频中的行为信息,包括每一帧的姿态和行为标签。另外两个json文件中包含了对应文件夹中所有的文件名称和行为标签与索引。相关示例可下载。
提取生成kinetics-skeleton人体行为数据集的方法参考下面链接:
kinetics-skeleton格式行为数据提取方法
stgcn训练代码中自带了数据转换代码tools/kinetics_gendata.py,使用该脚本将kinetics-skleton数据集转换为训练使用的npy与pkl文件。
python tools/kinetics_gendata.py
这里需要注意根据自己的数据集,修改kinetics_gendata.py中的内容,包括37-39行的num_person_in、num_person_out和max_frame,55行的关键点个数,72~83行的数据读取路径。feeder_kinetics.py中也要相应的修改。
num_person_in=5, #observe the first 5 persons
num_person_out=2, #then choose 2 persons with the highest score
max_frame=300):
shape=(len(sample_name), 3, max_frame, 18, num_person_out))
# output data shape (N, C, T, V, M)
self.N = len(self.sample_name) #sample
self.C = 3 #channel
self.T = 300 #frame
self.V = 18 #joint
self.M = self.num_person_out #person
在net/utils/graph.py文件里面get_edge函数中增加一个elif,num_node为 关键点个数、self_link为连接关系。如下添加的是一个‘my_pose’Layout,关键点个数为20(默认的pose点数是18)。
注意这里的默认的layout如果符合自己定义的姿态就不用修改,否则需要自定义一个。其中num_node为关键点的个数,neighbor_link为关键点连接关系。如果新定义的姿态点数不为18,在后续转换中可能还有修改保持一致。
elif layout == 'my_pose':
self.num_node = 20
self_link = [(i, i) for i in range(self.num_node)]
neighbor_link = [(0, 1), (0, 3), (1, 2), (3, 4), (0, 5), (0, 11),
(5, 6), (6, 7), (11, 12), (12, 13),
(5, 8), (11, 14), (8, 9), (9, 10), (14, 15), (15, 16),
(17, 18), (8, 19), (14, 19), (17, 5), (17, 8), (17, 11),
(17, 14)]
self.edge = self_link + neighbor_link
self.center = 1
修改config/st_gcn/kinetics-skeleton/train.yaml中的相关参数。
data_path和label_path修改为 之前生成的文件路径;
num_class改为自建数据集的行为类别个数;
layout参数修改为之前添加的layout类别;
strategy设置为spatial;
由于我使用多GPU训练报错,所以设置device: [0];
window_size可适当增大;
调整batch_size、学习率和迭代次数等。
执行训练代码:
python main.py recognition -c config/st_gcn/kinetics-skeleton/train.yaml
训练生成的模型在work_dir中,默认每10个epoch保存一次模型。
import numpy as np
import torch
import torch.nn as nn
from net.utils.graph import Graph
from net.utils.tgcn import ConvTemporalGraphical
# from net.st_gcn import Model
import torch.nn.functional as F
class Model(nn.Module):
r"""Spatial temporal graph convolutional networks.
Args:
in_channels (int): Number of channels in the input data
num_class (int): Number of classes for the classification task
graph_args (dict): The arguments for building the graph
edge_importance_weighting (bool): If ``True``, adds a learnable
importance weighting to the edges of the graph
**kwargs (optional): Other parameters for graph convolution units
Shape:
- Input: :math:`(N, in_channels, T_{in}, V_{in}, M_{in})`
- Output: :math:`(N, num_class)` where
:math:`N` is a batch size,
:math:`T_{in}` is a length of input sequence,
:math:`V_{in}` is the number of graph nodes,
:math:`M_{in}` is the number of instance in a frame.
"""
def __init__(self, in_channels=3, num_class=3,
edge_importance_weighting=True, **kwargs):
super().__init__()
# load graph
self.graph = Graph()
A = torch.tensor(self.graph.A, dtype=torch.float32, requires_grad=False)
self.register_buffer('A', A)
# build networks
spatial_kernel_size = A.size(0)
temporal_kernel_size = 9
kernel_size = (temporal_kernel_size, spatial_kernel_size)
self.data_bn = nn.BatchNorm1d(in_channels * A.size(1))
kwargs0 = {k: v for k, v in kwargs.items() if k != 'dropout'}
self.st_gcn_networks = nn.ModuleList((
st_gcn(in_channels, 64, kernel_size, 1, residual=False, **kwargs0),
st_gcn(64, 64, kernel_size, 1, **kwargs),
st_gcn(64, 64, kernel_size, 1, **kwargs),
st_gcn(64, 64, kernel_size, 1, **kwargs),
st_gcn(64, 128, kernel_size, 2, **kwargs),
st_gcn(128, 128, kernel_size, 1, **kwargs),
st_gcn(128, 128, kernel_size, 1, **kwargs),
st_gcn(128, 256, kernel_size, 2, **kwargs),
st_gcn(256, 256, kernel_size, 1, **kwargs),
st_gcn(256, 256, kernel_size, 1, **kwargs),
))
# initialize parameters for edge importance weighting
if edge_importance_weighting:
self.edge_importance = nn.ParameterList([
nn.Parameter(torch.ones(self.A.size()))
for i in self.st_gcn_networks
])
else:
self.edge_importance = [1] * len(self.st_gcn_networks)
# fcn for prediction
self.fcn = nn.Conv2d(256, num_class, kernel_size=1)
def forward(self, x):
# data normalization
N, C, T, V, M = x.size()
x = x.permute(0, 4, 3, 1, 2).contiguous()
x = x.view(N * M, V * C, T)
x = self.data_bn(x)
x = x.view(N, M, V, C, T)
x = x.permute(0, 1, 3, 4, 2).contiguous()
x = x.view(N * M, C, T, V)
# forwad
for gcn, importance in zip(self.st_gcn_networks, self.edge_importance):
x, _ = gcn(x, self.A * importance)
# global pooling
x = F.avg_pool2d(x, x.size()[2:])
x = x.view(N, M, -1, 1, 1).mean(dim=1)
# prediction
x = self.fcn(x)
x = x.view(x.size(0), -1)
return x
def extract_feature(self, x):
# data normalization
N, C, T, V, M = x.size()
x = x.permute(0, 4, 3, 1, 2).contiguous()
x = x.view(N * M, V * C, T)
x = self.data_bn(x)
x = x.view(N, M, V, C, T)
x = x.permute(0, 1, 3, 4, 2).contiguous()
x = x.view(N * M, C, T, V)
# forwad
for gcn, importance in zip(self.st_gcn_networks, self.edge_importance):
x, _ = gcn(x, self.A * importance)
_, c, t, v = x.size()
feature = x.view(N, M, c, t, v).permute(0, 2, 3, 4, 1)
# prediction
x = self.fcn(x)
output = x.view(N, M, -1, t, v).permute(0, 2, 3, 4, 1)
return output, feature
class st_gcn(nn.Module):
r"""Applies a spatial temporal graph convolution over an input graph sequence.
Args:
in_channels (int): Number of channels in the input sequence data
out_channels (int): Number of channels produced by the convolution
kernel_size (tuple): Size of the temporal convolving kernel and graph convolving kernel
stride (int, optional): Stride of the temporal convolution. Default: 1
dropout (int, optional): Dropout rate of the final output. Default: 0
residual (bool, optional): If ``True``, applies a residual mechanism. Default: ``True``
Shape:
- Input[0]: Input graph sequence in :math:`(N, in_channels, T_{in}, V)` format
- Input[1]: Input graph adjacency matrix in :math:`(K, V, V)` format
- Output[0]: Outpu graph sequence in :math:`(N, out_channels, T_{out}, V)` format
- Output[1]: Graph adjacency matrix for output data in :math:`(K, V, V)` format
where
:math:`N` is a batch size,
:math:`K` is the spatial kernel size, as :math:`K == kernel_size[1]`,
:math:`T_{in}/T_{out}` is a length of input/output sequence,
:math:`V` is the number of graph nodes.
"""
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
dropout=0,
residual=True):
super().__init__()
assert len(kernel_size) == 2
assert kernel_size[0] % 2 == 1
padding = ((kernel_size[0] - 1) // 2, 0)
self.gcn = ConvTemporalGraphical(in_channels, out_channels,
kernel_size[1])
self.tcn = nn.Sequential(
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(
out_channels,
out_channels,
(kernel_size[0], 1),
(stride, 1),
padding,
),
nn.BatchNorm2d(out_channels),
nn.Dropout(dropout, inplace=True),
)
if not residual:
self.residual = lambda x: 0
elif (in_channels == out_channels) and (stride == 1):
self.residual = lambda x: x
else:
self.residual = nn.Sequential(
nn.Conv2d(
in_channels,
out_channels,
kernel_size=1,
stride=(stride, 1)),
nn.BatchNorm2d(out_channels),
)
self.relu = nn.ReLU(inplace=True)
def forward(self, x, A):
res = self.residual(x)
x, A = self.gcn(x, A)
x = self.tcn(x) + res
return self.relu(x), A
if __name__ == '__main__':
# set params
weights_path = './work_dir/recognition/pig18/epoch300_model.pt'
label_list = ['standing', 'walking', 'laying']
data_path = "./data/pig18/val_data.npy"
label_path = "./data/pig18/val_label.pkl"
model = Model().to('cuda:0')
weights = torch.load(weights_path)
model.load_state_dict(weights)
model.eval()
val_data = np.load(data_path)
f = open(label_path, 'rb')
label_data = pickle.load(f)
Num_ture = [0] * (len(label_list)+1)
Num_total = [0] * (len(label_list)+1)
'''
data1 = torch.tensor(val_data[0]).unsqueeze(0)
data1 = data1.float().to("cuda:0")
traced_model = torch.jit.trace(model, data1) #trace模型
'''
for data in val_data:
data = torch.tensor(data).unsqueeze(0)
data = data.float().to("cuda:0")
output = model(data).data.cpu().numpy()[0].tolist()
pred_index = output.index(max(output))
label_index = label_data[1][Num_total[-1]]
print("Label/Pred: {}/{}".format(label_list[label_index], label_list[pred_index]))
for l_idx in range(len(label_list)):
if label_index == l_idx:
Num_total[l_idx] += 1
if pred_index==label_index:
Num_ture[l_idx] += 1
Num_ture[-1] += 1
Num_total[-1] += 1
for idx in range(len(label_list)):
print("Accuracy for {}-{}: {}, TP: {}, Total_num: {}".format(idx, label_list[idx], Num_ture[idx]/Num_total[idx], Num_ture[idx], Num_total[idx]))
print("Total Accuracy: {}".format(Num_ture[-1]/Num_total[-1]))
# # traced_model.save("stgcn_torchscript.pt") #保存trace的模型