SimDR(现在已经改名叫SimCC,后文还是称SimDR)将姿态估计的Heatmap方法转换为分类方法,在HRNet上实现了涨点,并且减小了显存占用。作者已经在github上开源了代码,但是在MMPose上目前还没有实现,所以本篇文章就使用HRNet在MMPose上实现SimDR。
SimDR原文: Is 2D Heatmap Representation Even Necessary for Human Pose Estimation?
SimDR开源代码:SimCC
因为在MMPose上修改的部分较多,所以文章会分为以下几个部分:
处理头主要负责对网络输出进行处理,例如最后的线性层和损失函数等。拷贝mmpose\models\heads\topdown_heatmap_simple_head.py并重命名为mmpose\models\heads\simDR_head.py,接下来代码会在此基础上进行修改。
创建继承TopdownHeatmapBaseHead的类simDRHead:
import torch
import torch.nn as nn
from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
constant_init, normal_init)
from mmpose.models.builder import build_loss
from mmpose.models.utils.ops import resize
from ..builder import HEADS
from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
from einops import rearrange, repeat
import torch.nn.functional as F
import numpy as np
import cv2
@HEADS.register_module()
class simDRHead(TopdownHeatmapBaseHead):
"""simDR Head
Args:
in_channels (int): Number of input channels
out_channels (int): Number of output channels
num_deconv_layers (int): Number of deconv layers.
num_deconv_layers should >= 0. Note that 0 means
no deconv layers.
num_deconv_filters (list|tuple): Number of filters.
If num_deconv_layers > 0, the length of
num_deconv_kernels (list|tuple): Kernel sizes.
in_index (int|Sequence[int]): Input feature index. Default: 0
input_transform (str|None): Transformation type of input features.
Options: 'resize_concat', 'multiple_select', None.
Default: None.
- 'resize_concat': Multiple feature maps will be resized to the
same size as the first one and then concat together.
Usually used in FCN head of HRNet.
- 'multiple_select': Multiple feature maps will be bundle into
a list and passed into decode head.
- None: Only one select feature map is allowed.
align_corners (bool): align_corners argument of F.interpolate.
Default: False.
loss_keypoint (dict): Config for keypoint loss. Default: None.
"""
pass
同样在mmpose\models\heads\__init__.py添加创建的处理头:
# Copyright (c) OpenMMLab. All rights reserved.
from .ae_higher_resolution_head import AEHigherResolutionHead
from .ae_multi_stage_head import AEMultiStageHead
from .ae_simple_head import AESimpleHead
from .cid_head import CIDHead
from .deconv_head import DeconvHead
from .deeppose_regression_head import DeepposeRegressionHead
from .dekr_head import DEKRHead
from .hmr_head import HMRMeshHead
from .interhand_3d_head import Interhand3DHead
from .mtut_head import MultiModalSSAHead
from .temporal_regression_head import TemporalRegressionHead
from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
from .topdown_heatmap_multi_stage_head import (TopdownHeatmapMSMUHead,
TopdownHeatmapMultiStageHead)
from .topdown_heatmap_simple_head import TopdownHeatmapSimpleHead
from .vipnas_heatmap_simple_head import ViPNASHeatmapSimpleHead
from .voxelpose_head import CuboidCenterHead, CuboidPoseHead
from .simDR_head import simDRHead
__all__ = [
'TopdownHeatmapSimpleHead', 'TopdownHeatmapMultiStageHead',
'TopdownHeatmapMSMUHead', 'TopdownHeatmapBaseHead',
'AEHigherResolutionHead', 'AESimpleHead', 'AEMultiStageHead', 'CIDHead',
'DeepposeRegressionHead', 'TemporalRegressionHead', 'Interhand3DHead',
'HMRMeshHead', 'DeconvHead', 'ViPNASHeatmapSimpleHead', 'CuboidCenterHead',
'CuboidPoseHead', 'MultiModalSSAHead', 'DEKRHead','simDRHead'
]
这样就能在配置文件中直接调用我们创建的处理头了。
2.添加evaluation需要的函数
因为处理头涉及到损失和验证函数,所以需要对验证函数做些许修改,本来验证函数是在mmpose.core.evaluation里面,但是为了不修改mmpose源码我将验证函数之间写到了处理头里。想要更加标准化的话可以自定义一个evaluation.py放在core文件夹里。
def transform_preds(coords, center, scale, output_size):
target_coords = np.zeros(coords.shape)
trans = get_affine_transform(center, scale, 0, output_size, inv=1)
for p in range(coords.shape[0]):
target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
return target_coords
def get_affine_transform(
center, scale, rot, output_size,
shift=np.array([0, 0], dtype=np.float32), inv=0
):
if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
print(scale)
scale = np.array([scale, scale])
scale_tmp = scale * 200.0
src_w = scale_tmp[0]
dst_w = output_size[0]
dst_h = output_size[1]
rot_rad = np.pi * rot / 180
src_dir = get_dir([0, src_w * -0.5], rot_rad)
dst_dir = np.array([0, dst_w * -0.5], np.float32)
src = np.zeros((3, 2), dtype=np.float32)
dst = np.zeros((3, 2), dtype=np.float32)
src[0, :] = center + scale_tmp * shift
src[1, :] = center + src_dir + scale_tmp * shift
dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
src[2:, :] = get_3rd_point(src[0, :], src[1, :])
dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
if inv:
trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
else:
trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
return trans
def affine_transform(pt, t):
new_pt = np.array([pt[0], pt[1], 1.]).T
new_pt = np.dot(t, new_pt)
return new_pt[:2]
def get_3rd_point(a, b):
direct = a - b
return b + np.array([-direct[1], direct[0]], dtype=np.float32)
def get_dir(src_point, rot_rad):
sn, cs = np.sin(rot_rad), np.cos(rot_rad)
src_result = [0, 0]
src_result[0] = src_point[0] * cs - src_point[1] * sn
src_result[1] = src_point[0] * sn + src_point[1] * cs
return src_result
def flip_back_simdr(output_flipped, matched_parts, type='x'):
'''
ouput_flipped: numpy.ndarray(batch_size, num_joints, onehot)
'''
assert output_flipped.ndim == 3,\
'output_flipped should be [batch_size, num_joints, onehot]'
if type == 'x':
output_flipped = output_flipped[:, :, ::-1]
for pair in matched_parts:
tmp = output_flipped[:, pair[0], :].copy()
output_flipped[:, pair[0], :] = output_flipped[:, pair[1], :]
output_flipped[:, pair[1], :] = tmp
return output_flipped
def _calc_distances(preds, targets, mask, normalize):
"""Calculate the normalized distances between preds and target.
Note:
batch_size: N
num_keypoints: K
dimension of keypoints: D (normally, D=2 or D=3)
Args:
preds (np.ndarray[N, K, D]): Predicted keypoint location.
targets (np.ndarray[N, K, D]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
normalize (np.ndarray[N, D]): Typical value is heatmap_size
Returns:
np.ndarray[K, N]: The normalized distances. \
If target keypoints are missing, the distance is -1.
"""
N, K, _ = preds.shape
# set mask=0 when normalize==0
_mask = mask.copy()
_mask[np.where((normalize == 0).sum(1))[0], :] = False
distances = np.full((N, K), -1, dtype=np.float32)
# handle invalid values
normalize[np.where(normalize <= 0)] = 1e6
distances[_mask] = np.linalg.norm(
((preds - targets) / normalize[:, None, :])[_mask], axis=-1)
return distances.T
def _distance_acc(distances, thr=0.5):
"""Return the percentage below the distance threshold, while ignoring
distances values with -1.
Note:
batch_size: N
Args:
distances (np.ndarray[N, ]): The normalized distances.
thr (float): Threshold of the distances.
Returns:
float: Percentage of distances below the threshold. \
If all target keypoints are missing, return -1.
"""
distance_valid = distances != -1
num_distance_valid = distance_valid.sum()
if num_distance_valid > 0:
return (distances[distance_valid] < thr).sum() / num_distance_valid
return -1
def keypoint_pck_accuracy(pred, gt, mask, thr, normalize):
"""Calculate the pose accuracy of PCK for each individual keypoint and the
averaged accuracy across all keypoints for coordinates.
Note:
PCK metric measures accuracy of the localization of the body joints.
The distances between predicted positions and the ground-truth ones
are typically normalized by the bounding box size.
The threshold (thr) of the normalized distance is commonly set
as 0.05, 0.1 or 0.2 etc.
- batch_size: N
- num_keypoints: K
Args:
pred (np.ndarray[N, K, 2]): Predicted keypoint location.
gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
thr (float): Threshold of PCK calculation.
normalize (np.ndarray[N, 2]): Normalization factor for H&W.
Returns:
tuple: A tuple containing keypoint accuracy.
- acc (np.ndarray[K]): Accuracy of each keypoint.
- avg_acc (float): Averaged accuracy across all keypoints.
- cnt (int): Number of valid keypoints.
"""
distances = _calc_distances(pred, gt, mask, normalize)
acc = np.array([_distance_acc(d, thr) for d in distances])
valid_acc = acc[acc >= 0]
cnt = len(valid_acc)
avg_acc = valid_acc.mean() if cnt > 0 else 0
return acc, avg_acc, cnt
def _get_max_preds(heatmaps):
"""Get keypoint predictions from score maps.
Note:
batch_size: N
num_keypoints: K
heatmap height: H
heatmap width: W
Args:
heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
Returns:
tuple: A tuple containing aggregated results.
- preds (np.ndarray[N, K, 2]): Predicted keypoint location.
- maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
"""
assert isinstance(heatmaps,
np.ndarray), ('heatmaps should be numpy.ndarray')
assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
N, K, _, W = heatmaps.shape
heatmaps_reshaped = heatmaps.reshape((N, K, -1))
idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
preds[:, :, 0] = preds[:, :, 0] % W
preds[:, :, 1] = preds[:, :, 1] // W
preds = np.where(np.tile(maxvals, (1, 1, 2)) > 0.0, preds, -1)
return preds, maxvals
def pose_pck_accuracy_DR(output, target, mask, cfg,thr=0.05, normalize=None):
"""Calculate the pose accuracy of PCK for each individual keypoint and the
averaged accuracy across all keypoints from heatmaps.
Note:
PCK metric measures accuracy of the localization of the body joints.
The distances between predicted positions and the ground-truth ones
are typically normalized by the bounding box size.
The threshold (thr) of the normalized distance is commonly set
as 0.05, 0.1 or 0.2 etc.
- batch_size: N
- num_keypoints: K
- heatmap height: H
- heatmap width: W
Args:
output (np.ndarray[N, K, H, W]): Model output heatmaps.
target (np.ndarray[N, K, H, W]): Groundtruth heatmaps.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
thr (float): Threshold of PCK calculation. Default 0.05.
normalize (np.ndarray[N, 2]): Normalization factor for H&W.
Returns:
tuple: A tuple containing keypoint accuracy.
- np.ndarray[K]: Accuracy of each keypoint.
- float: Averaged accuracy across all keypoints.
- int: Number of valid keypoints.
"""
if isinstance(output, tuple):
if isinstance(target, tuple) or isinstance(target, list):
output_x,output_y=output
target_x,target_y=target
output_x = F.softmax(output_x,dim=2)
output_y = F.softmax(output_y,dim=2)
_, preds_x = output_x.max(2,keepdim=True)
_, preds_y = output_y.max(2,keepdim=True)
output = torch.ones([preds_x.size(0),preds_x.size(1),2])
output[:,:,0] = torch.squeeze(torch.true_divide(preds_x, cfg['SIMDR_SPLIT_RATIO']))
output[:,:,1] = torch.squeeze(torch.true_divide(preds_y, cfg['SIMDR_SPLIT_RATIO']))
del preds_x,preds_y,output_x,output_y
pred=output.detach().cpu().numpy()
_, target_x = target_x.max(2,keepdim=True)
_, target_y = target_y.max(2,keepdim=True)
target = torch.ones([target_x.size(0),target_x.size(1),2])
target[:,:,0] = torch.squeeze(torch.true_divide(target_x, cfg['SIMDR_SPLIT_RATIO']))
target[:,:,1] = torch.squeeze(torch.true_divide(target_y, cfg['SIMDR_SPLIT_RATIO']))
del target_x,target_y
gt=target.detach().cpu().numpy()
else:
output_x,output_y=output
output_x = F.softmax(output_x,dim=2)
output_y = F.softmax(output_y,dim=2)
_, preds_x = output_x.max(2,keepdim=True)
_, preds_y = output_y.max(2,keepdim=True)
output = torch.ones([preds_x.size(0),preds_x.size(1),2])
output[:,:,0] = torch.squeeze(torch.true_divide(preds_x, cfg['SIMDR_SPLIT_RATIO']))
output[:,:,1] = torch.squeeze(torch.true_divide(preds_y, cfg['SIMDR_SPLIT_RATIO']))
del preds_x,preds_y
pred=output.detach().cpu().numpy()
target=target.detach().cpu().numpy()
gt, _ = _get_max_preds(target)
N, K, _ = output.shape
H,W=cfg['image_size']
if K == 0:
return None, 0, 0
if normalize is None:
normalize = np.tile(np.array([[H, W]]), (N, 1))
return keypoint_pck_accuracy(pred, gt, mask, thr, normalize)
以上函数我写在了类外,其实更建议写在类里。下面要开始编写处理头类的内容了。
因为simDR需要传入额外的几个参数,所以在编写配置文件时,我们在extra中加入了几行:
keypoint_head=dict(
type='simDRHead',
in_channels=40,
out_channels=channel_cfg['num_output_channels'],
num_deconv_layers=0,
extra=dict(
final_conv_kernel=1,
HEAD_INPUT=data_cfg['heatmap_size'][0]*data_cfg['heatmap_size'][1],
image_size=data_cfg['image_size'],
SIMDR_SPLIT_RATIO=simdr_split_ratio,
coord_representation='sa-simdr',
NUM_JOINTS=channel_cfg['dataset_joints']),
loss_keypoint=dict(type='KLDiscretLoss')),
配置文件的参数会传入处理头的__init__函数:
def __init__(self,
in_channels,
out_channels,
num_deconv_layers=3,
num_deconv_filters=(256, 256, 256),
num_deconv_kernels=(4, 4, 4),
extra=None,
in_index=0,
input_transform=None,
align_corners=False,
loss_keypoint=None,
train_cfg=None,
test_cfg=None):
super().__init__()
self.in_channels = in_channels
self.loss = build_loss(loss_keypoint)
self.train_cfg = {} if train_cfg is None else train_cfg
self.test_cfg = {} if test_cfg is None else test_cfg
self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
self._init_inputs(in_channels, in_index, input_transform)
self.in_index = in_index
self.align_corners = align_corners
if extra is not None and not isinstance(extra, dict):
raise TypeError('extra should be dict or None.')
if num_deconv_layers > 0:
self.deconv_layers = self._make_deconv_layer(
num_deconv_layers,
num_deconv_filters,
num_deconv_kernels,
)
elif num_deconv_layers == 0:
self.deconv_layers = nn.Identity()
else:
raise ValueError(
f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
identity_final_layer = False
if extra is not None and 'final_conv_kernel' in extra:
assert extra['final_conv_kernel'] in [0, 1, 3]
if extra['final_conv_kernel'] == 3:
padding = 1
elif extra['final_conv_kernel'] == 1:
padding = 0
else:
# 0 for Identity mapping.
identity_final_layer = True
kernel_size = extra['final_conv_kernel']
else:
kernel_size = 1
padding = 0
if identity_final_layer:
self.final_layer = nn.Identity()
else:
conv_channels = num_deconv_filters[
-1] if num_deconv_layers > 0 else self.in_channels
layers = []
if extra is not None:
num_conv_layers = extra.get('num_conv_layers', 0)
num_conv_kernels = extra.get('num_conv_kernels',
[1] * num_conv_layers)
for i in range(num_conv_layers):
layers.append(
build_conv_layer(
dict(type='Conv2d'),
in_channels=conv_channels,
out_channels=conv_channels,
kernel_size=num_conv_kernels[i],
stride=1,
padding=(num_conv_kernels[i] - 1) // 2))
layers.append(
build_norm_layer(dict(type='BN'), conv_channels)[1])
layers.append(nn.ReLU(inplace=True))
layers.append(
build_conv_layer(
cfg=dict(type='Conv2d'),
in_channels=conv_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=1,
padding=padding))
if len(layers) > 1:
self.final_layer = nn.Sequential(*layers)
else:
self.final_layer = layers[0]
self.extra = extra
self.coord_representation = extra['coord_representation']
assert self.coord_representation in ['simdr', 'sa-simdr', 'heatmap'], 'only simdr and sa-simdr and heatmap supported for pose_resnet_upfree'
if self.coord_representation == 'simdr' or self.coord_representation == 'sa-simdr':
self.mlp_head_x = nn.Linear(self.extra['HEAD_INPUT'], int(self.extra['image_size'][0]*self.extra['SIMDR_SPLIT_RATIO']))
self.mlp_head_y = nn.Linear(self.extra['HEAD_INPUT'], int(self.extra['image_size'][1]*self.extra['SIMDR_SPLIT_RATIO']))
其实与TopdownHeatmapSimpleHead类的内容大同小异,只是根据simDR的源码添加了几行,主要是线性层的创建。
simDR类中的get_loss()与TopdownHeatmapSimpleHead类一样不需要修改,这里要修改的是损失函数。
在mmpose\models\losses路径新建simDR_loss.py文件:
import torch
import torch.nn as nn
from ..builder import LOSSES
@LOSSES.register_module()
class KLDiscretLoss(nn.Module):
def __init__(self):
super(KLDiscretLoss, self).__init__()
self.LogSoftmax = nn.LogSoftmax(dim=1) #[B,LOGITS]
self.criterion_ = nn.KLDivLoss(reduction='none')
def criterion(self, dec_outs, labels):
scores = self.LogSoftmax(dec_outs)
loss = torch.mean(self.criterion_(scores, labels), dim=1)
return loss
def forward(self, output, target, target_weight):
output_x=output[0]
output_y=output[1]
target_x=target[0]
target_y=target[1]
num_joints = output_x.size(1)
loss = 0
for idx in range(num_joints):
coord_x_pred = output_x[:,idx].squeeze()
coord_y_pred = output_y[:,idx].squeeze()
coord_x_gt = target_x[:,idx].squeeze()
coord_y_gt = target_y[:,idx].squeeze()
weight = target_weight[:,idx].squeeze()
loss += (self.criterion(coord_x_pred,coord_x_gt).mul(weight).mean())
loss += (self.criterion(coord_y_pred,coord_y_gt).mul(weight).mean())
return loss / num_joints
@LOSSES.register_module()
class NMTNORMCritierion(nn.Module):
def __init__(self, label_smoothing=0.0):
super(NMTNORMCritierion, self).__init__()
self.label_smoothing = label_smoothing
self.LogSoftmax = nn.LogSoftmax(dim=1) #[B,LOGITS]
if label_smoothing > 0:
self.criterion_ = nn.KLDivLoss(reduction='none')
else:
self.criterion_ = nn.NLLLoss(reduction='none', ignore_index=100000)
self.confidence = 1.0 - label_smoothing
def _smooth_label(self, num_tokens):
one_hot = torch.randn(1, num_tokens)
one_hot.fill_(self.label_smoothing / (num_tokens - 1))
return one_hot
def _bottle(self, v):
return v.view(-1, v.size(2))
def criterion(self, dec_outs, labels):
scores = self.LogSoftmax(dec_outs)
num_tokens = scores.size(-1)
# conduct label_smoothing module
gtruth = labels.view(-1)
if self.confidence < 1:
tdata = gtruth.detach()
one_hot = self._smooth_label(num_tokens) # Do label smoothing, shape is [M]
if labels.is_cuda:
one_hot = one_hot.cuda()
tmp_ = one_hot.repeat(gtruth.size(0), 1) # [N, M]
tmp_.scatter_(1, tdata.unsqueeze(1), self.confidence) # after tdata.unsqueeze(1) , tdata shape is [N,1]
gtruth = tmp_.detach()
loss = torch.mean(self.criterion_(scores, gtruth), dim=1)
return loss
def forward(self, output, target, target_weight):
output_x, output_y=output
batch_size = output_x.size(0)
num_joints = output_x.size(1)
loss = 0
for idx in range(num_joints):
coord_x_pred = output_x[:,idx].squeeze()
coord_y_pred = output_y[:,idx].squeeze()
coord_gt = target[:,idx].squeeze()
weight = target_weight[:,idx].squeeze()
loss += self.criterion(coord_x_pred,coord_gt[:,0]).mul(weight).mean()
loss += self.criterion(coord_y_pred,coord_gt[:,1]).mul(weight).mean()
return loss / num_joints
@LOSSES.register_module()
class NMTCritierion(nn.Module):
def __init__(self, label_smoothing=0.0):
super(NMTCritierion, self).__init__()
self.label_smoothing = label_smoothing
self.LogSoftmax = nn.LogSoftmax(dim=1) #[B,LOGITS]
if label_smoothing > 0:
self.criterion_ = nn.KLDivLoss(reduction='none')
else:
self.criterion_ = nn.NLLLoss(reduction='none', ignore_index=100000)
self.confidence = 1.0 - label_smoothing
def _smooth_label(self, num_tokens):
one_hot = torch.randn(1, num_tokens)
one_hot.fill_(self.label_smoothing / (num_tokens - 1))
return one_hot
def _bottle(self, v):
return v.view(-1, v.size(2))
def criterion(self, dec_outs, labels):
scores = self.LogSoftmax(dec_outs)
num_tokens = scores.size(-1)
# conduct label_smoothing module
gtruth = labels.view(-1)
if self.confidence < 1:
tdata = gtruth.detach()
one_hot = self._smooth_label(num_tokens) # Do label smoothing, shape is [M]
if labels.is_cuda:
one_hot = one_hot.cuda()
tmp_ = one_hot.repeat(gtruth.size(0), 1) # [N, M]
tmp_.scatter_(1, tdata.unsqueeze(1), self.confidence) # after tdata.unsqueeze(1) , tdata shape is [N,1]
gtruth = tmp_.detach()
loss = torch.sum(self.criterion_(scores, gtruth), dim=1)
return loss
def forward(self, output, target, target_weight):
output_x, output_y=output
batch_size = output_x.size(0)
num_joints = output_x.size(1)
loss = 0
for idx in range(num_joints):
coord_x_pred = output_x[:,idx].squeeze()
coord_y_pred = output_y[:,idx].squeeze()
coord_gt = target[:,idx].squeeze()
weight = target_weight[:,idx].squeeze()
loss += self.criterion(coord_x_pred,coord_gt[:,0]).mul(weight).sum()
loss += self.criterion(coord_y_pred,coord_gt[:,1]).mul(weight).sum()
return loss / batch_size
这些损失函数从simDR源码得来。然后同样别忘了在mmpose\models\losses\__init__.py中做修改:
from .classfication_loss import BCELoss
from .heatmap_loss import AdaptiveWingLoss, FocalHeatmapLoss
from .mesh_loss import GANLoss, MeshLoss
from .mse_loss import JointsMSELoss, JointsOHKMMSELoss
from .multi_loss_factory import AELoss, HeatmapLoss, MultiLossFactory
from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MSELoss, RLELoss,
SemiSupervisionLoss, SmoothL1Loss,
SoftWeightSmoothL1Loss, SoftWingLoss, WingLoss)
from .simDR_loss import NMTCritierion,NMTNORMCritierion,KLDiscretLoss
__all__ = [
'JointsMSELoss', 'JointsOHKMMSELoss', 'HeatmapLoss', 'AELoss',
'MultiLossFactory', 'MeshLoss', 'GANLoss', 'SmoothL1Loss', 'WingLoss',
'MPJPELoss', 'MSELoss', 'L1Loss', 'BCELoss', 'BoneLoss',
'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss', 'RLELoss',
'SoftWeightSmoothL1Loss', 'FocalHeatmapLoss','NMTCritierion',
'NMTNORMCritierion','KLDiscretLoss'
]
主要会用到前面定义的验证函数:
def get_accuracy(self, output, target, target_weight):
"""Calculate accuracy for top-down keypoint loss.
Note:
- batch_size: N
- num_keypoints: K
- heatmaps height: H
- heatmaps weight: W
Args:
output (torch.Tensor[N,K,H,W]): Output heatmaps.
target (torch.Tensor[N,K,H,W]): Target heatmaps.
target_weight (torch.Tensor[N,K,1]):
Weights across different joint types.
"""
accuracy = dict()
if self.target_type == 'GaussianHeatmap':
_, avg_acc, _ = pose_pck_accuracy_DR(
output,
target,
target_weight.detach().cpu().numpy().squeeze(-1) > 0,
self.extra)
accuracy['acc_pose'] = float(avg_acc)
return accuracy
根据simDR源码,在final_layer之后将heatmap拆分为两个一维矩阵
def forward(self, x):
"""Forward function."""
x = self._transform_inputs(x)
x = self.deconv_layers(x)
x = self.final_layer(x)
if self.coord_representation == 'heatmap':
return x
elif self.coord_representation == 'simdr' or self.coord_representation == 'sa-simdr':
x = rearrange(x, 'b c h w -> b c (h w)')
pred_x = self.mlp_head_x(x)
pred_y = self.mlp_head_y(x)
return (pred_x, pred_y)
因为输出由一个变为两个,为了避免报错使用元组进行传输。
因为heatmap改变了,所以推理时的flip和shift操作有所变化。
def inference_model(self, x, flip_pairs=None):
"""Inference function.
Returns:
output_heatmap (np.ndarray): Output heatmaps.
Args:
x (torch.Tensor[N,K,H,W]): Input features.
flip_pairs (None | list[tuple]):
Pairs of keypoints which are mirrored.
"""
output_x, output_y = self.forward(x)
if flip_pairs is not None:
output_x = flip_back_simdr(output_x.cpu().numpy(),
flip_pairs,type='x')
output_y = flip_back_simdr(output_y.cpu().numpy(),
flip_pairs,type='y')
output_x = torch.from_numpy(output_x.copy()).cuda()
output_y = torch.from_numpy(output_y.copy()).cuda()
# feature is not aligned, shift flipped heatmap for higher accuracy
if self.test_cfg.get('shift_heatmap', False):
output_x[:, :, 0:-1] = output_x[:, :, 1:]
return (output_x,output_y)
def decode(self, img_metas, output, **kwargs):
"""Decode keypoints from heatmaps.
Args:
img_metas (list(dict)): Information about data augmentation
By default this includes:
- "image_file: path to the image file
- "center": center of the bbox
- "scale": scale of the bbox
- "rotation": rotation of the bbox
- "bbox_score": score of bbox
output (np.ndarray[N, K, H, W]): model predicted heatmaps.
"""
output_x,output_y=output
output_x = F.softmax(output_x,dim=2)
output_y = F.softmax(output_y,dim=2)
max_val_x, preds_x = output_x.max(2,keepdim=True)
max_val_y, preds_y = output_y.max(2,keepdim=True)
mask = max_val_x > max_val_y
max_val_x[mask] = max_val_y[mask]
maxvals = max_val_x.detach().cpu().numpy()
output = torch.ones([preds_x.size(0),preds_x.size(1),2])
output[:,:,0] = torch.squeeze(torch.true_divide(preds_x, self.extra['SIMDR_SPLIT_RATIO']))
output[:,:,1] = torch.squeeze(torch.true_divide(preds_y, self.extra['SIMDR_SPLIT_RATIO']))
del preds_x,preds_y,output_x,output_y
preds=output.detach().cpu().numpy()
batch_size = len(img_metas)
if 'bbox_id' in img_metas[0]:
bbox_ids = []
else:
bbox_ids = None
c = np.zeros((batch_size, 2), dtype=np.float32)
s = np.zeros((batch_size, 2), dtype=np.float32)
image_paths = []
score = np.ones(batch_size)
for i in range(batch_size):
c[i, :] = img_metas[i]['center']
s[i, :] = img_metas[i]['scale']
image_paths.append(img_metas[i]['image_file'])
if 'bbox_score' in img_metas[i]:
score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
if bbox_ids is not None:
bbox_ids.append(img_metas[i]['bbox_id'])
for i in range(output.shape[0]):
preds[i] = transform_preds(
output[i], c[i], s[i], [self.extra['image_size'][0], self.extra['image_size'][1]]
)
all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
all_preds[:, :, 0:2] = preds[:, :, 0:2]
all_preds[:, :, 2:3] = maxvals
all_boxes[:, 0:2] = c[:, 0:2]
all_boxes[:, 2:4] = s[:, 0:2]
all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
all_boxes[:, 5] = score
result = {}
result['preds'] = all_preds
result['boxes'] = all_boxes
result['image_paths'] = image_paths
result['bbox_ids'] = bbox_ids
return result
因为heatmap有所变化,所以对关节点的decode需要做一定修改。
9.simDR_head.py文件一览
由于篇幅原因,还有几个没有修改过的函数没有提及,请参考TopdownHeatmapSimpleHead类自行添加。文章更新完成后,我会上传项目的完整文件到github。
如果文章对你有有帮助,请动动手指点点收藏和赞,谢谢。