该论文在YOLOv3的基础上增加embedding学习,通过同时输出box、class和embedding实现One-stage跟踪,提高速度。但获得embedding后仍需要匹配算法进行跟踪,严格来说依然是两阶段的。因与YOLOv3模型相似,本文主要介绍跟踪部分内容和代码。
论文:Towards Real-Time Multi-Object Tracking
代码:Zhongdao/Towards-Realtime-MOT
论文针对SED跟踪弊端,提出One-stage 跟踪JED范式。
采用YOLOv3模型,网络结构基本相同,只是在prediction head中多出一个分支用于学习embedding,每个预测头都被建模为多任务学习问题,Prediction head输出大小为(6A+D)HW,其中A是该比例的anchor数量,D是embedding维度,分配如下
跟踪轨迹中的每一个物体都有唯一的track ID,为得到分类信息,在embedding后引入全连接层,借鉴物体分类思路(将每个track ID当做一个类别),将embedding信息转化为track ID的分类信息,论文给的图像中没有说明,补全效果如下
经mask处理后获得targets处的embedding信息,再经过全连接分类,获得nID个输出(nID是数据集中ID的个数),训练时再进行loss计算。
loss分为三项:
前后景分类:交叉熵
检测框:smooth-L1
embedding:论文中讲用定义的LEC损失,与交叉熵相似,代码中使用的交叉熵
代码在models.py中,如下
class YOLOLayer(nn.Module):
def __init__(self, anchors, nC, nID, nE, img_size, yolo_layer):
super(YOLOLayer, self).__init__()
self.layer = yolo_layer
nA = len(anchors)
self.anchors = torch.FloatTensor(anchors)
self.nA = nA # number of anchors (3) 注意这里代码和论文都是4
self.nC = nC # number of classes (80)
self.nID = nID # number of identities
self.img_size = 0
self.emb_dim = nE
self.shift = [1, 3, 5]
self.SmoothL1Loss = nn.SmoothL1Loss()
self.SoftmaxLoss = nn.CrossEntropyLoss(ignore_index=-1)
self.CrossEntropyLoss = nn.CrossEntropyLoss()
self.IDLoss = nn.CrossEntropyLoss(ignore_index=-1) # 此处为ID loss的损失函数
self.s_c = nn.Parameter(-4.15*torch.ones(1)) # -4.15
self.s_r = nn.Parameter(-4.85*torch.ones(1)) # -4.85
self.s_id = nn.Parameter(-2.3*torch.ones(1)) # -2.3
self.emb_scale = math.sqrt(2) * math.log(self.nID-1) if self.nID>1 else 1
这里定义了一些参数和损失函数,其中nA是当前模板下的anchor数量,注释给的3,论文和代码里都是4。
def forward(self, p_cat, img_size, targets=None, classifier=None, test_emb=False):
p, p_emb = p_cat[:, :24, ...], p_cat[:, 24:, ...] # 前24维用于box和conf
nB, nGh, nGw = p.shape[0], p.shape[-2], p.shape[-1] # 个数、size
if self.img_size != img_size:
create_grids(self, img_size, nGh, nGw)
if p.is_cuda:
self.grid_xy = self.grid_xy.cuda()
self.anchor_wh = self.anchor_wh.cuda()
p = p.view(nB, self.nA, self.nC + 5, nGh, nGw).permute(0, 1, 3, 4, 2).contiguous() # prediction 将24维信息分为4个anchor对应的[[box],[conf]],box4位,conf两位
p_emb = p_emb.permute(0,2,3,1).contiguous() # embedding 信息 shape(nB,[size],512)
p_box = p[..., :4] # boxs 信息 shape(nB,4,[size],4)
p_conf = p[..., 4:6].permute(0, 4, 1, 2, 3) # Conf shape(nB,2,4,[size]) 4为anchor数量
这里是按照模型分配特征信息,不同信息代码中都标注出来了
# Training
if targets is not None:
if test_emb:
tconf, tbox, tids = build_targets_max(targets, self.anchor_vec.cuda(), self.nA, self.nC, nGh, nGw)
else:
tconf, tbox, tids = build_targets_thres(targets, self.anchor_vec.cuda(), self.nA, self.nC, nGh, nGw)
tconf, tbox, tids = tconf.cuda(), tbox.cuda(), tids.cuda() # 置信度 检测框 id,每种对应四个anchor有相同的四层
mask = tconf > 0
这里是获取标签信息
# Compute losses
nT = sum([len(x) for x in targets]) # number of targets
nM = mask.sum().float() # number of anchors (assigned to targets)
nP = torch.ones_like(mask).sum().float()
if nM > 0:
lbox = self.SmoothL1Loss(p_box[mask], tbox[mask])
else:
FT = torch.cuda.FloatTensor if p_conf.is_cuda else torch.FloatTensor
lbox, lconf = FT([0]), FT([0])
lconf = self.SoftmaxLoss(p_conf, tconf)
lid = torch.Tensor(1).fill_(0).squeeze().cuda()
emb_mask,_ = mask.max(1)
计算conf和box的loss,设置mask,只考虑有targets处的,即包含目标处的损失
# For convenience we use max(1) to decide the id, TODO: more reseanable strategy
tids,_ = tids.max(1)
tids = tids[emb_mask]
embedding = p_emb[emb_mask].contiguous()
embedding = self.emb_scale * F.normalize(embedding)
nI = emb_mask.sum().float()
if test_emb:
if np.prod(embedding.shape)==0 or np.prod(tids.shape) == 0:
return torch.zeros(0, self.emb_dim+1).cuda()
emb_and_gt = torch.cat([embedding, tids.float()], dim=1)
return emb_and_gt
if len(embedding) > 1:
logits = classifier(embedding).contiguous() # 全连接分类
lid = self.IDLoss(logits, tids.squeeze())
这里是ID的loss计算,结合上边补全的网络图像,embedding经mask提取,Linear线性分类,再计算交叉熵损失。
# Sum loss components
loss = torch.exp(-self.s_r)*lbox + torch.exp(-self.s_c)*lconf + torch.exp(-self.s_id)*lid + \
(self.s_r + self.s_c + self.s_id)
loss *= 0.5
return loss, loss.item(), lbox.item(), lconf.item(), lid.item(), nT
最后计算总损失并返回。整个网络的运行过程如下:
class Darknet(nn.Module):
"""YOLOv3 object detection model"""
def __init__(self, cfg_dict, nID=0, test_emb=False):
super(Darknet, self).__init__()
if isinstance(cfg_dict, str):
cfg_dict = parse_model_cfg(cfg_dict)
self.module_defs = cfg_dict
self.module_defs[0]['nID'] = nID
self.img_size = [int(self.module_defs[0]['width']), int(self.module_defs[0]['height'])]
self.emb_dim = int(self.module_defs[0]['embedding_dim'])
self.hyperparams, self.module_list = create_modules(self.module_defs)
self.loss_names = ['loss', 'box', 'conf', 'id', 'nT']
self.losses = OrderedDict()
for ln in self.loss_names:
self.losses[ln] = 0
self.test_emb = test_emb
self.classifier = nn.Linear(self.emb_dim, nID) if nID>0 else None
# 这里定义的就是上边embedding分类网络,输出大小为nID
def forward(self, x, targets=None, targets_len=None):
self.losses = OrderedDict()
for ln in self.loss_names:
self.losses[ln] = 0
is_training = (targets is not None) and (not self.test_emb)
#img_size = x.shape[-1]
layer_outputs = []
output = []
for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
mtype = module_def['type']
if mtype in ['convolutional', 'upsample', 'maxpool']:
x = module(x)
elif mtype == 'route':
layer_i = [int(x) for x in module_def['layers'].split(',')]
if len(layer_i) == 1:
x = layer_outputs[layer_i[0]]
else:
x = torch.cat([layer_outputs[i] for i in layer_i], 1)
elif mtype == 'shortcut':
layer_i = int(module_def['from'])
x = layer_outputs[-1] + layer_outputs[layer_i]
elif mtype == 'yolo':
if is_training: # get loss
targets = [targets[i][:int(l)] for i,l in enumerate(targets_len)]
x, *losses = module[0](x, self.img_size, targets, self.classifier)
for name, loss in zip(self.loss_names, losses):
self.losses[name] += loss
elif self.test_emb:
if targets is not None:
targets = [targets[i][:int(l)] for i,l in enumerate(targets_len)]
x = module[0](x, self.img_size, targets, self.classifier, self.test_emb)
else: # get detections
x = module[0](x, self.img_size)
output.append(x)
layer_outputs.append(x)
if is_training:
self.losses['nT'] /= 3
output = [o.squeeze() for o in output]
return sum(output), torch.Tensor(list(self.losses.values())).cuda()
elif self.test_emb:
return torch.cat(output, 0)
return torch.cat(output, 1)
跟踪过程主要在multitracker.py文件
class JDETracker(object):
def __init__(self, opt, frame_rate=30):
self.opt = opt
self.model = Darknet(opt.cfg, nID=14455)
# load_darknet_weights(self.model, opt.weights)
self.model.load_state_dict(torch.load(opt.weights, map_location='cpu')['model'], strict=False)
self.model.cuda().eval()
self.tracked_stracks = [] # type: list[STrack]
self.lost_stracks = [] # type: list[STrack]
self.removed_stracks = [] # type: list[STrack]
self.frame_id = 0
self.det_thresh = opt.conf_thres
self.buffer_size = int(frame_rate / 30.0 * opt.track_buffer)
self.max_time_lost = self.buffer_size
self.kalman_filter = KalmanFilter()
可以看到代码中使用的nID为14455,这里定义了一些存储不同状态轨迹的容器(轨迹类也在这个文件中,class STrack),以及KF卡尔曼滤波器。跟踪过程如下
with torch.no_grad():
pred = self.model(im_blob)
# pred is tensor of all the proposals (default number of proposals: 54264). Proposals have information associated with the bounding box and embeddings
pred = pred[pred[:, :, 4] > self.opt.conf_thres] # [p_box, p_conf, p_cls, p_emb] 第四位为conf conf阈值判断
# pred now has lesser number of proposals. Proposals rejected on basis of object confidence score
if len(pred) > 0:
dets = non_max_suppression(pred.unsqueeze(0), self.opt.conf_thres, self.opt.nms_thres)[0].cpu() # 非极大值抑制
# Final proposals are obtained in dets. Information of bounding box and embeddings also included
# Next step changes the detection scales
scale_coords(self.opt.img_size, dets[:, :4], img0.shape).round()
'''Detections is list of (x1, y1, x2, y2, object_conf, class_score, class_pred)'''
# class_pred is the embeddings.
detections = [STrack(STrack.tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f.numpy(), 30) for
(tlbrs, f) in zip(dets[:, :5], dets[:, 6:])]
else:
detections = []
首先获取预测的detection信息,注释很详细
''' Add newly detected tracklets to tracked_stracks'''
unconfirmed = []
tracked_stracks = [] # type: list[STrack]
for track in self.tracked_stracks:
if not track.is_activated:
# previous tracks which are not active in the current frame are added in unconfirmed list
unconfirmed.append(track)
# print("Should not be here, in unconfirmed")
else:
# Active tracks are added to the local list 'tracked_stracks'
tracked_stracks.append(track)
然后将容器中的轨迹取出包括正常的轨迹和一些未确定轨迹(主要是只包含起始帧,即detection只出现过一次的轨迹)
''' Step 2: First association, with embedding'''
# Combining currently tracked_stracks and lost_stracks
strack_pool = joint_stracks(tracked_stracks, self.lost_stracks)
# Predict the current location with KF
STrack.multi_predict(strack_pool, self.kalman_filter)
dists = matching.embedding_distance(strack_pool, detections)
# dists = matching.gate_cost_matrix(self.kalman_filter, dists, strack_pool, detections)
dists = matching.fuse_motion(self.kalman_filter, dists, strack_pool, detections)
# The dists is the list of distances of the detection with the tracks in strack_pool
matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7)
# The matches is the array for corresponding matches of the detection with the corresponding strack_pool
这里计算embedding_distance使用余弦距离,函数如下
def embedding_distance(tracks, detections, metric='cosine'):
"""
:param tracks: list[STrack]
:param detections: list[BaseTrack]
:param metric:
:return: cost_matrix np.ndarray
"""
cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
if cost_matrix.size == 0:
return cost_matrix
det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float)
track_features = np.asarray([track.smooth_feat for track in tracks], dtype=np.float)
cost_matrix = np.maximum(0.0, cdist(track_features, det_features)) # Nomalized features
return cost_matrix
接着使用KF滤波融合运动信息,如下
def fuse_motion(kf, cost_matrix, tracks, detections, only_position=False, lambda_=0.98):
if cost_matrix.size == 0:
return cost_matrix
gating_dim = 2 if only_position else 4
gating_threshold = kalman_filter.chi2inv95[gating_dim]
measurements = np.asarray([det.to_xyah() for det in detections])
for row, track in enumerate(tracks):
gating_distance = kf.gating_distance(
track.mean, track.covariance, measurements, only_position, metric='maha')
cost_matrix[row, gating_distance > gating_threshold] = np.inf
cost_matrix[row] = lambda_ * cost_matrix[row] + (1-lambda_)* gating_distance
return cost_matrix
匹配使用JCV算法(线性任务分配,函数lap.lapjv,这个好像比匈牙利要快),计算最小distance/cost情况下detection与track的匹配,如下
def linear_assignment(cost_matrix, thresh):
if cost_matrix.size == 0:
return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1]))
matches, unmatched_a, unmatched_b = [], [], []
cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh) # 使用JVC匹配
for ix, mx in enumerate(x):
if mx >= 0:
matches.append([ix, mx])
unmatched_a = np.where(x < 0)[0]
unmatched_b = np.where(y < 0)[0]
matches = np.asarray(matches)
return matches, unmatched_a, unmatched_b
将匹配结果保存,
再对未匹配的track和detection 使用IoU distance 匹配,
移除仍未匹配的未经确认的轨迹(通常只有起始帧有detection),
对新的检测创建轨迹,
移除长时间未匹配的轨迹,
最终进行轨迹更新
上述过程中的匹配过程均使用JVC算法,过程相似不再贴代码了
网络结构简单,主要是在YOLO3预测头后面加了一个embedding学习。对比tracking by detection,该网络同时输出图像画面中的检测框位置和检测框内物体的embedding,从而加速MOT的速度。但JDE只是同时输出了检测框和embedding信息,后面还要匹配,其实还是两阶段的。看到大佬分析将detection和ReID结合在同一个网络中的做法仍然和分别去做存在一定的差距,因为detection本质上需要catagory特征,然而ReID需要identity特征,同一个网络中不能同时得到较好的class和identity特征。
代码运行比较容易,可能会遇到点小问题,安装cython-bbox,可以下源码编译;注意根据下载的数据和权重修改代码和cfg/ccmcpe.json中的信息,最好下载作者提供的数据,是已经按照代码格式处理过的,官网下载的不能直接用。
欢迎交流指正
参考
【MOT】对JDE的深度解析
One Shot Multi-Object Tracking Overview
多目标跟踪算法(JDE)Towards Real-Time Multi-Object Tracking训练方法