写在前面,首先吐槽一下,从siamrpn(pysot)的代码来看,作者的代码能力实在一般,(也可能是我更一般),代码整体不够clean,可读性也不高,并且和paper中的一些details出入也比较大。但出于工作需要,无奈还是要好好研读一下。
一.核心函数track
这一部分的前提是通过init
函数进行crop_img和Bbox的初始化,即第一帧的跟踪对象的选定,而track
函数是针对第一帧以后的帧进行操作。
part one
w_z = self.size[0] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size)
h_z = self.size[1] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size)
s_z = np.sqrt(w_z * h_z)
scale_z = cfg.TRACK.EXEMPLAR_SIZE / s_z
s_x = s_z * (cfg.TRACK.INSTANCE_SIZE / cfg.TRACK.EXEMPLAR_SIZE)
x_crop = self.get_subwindow(img, self.center_pos,
cfg.TRACK.INSTANCE_SIZE,
round(s_x), self.channel_average)
outputs = self.model.track(x_crop)
这一部分是针对第一帧的对象,选取search的范围。
part two
score = self._convert_score(outputs['cls'])
pred_bbox = self._convert_bbox(outputs['loc'], self.anchors)
这一部分是将siamrpn model的输出进行后处理。outputs的type为dict。
outputs['cls']为(1x10x17x17)的tensor;
outputs["loc"]为(1x20x17x17)的tensor
def _convert_score(self, score):
score = score.permute(1, 2, 3, 0).contiguous().view(2, -1).permute(1, 0)
score = F.softmax(score, dim=1).data[:, 1].cpu().numpy()
return score
_convert_score
将原输入(1x10x17x17)变为(1445x2)后,经过softmax函数,再取第二列,因为第二列代表的是正样本的概率
def _convert_bbox(self, delta, anchor):
delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1)
delta = delta.data.cpu().numpy()
delta[0, :] = delta[0, :] * anchor[:, 2] + anchor[:, 0]
delta[1, :] = delta[1, :] * anchor[:, 3] + anchor[:, 1]
delta[2, :] = np.exp(delta[2, :]) * anchor[:, 2]
delta[3, :] = np.exp(delta[3, :]) * anchor[:, 3]
return delta
_convert_bbox
是对anchor的后处理,这里的anchor由generate_anchor
生成,这里的score_size为17,注释写在代码中,这里又要吐槽一下,很明显,Sensetime公布的这份代码,目的就是开源,但也仅仅局限在开源了,并没有对代码做太多的美化,这让我不禁有点小小的disappointed,毕竟,Sensetime给了我那么多美好的回忆(跑题了)
def generate_anchor(self, score_size):
anchors = Anchors(cfg.ANCHOR.STRIDE,
cfg.ANCHOR.RATIOS,
cfg.ANCHOR.SCALES)
#cfg.ANCHOR.STRIDE=8
#cfg.ANCHOR.RATIOS=[0.33,0.5,1,2,3]
#cfg.ANCHOR.SCALES=[8]
anchor = anchors.anchors
x1, y1, x2, y2 = anchor[:, 0], anchor[:, 1], anchor[:, 2], anchor[:, 3]
anchor = np.stack([(x1+x2)*0.5, (y1+y2)*0.5, x2-x1, y2-y1], 1)
total_stride = anchors.stride
anchor_num = anchor.shape[0]
anchor = np.tile(anchor, score_size * score_size).reshape((-1, 4))
ori = - (score_size // 2) * total_stride
xx, yy = np.meshgrid([ori + total_stride * dx for dx in range(score_size)],
[ori + total_stride * dy for dy in range(score_size)])
xx, yy = np.tile(xx.flatten(), (anchor_num, 1)).flatten(), \
np.tile(yy.flatten(), (anchor_num, 1)).flatten()
anchor[:, 0], anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32)
return anchor
这里的anchor返回的是(1445,4)的矩阵。
表示[网格坐标y,网格坐标x,anchor纵坐标,anchor横坐标]
def _convert_bbox(self, delta, anchor):
delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1)
delta = delta.data.cpu().numpy()
delta[0, :] = delta[0, :] * anchor[:, 2] + anchor[:, 0]
delta[1, :] = delta[1, :] * anchor[:, 3] + anchor[:, 1]
delta[2, :] = np.exp(delta[2, :]) * anchor[:, 2]
delta[3, :] = np.exp(delta[3, :]) * anchor[:, 3]
return delta
这里实际上是对anchor大小,位置的更新,可以参考这些公式。
返回的delta也就是更新完的anchors
part three
这一部分主要是添加了一些惩罚项,惩罚项分为尺度惩罚项和空间位置上的惩罚项,尺度的惩罚项由下述部分完成。
def change(r):
return np.maximum(r, 1. / r)
def sz(w, h):
pad = (w + h) * 0.5
return np.sqrt((w + pad) * (h + pad))
# scale penalty
s_c = change(sz(pred_bbox[2, :], pred_bbox[3, :]) /
(sz(self.size[0]*scale_z, self.size[1]*scale_z)))
#output_size:1445x1
# aspect ratio penalty
r_c = change((self.size[0]/self.size[1]) /
(pred_bbox[2, :]/pred_bbox[3, :]))
penalty = np.exp(-(r_c * s_c - 1) * cfg.TRACK.PENALTY_K)
pscore = penalty * score
但这一部分Code和Paper是有出入的。
这一部分是添加了余弦窗,来做一个空间位置上的滤波(认为下一帧的跟踪目标不会离这一帧太远)
pscore = pscore * (1 - cfg.TRACK.WINDOW_INFLUENCE) +\
self.window * cfg.TRACK.WINDOW_INFLUENCE
best_idx = np.argmax(pscore)
以上为Hanning窗的数学表示。久违了,SAS。
part four
在前三个part得到分数最高的anchor后,会做一些smooth的操作。
bbox = pred_bbox[:, best_idx] / scale_z
#尺度变换回去
lr = penalty[best_idx] * score[best_idx] * cfg.TRACK.LR
cx = bbox[0] + self.center_pos[0]
cy = bbox[1] + self.center_pos[1]
# smooth bbox
width = self.size[0] * (1 - lr) + bbox[2] * lr
height = self.size[1] * (1 - lr) + bbox[3] * lr
#这一部分可以看论文里的表述
代码这一部分没有采用NMS,而是直接取了max,这和paper里也不一样。
part five
这一部分将前面挑出的最大框返回。
cx, cy, width, height = self._bbox_clip(cx, cy, width,height,img.shape[:2])
# udpate state
self.center_pos = np.array([cx, cy])
self.size = np.array([width, height])
bbox = [cx - width / 2,
cy - height / 2,
width,
height]
best_score = score[best_idx]
return {
'bbox': bbox,
'best_score': best_score
}
其中self._bbox_clip
解析如下,最后一个了,顶住。
def _bbox_clip(self, cx, cy, width, height, boundary):
cx = max(0, min(cx, boundary[1]))
cy = max(0, min(cy, boundary[0]))
width = max(10, min(width, boundary[1]))
height = max(10, min(height, boundary[0]))
return cx, cy, width, height
就是一个简单的规避Bbox出界的函数。
附源代码
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
import torch.nn.functional as F
from pysot.core.config import cfg
from pysot.utils.anchor import Anchors
from pysot.tracker.base_tracker import SiameseTracker
class SiamRPNTracker(SiameseTracker):
def __init__(self, model):
super(SiamRPNTracker, self).__init__()
self.score_size = (cfg.TRACK.INSTANCE_SIZE - cfg.TRACK.EXEMPLAR_SIZE) // \
cfg.ANCHOR.STRIDE + 1 + cfg.TRACK.BASE_SIZE
self.anchor_num = len(cfg.ANCHOR.RATIOS) * len(cfg.ANCHOR.SCALES)
hanning = np.hanning(self.score_size)
window = np.outer(hanning, hanning)
self.window = np.tile(window.flatten(), self.anchor_num)
self.anchors = self.generate_anchor(self.score_size)
self.model = model
self.model.eval()
def generate_anchor(self, score_size):
anchors = Anchors(cfg.ANCHOR.STRIDE,
cfg.ANCHOR.RATIOS,
cfg.ANCHOR.SCALES)
anchor = anchors.anchors
x1, y1, x2, y2 = anchor[:, 0], anchor[:, 1], anchor[:, 2], anchor[:, 3]
anchor = np.stack([(x1+x2)*0.5, (y1+y2)*0.5, x2-x1, y2-y1], 1)
total_stride = anchors.stride
anchor_num = anchor.shape[0]
anchor = np.tile(anchor, score_size * score_size).reshape((-1, 4))
ori = - (score_size // 2) * total_stride
xx, yy = np.meshgrid([ori + total_stride * dx for dx in range(score_size)],
[ori + total_stride * dy for dy in range(score_size)])
xx, yy = np.tile(xx.flatten(), (anchor_num, 1)).flatten(), \
np.tile(yy.flatten(), (anchor_num, 1)).flatten()
anchor[:, 0], anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32)
return anchor
def _convert_bbox(self, delta, anchor):
delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1)
delta = delta.data.cpu().numpy()
delta[0, :] = delta[0, :] * anchor[:, 2] + anchor[:, 0]
delta[1, :] = delta[1, :] * anchor[:, 3] + anchor[:, 1]
delta[2, :] = np.exp(delta[2, :]) * anchor[:, 2]
delta[3, :] = np.exp(delta[3, :]) * anchor[:, 3]
return delta
def _convert_score(self, score):
score = score.permute(1, 2, 3, 0).contiguous().view(2, -1).permute(1, 0)
score = F.softmax(score, dim=1).data[:, 1].cpu().numpy()
return score
def _bbox_clip(self, cx, cy, width, height, boundary):
cx = max(0, min(cx, boundary[1]))
cy = max(0, min(cy, boundary[0]))
width = max(10, min(width, boundary[1]))
height = max(10, min(height, boundary[0]))
return cx, cy, width, height
def init(self, img, bbox):
"""
args:
img(np.ndarray): BGR image
bbox: (x, y, w, h) bbox
"""
self.center_pos = np.array([bbox[0]+(bbox[2]-1)/2,
bbox[1]+(bbox[3]-1)/2])
self.size = np.array([bbox[2], bbox[3]])
# calculate z crop size
w_z = self.size[0] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size)
h_z = self.size[1] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size)
s_z = round(np.sqrt(w_z * h_z))
# calculate channle average
self.channel_average = np.mean(img, axis=(0, 1))
# get crop
z_crop = self.get_subwindow(img, self.center_pos,
cfg.TRACK.EXEMPLAR_SIZE,
s_z, self.channel_average)
self.model.template(z_crop)
def track(self, img):
"""
args:
img(np.ndarray): BGR image
return:
bbox(list):[x, y, width, height]
"""
w_z = self.size[0] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size)
h_z = self.size[1] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size)
s_z = np.sqrt(w_z * h_z)
scale_z = cfg.TRACK.EXEMPLAR_SIZE / s_z
s_x = s_z * (cfg.TRACK.INSTANCE_SIZE / cfg.TRACK.EXEMPLAR_SIZE)
x_crop = self.get_subwindow(img, self.center_pos,
cfg.TRACK.INSTANCE_SIZE,
round(s_x), self.channel_average)
outputs = self.model.track(x_crop)
score = self._convert_score(outputs['cls'])
pred_bbox = self._convert_bbox(outputs['loc'], self.anchors)
def change(r):
return np.maximum(r, 1. / r)
def sz(w, h):
pad = (w + h) * 0.5
return np.sqrt((w + pad) * (h + pad))
# scale penalty
s_c = change(sz(pred_bbox[2, :], pred_bbox[3, :]) /
(sz(self.size[0]*scale_z, self.size[1]*scale_z)))
# aspect ratio penalty
r_c = change((self.size[0]/self.size[1]) /
(pred_bbox[2, :]/pred_bbox[3, :]))
penalty = np.exp(-(r_c * s_c - 1) * cfg.TRACK.PENALTY_K)
pscore = penalty * score
# window penalty
pscore = pscore * (1 - cfg.TRACK.WINDOW_INFLUENCE) + \
self.window * cfg.TRACK.WINDOW_INFLUENCE
best_idx = np.argmax(pscore)
bbox = pred_bbox[:, best_idx] / scale_z
lr = penalty[best_idx] * score[best_idx] * cfg.TRACK.LR
cx = bbox[0] + self.center_pos[0]
cy = bbox[1] + self.center_pos[1]
# smooth bbox
width = self.size[0] * (1 - lr) + bbox[2] * lr
height = self.size[1] * (1 - lr) + bbox[3] * lr
# clip boundary
cx, cy, width, height = self._bbox_clip(cx, cy, width,
height, img.shape[:2])
# udpate state
self.center_pos = np.array([cx, cy])
self.size = np.array([width, height])
bbox = [cx - width / 2,
cy - height / 2,
width,
height]
best_score = score[best_idx]
return {
'bbox': bbox,
'best_score': best_score
}