论文链接:https://arxiv.org/pdf/1804.04273.pdf
代码链接:https://github.com/abnerwang/py-Vital
datasets
gnet
models
moduels
pretrain
tracking
刚开始看项目代码无从下手,索性从tracking部分开始吧
import部分:
import numpy as np
import os
import sys
import time
import argparse
import yaml, json
from PIL import Image
import matplotlib.pyplot as plt
import torch
import torch.utils.data as data
import torch.optim as optim
sys.path.insert(0, '.')
from modules.model import MDNet, BCELoss, set_optimizer
from modules.sample_generator import SampleGenerator
from modules.utils import overlap_ratio
from data_prov import RegionExtractor
from bbreg import BBRegressor
from gen_config import gen_config
sys.path.insert(0,'./gnet')#动态加入搜索路径
from gnet.g_init import NetG, set_optimizer_g
from gnet.g_pretrain import *
opts = yaml.safe_load(open('./tracking/options.yaml','r'))
主函数部分:
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--seq', default='', help='input seq')
parser.add_argument('-j', '--json', default='', help='input json')
parser.add_argument('-f', '--savefig', action='store_true')
parser.add_argument('-d', '--display', action='store_true')
#action的意思是当读取的参数中出现的时候参数字典的建对应的值为True,
args = parser.parse_args()
assert args.seq != '' or args.json != ''
np.random.seed(0)
torch.manual_seed(0)
# Generate sequence config
img_list, init_bbox, gt, savefig_dir, display, result_path = gen_config(args)
这一句对命令行参数进行处理,得到待检测的图像序列img_list,初始的Ground Truth init_bbox, Ground Truth gt,保存图片的目录savefig_dir,是否对结果进行可视化参数display,以及保存结果的地址result_path,下面顺手直接分析gen_config:
import os
import json
import numpy as np
def gen_config(args):
# args的格式是 python tracking/run_tracker.py -s DragonBaby [-d (display fig)] [-f (save fig)]
if args.seq != '':
# generate config from a sequence name
seq_home = '/home/xpwang/datasets/OTB100'
result_home = './results'
seq_name = args.seq
img_dir = os.path.join(seq_home, seq_name, 'img')
gt_path = os.path.join(seq_home, seq_name, 'groundtruth_rect.txt')
img_list = os.listdir(img_dir)
img_list.sort()
img_list = [os.path.join(img_dir, x) for x in img_list]
with open(gt_path) as f:
gt = np.loadtxt((x.replace('\t',',') for x in f), delimiter=',')
init_bbox = gt[0]#初始的框就是gt[0]
result_dir = os.path.join(result_home, seq_name)
if not os.path.exists(result_dir):
os.makedirs(result_dir)
savefig_dir = os.path.join(result_dir, 'figs')
result_path = os.path.join(result_dir, 'result.json')
elif args.json != '':
# load config from a json file
param = json.load(open(args.json, 'r'))
seq_name = param['seq_name']
img_list = param['img_list']
init_bbox = param['init_bbox']
savefig_dir = param['savefig_dir']
result_path = param['result_path']
gt = None
if args.savefig:
if not os.path.exists(savefig_dir):
os.makedirs(savefig_dir)
else:
savefig_dir = ''
return img_list, init_bbox, gt, savefig_dir, args.display, result_path
# Run tracker
result, result_bb, fps = run_vital(img_list, init_bbox, gt=gt, savefig_dir=savefig_dir, display=display)
# Save result
res = {}
res['res'] = result_bb.round().tolist()
res['type'] = 'rect'
res['fps'] = fps
json.dump(res, open(result_path, 'w'), indent=2)
这一步运行跟踪框架并保存结果。
接下来看run_vital()函数。
首先初始化目标框和结果:
def run_vital(img_list, init_bbox, gt=None, savefig_dir='', display=False):
# Init bbox
target_bbox = np.array(init_bbox)
result = np.zeros((len(img_list), 4))
result_bb = np.zeros((len(img_list), 4))
result[0] = target_bbox
result_bb[0] = target_bbox
if gt is not None:
overlap = np.zeros(len(img_list))
overlap[0] = 1
接下来是初始化模型部分:
model = MDNet(opts['model_path'])
model_g = NetG()
if opts['use_gpu']:
model = model.cuda()
model_g = model_g.cuda()
这里是用预训练好的模型加载MDNET,而G网络是初始化生成,看一下MDNET的源码,已经注释:
class MDNet(nn.Module):
def __init__(self, model_path=None, K=1):
super(MDNet, self).__init__()
self.K = K #K是分支数,即一次有多少个视频参与训练,论文中是8个
#搭建网络
self.layers = nn.Sequential(OrderedDict([
('conv1', nn.Sequential(nn.Conv2d(3, 96, kernel_size=7, stride=2),
nn.ReLU(inplace=True),
nn.LocalResponseNorm(2),
nn.MaxPool2d(kernel_size=3, stride=2))),
('conv2', nn.Sequential(nn.Conv2d(96, 256, kernel_size=5, stride=2),
nn.ReLU(inplace=True),
nn.LocalResponseNorm(2),
nn.MaxPool2d(kernel_size=3, stride=2))),
('conv3', nn.Sequential(nn.Conv2d(256, 512, kernel_size=3, stride=1),
nn.ReLU(inplace=True))),
('fc4', nn.Sequential(nn.Linear(512 * 3 * 3, 512),
nn.ReLU(inplace=True))),
('fc5', nn.Sequential(nn.Dropout(0.5),
nn.Linear(512, 512),
nn.ReLU(inplace=True)))]))
#这一步是在fc5后建立多个分支全连接层,每个视频有不同的全连接层
self.branches = nn.ModuleList([nn.Sequential(nn.Dropout(0.5),
nn.Linear(512, 2)) for _ in range(K)])
for m in self.layers.modules():
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.constant_(m.bias, 0.1)
for m in self.branches.modules():
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.constant_(m.bias, 0)
#如果参数已经预训练好,就直接加在
if model_path is not None:
if os.path.splitext(model_path)[1] == '.pth':
self.load_model(model_path)
elif os.path.splitext(model_path)[1] == '.mat':
self.load_mat_model(model_path)
else:
raise RuntimeError('Unkown model format: {:s}'.format(model_path))
self.build_param_dict()
def build_param_dict(self):
self.params = OrderedDict()
for name, module in self.layers.named_children():
# name 为每一层的名称 如:conv1,fc4
# module 为该层的具体网络结构,如:
# Sequential(
# (0): Conv2d(3, 96, kernel_size=(7, 7), stride=(2, 2))
# (1): ReLU()
# (2): LRN()
# (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
# )
append_params(self.params, module, name)
append_params(self.params, module, name)
for k, module in enumerate(self.branches):
# k 为分支层的index 0...5
# module同上
append_params(self.params, module, 'fc6_{:d}'.format(k))
def append_params(params, module, prefix):
# params 为参数字典
# module 为具体的模型
# prefix 为模型中具体某一层的名称 如:conv1,fc4,fc6_0
for child in module.children(): # 遍历模型的网络结构
for k,p in child._parameters.items():
if p is None: continue
if isinstance(child, nn.BatchNorm2d):
name = prefix + '_bn_' + k
else:
name = prefix + '_' + k
if name not in params:
params[name] = p
else:
raise RuntimeError('Duplicated param name: {:s}'.format(name))
接着回去看run_tracker部分:
# Init criterion and optimizer
criterion = BCELoss()
criterion_g = torch.nn.MSELoss(reduction='mean')
model.set_learnable_params(opts['ft_layers'])
model_g.set_learnable_params(opts['ft_layers'])
init_optimizer = set_optimizer(model, opts['lr_init'], opts['lr_mult'])
update_optimizer = set_optimizer(model, opts['lr_update'], opts['lr_mult'])
BCELoss是作者新提出的损失函数,细节参考论文3.2:
class BCELoss(nn.Module):
def forward(self, pos_score, neg_score, average=True):
pos_loss = -F.log_softmax(pos_score, dim=1)[:, 1]
pos_loss_p = (torch.ones(pos_loss.size()).cuda() - F.softmax(pos_score, dim=1)[:,1]) * pos_loss
neg_loss = -F.log_softmax(neg_score, dim=1)[:, 0]
neg_loss_p = (torch.ones(neg_loss.size()).cuda() - F.softmax(neg_score, dim=1)[:,0]) * neg_loss
loss = pos_loss_p.sum() + neg_loss_p.sum()
# loss = pos_loss.sum() + neg_loss.sum()
if average:
# loss /= (pos_loss.size(0) + neg_loss.size(0))
loss /= (pos_loss_p.size(0) + neg_loss_p.size(0))
return loss
然后代码设置优化器和可优化的参数,这一步划重点,网络在运行的时候是不对前面的卷积层进行更新的,我们可以去看一下opts[‘ft_layers’]是ft_layers: [‘fc’],然后再去看set_learnable_params函数:
def set_learnable_params(self, layers):
for k, p in self.params.items():
if any([k.startswith(l) for l in layers]):
p.requires_grad = True
else:
p.requires_grad = False
def get_learnable_params(self):
params = OrderedDict()
for k, p in self.params.items():
if p.requires_grad:
params[k] = p
return params
简而言之,就是fc层都被设置用来更新啦。
然后看设置优化器部分:
def set_optimizer(model, lr_base, lr_mult=opts['lr_mult'], momentum=opts['momentum'], w_decay=opts['w_decay']):
params = model.get_learnable_params() # 得到所有需要求解梯度的参数字典
param_list = []
for k, p in params.items():
lr = lr_base # 设置学习率为0.0001
for l, m in lr_mult.items():
if k.startswith(l):
lr = lr_base * m # fc6层的学习率是0.001
param_list.append({'params': [p], 'lr': lr})
# 随机梯度下降法训练
optimizer = optim.SGD(param_list, lr=lr, momentum=momentum, weight_decay=w_decay)
return optimizer # 返回优化器
好了,都搞清楚了,接着看run_tracker,首先打开第一张图片,然后在bbox生成正负采样:
tic = time.time()
# Load first image
image = Image.open(img_list[0]).convert('RGB')
# Draw pos/neg samples
pos_examples = SampleGenerator('gaussian', image.size, opts['trans_pos'], opts['scale_pos'])(
target_bbox, opts['n_pos_init'], opts['overlap_pos_init'])
neg_examples = np.concatenate([
SampleGenerator('uniform', image.size, opts['trans_neg_init'], opts['scale_neg_init'])(
target_bbox, int(opts['n_neg_init'] * 0.5), opts['overlap_neg_init']),
SampleGenerator('whole', image.size)(
target_bbox, int(opts['n_neg_init'] * 0.5), opts['overlap_neg_init'])])
neg_examples = np.random.permutation(neg_examples)
又要去看函数了,SampleGenrator是一个类,初始化为:
class SampleGenerator():
def __init__(self, type_, img_size, trans=1, scale=1, aspect=None, valid=False):
self.type = type_ # type是指生成哪一种分布的包围盒 'gaussian' 'uniform' 'whole'
self.img_size = np.array(img_size) # (w, h)
self.trans = trans
self.scale = scale
self.aspect = aspect
self.valid = valid
刚刚调用的函数格式是:
pos_examples = SampleGenerator('gaussian', image.size, opts['trans_pos'], opts['scale_pos'])(target_bbox, opts['n_pos_init'], opts['overlap_pos_init'])
去看一下这些参数,把调用翻译一下:
pos_examples = SampleGenerator('gaussian', image.size, 0.1, 1.3)(target_bbox, 500, [0.7,1])
然后再来看下面的函数,overlap_range=[0.7,1], overlap_range 是一个IOU范围,scale_range 指的是缩放范围,接下来请看注释:
def __call__(self, bbox, n, overlap_range=None, scale_range=None):
if overlap_range is None and scale_range is None:
return self._gen_samples(bbox, n)
else:
samples = None
remain = n #初始化,需要生成n个候选框
factor = 2
while remain > 0 and factor < 16:
samples_ = self._gen_samples(bbox, remain * factor)#生成remain*factor个候选框
idx = np.ones(len(samples_), dtype=bool)
if overlap_range is not None: # 挑选出符合IOU范围的候选框
r = overlap_ratio(samples_, bbox)
idx *= (r >= overlap_range[0]) * (r <= overlap_range[1])
if scale_range is not None:
s = np.prod(samples_[:, 2:], axis=1) / np.prod(bbox[2:])
idx *= (s >= scale_range[0]) * (s <= scale_range[1])
samples_ = samples_[idx, :] # 挑选出符合条件的候选框
samples_ = samples_[:min(remain, len(samples_))]
if samples is None:
samples = samples_
else:
samples = np.concatenate([samples, samples_])
remain = n - len(samples)
factor = factor * 2
return samples
下面是刚刚调用的生成样本函数:
def _gen_samples(self, bb, n):
#生成n
# bb: target bbox (min_x,min_y,w,h)
bb = np.array(bb, dtype='float32')
#随便假设一个bbox[1,2,10,5]
# (center_x, center_y, w, h)
sample = np.array([bb[0] + bb[2] / 2, bb[1] + bb[3] / 2, bb[2], bb[3]], dtype='float32')
samples = np.tile(sample[None, :], (n ,1))#纵向复制(center_x, center_y, w, h)
# vary aspect ratio 不同的长宽比
if self.aspect is not None:
ratio = np.random.rand(n, 2) * 2 - 1 # 均匀分布的随机样本[0, 1)×2-1
samples[:, 2:] *= self.aspect ** ratio # w**ratio , h**(-ratio)
# sample generation
if self.type == 'gaussian':
samples[:, :2] += self.trans * np.mean(bb[2:]) * np.clip(0.5 * np.random.randn(n, 2), -1, 1)
samples[:, 2:] *= self.scale ** np.clip(0.5 * np.random.randn(n, 1), -1, 1)
elif self.type == 'uniform':
samples[:, :2] += self.trans * np.mean(bb[2:]) * (np.random.rand(n, 2) * 2 - 1)
samples[:, 2:] *= self.scale ** (np.random.rand(n, 1) * 2 - 1)
elif self.type == 'whole':
m = int(2 * np.sqrt(n))
xy = np.dstack(np.meshgrid(np.linspace(0, 1, m), np.linspace(0, 1, m))).reshape(-1, 2)
xy = np.random.permutation(xy)[:n]
samples[:, :2] = bb[2:] / 2 + xy * (self.img_size - bb[2:] / 2 - 1)
samples[:, 2:] *= self.scale ** (np.random.rand(n, 1) * 2 - 1)
# adjust bbox range
samples[:, 2:] = np.clip(samples[:, 2:], 10, self.img_size - 10)
if self.valid:
samples[:, :2] = np.clip(samples[:, :2], samples[:, 2:] / 2, self.img_size - samples[:, 2:] / 2 - 1)
else:
samples[:, :2] = np.clip(samples[:, :2], 0, self.img_size)
# (min_x, min_y, w, h)
samples[:, :2] -= samples[:, 2:] / 2
return samples
计算bounding box和ground truth的IOU的函数:
def overlap_ratio(rect1, rect2):
'''
Compute overlap ratio between two rects
- rect: 1d array of [x,y,w,h] or
2d array of N x [x,y,w,h]
'''
if rect1.ndim == 1:
rect1 = rect1[None, :]
if rect2.ndim == 1:
rect2 = rect2[None, :]
left = np.maximum(rect1[:, 0], rect2[:, 0])
right = np.minimum(rect1[:, 0] + rect1[:, 2], rect2[:, 0] + rect2[:, 2])
top = np.maximum(rect1[:, 1], rect2[:, 1])
bottom = np.minimum(rect1[:, 1] + rect1[:, 3], rect2[:, 1] + rect2[:, 3])
intersect = np.maximum(0, right - left) * np.maximum(0, bottom - top)
union = rect1[:, 2] * rect1[:, 3] + rect2[:, 2] * rect2[:, 3] - intersect
iou = np.clip(intersect / union, 0, 1)
return iou
OK,继续回到run_vital.py, 往下看,这一步会得到样本框的特征图,换句话说,过VITAL_NET,但只到全连接层之前:
# Extract pos/neg features
pos_feats = forward_samples(model, image, pos_examples)
neg_feats = forward_samples(model, image, neg_examples)
顺便看一下 forward_samples(),我们已经知道这个函数是用来干嘛,再看一下细节吧:
def forward_samples(model, image, samples, out_layer='conv3'):
model.eval()
extractor = RegionExtractor(image, samples, opts)
for i, regions in enumerate(extractor):
if opts['use_gpu']:
regions = regions.cuda()
with torch.no_grad():
feat = model(regions, out_layer=out_layer)
if i==0:
feats = feat.detach().clone()
else:
feats = torch.cat((feats, feat.detach().clone()), 0)
return feats
上面的函数中,RegionExtractor函数用来在图片中取得相应的区域。
class RegionExtractor():
def __init__(self, image, samples, opts):
self.image = np.asarray(image)
self.samples = samples
self.crop_size = opts['img_size']
self.padding = opts['padding']
self.batch_size = opts['batch_test']
self.index = np.arange(len(samples))
self.pointer = 0
def __iter__(self):
return self
def __next__(self):
if self.pointer == len(self.samples): # 采集完了所有的样本区域
self.pointer = 0
raise StopIteration
else:
next_pointer = min(self.pointer + self.batch_size, len(self.samples))
index = self.index[self.pointer:next_pointer]
self.pointer = next_pointer
regions = self.extract_regions(index) # 根据samples提取样本区域
regions = torch.from_numpy(regions)
return regions
next = __next__
def extract_regions(self, index):
regions = np.zeros((len(index), self.crop_size, self.crop_size, 3), dtype='uint8')
for i, sample in enumerate(self.samples[index]): # 根据samples从图像中提取样本区域
regions[i] = crop_image2(self.image, sample, self.crop_size, self.padding)
regions = regions.transpose(0, 3, 1, 2)
regions = regions.astype('float32') - 128.
return regions
从img中选出bbox区域的图片,resize为107*107的from (min_x,min_y,w,h) to (center_x, center_y, w, h):
def crop_image2(img, bbox, img_size=107, padding=16, flip=False, rotate_limit=0, blur_limit=0):
x, y, w, h = np.array(bbox, dtype='float32')
cx, cy = x + w/2, y + h/2
if padding > 0:
w += 2 * padding * w/img_size
h += 2 * padding * h/img_size
# List of transformation matrices
matrices = []
# Translation matrix to move patch center to origin
translation_matrix = np.asarray([[1, 0, -cx],
[0, 1, -cy],
[0, 0, 1]], dtype=np.float32)
matrices.append(translation_matrix)
# Scaling matrix according to image size
scaling_matrix = np.asarray([[img_size / w, 0, 0],
[0, img_size / h, 0],
[0, 0, 1]], dtype=np.float32)
matrices.append(scaling_matrix)
# Define flip matrix
if flip and np.random.binomial(1, 0.5):
flip_matrix = np.eye(3, dtype=np.float32)
flip_matrix[0, 0] = -1
matrices.append(flip_matrix)
# Define rotation matrix
if rotate_limit and np.random.binomial(1, 0.5):
angle = np.random.uniform(-rotate_limit, rotate_limit)
alpha = np.cos(np.deg2rad(angle))
beta = np.sin(np.deg2rad(angle))
rotation_matrix = np.asarray([[alpha, -beta, 0],
[beta, alpha, 0],
[0, 0, 1]], dtype=np.float32)
matrices.append(rotation_matrix)
# Translation matrix to move patch center from origin
revert_t_matrix = np.asarray([[1, 0, img_size / 2],
[0, 1, img_size / 2],
[0, 0, 1]], dtype=np.float32)
matrices.append(revert_t_matrix)
# Aggregate all transformation matrices
matrix = np.eye(3)
for m_ in matrices:
matrix = np.matmul(m_, matrix)
# Warp image, padded value is set to 128
patch = cv2.warpPerspective(img,
matrix,
(img_size, img_size),
borderValue=128)
if blur_limit and np.random.binomial(1, 0.5):
blur_size = np.random.choice(np.arange(1, blur_limit + 1, 2))
patch = cv2.GaussianBlur(patch, (blur_size, blur_size), 0)
return patch
接着看:
# Initial training
train(model, None, criterion, init_optimizer, pos_feats, neg_feats, opts['maxiter_init'])
del init_optimizer, neg_feats
torch.cuda.empty_cache()
g_pretrain(model, model_g, criterion_g, pos_feats)
torch.cuda.empty_cache()
这一步是训练部分,注意,这里能看出来作者论文中的框架了:先更新D,再更新G,接下来分别分析train和g_pretrain:
def g_pretrain(model, model_g, criterion_g, pos_data):
# Evaluate mask
n = pos_data.size(0)
if n % opts['batch_gnet'] == 0:
nBatches = n / opts['batch_gnet']
else:
nBatches = n // opts['batch_gnet'] + 1
print(nBatches)
pos_data = pos_data.view(n, 512, 3, 3)
prob = t.zeros(n)
prob_k = t.zeros(9)
for k in range(9):
row = k // 3
col = k % 3
model.eval()
for i in range(nBatches):
batch = pos_data[opts['batch_gnet'] * i:min(n, opts['batch_gnet'] * (i + 1)), :, :, :]
batch[:, :, col, row] = 0
batch = batch.view(batch.size(0), -1)
if opts['use_gpu']:
batch = batch.cuda()
score = model(batch, in_layer='fc4', out_layer='fc6_softmax')[:, 1]
prob[opts['batch_gnet'] * i:min(n, opts['batch_gnet'] * (i + 1))] = score
model.train()
prob_k[k] = prob.sum()
_, idx = t.min(prob_k, 0)#找到loss最大的mask
idx = idx.item()
row = idx // 3
col = idx % 3
batch_pos = opts['batch_pos']
maxiter_g = opts['maxiter_g']
pos_idx = np.random.permutation(pos_data.size(0))
while len(pos_idx) < batch_pos * maxiter_g:
pos_idx = np.concatenate([pos_idx, np.random.permutation(pos_data.size(0))])
pos_pointer = 0
objective = t.zeros(maxiter_g)
optimizer = set_optimizer_g(model_g)
for iter in range(maxiter_g):
start = time.time()
# select pos idx
pos_next = pos_pointer + batch_pos
pos_cur_idx = pos_idx[pos_pointer:pos_next]
pos_cur_idx = pos_data.new(pos_cur_idx).long()
pos_pointer = pos_next
labels = t.ones(batch_pos, 1, 3, 3)
labels[:, :, col, row] = 0
batch_pos_data = pos_data.index_select(0, pos_cur_idx)
batch_pos_data = batch_pos_data.view(batch_pos_data.size(0), -1)
res = model_g(batch_pos_data)
labels = labels.view(batch_pos, -1)
loss_g = criterion_g(res.float(), labels.cuda().float())
model_g.zero_grad()
loss_g.backward()
optimizer.step()
objective[iter] = loss_g
end = time.time()
print('asdn objective %.3f, %.2f s' % (objective.mean(), end - start))
然后是train部分:
def train(model, model_g, criterion, optimizer, pos_feats, neg_feats, maxiter, in_layer='fc4'):
model.train()
batch_pos = opts['batch_pos']#32
batch_neg = opts['batch_neg']#96
batch_test = opts['batch_test']#256
batch_neg_cand = max(opts['batch_neg_cand'], batch_neg)#1024困难负样本挖掘时生成的1024个负样本,我们要从中取出96个
每次迭代中从样本中随机取出正负样本用来训练
pos_idx = np.random.permutation(pos_feats.size(0)) # 打乱正样本索引
neg_idx = np.random.permutation(neg_feats.size(0))# 打乱负样本索引
while(len(pos_idx) < batch_pos * maxiter):# 每次迭代从正样本中挑选出32个
pos_idx = np.concatenate([pos_idx, np.random.permutation(pos_feats.size(0))])
while(len(neg_idx) < batch_neg_cand * maxiter): # 每次迭代从负样本中挑选出96个
neg_idx = np.concatenate([neg_idx, np.random.permutation(neg_feats.size(0))])
pos_pointer = 0 # 正样本索引
neg_pointer = 0 # 负样本索引
迭代训练maxiter(50)轮:
for i in range(maxiter):
# 挑选正样本idx
pos_next = pos_pointer + batch_pos
pos_cur_idx = pos_idx[pos_pointer:pos_next]
pos_cur_idx = pos_feats.new(pos_cur_idx).long()
pos_pointer = pos_next
# select neg idx
neg_next = neg_pointer + batch_neg_cand
neg_cur_idx = neg_idx[neg_pointer:neg_next]
neg_cur_idx = neg_feats.new(neg_cur_idx).long()
neg_pointer = neg_next
# 创建mini-batch
batch_pos_feats = pos_feats[pos_cur_idx]
if model_g is not None:
batch_asdn_feats = pos_feats.index_select(0, pos_cur_idx)
batch_neg_feats = neg_feats[neg_cur_idx]
# hard negative mining
if batch_neg_cand > batch_neg:
model.eval()
for start in range(0, batch_neg_cand, batch_test):
end = min(start + batch_test, batch_neg_cand)
with torch.no_grad():
score = model(batch_neg_feats[start:end], in_layer=in_layer)
if start==0:
neg_cand_score = score.detach()[:, 1].clone()
else:
neg_cand_score = torch.cat((neg_cand_score, score.detach()[:, 1].clone()), 0)
_, top_idx = neg_cand_score.topk(batch_neg)# 挑选出前96大的,即为困难负样本
batch_neg_feats = batch_neg_feats[top_idx]
model.train()
if model_g is not None:
model_g.eval()
res_asdn = model_g(batch_asdn_feats)#通过g网络,res_asdn的形状是二维向量
model_g.train()
num = res_asdn.size(0)#num=batchsize
mask_asdn = torch.ones(num, 512, 3, 3)#mask的size是n*512*3*3
res_asdn = res_asdn.view(num, 3, 3)#res_asdn的形状是n*3*3
for i in range(num):#循环n次
feat_ = res_asdn[i, :, :]#feat的形状是3*3
featlist = feat_.view(1, 9).squeeze()#一位向量
feat_list = featlist.detach().cpu().numpy()
idlist = feat_list.argsort()#进行排序
idxlist = idlist[:3]#取最小的三个
for k in range(len(idxlist)):
idx = idxlist[k]
row = idx // 3
col = idx % 3
mask_asdn[:, :, col, row] = 0
mask_asdn = mask_asdn.view(mask_asdn.size(0), -1)#mask的size是n*1维向量
if opts['use_gpu']:
batch_asdn_feats = batch_asdn_feats.cuda()
mask_asdn = mask_asdn.cuda()
batch_asdn_feats = batch_asdn_feats * mask_asdn#点乘操作
# forward计算样本得分
if model_g is None:
pos_score = model(batch_pos_feats, in_layer=in_layer)
else:
pos_score = model(batch_asdn_feats, in_layer=in_layer)
neg_score = model(batch_neg_feats, in_layer=in_layer)
# optimize
loss = criterion(pos_score, neg_score)
model.zero_grad()
loss.backward()
if 'grad_clip' in opts:
torch.nn.utils.clip_grad_norm_(model.parameters(), opts['grad_clip'])
optimizer.step()
if model_g is not None:
start = time.time()
prob_k = torch.zeros(9)
for k in range(9):
row = k // 3
col = k % 3
model.eval()
batch = batch_pos_feats.view(batch_pos, 512, 3, 3)
batch[:, :, col, row] = 0
batch = batch.view(batch.size(0), -1)
if opts['use_gpu']:
batch = batch.cuda()
prob = model(batch, in_layer='fc4', out_layer='fc6_softmax')[:, 1]
model.train()
prob_k[k] = prob.sum()
_, idx = torch.min(prob_k, 0)
idx = idx.item()
row = idx // 3
col = idx % 3
optimizer_g = set_optimizer_g(model_g)
labels = torch.ones(batch_pos, 1, 3, 3)
labels[:, :, col, row] = 0
#训练g网络
batch_pos_feats = batch_pos_feats.view(batch_pos_feats.size(0), -1)
res = model_g(batch_pos_feats)
labels = labels.view(batch_pos, -1)
criterion_g = torch.nn.MSELoss(reduction='mean')
loss_g_2 = criterion_g(res.float(), labels.cuda().float())
model_g.zero_grad()
loss_g_2.backward()
optimizer_g.step()
end = time.time()
print('asdn objective %.3f, %.2f s' % (loss_g_2, end - start))
OK,接着往下看,快结束了,再坚持一下:
# 训练bounding box regression
# 生成1000个符合overlab要求和scale要求的bbox
bbreg_examples = gen_samples(SampleGenerator('uniform', image.size, 0.3, 1.5, 1.1),
target_bbox, opts['n_bbreg'], opts['overlap_bbreg'], opts['scale_bbreg'])
bbreg_feats = forward_samples(model, image, bbreg_examples) # conv3的特征,已经转换为fc4的输入格式
bbreg = BBRegressor(image.size) # bounding box regression初始化
bbreg.train(bbreg_feats, bbreg_examples, target_bbox) # bounding box regression训练
这部分也是我没有很仔细看的部分,bounding box regression ,改善定位的精确度,根据给定视频的第一帧,训练一个简单地线性岭回归模型来预测目标物体的位置,用的是 Conv 3 的特征。在随后的视频帧中, 如果预测的目标是可靠的,可以利用bounding box regression调整得到最终的目标位置。文章仅仅用第一帧进行 bounding box regressor 的预测,因为这非常耗时,并且增量学习并不一定有用考虑到其 risk。
# Init sample generators for update
sample_generator = SampleGenerator('gaussian', image.size, opts['trans'], opts['scale'])
pos_generator = SampleGenerator('gaussian', image.size, opts['trans_pos'], opts['scale_pos'])
neg_generator = SampleGenerator('uniform', image.size, opts['trans_neg'], opts['scale_neg'])
# Init pos/neg features for update
# 初始化正负样本特征 for update
neg_examples = neg_generator(target_bbox, opts['n_neg_update'], opts['overlap_neg_init'])
neg_feats = forward_samples(model, image, neg_examples)
pos_feats_all = [pos_feats]
neg_feats_all = [neg_feats]
下面是可视化部分:
spf_total = time.time() - tic
# Display
savefig = savefig_dir != ''
if display or savefig:
dpi = 80.0
figsize = (image.size[0] / dpi, image.size[1] / dpi)
fig = plt.figure(frameon=False, figsize=figsize, dpi=dpi)
ax = plt.Axes(fig, [0., 0., 1., 1.])
ax.set_axis_off()
fig.add_axes(ax)
im = ax.imshow(image, aspect='auto')
if gt is not None:
gt_rect = plt.Rectangle(tuple(gt[0, :2]), gt[0, 2], gt[0, 3],
linewidth=3, edgecolor="#00ff00", zorder=1, fill=False)
ax.add_patch(gt_rect)
rect = plt.Rectangle(tuple(result_bb[0, :2]), result_bb[0, 2], result_bb[0, 3],
linewidth=3, edgecolor="#ff0000", zorder=1, fill=False)
ax.add_patch(rect)
if display:
plt.pause(.01)
plt.draw()
if savefig:
fig.savefig(os.path.join(savefig_dir, '0000.jpg'), dpi=dpi)
跟论文无关,不赘述了,感觉已经写的太冗长了–
主循环:
for i in range(1, len(img_list)): # 跟踪每一帧
tic = time.time()
# Load image
image = Image.open(img_list[i]).convert('RGB')# 加载图片
# Estimate target bbox
samples = sample_generator(target_bbox, opts['n_samples'])# 每一帧随机生成256个候选区域
sample_scores = forward_samples(model, image, samples, out_layer='fc6')
# 计算这256个候选窗口的得分
top_scores, top_idx = sample_scores[:, 1].topk(5) # 挑选出得分最高的5个候选窗口和索引
top_idx = top_idx.cpu()
target_score = top_scores.mean()# 这一帧的候选框的得分为前5个得分的平均值
target_bbox = samples[top_idx]
if top_idx.shape[0] > 1:
target_bbox = target_bbox.mean(axis=0)# 更新target_bbox用于下一帧的跟踪
success = target_score > 0 # 比较看是否成功
#失败时扩大搜索区域
if success:
sample_generator.set_trans(opts['trans'])
else:
sample_generator.expand_trans(opts['trans_limit'])
# Bbox regression 微调bbox
if success:
bbreg_samples = samples[top_idx]
if top_idx.shape[0] == 1:
bbreg_samples = bbreg_samples[None,:]
bbreg_feats = forward_samples(model, image, bbreg_samples)
bbreg_samples = bbreg.predict(bbreg_feats, bbreg_samples)
bbreg_bbox = bbreg_samples.mean(axis=0)
else:
bbreg_bbox = target_bbox 在失败时复制先前的结果
# Save result
result[i] = target_bbox
result_bb[i] = bbreg_bbox
# Data collect
if success:
# 绘制正负样本
pos_examples = pos_generator(target_bbox, opts['n_pos_update'], opts['overlap_pos_update'])
pos_feats = forward_samples(model, image, pos_examples)
pos_feats_all.append(pos_feats)
if len(pos_feats_all) > opts['n_frames_long']:
del pos_feats_all[0]
neg_examples = neg_generator(target_bbox, opts['n_neg_update'], opts['overlap_neg_update'])
# 提取正负样本的特征
neg_feats = forward_samples(model, image, neg_examples)
neg_feats_all.append(neg_feats)
if len(neg_feats_all) > opts['n_frames_short']:
del neg_feats_all[0]
# Short term update # 跟踪失败时进行短期更新
if not success:
nframes = min(opts['n_frames_short'], len(pos_feats_all))
pos_data = torch.cat(pos_feats_all[-nframes:], 0)
neg_data = torch.cat(neg_feats_all, 0)
train(model, None, criterion, update_optimizer, pos_data, neg_data, opts['maxiter_update'])
# Long term update # 每10帧进行一次长期更新
elif i % opts['long_interval'] == 0:
pos_data = torch.cat(pos_feats_all, 0)
neg_data = torch.cat(neg_feats_all, 0)
train(model, model_g, criterion, update_optimizer, pos_data, neg_data, opts['maxiter_update'])
torch.cuda.empty_cache()
spf = time.time() - tic
spf_total += spf
# Display
if display or savefig:
im.set_data(image)
if gt is not None:
gt_rect.set_xy(gt[i, :2])
gt_rect.set_width(gt[i, 2])
gt_rect.set_height(gt[i, 3])
rect.set_xy(result_bb[i, :2])
rect.set_width(result_bb[i, 2])
rect.set_height(result_bb[i, 3])
if display:
plt.pause(.01)
plt.draw()
if savefig:
fig.savefig(os.path.join(savefig_dir, '{:04d}.jpg'.format(i)), dpi=dpi)
最后的部分:
if gt is None:
print("Frame %d/%d, Score %.3f, Time %.3f" % (i, len(img_list), target_score, spf))
else:
print("Frame %d/%d, Overlap %.3f, Score %.3f, Time %.3f" %
(i, len(img_list), overlap_ratio(gt[i], result_bb[i])[0], target_score, spf))
fps = len(img_list) / spf_total
return result, result_bb, fps
OK,这一部分终于写完了。
这一部分实在写不下去了,没什么难度,只是把前面的卷积层一起训练了,划重点,为什么这样呢?因为在offline训练的时候我们训练卷积层就可以获得特征的表达,而后面的全连接层则是分视频来算咯。
花了大概一天的时间搞明白了这个代码,希望对自己和大家都有帮助 ^ ^
python 小知识: