人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度学习实战(不定时更新)
CNN:RCNN、SPPNet、Fast RCNN、Faster RCNN、YOLO V1 V2 V3、SSD、FCN、SegNet、U-Net、DeepLab V1 V2 V3、Mask RCNN
单目标跟踪SiamMask:特定目标车辆追踪 part1
单目标跟踪SiamMask:特定目标车辆追踪 part2
单目标跟踪 Siamese系列网络:SiamFC、SiamRPN、one-shot跟踪、one-shotting单样本学习、DaSiamRPN、SiamRPN++、SiamMask
单目标跟踪:跟踪效果
单目标跟踪:数据集处理
单目标跟踪:模型搭建
单目标跟踪:模型训练
单目标跟踪:模型测试
项目介绍:
运动目标跟踪一直以来都是一项具有挑战性的 工作, 也是研究的热点方向. 现阶段, 随着硬件设施 的不断完善和人工智能技术的快速发展, 运动目标 跟踪技术越来越重要. 目标跟踪在现实生活中有很 多应用, 包括交通视频监控、运动员比赛分析、智能人机交互 、跟踪系统的设计 等. 由于在目标跟踪中存在形态变化、图像分辨率低、背景复杂等情 况, 因此研究出一个性能优良的跟踪器势在必行。
早期的目标跟踪算法主要是根据目标建模或者 对目标特征进行跟踪, 主要的方法有:
1) 基于目标模型建模的方法: 通过对目标外观模型进行建模, 然 后在之后的帧中找到目标. 例如, 区域匹配、特征点 跟踪、基于主动轮廓的跟踪算法、光流法等. 最常用 的是特征匹配法, 首先提取目标特征, 然后在后续的 帧中找到最相似的特征进行目标定位, 常用的特征 有: SIFT 特征、SURF 特征、Harris 角点等.
2) 基于搜索的方法: 随着研究的深入, 人们发现基 于目标模型建模的方法对整张图片进行处理, 实 时性差. 人们将预测算法加入跟踪中, 在预测值附近 进行目标搜索, 减少了搜索的范围. 常见一类的预测 算法有 Kalman滤波、粒子滤波方法. 另一种减小搜索范围的方法是内核方法: 运用最速下降法的原理, 向梯度下降方向对目标模板逐步迭代, 直到迭代到最优位置。
我们给大家介绍的方式是基于深度学习的方法对目标进行跟踪,使用的模型是:
网络的工作流如下所示:
在首帧图像选中跟踪的目标,在视频中对目标进行跟踪:
该项目中的代码主要在以下几个文件夹中,如下图所示:
其中:
data: 包括所需要的数据,训练数据和测试数据
utils: 包含需要的辅助函数,包括目标框处理函数,学习率更新函数等
Datasets: 数据处理,增强及构建训练集的内容在该文件夹中
models: 模型的构建
experiments: 模型的载体
tools:包含模型的训练测试的内容
接下来我们就对该项目进行介绍
# --------------------------------------------------------
# SiamMask
# Licensed under The MIT License
# Written by Qiang Wang (wangqiang2015 at ia.ac.cn)
# 数据集的处理
# --------------------------------------------------------
from __future__ import division
from torch.utils.data import Dataset
import numpy as np
import json
import random
import logging
from os.path import join
from utils.bbox_helper import *
from utils.anchors import Anchors
import math
import sys
pyv = sys.version[0]
import cv2
if pyv[0] == '3':
cv2.ocl.setUseOpenCL(False)
logger = logging.getLogger('global')
sample_random = random.Random()
sample_random.seed(123456)
class SubDataSet(object):
def __init__(self, cfg):
for string in ['root', 'anno']:
if string not in cfg:
raise Exception('SubDataSet need "{}"'.format(string))
# 打开标注文件
with open(cfg['anno']) as fin:
logger.info("loading " + cfg['anno'])
# 获取标注数据
self.labels = self.filter_zero(json.load(fin), cfg)
# 判断参数是否为数值
def isint(x):
try:
int(x)
return True
except:
return False
# add frames args into labels 将图像的描述信息加入到label标注中
to_del = []
# 遍历标注数据中的vedio
for video in self.labels:
# 遍历video中跟踪器
for track in self.labels[video]:
# 得到跟踪目标的帧图像
frames = self.labels[video][track]
frames = list(map(int, filter(lambda x: isint(x), frames.keys())))
# 排序
frames.sort()
# 填充到labels中
self.labels[video][track]['frames'] = frames
# 若得到的帧数小于0,则进行删除
if len(frames) <= 0:
logger.info("warning {}/{} has no frames.".format(video, track))
to_del.append((video, track))
# 删除不在video中的图像 delete tracks with no frames
for video, track in to_del:
del self.labels[video][track]
# ( delete videos with no valid track)
to_del = []
# 判断video中是否包含跟踪目标的图像
for video in self.labels:
if len(self.labels[video]) <= 0:
logger.info("warning {} has no tracks".format(video))
to_del.append(video)
# 删除不包含跟踪目标的video
for video in to_del:
del self.labels[video]
# 将包含标注信息的video赋值给videos
self.videos = list(self.labels.keys())
logger.info(cfg['anno'] + " loaded.")
# 默认参数 default args
self.root = "/"
self.start = 0
# 标注数量
self.num = len(self.labels)
self.num_use = self.num
self.frame_range = 100
self.mark = "vid"
# 图片格式
self.path_format = "{}.{}.{}.jpg"
# 掩膜格式
self.mask_format = "{}.{}.m.png"
self.pick = []
# input args
self.__dict__.update(cfg)
# 是否包含掩膜
self.has_mask = self.mark in ['coco', 'ytb_vos']
self.num_use = int(self.num_use)
# 打乱shuffle
self.shuffle()
def filter_zero(self, anno, cfg):
"过滤跟踪框为0的标注数据"
name = cfg.get('mark', '')
# 过滤为0的跟踪框后的结果
out = {}
# 记录跟踪框的个数
tot = 0
# 记录跟踪框不为0的个数
new = 0
# 记录跟踪框为0的个数
zero = 0
# 遍历标注数据
for video, tracks in anno.items():
new_tracks = {}
# 遍历跟踪数据
for trk, frames in tracks.items():
new_frames = {}
# 遍历帧数据得到跟踪框
for frm, bbox in frames.items():
tot += 1
# bbox的表示方式,获取其宽高
if len(bbox) == 4:
x1, y1, x2, y2 = bbox
w, h = x2 - x1, y2 - y1
else:
w, h = bbox
# 判断跟踪框bbox是否为0
if w == 0 or h == 0:
logger.info('Error, {name} {video} {trk} {bbox}'.format(**locals()))
zero += 1
continue
new += 1
new_frames[frm] = bbox
if len(new_frames) > 0:
new_tracks[trk] = new_frames
# 记录跟踪框不为0的标注数据
if len(new_tracks) > 0:
out[video] = new_tracks
return out
def log(self):
"日志信息"
logger.info('SubDataSet {name} start-index {start} select [{select}/{num}] path {format}'.format(
name=self.mark, start=self.start, select=self.num_use, num=self.num, format=self.path_format
))
def shuffle(self):
"打乱数据"
# 要进行打乱的列表
lists = list(range(self.start, self.start + self.num))
m = 0
pick = []
# 对数据进行分段打乱
while m < self.num_use:
sample_random.shuffle(lists)
pick += lists
m += self.num
# 数据打乱后的结果
self.pick = pick[:self.num_use]
return self.pick
def get_image_anno(self, video, track, frame):
"获取图像及标注信息"
frame = "{:06d}".format(frame)
# 图像路径
image_path = join(self.root, video, self.path_format.format(frame, track, 'x'))
# 图像标注
image_anno = self.labels[video][track][frame]
# 掩膜路径
mask_path = join(self.root, video, self.mask_format.format(frame, track))
return image_path, image_anno, mask_path
def get_positive_pair(self, index):
"正样本"
# 获取video,track的相关信息
video_name = self.videos[index]
video = self.labels[video_name]
track = random.choice(list(video.keys()))
track_info = video[track]
frames = track_info['frames']
# 存在hard
if 'hard' not in track_info:
template_frame = random.randint(0, len(frames)-1)
# 获取搜索范围
left = max(template_frame - self.frame_range, 0)
right = min(template_frame + self.frame_range, len(frames)-1) + 1
search_range = frames[left:right]
# 获取模板和搜索图像
template_frame = frames[template_frame]
search_frame = random.choice(search_range)
else:
# 搜索图像的帧数
search_frame = random.choice(track_info['hard'])
# 范围
left = max(search_frame - self.frame_range, 0)
right = min(search_frame + self.frame_range, len(frames)-1) + 1 # python [left:right+1) = [left:right]
template_range = frames[left:right]
# 获取模板和搜索图像
template_frame = random.choice(template_range)
search_frame = frames[search_frame]
# 返回模板帧和图像帧
return self.get_image_anno(video_name, track, template_frame), \
self.get_image_anno(video_name, track, search_frame)
def get_random_target(self, index=-1):
"获取任意目标"
if index == -1:
index = random.randint(0, self.num-1)
# 获取videoh和track信息
video_name = self.videos[index]
video = self.labels[video_name]
track = random.choice(list(video.keys()))
track_info = video[track]
# 获取图像帧数
frames = track_info['frames']
frame = random.choice(frames)
# 返回图像帧
return self.get_image_anno(video_name, track, frame)
def crop_hwc(image, bbox, out_sz, padding=(0, 0, 0)):
"""
对图像进行裁剪
:param image: 要裁剪的图像
:param bbox:
:param out_sz:
:param padding:
:return:
"""
# 将图像bbox进行裁剪
bbox = [float(x) for x in bbox]
a = (out_sz-1) / (bbox[2]-bbox[0])
b = (out_sz-1) / (bbox[3]-bbox[1])
c = -a * bbox[0]
d = -b * bbox[1]
mapping = np.array([[a, 0, c],
[0, b, d]]).astype(np.float)
# 反射变换
crop = cv2.warpAffine(image, mapping, (out_sz, out_sz), borderMode=cv2.BORDER_CONSTANT, borderValue=padding)
return crop
"""
图像增强
"""
class Augmentation:
def __init__(self, cfg):
# 默认参数 default args
self.shift = 0
# 尺度变化
self.scale = 0
self.blur = 0 # False
self.resize = False
self.rgbVar = np.array([[-0.55919361, 0.98062831, - 0.41940627],
[1.72091413, 0.19879334, - 1.82968581],
[4.64467907, 4.73710203, 4.88324118]], dtype=np.float32)
self.flip = 0
# # 本证向量
# self.eig_vec = np.array([
# [0.4009, 0.7192, -0.5675],
# [-0.8140, -0.0045, -0.5808],
# [0.4203, -0.6948, -0.5836],
# ], dtype=np.float32)
# # 本征值
# self.eig_val = np.array([[0.2175, 0.0188, 0.0045]], np.float32)
self.__dict__.update(cfg)
@staticmethod
def random():
return random.random() * 2 - 1.0
# 图像模糊
def blur_image(self, image):
# 核函数
def rand_kernel():
size = np.random.randn(1)
size = int(np.round(size)) * 2 + 1
if size < 0: return None
if random.random() < 0.5: return None
size = min(size, 45)
kernel = np.zeros((size, size))
c = int(size/2)
# 随机数
wx = random.random()
# 生成核函数
kernel[:, c] += 1. / size * wx
kernel[c, :] += 1. / size * (1-wx)
return kernel
kernel = rand_kernel()
# 图像模糊
if kernel is not None:
image = cv2.filter2D(image, -1, kernel)
return image
def __call__(self, image, bbox, size, gray=False, mask=None):
# 灰度图像
if gray:
# 将彩图转换为灰度图
grayed = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# 生成三个通道都是灰度值的彩图
image = np.zeros((grayed.shape[0], grayed.shape[1], 3), np.uint8)
image[:, :, 0] = image[:, :, 1] = image[:, :, 2] = grayed
# 图像尺寸
shape = image.shape
# 在图像中心生成bbox
crop_bbox = center2corner((shape[0]//2, shape[1]//2, size-1, size-1))
param = {}
# 图像平移
if self.shift:
param['shift'] = (Augmentation.random() * self.shift, Augmentation.random() * self.shift)
# 图像尺度变换
if self.scale:
param['scale'] = ((1.0 + Augmentation.random() * self.scale), (1.0 + Augmentation.random() * self.scale))
# 矩形框也要相应的进行平移和尺度变换
crop_bbox, _ = aug_apply(Corner(*crop_bbox), param, shape)
# 获取bbox的左上角坐标
x1 = crop_bbox.x1
y1 = crop_bbox.y1
# 获取bbox
bbox = BBox(bbox.x1 - x1, bbox.y1 - y1,
bbox.x2 - x1, bbox.y2 - y1)
# bbox尺度变换
if self.scale:
scale_x, scale_y = param['scale']
bbox = Corner(bbox.x1 / scale_x, bbox.y1 / scale_y, bbox.x2 / scale_x, bbox.y2 / scale_y)
# 裁剪图像
image = crop_hwc(image, crop_bbox, size)
if not mask is None:
# mask也要进行裁剪
mask = crop_hwc(mask, crop_bbox, size)
# 偏置
offset = np.dot(self.rgbVar, np.random.randn(3, 1))
offset = offset[::-1] # bgr 2 rgb
offset = offset.reshape(3)
# 图像减去偏置
image = image - offset
# 图像模糊
if self.blur > random.random():
image = self.blur_image(image)
# 图像缩放
if self.resize:
imageSize = image.shape[:2]
# 缩放比例
ratio = max(math.pow(random.random(), 0.5), 0.2) # 25 ~ 255
rand_size = (int(round(ratio*imageSize[0])), int(round(ratio*imageSize[1])))
# 缩放
image = cv2.resize(image, rand_size)
# 缩放
image = cv2.resize(image, tuple(imageSize))
# 图像翻转
if self.flip and self.flip > Augmentation.random():
# 图像翻转
image = cv2.flip(image, 1)
# 掩膜翻转
mask = cv2.flip(mask, 1)
width = image.shape[1]
# bbox进行变换
bbox = Corner(width - 1 - bbox.x2, bbox.y1, width - 1 - bbox.x1, bbox.y2)
return image, bbox, mask
class AnchorTargetLayer:
# 计算anchor的类别,RPN的返回结果,IOU
def __init__(self, cfg):
# 设置阈值
self.thr_high = 0.6
self.thr_low = 0.3
# 正负样本
self.negative = 16
self.rpn_batch = 64
self.positive = 16
self.__dict__.update(cfg)
def __call__(self, anchor, target, size, neg=False, need_iou=False):
# 获取anchor的数目
anchor_num = anchor.anchors.shape[0]
# anchor的类别
cls = np.zeros((anchor_num, size, size), dtype=np.int64)
cls[...] = -1 # -1 ignore 0 negative 1 positive
# rpn的输出结果
delta = np.zeros((4, anchor_num, size, size), dtype=np.float32)
delta_weight = np.zeros((anchor_num, size, size), dtype=np.float32)
# 计算值的集合和数量
def select(position, keep_num=16):
num = position[0].shape[0]
if num <= keep_num:
return position, num
slt = np.arange(num)
np.random.shuffle(slt)
slt = slt[:keep_num]
return tuple(p[slt] for p in position), keep_num
# 负样本
if neg:
# 获取左右位置
l = size // 2 - 3
r = size // 2 + 3 + 1
# 将cls置为0
cls[:, l:r, l:r] = 0
# 获取样本的索引和总数
neg, neg_num = select(np.where(cls == 0), self.negative)
cls[:] = -1
cls[neg] = 0
# 返回结果
if not need_iou:
# 分类结果,rpn预测的位置,惩罚系数
return cls, delta, delta_weight
else:
overlap = np.zeros((anchor_num, size, size), dtype=np.float32)
return cls, delta, delta_weight, overlap
# bbox表示方式转换
tcx, tcy, tw, th = corner2center(target)
# 获取anchor的坐标
anchor_box = anchor.all_anchors[0]
anchor_center = anchor.all_anchors[1]
x1, y1, x2, y2 = anchor_box[0], anchor_box[1], anchor_box[2], anchor_box[3]
cx, cy, w, h = anchor_center[0], anchor_center[1], anchor_center[2], anchor_center[3]
# delta RPN网络预测的位置
delta[0] = (tcx - cx) / w
delta[1] = (tcy - cy) / h
delta[2] = np.log(tw / w)
delta[3] = np.log(th / h)
# IoU 交并比
overlap = IoU([x1, y1, x2, y2], target)
# 获取正负样本
pos = np.where(overlap > self.thr_high)
neg = np.where(overlap < self.thr_low)
# 获取正负样本及其索引
pos, pos_num = select(pos, self.positive)
neg, neg_num = select(neg, self.rpn_batch - pos_num)
# 正样本置为1
cls[pos] = 1
# 设置惩罚系数
delta_weight[pos] = 1. / (pos_num + 1e-6)
# 负样本为0
cls[neg] = 0
# 返回结果
if not need_iou:
# 分类结果,rpn预测的位置,惩罚系数
return cls, delta, delta_weight
else:
return cls, delta, delta_weight, overlap
"""
获取数据集
"""
class DataSets(Dataset):
def __init__(self, cfg, anchor_cfg, num_epoch=1):
super(DataSets, self).__init__()
global logger
logger = logging.getLogger('global')
# anchors
self.anchors = Anchors(anchor_cfg)
# size
self.template_size = 127
self.origin_size = 127
self.search_size = 255
self.size = 17
self.base_size = 0
self.crop_size = 0
# 根据配置文件更新参数
if 'template_size' in cfg:
self.template_size = cfg['template_size']
if 'origin_size' in cfg:
self.origin_size = cfg['origin_size']
if 'search_size' in cfg:
self.search_size = cfg['search_size']
if 'base_size' in cfg:
self.base_size = cfg['base_size']
if 'size' in cfg:
self.size = cfg['size']
if (self.search_size - self.template_size) / self.anchors.stride + 1 + self.base_size != self.size:
raise Exception("size not match!") # TODO: calculate size online
if 'crop_size' in cfg:
self.crop_size = cfg['crop_size']
self.template_small = False
if 'template_small' in cfg and cfg['template_small']:
self.template_small = True
# 生成anchor
self.anchors.generate_all_anchors(im_c=self.search_size//2, size=self.size)
if 'anchor_target' not in cfg:
cfg['anchor_target'] = {}
# 生成anchor的信息:cls,reg,mask
self.anchor_target = AnchorTargetLayer(cfg['anchor_target'])
# data sets
if 'datasets' not in cfg:
raise(Exception('DataSet need "{}"'.format('datasets')))
self.all_data = []
start = 0
self.num = 0
for name in cfg['datasets']:
dataset = cfg['datasets'][name]
dataset['mark'] = name
dataset['start'] = start
# 加载数据
dataset = SubDataSet(dataset)
dataset.log()
self.all_data.append(dataset)
# 数据数量
start += dataset.num # real video number
# 打乱的数据数量
self.num += dataset.num_use # the number used for subset shuffle
# 数据增强data augmentation
aug_cfg = cfg['augmentation']
self.template_aug = Augmentation(aug_cfg['template'])
self.search_aug = Augmentation(aug_cfg['search'])
self.gray = aug_cfg['gray']
self.neg = aug_cfg['neg']
self.inner_neg = 0 if 'inner_neg' not in aug_cfg else aug_cfg['inner_neg']
self.pick = None # list to save id for each img
if 'num' in cfg: # number used in training for all dataset
self.num = int(cfg['num'])
self.num *= num_epoch
self.shuffle()
self.infos = {
'template': self.template_size,
'search': self.search_size,
'template_small': self.template_small,
'gray': self.gray,
'neg': self.neg,
'inner_neg': self.inner_neg,
'crop_size': self.crop_size,
'anchor_target': self.anchor_target.__dict__,
'num': self.num // num_epoch
}
logger.info('dataset informations: \n{}'.format(json.dumps(self.infos, indent=4)))
def imread(self, path):
# 数据读取
img = cv2.imread(path)
if self.origin_size == self.template_size:
# 返回图像
return img, 1.0
def map_size(exe, size):
return int(round(((exe + 1) / (self.origin_size + 1) * (size+1) - 1)))
# 尺寸调整
nsize = map_size(self.template_size, img.shape[1])
# 调整图像大小
img = cv2.resize(img, (nsize, nsize))
# 返回图像和缩放比例
return img, nsize / img.shape[1]
def shuffle(self):
"打乱"
pick = []
m = 0
# 获取数据
while m < self.num:
p = []
for subset in self.all_data:
sub_p = subset.shuffle()
p += sub_p
# 打乱数据
sample_random.shuffle(p)
# 将打乱的结果进行拼接
pick += p
m = len(pick)
# 将打乱的结果赋值给pick
self.pick = pick
logger.info("shuffle done!")
logger.info("dataset length {}".format(self.num))
def __len__(self):
return self.num
def find_dataset(self, index):
"查找数据"
for dataset in self.all_data:
if dataset.start + dataset.num > index:
# 返回索引范围内的数据
return dataset, index - dataset.start
def __getitem__(self, index, debug=False):
# 在打乱的结果中找到索引
index = self.pick[index]
# 查找得到数据
dataset, index = self.find_dataset(index)
# 灰度图
gray = self.gray and self.gray > random.random()
# 负样本
neg = self.neg and self.neg > random.random()
# 负样本
if neg:
# 获取template
template = dataset.get_random_target(index)
# 根据设置,从数据生成负样本或随机选择负样本
if self.inner_neg and self.inner_neg > random.random():
search = dataset.get_random_target()
else:
search = random.choice(self.all_data).get_random_target()
else:
# 获得正样本对
template, search = dataset.get_positive_pair(index)
# 裁剪图像的中央大小为size的部分
def center_crop(img, size):
# 获取图像的形状
shape = img.shape[1]
# 若为size,则直接返回
if shape == size: return img
# 否则,裁剪中央位置为size大小的图像
c = shape // 2
l = c - size // 2
r = c + size // 2 + 1
return img[l:r, l:r]
# 读取模板图像
template_image, scale_z = self.imread(template[0])
# 若设置为小模板时,则从模板图像中进行裁剪
if self.template_small:
template_image = center_crop(template_image, self.template_size)
# 读取待搜索图像
search_image, scale_x = self.imread(search[0])
# 若存在掩膜并且不是负样本数据
if dataset.has_mask and not neg:
# 读取掩膜数据
search_mask = (cv2.imread(search[2], 0) > 0).astype(np.float32)
else:
# 掩膜数据用全零数组替代
search_mask = np.zeros(search_image.shape[:2], dtype=np.float32)
# 若裁剪size大于0,对搜索图像和掩膜进行裁剪
if self.crop_size > 0:
search_image = center_crop(search_image, self.crop_size)
search_mask = center_crop(search_mask, self.crop_size)
# 根据图像大小生成bbox,shape是模板图像中bbox的形状
def toBBox(image, shape):
# 图像的大小
imh, imw = image.shape[:2]
# 获取shape的宽高
if len(shape) == 4:
w, h = shape[2]-shape[0], shape[3]-shape[1]
else:
w, h = shape
# 扩展比例
context_amount = 0.5
# 模板尺寸
exemplar_size = self.template_size # 127
# 获取宽高
wc_z = w + context_amount * (w+h)
hc_z = h + context_amount * (w+h)
# 等效边长
s_z = np.sqrt(wc_z * hc_z)
# 比例
scale_z = exemplar_size / s_z
# 宽高
w = w*scale_z
h = h*scale_z
# 中心点坐标
cx, cy = imw//2, imh//2
bbox = center2corner(Center(cx, cy, w, h))
return bbox
# 生成模板图像和待搜索图像中的bbox
template_box = toBBox(template_image, template[1])
search_box = toBBox(search_image, search[1])
# 模板数据增强
template, _, _ = self.template_aug(template_image, template_box, self.template_size, gray=gray)
# 待搜索图像的数据增强
search, bbox, mask = self.search_aug(search_image, search_box, self.search_size, gray=gray, mask=search_mask)
# def draw(image, box, name):
# image = image.copy()
# x1, y1, x2, y2 = map(lambda x: int(round(x)), box)
# cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0))
# cv2.imwrite(name, image)
#
# if debug:
# draw(template_image, template_box, "debug/{:06d}_ot.jpg".format(index))
# draw(search_image, search_box, "debug/{:06d}_os.jpg".format(index))
# draw(template, _, "debug/{:06d}_t.jpg".format(index))
# draw(search, bbox, "debug/{:06d}_s.jpg".format(index))
# 生成anchor对应的信息
cls, delta, delta_weight = self.anchor_target(self.anchors, bbox, self.size, neg)
if dataset.has_mask and not neg:
# 掩膜图像
mask_weight = cls.max(axis=0, keepdims=True)
else:
mask_weight = np.zeros([1, cls.shape[1], cls.shape[2]], dtype=np.float32)
# 模板和搜索图像
template, search = map(lambda x: np.transpose(x, (2, 0, 1)).astype(np.float32), [template, search])
# 掩膜结果
mask = (np.expand_dims(mask, axis=0) > 0.5) * 2 - 1 # 1*H*W
# 返回结果
return template, search, cls, delta, delta_weight, np.array(bbox, np.float32), \
np.array(mask, np.float32), np.array(mask_weight, np.float32)
{
"network": {
"arch": "Custom"
},
"hp": {
"instance_size": 255,
"base_size": 8
},
"lr": {
"type": "log",
"start_lr": 0.005,
"end_lr": 0.0025,
"warmup": {
"start_lr": 0.001,
"end_lr": 0.005,
"type": "step",
"step": 1,
"epoch": 5
}
},
"loss": {
"weight": [1.0, 1.2, 36]
},
"train_datasets": {
"datasets": {
"coco": {
"root": "../../data/coco/crop511",
"anno": "../../data/coco/train2017.json",
"frame_range": 1
}
},
"template_size": 127,
"search_size": 255,
"base_size": 8,
"size": 25,
"num" : 6000,
"augmentation": {
"template": {
"shift": 4, "scale": 0.05
},
"search": {
"shift": 64, "scale": 0.18, "blur": 0.18
},
"neg": 0.2,
"gray": 0.25
}
},
"val_datasets": {
"datasets": {
"vid": {
"root": "../../data/coco/crop511",
"anno": "../../data/coco/val2017.json",
"num_use": 1000
}
},
"template_size": 127,
"search_size": 255,
"size": 17,
"num" : 10,
"augmentation": {
"template": {
"shift": 0, "scale": 0.00
},
"search": {
"shift": 12, "scale": 0.18
},
"neg": 0,
"gray": 0
}
},
"anchors": {
"stride": 8,
"ratios": [0.33, 0.5, 1, 2, 3],
"scales": [8],
"round_dight": 0
}
}
from models.siammask import SiamMask
from models.features import MultiStageFeature
from models.rpn import RPN, DepthCorr
from models.mask import Mask
import torch
import torch.nn as nn
from utils.load_helper import load_pretrain
from resnet import resnet50
class ResDownS(nn.Module):
def __init__(self, inplane, outplane):
super(ResDownS, self).__init__()
self.downsample = nn.Sequential(
nn.Conv2d(inplane, outplane, kernel_size=1, bias=False),
nn.BatchNorm2d(outplane))
def forward(self, x):
x = self.downsample(x)
if x.size(3) < 20:
l = 4
r = -4
x = x[:, :, l:r, l:r]
return x
class ResDown(MultiStageFeature):
"""
"""
def __init__(self, pretrain=False):
super(ResDown, self).__init__()
self.features = resnet50(layer3=True, layer4=False)
if pretrain:
load_pretrain(self.features, 'resnet.model')
self.downsample = ResDownS(1024, 256)
self.layers = [self.downsample, self.features.layer2, self.features.layer3]
self.train_nums = [1, 3]
self.change_point = [0, 0.5]
self.unfix(0.0)
def param_groups(self, start_lr, feature_mult=1):
lr = start_lr * feature_mult
def _params(module, mult=1):
params = list(filter(lambda x:x.requires_grad, module.parameters()))
if len(params):
return [{'params': params, 'lr': lr * mult}]
else:
return []
groups = []
groups += _params(self.downsample)
groups += _params(self.features, 0.1)
return groups
def forward(self, x):
output = self.features(x)
p3 = self.downsample(output[1])
return p3
class UP(RPN):
def __init__(self, anchor_num=5, feature_in=256, feature_out=256):
super(UP, self).__init__()
self.anchor_num = anchor_num
self.feature_in = feature_in
self.feature_out = feature_out
self.cls_output = 2 * self.anchor_num
self.loc_output = 4 * self.anchor_num
self.cls = DepthCorr(feature_in, feature_out, self.cls_output)
self.loc = DepthCorr(feature_in, feature_out, self.loc_output)
def forward(self, z_f, x_f):
cls = self.cls(z_f, x_f)
loc = self.loc(z_f, x_f)
return cls, loc
class MaskCorr(Mask):
def __init__(self, oSz=63):
super(MaskCorr, self).__init__()
self.oSz = oSz
self.mask = DepthCorr(256, 256, self.oSz**2)
def forward(self, z, x):
return self.mask(z, x)
class Custom(SiamMask):
"""
Custom是SiamMask的载体
"""
def __init__(self, pretrain=False, **kwargs):
super(Custom, self).__init__(**kwargs)
# 特征提取
self.features = ResDown(pretrain=pretrain)
# rpn
self.rpn_model = UP(anchor_num=self.anchor_num, feature_in=256, feature_out=256)
# mask
self.mask_model = MaskCorr()
def template(self, template):
"""
对模板进行特征提取
:param template:
:return:
"""
self.zf = self.features(template)
def track(self, search):
"""
对待搜索图像进行特征提取
:param search:
:return:
"""
search = self.features(search)
rpn_pred_cls, rpn_pred_loc = self.rpn(self.zf, search)
return rpn_pred_cls, rpn_pred_loc
def track_mask(self, search):
"""
对目标进行跟踪
:param search:
:return:
"""
# 特征提取
search = self.features(search)
# rpn预测分类和位置
rpn_pred_cls, rpn_pred_loc = self.rpn(self.zf, search)
# 分割掩膜
pred_mask = self.mask(self.zf, search)
return rpn_pred_cls, rpn_pred_loc, pred_mask
import torch.nn as nn
import torch
from torch.autograd import Variable
import math
import torch.utils.model_zoo as model_zoo
from models.features import Features
from utils.log_helper import log_once
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, stride=1):
"3x3 convolution with padding"
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(Features):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
# padding = (2 - stride) + (dilation // 2 - 1)
padding = 2 - stride
assert stride==1 or dilation==1, "stride and dilation must have one equals to zero at least"
if dilation > 1:
padding = dilation
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=padding, bias=False, dilation=dilation)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
if out.size() != residual.size():
print(out.size(), residual.size())
out += residual
out = self.relu(out)
return out
class Bottleneck_nop(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck_nop, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=0, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
s = residual.size(3)
residual = residual[:, :, 1:s-1, 1:s-1]
out += residual
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, layer4=False, layer3=False):
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=0, # 3
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2) # 31x31, 15x15
self.feature_size = 128 * block.expansion
if layer3:
self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2) # 15x15, 7x7
self.feature_size = (256 + 128) * block.expansion
else:
self.layer3 = lambda x:x # identity
if layer4:
self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4) # 7x7, 3x3
self.feature_size = 512 * block.expansion
else:
self.layer4 = lambda x:x # identity
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
downsample = None
dd = dilation
if stride != 1 or self.inplanes != planes * block.expansion:
if stride == 1 and dilation == 1:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
else:
if dilation > 1:
dd = dilation // 2
padding = dd
else:
dd = 1
padding = 0
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=3, stride=stride, bias=False,
padding=padding, dilation=dd),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
# layers.append(block(self.inplanes, planes, stride, downsample, dilation=dilation))
layers.append(block(self.inplanes, planes, stride, downsample, dilation=dd))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes, dilation=dilation))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
# print x.size()
x = self.maxpool(x)
# print x.size()
p1 = self.layer1(x)
p2 = self.layer2(p1)
p3 = self.layer3(p2)
# p3 = torch.cat([p2, p3], 1)
log_once("p3 {}".format(p3.size()))
p4 = self.layer4(p3)
return p2, p3, p4
class ResAdjust(nn.Module):
def __init__(self,
block=Bottleneck,
out_channels=256,
adjust_number=1,
fuse_layers=[2,3,4]):
super(ResAdjust, self).__init__()
self.fuse_layers = set(fuse_layers)
if 2 in self.fuse_layers:
self.layer2 = self._make_layer(block, 128, 1, out_channels, adjust_number)
if 3 in self.fuse_layers:
self.layer3 = self._make_layer(block, 256, 2, out_channels, adjust_number)
if 4 in self.fuse_layers:
self.layer4 = self._make_layer(block, 512, 4, out_channels, adjust_number)
self.feature_size = out_channels * len(self.fuse_layers)
def _make_layer(self, block, plances, dilation, out, number=1):
layers = []
for _ in range(number):
layer = block(plances * block.expansion, plances, dilation=dilation)
layers.append(layer)
downsample = nn.Sequential(
nn.Conv2d(plances * block.expansion, out, kernel_size=3, padding=1, bias=False),
nn.BatchNorm2d(out)
)
layers.append(downsample)
return nn.Sequential(*layers)
def forward(self, p2, p3, p4):
outputs = []
if 2 in self.fuse_layers:
outputs.append(self.layer2(p2))
if 3 in self.fuse_layers:
outputs.append(self.layer3(p3))
if 4 in self.fuse_layers:
outputs.append(self.layer4(p4))
# return torch.cat(outputs, 1)
return outputs
def resnet18(pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
return model
def resnet34(pretrained=False, **kwargs):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
return model
def resnet50(pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
return model
def resnet101(pretrained=False, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
return model
def resnet152(pretrained=False, **kwargs):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
return model
if __name__ == '__main__':
net = resnet50()
print(net)
net = net.cuda()
var = torch.FloatTensor(1,3,127,127).cuda()
var = Variable(var)
net(var)
print('*************')
var = torch.FloatTensor(1,3,255,255).cuda()
var = Variable(var)
net(var)
{
"network": {
"arch": "Custom"
},
"hp": {
"instance_size": 255,
"base_size": 8,
"out_size": 127,
"seg_thr": 0.35,
"penalty_k": 0.04,
"window_influence": 0.4,
"lr": 1.0
},
"lr": {
"type": "log",
"start_lr": 0.01,
"end_lr": 0.0025
},
"loss": {
"weight": [0, 0, 36]
},
"train_datasets": {
"datasets": {
"coco": {
"root": "../../data/coco/crop511",
"anno": "../../data/coco/train2017.json",
"frame_range": 1
}
},
"template_size": 127,
"search_size": 143,
"base_size": 0,
"size": 3,
"num" : 2000,
"augmentation": {
"template": {
"shift": 4, "scale": 0.05
},
"search": {
"shift": 8, "scale": 0.18, "blur": 0.18
},
"neg": 0,
"gray": 0.25
}
},
"anchors": {
"stride": 8,
"ratios": [0.33, 0.5, 1, 2, 3],
"scales": [8],
"round_dight": 0
}
}
from models.siammask_sharp import SiamMask
from models.features import MultiStageFeature
from models.rpn import RPN, DepthCorr
from models.mask import Mask
import torch
import torch.nn as nn
import torch.nn.functional as F
from utils.load_helper import load_pretrain
from resnet import resnet50
class ResDownS(nn.Module):
"""
对应于网络中的adjust
"""
# inplane对应输入通道数,outplane对应输出通道数
def __init__(self, inplane, outplane):
super(ResDownS, self).__init__()
self.downsample = nn.Sequential(
nn.Conv2d(inplane, outplane, kernel_size=1, bias=False),
nn.BatchNorm2d(outplane))
def forward(self, x):
# adjust操作
x = self.downsample(x)
# 图像宽度小于20的,只取中间的部分
if x.size(3) < 20:
l = 4
r = -4
x = x[:, :, l:r, l:r]
return x
class ResDown(MultiStageFeature):
"""
孪生网络特征提取层,对应ResNet-50和adjust 操作
"""
def __init__(self, pretrain=False):
super(ResDown, self).__init__()
# 利用resnet50进行特征提取
self.features = resnet50(layer3=True, layer4=False)
# 若存在预训练网络则将其直接赋值给feature
if pretrain:
load_pretrain(self.features, 'resnet.model')
# adjust
self.downsample = ResDownS(1024, 256)
# 网络层
self.layers = [self.downsample, self.features.layer2, self.features.layer3]
self.train_nums = [1, 3]
self.change_point = [0, 0.5]
self.unfix(0.0)
def param_groups(self, start_lr, feature_mult=1):
lr = start_lr * feature_mult
def _params(module, mult=1):
params = list(filter(lambda x:x.requires_grad, module.parameters()))
if len(params):
return [{'params': params, 'lr': lr * mult}]
else:
return []
groups = []
groups += _params(self.downsample)
groups += _params(self.features, 0.1)
return groups
def forward(self, x):
"""
前向传输,返回adjust结果
:param x:
:return:
"""
output = self.features(x)
p3 = self.downsample(output[-1])
return p3
def forward_all(self, x):
"""
前向传输,返回特征和adjust结果
:param x:
:return:
"""
output = self.features(x)
p3 = self.downsample(output[-1])
return output, p3
class UP(RPN):
"""
边界回归和分类网络
"""
def __init__(self, anchor_num=5, feature_in=256, feature_out=256):
super(UP, self).__init__()
# 参数设置
self.anchor_num = anchor_num
self.feature_in = feature_in
self.feature_out = feature_out
self.cls_output = 2 * self.anchor_num
self.loc_output = 4 * self.anchor_num
# 分类和回归结果
self.cls = DepthCorr(feature_in, feature_out, self.cls_output)
self.loc = DepthCorr(feature_in, feature_out, self.loc_output)
def forward(self, z_f, x_f):
"""
返回分类和回归结果
:param z_f:
:param x_f:
:return:
"""
cls = self.cls(z_f, x_f)
loc = self.loc(z_f, x_f)
return cls, loc
class MaskCorr(Mask):
"""
目标分割,利用DepthCorr完成
"""
def __init__(self, oSz=63):
super(MaskCorr, self).__init__()
self.oSz = oSz
self.mask = DepthCorr(256, 256, self.oSz**2)
def forward(self, z, x):
return self.mask(z, x)
class Refine(nn.Module):
"""
对掩码进行细化
"""
def __init__(self):
"""
掩膜操作融合模块的实现
"""
super(Refine, self).__init__()
# self.v2、self.v1、self.v0为垂直分支(vertical),压缩通道;
self.v0 = nn.Sequential(nn.Conv2d(64, 16, 3, padding=1), nn.ReLU(),
nn.Conv2d(16, 4, 3, padding=1),nn.ReLU())
self.v1 = nn.Sequential(nn.Conv2d(256, 64, 3, padding=1), nn.ReLU(),
nn.Conv2d(64, 16, 3, padding=1), nn.ReLU())
self.v2 = nn.Sequential(nn.Conv2d(512, 128, 3, padding=1), nn.ReLU(),
nn.Conv2d(128, 32, 3, padding=1), nn.ReLU())
# self.h2、self.h1、self.h0作用于水平分支(horizontal),消化融合结果。
self.h2 = nn.Sequential(nn.Conv2d(32, 32, 3, padding=1), nn.ReLU(),
nn.Conv2d(32, 32, 3, padding=1), nn.ReLU())
self.h1 = nn.Sequential(nn.Conv2d(16, 16, 3, padding=1), nn.ReLU(),
nn.Conv2d(16, 16, 3, padding=1), nn.ReLU())
self.h0 = nn.Sequential(nn.Conv2d(4, 4, 3, padding=1), nn.ReLU(),
nn.Conv2d(4, 4, 3, padding=1), nn.ReLU())
# 由多个输入平面组成的输入图像上应用2D 转置卷积运算符。该模块可以看作 Conv2d 相对于其输入的梯度。它也被称为分数步长卷积或反卷积
self.deconv = nn.ConvTranspose2d(256, 32, 15, 15)
# post0,post1,post2属性分别对应U2,U3,U4
self.post0 = nn.Conv2d(32, 16, 3, padding=1)
self.post1 = nn.Conv2d(16, 4, 3, padding=1)
self.post2 = nn.Conv2d(4, 1, 3, padding=1)
# 卷积层使用kaiming分布初始化其参数
for modules in [self.v0, self.v1, self.v2, self.h2, self.h1, self.h0, self.deconv, self.post0, self.post1, self.post2,]:
for l in modules.modules():
if isinstance(l, nn.Conv2d):
nn.init.kaiming_uniform_(l.weight, a=1)
def forward(self, f, corr_feature, pos=None, test=False):
if test:
# 进行测试时:
# f为 ResNet 的特征图元组。
# f[0]形状为[1, 64, 125, 125],
# f[1]形状为[1, 256, 63, 63],
# f[2]形状为[1, 512, 31, 31],
# p0,p1,p2表示补0填充后,取出目标位置的特征图
p0 = torch.nn.functional.pad(f[0], [16, 16, 16, 16])[:, :, 4*pos[0]:4*pos[0]+61, 4*pos[1]:4*pos[1]+61]
p1 = torch.nn.functional.pad(f[1], [8, 8, 8, 8])[:, :, 2 * pos[0]:2 * pos[0] + 31, 2 * pos[1]:2 * pos[1] + 31]
p2 = torch.nn.functional.pad(f[2], [4, 4, 4, 4])[:, :, pos[0]:pos[0] + 15, pos[1]:pos[1] + 15]
else:
# 训练
# 利用滑动窗口取得特征图
p0 = F.unfold(f[0], (61, 61), padding=0, stride=4).permute(0, 2, 1).contiguous().view(-1, 64, 61, 61)
if not (pos is None): p0 = torch.index_select(p0, 0, pos)
p1 = F.unfold(f[1], (31, 31), padding=0, stride=2).permute(0, 2, 1).contiguous().view(-1, 256, 31, 31)
if not (pos is None): p1 = torch.index_select(p1, 0, pos)
p2 = F.unfold(f[2], (15, 15), padding=0, stride=1).permute(0, 2, 1).contiguous().view(-1, 512, 15, 15)
if not (pos is None): p2 = torch.index_select(p2, 0, pos)
if not(pos is None):
# 训练,P3相关特征上的特征向量
p3 = corr_feature[:, :, pos[0], pos[1]].view(-1, 256, 1, 1)
else:
# 测试
p3 = corr_feature.permute(0, 2, 3, 1).contiguous().view(-1, 256, 1, 1)
# 反卷积
out = self.deconv(p3)
# 进行特征的融和
out = self.post0(F.upsample(self.h2(out) + self.v2(p2), size=(31, 31)))
out = self.post1(F.upsample(self.h1(out) + self.v1(p1), size=(61, 61)))
out = self.post2(F.upsample(self.h0(out) + self.v0(p0), size=(127, 127)))
out = out.view(-1, 127*127)
return out
def param_groups(self, start_lr, feature_mult=1):
"""
参数集合
:param start_lr:
:param feature_mult:
:return:
"""
params = filter(lambda x:x.requires_grad, self.parameters())
params = [{'params': params, 'lr': start_lr * feature_mult}]
return params
class Custom(SiamMask):
def __init__(self, pretrain=False, **kwargs):
super(Custom, self).__init__(**kwargs)
self.features = ResDown(pretrain=pretrain)
self.rpn_model = UP(anchor_num=self.anchor_num, feature_in=256, feature_out=256)
self.mask_model = MaskCorr()
self.refine_model = Refine()
def refine(self, f, pos=None):
"""
特征融合
:param f:
:param pos:
:return:
"""
return self.refine_model(f, pos)
def template(self, template):
"""
对模板进行特征提取
:param template:
:return:
"""
self.zf = self.features(template)
def track(self, search):
"""
目标追踪
:param search:进行跟踪的图像块
:return: 目标分类和回归结果
"""
# 目标特征提取
search = self.features(search)
# 利用rpn网络进行回归与分类:得到目标类型及位置
rpn_pred_cls, rpn_pred_loc = self.rpn(self.zf, search)
return rpn_pred_cls, rpn_pred_loc
def track_mask(self, search):
"""
目标跟踪并进行分割
:param search: 进行跟踪的图像块
:return: 目标分类,回归及分割结果
"""
# 目标特征提取,
self.feature, self.search = self.features.forward_all(search)
# 分类和回归
rpn_pred_cls, rpn_pred_loc = self.rpn(self.zf, self.search)
# 相关滤波
self.corr_feature = self.mask_model.mask.forward_corr(self.zf, self.search)
# 掩膜结果
pred_mask = self.mask_model.mask.head(self.corr_feature)
return rpn_pred_cls, rpn_pred_loc, pred_mask
def track_refine(self, pos):
# 对特征进行融合
pred_mask = self.refine_model(self.feature, self.corr_feature, pos=pos, test=True)
return pred_mask
import torch.nn as nn
import torch
from torch.autograd import Variable
import math
import torch.utils.model_zoo as model_zoo
from models.features import Features
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152']
# 已进行预训练的resnet模型
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, stride=1):
"""
"3x3 convolution with padding"
3*3卷积
:param in_planes: 输入通道数
:param out_planes: 输出通道数
:param stride: 步长
:return:
"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
"""
基础的瓶颈模块,由两个叠加的3*3卷积组成,用于res18和res34
"""
# 对输出深度的倍乘
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
# 3*3卷积 BN层 Relu激活
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
# shortcut
self.downsample = downsample
self.stride = stride
def forward(self, x):
# 上一层的输出
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
# shortcut存在时
if self.downsample is not None:
# 将上一层的输出x输入downsample,将结果赋给residual
# 目的就是为了应对上下层输出输入深度一致
residual = self.downsample(x)
# 将BN层结果与shortcut相加
out += residual
# relu激活
out = self.relu(out)
return out
class Bottleneck(Features):
"""
瓶颈模块,有1*1 3*3 1*1三个卷积层构成,分别用来降低维度,卷积处理和升高维度
继承在feature,用于特征提取
"""
# 将输入深度进行倍乘(若输入深度为64,那么扩张4倍后就变为了256)
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1):
super(Bottleneck, self).__init__()
# 1*1卷积
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
# BN
self.bn1 = nn.BatchNorm2d(planes)
# padding = (2 - stride) + (dilation // 2 - 1)
padding = 2 - stride
assert stride==1 or dilation==1, "stride and dilation must have one equals to zero at least"
if dilation > 1:
padding = dilation
# 3*3 卷积
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=padding, bias=False, dilation=dilation)
self.bn2 = nn.BatchNorm2d(planes)
# 1*1 卷积
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
# shortcut
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
if out.size() != residual.size():
print(out.size(), residual.size())
out += residual
out = self.relu(out)
return out
class Bottleneck_nop(nn.Module):
"""
官网原始的瓶颈块
"""
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck_nop, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=0, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
s = residual.size(3)
residual = residual[:, :, 1:s-1, 1:s-1]
out += residual
out = self.relu(out)
return out
class ResNet(nn.Module):
"""
ResNet主体部分实现
"""
def __init__(self, block, layers, layer4=False, layer3=False):
"""
主体实现
:param block: 基础块:BasicBlock或者BottleNeck
:param layers: 每个大的layer中的block个数
:param layer4:
:param layer3:是否加入layer3和layer4
"""
# 输入深度
self.inplanes = 64
super(ResNet, self).__init__()
# 卷积
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=0,
bias=False)
# BN
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
# 池化
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# 将block块添加到layer中
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2) # 31x31, 15x15
self.feature_size = 128 * block.expansion
# 添加layer3
if layer3:
self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2) # 15x15, 7x7
self.feature_size = (256 + 128) * block.expansion
else:
self.layer3 = lambda x:x # identity
# 添加layer4
if layer4:
self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4) # 7x7, 3x3
self.feature_size = 512 * block.expansion
else:
self.layer4 = lambda x:x # identity
# 参数初始化
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
"""
:param block: basicbottle或者是bottleneck
:param planes:通道数
:param blocks:添加block的个数
:param stride:
:param dilation:
:return:
"""
downsample = None
dd = dilation
# shortcut的设置
if stride != 1 or self.inplanes != planes * block.expansion:
if stride == 1 and dilation == 1:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
else:
if dilation > 1:
dd = dilation // 2
padding = dd
else:
dd = 1
padding = 0
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=3, stride=stride, bias=False,
padding=padding, dilation=dd),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
# layers.append(block(self.inplanes, planes, stride, downsample, dilation=dilation))
layers.append(block(self.inplanes, planes, stride, downsample, dilation=dd))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes, dilation=dilation))
# 构建网络
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
p0 = self.relu(x)
x = self.maxpool(p0)
p1 = self.layer1(x)
p2 = self.layer2(p1)
p3 = self.layer3(p2)
return p0, p1, p2, p3
class ResAdjust(nn.Module):
"""
对模块进行adjust,未使用
"""
def __init__(self,
block=Bottleneck,
out_channels=256,
adjust_number=1,
fuse_layers=[2,3,4]):
super(ResAdjust, self).__init__()
self.fuse_layers = set(fuse_layers)
if 2 in self.fuse_layers:
self.layer2 = self._make_layer(block, 128, 1, out_channels, adjust_number)
if 3 in self.fuse_layers:
self.layer3 = self._make_layer(block, 256, 2, out_channels, adjust_number)
if 4 in self.fuse_layers:
self.layer4 = self._make_layer(block, 512, 4, out_channels, adjust_number)
self.feature_size = out_channels * len(self.fuse_layers)
def _make_layer(self, block, plances, dilation, out, number=1):
layers = []
for _ in range(number):
layer = block(plances * block.expansion, plances, dilation=dilation)
layers.append(layer)
downsample = nn.Sequential(
nn.Conv2d(plances * block.expansion, out, kernel_size=3, padding=1, bias=False),
nn.BatchNorm2d(out)
)
layers.append(downsample)
return nn.Sequential(*layers)
def forward(self, p2, p3, p4):
outputs = []
if 2 in self.fuse_layers:
outputs.append(self.layer2(p2))
if 3 in self.fuse_layers:
outputs.append(self.layer3(p3))
if 4 in self.fuse_layers:
outputs.append(self.layer4(p4))
# return torch.cat(outputs, 1)
return outputs
def resnet18(pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
return model
def resnet34(pretrained=False, **kwargs):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
return model
def resnet50(pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
重构resnet50
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
return model
def resnet101(pretrained=False, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
return model
def resnet152(pretrained=False, **kwargs):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
return model
if __name__ == '__main__':
net = resnet50()
print(net)
# net = net.cuda()
#
# var = torch.FloatTensor(1,3,127,127).cuda()
# var = Variable(var)
#
# net(var)
# print('*************')
# var = torch.FloatTensor(1,3,255,255).cuda()
# var = Variable(var)
# net(var)
var = torch.FloatTensor(1,3,127,127)
var = Variable(var)
print(var)
out = net(var)
print(out)
# --------------------------------------------------------
# 特征提取基础类
# --------------------------------------------------------
import torch.nn as nn
import logging
logger = logging.getLogger('global')
class Features(nn.Module):
"""
特征提取模块,用于resnet
"""
def __init__(self):
super(Features, self).__init__()
self.feature_size = -1
def forward(self, x):
raise NotImplementedError
def param_groups(self, start_lr, feature_mult=1):
"""
需要进行梯度更新的参数
:param start_lr:
:param feature_mult:
:return:
"""
params = filter(lambda x:x.requires_grad, self.parameters())
params = [{'params': params, 'lr': start_lr * feature_mult}]
return params
def load_model(self, f='pretrain.model'):
with open(f) as f:
pretrained_dict = torch.load(f)
model_dict = self.state_dict()
print(pretrained_dict.keys())
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
print(pretrained_dict.keys())
model_dict.update(pretrained_dict)
self.load_state_dict(model_dict)
class MultiStageFeature(Features):
"""
多阶段的特征提取模块,用于custom中的ResDown
"""
def __init__(self):
"""
初始化
"""
super(MultiStageFeature, self).__init__()
self.layers = []
self.train_num = -1
self.change_point = []
self.train_nums = []
def unfix(self, ratio=0.0):
"""
设置在train和test网络层的更新
:param ratio:
:return:
"""
if self.train_num == -1:
self.train_num = 0
self.unlock()
self.eval()
for p, t in reversed(list(zip(self.change_point, self.train_nums))):
if ratio >= p:
if self.train_num != t:
self.train_num = t
self.unlock()
return True
break
return False
def train_layers(self):
"""
返回进行参数更新的网络层
:return:
"""
return self.layers[:self.train_num]
def unlock(self):
"""
设置网络层是否进行梯度更新
:return:
"""
# 将所有参数设置为不可训练
for p in self.parameters():
p.requires_grad = False
logger.info('Current training {} layers:\n\t'.format(self.train_num, self.train_layers()))
# 将train_layers中的参数,参与梯度更新
for m in self.train_layers():
for p in m.parameters():
p.requires_grad = True
def train(self, mode):
"""
根据mode对网络进行训练
:param mode:
:return:
"""
self.training = mode
# 若mode为false则不进行训练
if mode == False:
super(MultiStageFeature,self).train(False)
else:
# 否则训练指定层
for m in self.train_layers():
m.train(True)
return self
# --------------------------------------------------------
# Mask
# --------------------------------------------------------
import torch.nn as nn
class Mask(nn.Module):
"""
mask的基本信息
"""
def __init__(self):
super(Mask, self).__init__()
def forward(self, z_f, x_f):
raise NotImplementedError
def template(self, template):
raise NotImplementedError
def track(self, search):
raise NotImplementedError
def param_groups(self, start_lr, feature_mult=1):
"""
过滤掉不符合条件的元素
:param start_lr:
:param feature_mult:
:return:
"""
params = filter(lambda x:x.requires_grad, self.parameters())
params = [{'params': params, 'lr': start_lr * feature_mult}]
return params
# --------------------------------------------------------
# SiamMask
# Licensed under The MIT License
# Written by Qiang Wang (wangqiang2015 at ia.ac.cn)
# --------------------------------------------------------
import torch.nn as nn
import torch.nn.functional as F
class RPN(nn.Module):
"""
RPN网络的基本信息
"""
def __init__(self):
"初始化"
super(RPN, self).__init__()
def forward(self, z_f, x_f):
"前向传播"
raise NotImplementedError
def template(self, template):
"模板信息"
raise NotImplementedError
def track(self, search):
"跟踪信息"
raise NotImplementedError
def param_groups(self, start_lr, feature_mult=1, key=None):
# 若key为空,返回需要进行梯度更新的参数
if key is None:
params = filter(lambda x:x.requires_grad, self.parameters())
# 否则返回key中需要更新的参数
else:
params = [v for k, v in self.named_parameters() if (key in k) and v.requires_grad]
params = [{'params': params, 'lr': start_lr * feature_mult}]
return params
def conv2d_dw_group(x, kernel):
"""
模板与待搜索图像之间的相关,并变换维度
:param x:
:param kernel:
:return:
"""
# 获得batch size 和 channel
batch, channel = kernel.shape[:2]
# 进行维度重构
x = x.view(1, batch*channel, x.size(2), x.size(3)) # 1 * (b*c) * k * k
kernel = kernel.view(batch*channel, 1, kernel.size(2), kernel.size(3)) # (b*c) * 1 * H * W
# 计算图像和模板的相似度,使用分组卷积。
out = F.conv2d(x, kernel, groups=batch*channel)
# 维度重构
out = out.view(batch, channel, out.size(2), out.size(3))
return out
# 计算模板与搜索图像的关系,目标置信度和检测位置
class DepthCorr(nn.Module):
def __init__(self, in_channels, hidden, out_channels, kernel_size=3):
super(DepthCorr, self).__init__()
# 调整图层的不对称性特征 adjust layer for asymmetrical features
# 对模板进行处理
self.conv_kernel = nn.Sequential(
nn.Conv2d(in_channels, hidden, kernel_size=kernel_size, bias=False),
nn.BatchNorm2d(hidden),
nn.ReLU(inplace=True),
)
# 对搜索图像进行处理
self.conv_search = nn.Sequential(
nn.Conv2d(in_channels, hidden, kernel_size=kernel_size, bias=False),
nn.BatchNorm2d(hidden),
nn.ReLU(inplace=True),
)
# 网络输出位置:位置,类别等
self.head = nn.Sequential(
nn.Conv2d(hidden, hidden, kernel_size=1, bias=False),
nn.BatchNorm2d(hidden),
nn.ReLU(inplace=True),
nn.Conv2d(hidden, out_channels, kernel_size=1)
)
def forward_corr(self, kernel, input):
# 计算模板和搜索图像的特征
kernel = self.conv_kernel(kernel)
input = self.conv_search(input)
# 计算模板与搜索图像检测相关性
feature = conv2d_dw_group(input, kernel)
return feature
def forward(self, kernel, search):
"""
前向传播
:param kernel: 模板
:param search: 搜索的图像
:return:
"""
feature = self.forward_corr(kernel, search)
out = self.head(feature)
return out
# --------------------------------------------------------
# SiamMask定义了网络的主要的模块
# --------------------------------------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from utils.anchors import Anchors
class SiamMask(nn.Module):
"""
主要用来定义siamMask网络的框架,及其主要模块
"""
def __init__(self, anchors=None, o_sz=63, g_sz=127):
super(SiamMask, self).__init__()
self.anchors = anchors # anchor_cfg anchors中的配置信息
self.anchor_num = len(self.anchors["ratios"]) * len(self.anchors["scales"]) # anchor的数目
self.anchor = Anchors(anchors) # anchor
self.features = None # 特征提取网络模型
self.rpn_model = None # rpn网络模型
self.mask_model = None # 图像分割的网络模型
self.o_sz = o_sz # 输入尺寸
self.g_sz = g_sz # 输出尺寸
self.upSample = nn.UpsamplingBilinear2d(size=[g_sz, g_sz]) # 2d数据的双线性插值
self.all_anchors = None
def set_all_anchors(self, image_center, size):
"""
初始化anchors(该方法未使用)
:param image_center: 图像中心
:param size:
:return:
"""
# cx,cy,w,h
if not self.anchor.generate_all_anchors(image_center, size):
return
all_anchors = self.anchor.all_anchors[1] # cx, cy, w, h
self.all_anchors = torch.from_numpy(all_anchors).float().cuda()
self.all_anchors = [self.all_anchors[i] for i in range(4)]
def feature_extractor(self, x):
"""
特征提取
:param x:输入数据
:return:数据特征
"""
return self.features(x)
def rpn(self, template, search):
"""
rpn网络
:param template: 模板
:param search: 搜索图像
:return:
"""
# 预测分类和位置结果
pred_cls, pred_loc = self.rpn_model(template, search)
return pred_cls, pred_loc
def mask(self, template, search):
"""
分割预测结果
:param template: 模板
:param search: 待搜索图像
:return: 掩膜结果
"""
pred_mask = self.mask_model(template, search)
return pred_mask
def _add_rpn_loss(self, label_cls, label_loc, lable_loc_weight, label_mask, label_mask_weight,
rpn_pred_cls, rpn_pred_loc, rpn_pred_mask):
"""
rpn损失函数
"""
# 分类的损失结果(交叉熵损失)
rpn_loss_cls = select_cross_entropy_loss(rpn_pred_cls, label_cls)
# 回归的损失结果
rpn_loss_loc = weight_l1_loss(rpn_pred_loc, label_loc, lable_loc_weight)
# 分割的损失结果和准确率
rpn_loss_mask, iou_m, iou_5, iou_7 = select_mask_logistic_loss(rpn_pred_mask, label_mask, label_mask_weight)
return rpn_loss_cls, rpn_loss_loc, rpn_loss_mask, iou_m, iou_5, iou_7
def run(self, template, search, softmax=False):
'''
构建网络
:param template: 模板
:param search: 待搜索图像
:param softmax:
:return:
'''
# 提取模板特征
template_feature = self.feature_extractor(template)
# 提取图像特征
search_feature = self.feature_extractor(search)
# 预测结果
rpn_pred_cls, rpn_pred_loc = self.rpn(template_feature, search_feature)
rpn_pred_mask = self.mask(template_feature, search_feature) # (b, 63*63, w, h)
# 利用softmax进行分类
if softmax:
rpn_pred_cls = self.softmax(rpn_pred_cls)
return rpn_pred_cls, rpn_pred_loc, rpn_pred_mask, template_feature, search_feature
def softmax(self, cls):
"""
softmax
:param cls:
:return:
"""
# 获取cls的结果,及其对应的anchor的大小
b, a2, h, w = cls.size()
# 维度变换
cls = cls.view(b, 2, a2//2, h, w)
# 高维转置
cls = cls.permute(0, 2, 3, 4, 1).contiguous()
# 对softmax结果求以4为底的对数
cls = F.log_softmax(cls, dim=4)
return cls
def forward(self, input):
"""
torch中正向传递的算法,所有的子函数将覆盖函数
:param input: dict of input with keys of:
'template': [b, 3, h1, w1], input template image.输入的模板图像
'search': [b, 3, h2, w2], input search image.待搜索图像
'label_cls':[b, max_num_gts, 5] or None(self.training==False),
each gt contains x1,y1,x2,y2,class.
:return: dict of loss, predict, accuracy 损失 预测结果 准确率
"""
# 参数设置
template = input['template']
search = input['search']
if self.training:
label_cls = input['label_cls']
label_loc = input['label_loc']
lable_loc_weight = input['label_loc_weight']
label_mask = input['label_mask']
label_mask_weight = input['label_mask_weight']
# 运行网络
rpn_pred_cls, rpn_pred_loc, rpn_pred_mask, template_feature, search_feature = \
self.run(template, search, softmax=self.training)
outputs = dict()
# 预测结果
outputs['predict'] = [rpn_pred_loc, rpn_pred_cls, rpn_pred_mask, template_feature, search_feature]
if self.training:
# 损失函数
rpn_loss_cls, rpn_loss_loc, rpn_loss_mask, iou_acc_mean, iou_acc_5, iou_acc_7 = \
self._add_rpn_loss(label_cls, label_loc, lable_loc_weight, label_mask, label_mask_weight,
rpn_pred_cls, rpn_pred_loc, rpn_pred_mask)
# 输出损失函数和精度结果
outputs['losses'] = [rpn_loss_cls, rpn_loss_loc, rpn_loss_mask]
outputs['accuracy'] = [iou_acc_mean, iou_acc_5, iou_acc_7]
return outputs
def template(self, z):
"""
用于处理模板图像
:param z: 跟踪目标的模板
:return: 模板的分类和回归结果
"""
self.zf = self.feature_extractor(z)
cls_kernel, loc_kernel = self.rpn_model.template(self.zf)
return cls_kernel, loc_kernel
def track(self, x, cls_kernel=None, loc_kernel=None, softmax=False):
"""
目标跟踪
:param x:
:param cls_kernel:
:param loc_kernel:
:param softmax:
:return:
"""
# 特征提取
xf = self.feature_extractor(x)
# 跟踪结果
rpn_pred_cls, rpn_pred_loc = self.rpn_model.track(xf, cls_kernel, loc_kernel)
if softmax:
rpn_pred_cls = self.softmax(rpn_pred_cls)
# 返回目标跟踪的位置和分类结果
return rpn_pred_cls, rpn_pred_loc
def get_cls_loss(pred, label, select):
"""
计算分类的损失
:param pred: 预测结果
:param label: 真实结果
:param select: 预测位置
:return:
"""
# 预测位置为0个,返回0
if select.nelement() == 0: return pred.sum()*0.
# 获取预测结果和真实结果
pred = torch.index_select(pred, 0, select)
label = torch.index_select(label, 0, select)
# 计算最大似然函数
return F.nll_loss(pred, label)
def select_cross_entropy_loss(pred, label):
"""
交叉熵损失
:param pred: 预测值
:param label: 标签值(真实值)
:return: 返回正负类的损失值
"""
# 将预测数据展成...*2的形式
pred = pred.view(-1, 2)
# 将标签值展成一维形式
label = label.view(-1)
# 指明标签值
# GPU
# pos = Variable(label.data.eq(1).nonzero().squeeze()).cuda()
# neg = Variable(label.data.eq(0).nonzero().squeeze()).cuda()
pos = Variable(label.data.eq(1).nonzero().squeeze())
neg = Variable(label.data.eq(0).nonzero().squeeze())
# 计算正负样本的分类损失
loss_pos = get_cls_loss(pred, label, pos)
loss_neg = get_cls_loss(pred, label, neg)
return loss_pos * 0.5 + loss_neg * 0.5
def weight_l1_loss(pred_loc, label_loc, loss_weight):
"""
smooth L1 损失
:param pred_loc: [b, 4k, h, w]
:param label_loc: [b, 4k, h, w]
:param loss_weight: [b, k, h, w]
:return: loc loss value
"""
# 预测位置的中心坐标和大小
b, _, sh, sw = pred_loc.size()
# 变换维度:
pred_loc = pred_loc.view(b, 4, -1, sh, sw)
# 计算预测与真实值之间的差值
diff = (pred_loc - label_loc).abs()
# 计算梯度
diff = diff.sum(dim=1).view(b, -1, sh, sw)
# 损失
loss = diff * loss_weight
return loss.sum().div(b)
def select_mask_logistic_loss(p_m, mask, weight, o_sz=63, g_sz=127):
"""
计算图像分割分支的损失函数及精度信息
:param p_m:预测的分割结果
:param mask: 掩膜真实结果
:param weight:
:param o_sz:模板的大小
:param g_sz:图像的大小
:return:
"""
weight = weight.view(-1)
pos = Variable(weight.data.eq(1).nonzero().squeeze())
if pos.nelement() == 0: return p_m.sum() * 0, p_m.sum() * 0, p_m.sum() * 0, p_m.sum() * 0
# 维度转换
p_m = p_m.permute(0, 2, 3, 1).contiguous().view(-1, 1, o_sz, o_sz)
p_m = torch.index_select(p_m, 0, pos)
# 2d升采样
p_m = nn.UpsamplingBilinear2d(size=[g_sz, g_sz])(p_m)
p_m = p_m.view(-1, g_sz * g_sz)
# 对掩膜的真实结果进行处理
mask_uf = F.unfold(mask, (g_sz, g_sz), padding=32, stride=8)
mask_uf = torch.transpose(mask_uf, 1, 2).contiguous().view(-1, g_sz * g_sz)
mask_uf = torch.index_select(mask_uf, 0, pos)
# 计算损失函数
loss = F.soft_margin_loss(p_m, mask_uf)
# 计算精度
iou_m, iou_5, iou_7 = iou_measure(p_m, mask_uf)
# 返回结果
return loss, iou_m, iou_5, iou_7
def iou_measure(pred, label):
"""
iou计算
:param pred: 预测值
:param label: 真实值
:return: iou平均值,iou>0.5的比例,iou>0.7的比例
"""
# pred中大于0的置为1
pred = pred.ge(0)
# 将pred中等于1的与label中为1的相加
mask_sum = pred.eq(1).add(label.eq(1))
# mask_sum中为2的表示交
intxn = torch.sum(mask_sum == 2, dim=1).float()
# mask_sum中大于0的表示并
union = torch.sum(mask_sum > 0, dim=1).float()
# 交并比
iou = intxn/union
return torch.mean(iou), (torch.sum(iou > 0.5).float()/iou.shape[0]), (torch.sum(iou > 0.7).float()/iou.shape[0])
if __name__ == "__main__":
p_m = torch.randn(4, 63*63, 25, 25)
cls = torch.randn(4, 1, 25, 25) > 0.9
mask = torch.randn(4, 1, 255, 255) * 2 - 1
loss = select_mask_logistic_loss(p_m, mask, cls)
print(loss)
# --------------------------------------------------------
# SiamMask
# Licensed under The MIT License
# Written by Qiang Wang (wangqiang2015 at ia.ac.cn)
# --------------------------------------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from utils.anchors import Anchors
"""
该文件中主要定义了Siammask-refine的网络框架,与siammask.py中的网络架构基本相同
在run函数中增加了refine——model的应用
"""
class SiamMask(nn.Module):
def __init__(self, anchors=None, o_sz=127, g_sz=127):
super(SiamMask, self).__init__()
self.anchors = anchors # anchor_cfg
self.anchor_num = len(self.anchors["ratios"]) * len(self.anchors["scales"])
self.anchor = Anchors(anchors)
self.features = None
self.rpn_model = None
self.mask_model = None
self.o_sz = o_sz
self.g_sz = g_sz
self.upSample = nn.UpsamplingBilinear2d(size=[g_sz, g_sz])
self.all_anchors = None
def set_all_anchors(self, image_center, size):
# cx,cy,w,h
if not self.anchor.generate_all_anchors(image_center, size):
return
all_anchors = self.anchor.all_anchors[1] # cx, cy, w, h
self.all_anchors = torch.from_numpy(all_anchors).float().cuda()
self.all_anchors = [self.all_anchors[i] for i in range(4)]
def feature_extractor(self, x):
return self.features(x)
def rpn(self, template, search):
pred_cls, pred_loc = self.rpn_model(template, search)
return pred_cls, pred_loc
def mask(self, template, search):
pred_mask = self.mask_model(template, search)
return pred_mask
def _add_rpn_loss(self, label_cls, label_loc, lable_loc_weight, label_mask, label_mask_weight,
rpn_pred_cls, rpn_pred_loc, rpn_pred_mask):
rpn_loss_cls = select_cross_entropy_loss(rpn_pred_cls, label_cls)
rpn_loss_loc = weight_l1_loss(rpn_pred_loc, label_loc, lable_loc_weight)
rpn_loss_mask, iou_m, iou_5, iou_7 = select_mask_logistic_loss(rpn_pred_mask, label_mask, label_mask_weight)
return rpn_loss_cls, rpn_loss_loc, rpn_loss_mask, iou_m, iou_5, iou_7
def run(self, template, search, softmax=False):
"""
run network
"""
template_feature = self.feature_extractor(template)
feature, search_feature = self.features.forward_all(search)
rpn_pred_cls, rpn_pred_loc = self.rpn(template_feature, search_feature)
corr_feature = self.mask_model.mask.forward_corr(template_feature, search_feature) # (b, 256, w, h)
rpn_pred_mask = self.refine_model(feature, corr_feature)
if softmax:
rpn_pred_cls = self.softmax(rpn_pred_cls)
return rpn_pred_cls, rpn_pred_loc, rpn_pred_mask, template_feature, search_feature
def softmax(self, cls):
b, a2, h, w = cls.size()
cls = cls.view(b, 2, a2//2, h, w)
cls = cls.permute(0, 2, 3, 4, 1).contiguous()
cls = F.log_softmax(cls, dim=4)
return cls
def forward(self, input):
"""
:param input: dict of input with keys of:
'template': [b, 3, h1, w1], input template image.
'search': [b, 3, h2, w2], input search image.
'label_cls':[b, max_num_gts, 5] or None(self.training==False),
each gt contains x1,y1,x2,y2,class.
:return: dict of loss, predict, accuracy
"""
template = input['template']
search = input['search']
if self.training:
label_cls = input['label_cls']
label_loc = input['label_loc']
lable_loc_weight = input['label_loc_weight']
label_mask = input['label_mask']
label_mask_weight = input['label_mask_weight']
rpn_pred_cls, rpn_pred_loc, rpn_pred_mask, template_feature, search_feature = \
self.run(template, search, softmax=self.training)
outputs = dict()
outputs['predict'] = [rpn_pred_loc, rpn_pred_cls, rpn_pred_mask, template_feature, search_feature]
if self.training:
rpn_loss_cls, rpn_loss_loc, rpn_loss_mask, iou_acc_mean, iou_acc_5, iou_acc_7 = \
self._add_rpn_loss(label_cls, label_loc, lable_loc_weight, label_mask, label_mask_weight,
rpn_pred_cls, rpn_pred_loc, rpn_pred_mask)
outputs['losses'] = [rpn_loss_cls, rpn_loss_loc, rpn_loss_mask]
outputs['accuracy'] = [iou_acc_mean, iou_acc_5, iou_acc_7]
return outputs
def template(self, z):
self.zf = self.feature_extractor(z)
cls_kernel, loc_kernel = self.rpn_model.template(self.zf)
return cls_kernel, loc_kernel
def track(self, x, cls_kernel=None, loc_kernel=None, softmax=False):
xf = self.feature_extractor(x)
rpn_pred_cls, rpn_pred_loc = self.rpn_model.track(xf, cls_kernel, loc_kernel)
if softmax:
rpn_pred_cls = self.softmax(rpn_pred_cls)
return rpn_pred_cls, rpn_pred_loc
def get_cls_loss(pred, label, select):
if select.nelement() == 0: return pred.sum()*0.
pred = torch.index_select(pred, 0, select)
label = torch.index_select(label, 0, select)
return F.nll_loss(pred, label)
def select_cross_entropy_loss(pred, label):
pred = pred.view(-1, 2)
label = label.view(-1)
# GPU
# pos = Variable(label.data.eq(1).nonzero().squeeze()).cuda()
# neg = Variable(label.data.eq(0).nonzero().squeeze()).cuda()
pos = Variable(label.data.eq(1).nonzero().squeeze())
neg = Variable(label.data.eq(0).nonzero().squeeze())
loss_pos = get_cls_loss(pred, label, pos)
loss_neg = get_cls_loss(pred, label, neg)
return loss_pos * 0.5 + loss_neg * 0.5
def weight_l1_loss(pred_loc, label_loc, loss_weight):
"""
:param pred_loc: [b, 4k, h, w]
:param label_loc: [b, 4k, h, w]
:param loss_weight: [b, k, h, w]
:return: loc loss value
"""
b, _, sh, sw = pred_loc.size()
pred_loc = pred_loc.view(b, 4, -1, sh, sw)
diff = (pred_loc - label_loc).abs()
diff = diff.sum(dim=1).view(b, -1, sh, sw)
loss = diff * loss_weight
return loss.sum().div(b)
def select_mask_logistic_loss(p_m, mask, weight, o_sz=63, g_sz=127):
weight = weight.view(-1)
pos = Variable(weight.data.eq(1).nonzero().squeeze())
if pos.nelement() == 0: return p_m.sum() * 0, p_m.sum() * 0, p_m.sum() * 0, p_m.sum() * 0
if len(p_m.shape) == 4:
p_m = p_m.permute(0, 2, 3, 1).contiguous().view(-1, 1, o_sz, o_sz)
p_m = torch.index_select(p_m, 0, pos)
p_m = nn.UpsamplingBilinear2d(size=[g_sz, g_sz])(p_m)
p_m = p_m.view(-1, g_sz * g_sz)
else:
p_m = torch.index_select(p_m, 0, pos)
mask_uf = F.unfold(mask, (g_sz, g_sz), padding=0, stride=8)
mask_uf = torch.transpose(mask_uf, 1, 2).contiguous().view(-1, g_sz * g_sz)
mask_uf = torch.index_select(mask_uf, 0, pos)
loss = F.soft_margin_loss(p_m, mask_uf)
iou_m, iou_5, iou_7 = iou_measure(p_m, mask_uf)
return loss, iou_m, iou_5, iou_7
def iou_measure(pred, label):
pred = pred.ge(0)
mask_sum = pred.eq(1).add(label.eq(1))
intxn = torch.sum(mask_sum == 2, dim=1).float()
union = torch.sum(mask_sum > 0, dim=1).float()
iou = intxn/union
return torch.mean(iou), (torch.sum(iou > 0.5).float()/iou.shape[0]), (torch.sum(iou > 0.7).float()/iou.shape[0])
if __name__ == "__main__":
p_m = torch.randn(4, 63*63, 25, 25)
cls = torch.randn(4, 1, 25, 25) > 0.9
mask = torch.randn(4, 1, 255, 255) * 2 - 1
loss = select_mask_logistic_loss(p_m, mask, cls)
print(loss)