大家好,最近一直在看DiMP的代码,但是发现自己经常容易看了后面的忘了前面的,妈妈常和我说“好脑子不如烂笔头”,所以我今天决定把关于代码的一些想法记录下来,一定会有很多问题的,希望大家多多包容,多多与我交流,很欢迎大家评论或私信我哦~
def initialize(self, image, info: dict) -> dict:
# Initialize some stuff
self.frame_num = 1
# 这里的self.params里的参数在pytracking/pytracking/parameter/dimp/dimp50.py中找(我用的dimp50,你用其他的也ok)
# 判断使用GPU还是CPU
if not hasattr(self.params, 'device'):
self.params.device = 'cuda' if self.params.use_gpu else 'cpu'
# Initialize network
self.initialize_features() # 跳到“二”
# The DiMP network
self.net = self.params.net # 定义self.net
# Time initialization
tic = time.time() # 获取初始时间
# Get target position and size 初始状态---ground_truth_rect[0,:] [x1,y1,x2-x1,y2-y1]
state = info['init_bbox']
self.pos = torch.Tensor([state[1] + (state[3] - 1)/2, state[0] + (state[2] - 1)/2]) # 中心点坐标,先y后x
self.target_sz = torch.Tensor([state[3], state[2]]) # 物体大小,先height后width
# Set sizes
sz = self.params.image_sample_size # 18*16
# 如果sz是int型,img_sample_sz=[sz,sz],否则为sz
self.img_sample_sz = torch.Tensor([sz, sz] if isinstance(sz, int) else sz)
self.img_support_sz = self.img_sample_sz
# Set search area
# 搜索区域大小:search_area_scale倍目标区域,prod求元素乘积,item用于得到张量的值
# 面积:height*search_area_scale*width*search_area_scale
search_area = torch.prod(self.target_sz * self.params.search_area_scale).item()
# 这一步实在没看懂,大概理解的是搜索区域面积相对目标面积的倍数
self.target_scale = math.sqrt(search_area) / self.img_sample_sz.prod().sqrt()
# Target size in base scale 这一步也没看懂
self.base_target_sz = self.target_sz / self.target_scale
# Convert image
im = numpy_to_torch(image)
# Setup scale factors
if not hasattr(self.params, 'scale_factors'):
# 执行这句
self.params.scale_factors = torch.ones(1)
elif isinstance(self.params.scale_factors, (list, tuple)):
self.params.scale_factors = torch.Tensor(self.params.scale_factors)
# Setup scale bounds
self.image_sz = torch.Tensor([im.shape[2], im.shape[3]]) # image_sz=[Height,Width]
self.min_scale_factor = torch.max(10 / self.base_target_sz)
self.max_scale_factor = torch.min(self.image_sz / self.base_target_sz)
# Extract and transform sample
init_backbone_feat = self.generate_init_samples(im) # 跳到“三”
# 下面的代码还没看完,看完再更新叭
# Initialize classifier
self.init_classifier(init_backbone_feat)
# Initialize IoUNet
if getattr(self.params, 'use_iou_net', True):
self.init_iou_net(init_backbone_feat)
out = {'time': time.time() - tic}
return out
def initialize_features(self):
if not getattr(self, 'features_initialized', False):
self.params.net.initialize()
self.features_initialized = True
由dimp50.py文件可以知道;
params.net = NetWithBackbone(net_path='dimp50.pth',
use_gpu=params.use_gpu)
所以找到pytracking/pytracking/features/net_wrappers.py的NetWithBackbone类中的initialize()函数
def initialize(self):
super().initialize() # 继承父类的方法和属性,父类就是下面的NetWrapper类,所以会运行NetWrapper中的initialize函数
# 下面的mean和std是Imagenet得出的规律,如果是特定数据集,可以自己计算
self._mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, -1, 1, 1)
self._std = torch.Tensor([0.229, 0.224, 0.225]).view(1, -1, 1, 1)
class NetWrapper:
"""Used for wrapping networks in pytracking.
Network modules and functions can be accessed directly as if they were members of this class."""
_rec_iter=0
def __init__(self, net_path, use_gpu=True):
self.net_path = net_path
self.use_gpu = use_gpu
self.net = None
def __getattr__(self, name):
if self._rec_iter > 0:
self._rec_iter = 0
return None
self._rec_iter += 1
try:
ret_val = getattr(self.net, name)
except Exception as e:
self._rec_iter = 0
raise e
self._rec_iter = 0
return ret_val
def load_network(self):
self.net = load_network(self.net_path) # 载入网络的模型
if self.use_gpu:
self.cuda()
self.eval() # eval用来返回字符串表达式的值
def initialize(self):
self.load_network()
此时返回“一”中的# The DiMP network一行,继续往下看代码。
这一段代码主要是通过数据增强的手段,产生初始的训练样本
def generate_init_samples(self, im: torch.Tensor) -> TensorList:
"""Perform data augmentation to generate initial training samples."""
if getattr(self.params, 'border_mode', 'replicate') == 'inside':
# Get new sample size if forced inside the image
im_sz = torch.Tensor([im.shape[2], im.shape[3]])
sample_sz = self.target_scale * self.img_sample_sz
shrink_factor = (sample_sz.float() / im_sz).max().clamp(1)
sample_sz = (sample_sz.float() / shrink_factor)
self.init_sample_scale = (sample_sz / self.img_sample_sz).prod().sqrt()
tl = self.pos - (sample_sz - 1) / 2
br = self.pos + sample_sz / 2 + 1
global_shift = - ((-tl).clamp(0) - (br - im_sz).clamp(0)) / self.init_sample_scale
else:
# 执行else里的
self.init_sample_scale = self.target_scale
global_shift = torch.zeros(2) # tensor([0.,0.])
self.init_sample_pos = self.pos.round() # round:返回四舍五入值
# Compute augmentation size
aug_expansion_factor = getattr(self.params, 'augmentation_expansion_factor', None) # 2
aug_expansion_sz = self.img_sample_sz.clone() # 克隆出属性完全一样的物体,两个物体彼此独立
aug_output_sz = None
if aug_expansion_factor is not None and aug_expansion_factor != 1: # 执行
aug_expansion_sz = (self.img_sample_sz * aug_expansion_factor).long() # long:变为长整型
aug_expansion_sz += (aug_expansion_sz - self.img_sample_sz.long()) % 2
aug_expansion_sz = aug_expansion_sz.float()
aug_output_sz = self.img_sample_sz.long().tolist() # 将数组或矩阵转换为列表 [18*16,18*16]
# Random shift for each sample
get_rand_shift = lambda: None # lambda函数,冒号前为输入,冒号后为输入的函数表达式,如lambda x,y:x*y
random_shift_factor = getattr(self.params, 'random_shift_factor', 0) # 1/3
if random_shift_factor > 0: # 执行
get_rand_shift = lambda: ((torch.rand(2) - 0.5) * self.img_sample_sz * random_shift_factor + global_shift).long().tolist()
# Always put identity transformation first, since it is the unaugmented sample that is always used
# aug_output_sz=[288,288]
self.transforms = [augmentation.Identity(aug_output_sz, global_shift.long().tolist())]
# 跳到“四”中的Identity函数
augs = self.params.augmentation if getattr(self.params, 'use_augmentation', True) else {}
# Add all augmentations
# extend:用新列表扩充旧列表的值,新列表将成为多个元素
# append:将某对象作为一个整体添加到列表中
if 'shift' in augs: # 平移
self.transforms.extend([augmentation.Translation(shift, aug_output_sz, global_shift.long().tolist()) for shift in augs['shift']]) # 跳到“四”中的Translation函数
if 'relativeshift' in augs: # 平移
get_absolute = lambda shift: (torch.Tensor(shift) * self.img_sample_sz/2).long().tolist()
self.transforms.extend([augmentation.Translation(get_absolute(shift), aug_output_sz, global_shift.long().tolist()) for shift in augs['relativeshift']]) # 跳到“四”中的Translation函数
if 'fliplr' in augs and augs['fliplr']: # 水平翻转
self.transforms.append(augmentation.FlipHorizontal(aug_output_sz, get_rand_shift()))
# 跳到“四”中的FlipHorizontal函数
if 'blur' in augs: # 模糊
self.transforms.extend([augmentation.Blur(sigma, aug_output_sz, get_rand_shift()) for sigma in augs['blur']]) # 跳到“四”中的Blur函数
if 'scale' in augs: # 缩放
self.transforms.extend([augmentation.Scale(scale_factor, aug_output_sz, get_rand_shift()) for scale_factor in augs['scale']]) # 跳到“四”中的Scale函数
if 'rotate' in augs: # 旋转
self.transforms.extend([augmentation.Rotate(angle, aug_output_sz, get_rand_shift()) for angle in augs['rotate']]) # 跳到“四”中的Rotate函数
# Extract augmented image patches
im_patches = sample_patch_transformed(im, self.init_sample_pos, self.init_sample_scale, aug_expansion_sz, self.transforms) # 跳到“五”中的sample_patch_transformed函数
# Extract initial backbone features
with torch.no_grad(): # with torch.no_grad()中的数据不需要计算梯度,也不需要进行反向传播
init_backbone_feat = self.net.extract_backbone(im_patches)
"""关于特征提取这块,主要就是resnet的代码,在我的另一篇文章里有讲(https://blog.csdn.net/missyoudaisy/article/details/104512577),作者的代码跟官网的稍微有点不一样,但思路是一致的。这里就不讲resnet了"""
return init_backbone_feat # 返回“一”中的# Initialize classifier一行
class Transform:
"""Base data augmentation transform class."""
def __init__(self, output_sz = None, shift = None):
self.output_sz = output_sz
self.shift = (0,0) if shift is None else shift
def __call__(self, image):
raise NotImplementedError
def crop_to_output(self, image):
if isinstance(image, torch.Tensor):
imsz = image.shape[2:] # [height,width]
if self.output_sz is None:
pad_h = 0
pad_w = 0
else:
pad_h = (self.output_sz[0] - imsz[0]) / 2
pad_w = (self.output_sz[1] - imsz[1]) / 2
# 取整:floor把数字变小,ceil把数字变大
pad_left = math.floor(pad_w) + self.shift[1]
pad_right = math.ceil(pad_w) - self.shift[1]
pad_top = math.floor(pad_h) + self.shift[0]
pad_bottom = math.ceil(pad_h) - self.shift[0]
# 如果图片比我要采样的样本大小小的话,就对其进行填充
return F.pad(image, (pad_left, pad_right, pad_top, pad_bottom), 'replicate') # 复制填充
else:
raise NotImplementedError
class Identity(Transform):
"""Identity transformation."""
def __call__(self, image):
return self.crop_to_output(image) # 跳回“三”
class FlipHorizontal(Transform):
"""Flip along horizontal axis."""
def __call__(self, image):
if isinstance(image, torch.Tensor):
return self.crop_to_output(image.flip((3,))) # flip(input,dims)按照给定维度翻转张量 跳回“三”
else:
return np.fliplr(image)
class FlipVertical(Transform):
"""Flip along vertical axis."""
def __call__(self, image: torch.Tensor):
if isinstance(image, torch.Tensor):
return self.crop_to_output(image.flip((2,)))
else:
return np.flipud(image)
class Translation(Transform): # 平移
"""Translate."""
def __init__(self, translation, output_sz = None, shift = None):
super().__init__(output_sz, shift) # 使用super函数继承父类(Transform)的方法和属性,否则只能继承方法
self.shift = (self.shift[0] + translation[0], self.shift[1] + translation[1])
def __call__(self, image):
if isinstance(image, torch.Tensor):
# 我理解的是通过控制图片的上下左右填充量,控制图片的平移
return self.crop_to_output(image) # 返回“三”
else:
raise NotImplementedError
class Scale(Transform):
"""Scale."""
def __init__(self, scale_factor, output_sz = None, shift = None):
super().__init__(output_sz, shift)
self.scale_factor = scale_factor
def __call__(self, image):
if isinstance(image, torch.Tensor):
# Calculate new size. Ensure that it is even so that crop/pad becomes easier. even:偶数
# 图片原来的height和width
h_orig, w_orig = image.shape[2:]
if h_orig != w_orig:
raise NotImplementedError
h_new = round(h_orig /self.scale_factor) # round函数返回四舍五入值
h_new += (h_new - h_orig) % 2 # %为求余运算 确保h_new为偶数
w_new = round(w_orig /self.scale_factor)
w_new += (w_new - w_orig) % 2
image_resized = F.interpolate(image, [h_new, w_new], mode='bilinear')
return self.crop_to_output(image_resized) # 返回“三”
else:
raise NotImplementedError
class Affine(Transform):
"""Affine transformation."""
def __init__(self, transform_matrix, output_sz = None, shift = None):
super().__init__(output_sz, shift)
self.transform_matrix = transform_matrix
def __call__(self, image):
if isinstance(image, torch.Tensor):
return self.crop_to_output(numpy_to_torch(self(torch_to_numpy(image))))
else:
return cv.warpAffine(image, self.transform_matrix, image.shape[1::-1], borderMode=cv.BORDER_REPLICATE)
class Rotate(Transform):
"""Rotate with given angle."""
def __init__(self, angle, output_sz = None, shift = None):
super().__init__(output_sz, shift)
self.angle = math.pi * angle/180 # 弧度制与角度制的转换
def __call__(self, image):
if isinstance(image, torch.Tensor):
return self.crop_to_output(numpy_to_torch(self(torch_to_numpy(image))))
else:
# c代表中心点坐标,具体公式参考下面的链接:https://blog.csdn.net/missyoudaisy/article/details/104492020
c = (np.expand_dims(np.array(image.shape[:2]),1)-1)/2
# image.shape[:2]图像的高和宽 [height,width,channel]
R = np.array([[math.cos(self.angle), math.sin(self.angle)],
[-math.sin(self.angle), math.cos(self.angle)]])
H =np.concatenate([R, c - R @ c], 1) # @表示矩阵乘法 H为变换矩阵
return cv.warpAffine(image, H, image.shape[1::-1], borderMode=cv.BORDER_REPLICATE)
# 上面这个函数在链接里讲的很清楚了,返回“三”
class Blur(Transform):
"""Blur with given sigma (can be axis dependent)."""
""" 高斯模糊,推荐两个比较好的博客,看了就清楚下面是在做什么
https://www.cnblogs.com/invisible2/p/9177018.html
https://blog.csdn.net/lyl771857509/article/details/84113177
"""
def __init__(self, sigma, output_sz = None, shift = None):
# 求出两个方向上的高斯权重
super().__init__(output_sz, shift)
if isinstance(sigma, (float, int)):
sigma = (sigma, sigma)
self.sigma = sigma
self.filter_size = [math.ceil(2*s) for s in self.sigma
x_coord = [torch.arange(-sz, sz+1, dtype=torch.float32) for sz in self.filter_size]
self.filter = [torch.exp(-(x**2)/(2*s**2)) for x, s in zip(x_coord, self.sigma)] # x**2:x的平方
self.filter[0] = self.filter[0].view(1,1,-1,1) / self.filter[0].sum()
self.filter[1] = self.filter[1].view(1,1,1,-1) / self.filter[1].sum()
def __call__(self, image):
if isinstance(image, torch.Tensor):
# shape的四维分别为batch size,通道数,图像高度,图像宽度。image.shape[2:]指从第3(2+1)维开始
sz = image.shape[2:]
# 当view有一个维度为-1时,则该维的维数会根据其他维的维数自动确定。
im1 = F.conv2d(image.view(-1,1,sz[0],sz[1]), self.filter[0], padding=(self.filter_size[0],0))
return self.crop_to_output(F.conv2d(im1, self.filter[1], padding=(0,self.filter_size[1])).view(1,-1,sz[0],sz[1])) # 返回“三”
else:
raise NotImplementedError
def sample_patch_transformed(im, pos, scale, image_sz, transforms):
"""Extract transformed image samples.
args:
im: Image.
pos: Center position for extraction.
scale: Image scale to extract features from.
image_sz: Size to resize the image samples to before extraction.
transforms: A set of image transforms to apply.
"""
# Get image patch
im_patch, _ = sample_patch(im, pos, scale*image_sz, image_sz) # 跳到下面的“sample_patch”函数
# Apply transforms
im_patches = torch.cat([T(im_patch) for T in transforms])
return im_patches
def sample_patch(im: torch.Tensor, pos: torch.Tensor, sample_sz: torch.Tensor, output_sz: torch.Tensor = None,
mode: str = 'replicate'):
"""Sample an image patch.
args:
im: Image
pos: center position of crop
sample_sz: size to crop
output_sz: size to resize to
mode: how to treat image borders: 'replicate' (default) or 'inside'
"""
if mode not in ['replicate', 'inside']:
raise ValueError('Unknown border mode \'{}\'.'.format(mode))
# copy and convert
posl = pos.long().clone()
# Get new sample size if forced inside the image
if mode == 'inside':
im_sz = torch.Tensor([im.shape[2], im.shape[3]])
shrink_factor = (sample_sz.float() / im_sz).max().clamp(1) # clamp(1)限定了shrink_factor最小为1
sample_sz = (sample_sz.float() / shrink_factor).long()
# Compute pre-downsampling factor
if output_sz is not None:
resize_factor = torch.min(sample_sz.float() / output_sz.float()).item()
df = int(max(int(resize_factor - 0.1), 1)) # 为什么要减0.1? df肯定是大于等于1的值
else:
df = int(1)
sz = sample_sz.float() / df # new size
# Do downsampling 这里没有很明白,中心点位置是怎么变的
if df > 1: # 说明有缩放
os = posl % df # offset
posl = (posl - os) / df # new position
im2 = im[..., os[0].item()::df, os[1].item()::df] # downsample
else:
# 实际代码运行这里!
im2 = im
# compute size to crop
szl = torch.max(sz.round(), torch.Tensor([2])).long() # 不小于2
# Extract top and bottom coordinates
tl = posl - (szl - 1)/2 # 左上角坐标
br = posl + szl/2 + 1 # 右下角坐标
# Shift the crop to inside
if mode == 'inside':
im2_sz = torch.LongTensor([im2.shape[2], im2.shape[3]])
shift = (-tl).clamp(0) - (br - im2_sz).clamp(0)
tl += shift
br += shift
# Get image patch
im_patch = im2[...,tl[0].item():br[0].item(),tl[1].item():br[1].item()]
else:
# Get image patch
# 搞不懂为什么pad参数又是负数?
im_patch = F.pad(im2, (-tl[1].item(), br[1].item() - im2.shape[3], -tl[0].item(), br[0].item() - im2.shape[2]), mode)
# Get image coordinates
patch_coord = df * torch.cat((tl, br)).view(1,4)
if output_sz is None or (im_patch.shape[-2] == output_sz[0] and im_patch.shape[-1] == output_sz[1]):
return im_patch.clone(), patch_coord
# Resample
im_patch = F.interpolate(im_patch, output_sz.long().tolist(), mode='bilinear')
return im_patch, patch_coord # 返回“三”