[DiMP跟踪算法]代码学习笔记1——初始化

大家好,最近一直在看DiMP的代码,但是发现自己经常容易看了后面的忘了前面的,妈妈常和我说“好脑子不如烂笔头”,所以我今天决定把关于代码的一些想法记录下来,一定会有很多问题的,希望大家多多包容,多多与我交流,很欢迎大家评论或私信我哦~

一、pytracking/pytracking/tracker/dimp/dimp.py文件中的initialize函数

 def initialize(self, image, info: dict) -> dict:
        # Initialize some stuff
        self.frame_num = 1
        # 这里的self.params里的参数在pytracking/pytracking/parameter/dimp/dimp50.py中找(我用的dimp50,你用其他的也ok)
        # 判断使用GPU还是CPU
        if not hasattr(self.params, 'device'):
            self.params.device = 'cuda' if self.params.use_gpu else 'cpu'

        # Initialize network
        self.initialize_features()	# 跳到“二”

        # The DiMP network
        self.net = self.params.net	# 定义self.net

        # Time initialization
        tic = time.time()	# 获取初始时间

        # Get target position and size   初始状态---ground_truth_rect[0,:]   [x1,y1,x2-x1,y2-y1]
        state = info['init_bbox']
        self.pos = torch.Tensor([state[1] + (state[3] - 1)/2, state[0] + (state[2] - 1)/2])     # 中心点坐标,先y后x
        self.target_sz = torch.Tensor([state[3], state[2]])     # 物体大小,先height后width

        # Set sizes
        sz = self.params.image_sample_size  # 18*16
        # 如果sz是int型,img_sample_sz=[sz,sz],否则为sz
        self.img_sample_sz = torch.Tensor([sz, sz] if isinstance(sz, int) else sz)
        self.img_support_sz = self.img_sample_sz

        # Set search area
        # 搜索区域大小:search_area_scale倍目标区域,prod求元素乘积,item用于得到张量的值
        # 面积:height*search_area_scale*width*search_area_scale
        search_area = torch.prod(self.target_sz * self.params.search_area_scale).item()	
        # 这一步实在没看懂,大概理解的是搜索区域面积相对目标面积的倍数
        self.target_scale =  math.sqrt(search_area) / self.img_sample_sz.prod().sqrt()

        # Target size in base scale 这一步也没看懂
        self.base_target_sz = self.target_sz / self.target_scale

        # Convert image
        im = numpy_to_torch(image)

        # Setup scale factors
        if not hasattr(self.params, 'scale_factors'):
        	# 执行这句
            self.params.scale_factors = torch.ones(1)
        elif isinstance(self.params.scale_factors, (list, tuple)):
            self.params.scale_factors = torch.Tensor(self.params.scale_factors)

        # Setup scale bounds
        self.image_sz = torch.Tensor([im.shape[2], im.shape[3]])	# image_sz=[Height,Width]  
        self.min_scale_factor = torch.max(10 / self.base_target_sz)
        self.max_scale_factor = torch.min(self.image_sz / self.base_target_sz)

        # Extract and transform sample
        init_backbone_feat = self.generate_init_samples(im)	# 跳到“三”
        # 下面的代码还没看完,看完再更新叭
        # Initialize classifier
        self.init_classifier(init_backbone_feat)

        # Initialize IoUNet
        if getattr(self.params, 'use_iou_net', True):
            self.init_iou_net(init_backbone_feat)

        out = {'time': time.time() - tic}
        return out

二、pytracking/pytracking/tracker/dimp/dimp.py文件中的initialize_features函数

    def initialize_features(self):
        if not getattr(self, 'features_initialized', False):
            self.params.net.initialize()
        self.features_initialized = True

由dimp50.py文件可以知道;

params.net = NetWithBackbone(net_path='dimp50.pth',
                                 use_gpu=params.use_gpu)

所以找到pytracking/pytracking/features/net_wrappers.py的NetWithBackbone类中的initialize()函数

    def initialize(self):
        super().initialize() # 继承父类的方法和属性,父类就是下面的NetWrapper类,所以会运行NetWrapper中的initialize函数
        # 下面的mean和std是Imagenet得出的规律,如果是特定数据集,可以自己计算
        self._mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, -1, 1, 1)
        self._std = torch.Tensor([0.229, 0.224, 0.225]).view(1, -1, 1, 1)
class NetWrapper:
    """Used for wrapping networks in pytracking.
    Network modules and functions can be accessed directly as if they were members of this class."""
    _rec_iter=0
    def __init__(self, net_path, use_gpu=True):
        self.net_path = net_path
        self.use_gpu = use_gpu
        self.net = None

    def __getattr__(self, name):
        if self._rec_iter > 0:
            self._rec_iter = 0
            return None
        self._rec_iter += 1
        try:
            ret_val = getattr(self.net, name)
        except Exception as e:
            self._rec_iter = 0
            raise e
        self._rec_iter = 0
        return ret_val

    def load_network(self):
        self.net = load_network(self.net_path)  # 载入网络的模型
        if self.use_gpu:
            self.cuda()
        self.eval() # eval用来返回字符串表达式的值

    def initialize(self):
        self.load_network()

此时返回“一”中的# The DiMP network一行,继续往下看代码。

三、pytracking/pytracking/tracker/dimp/dimp.py文件中的generate_init_samples函数

这一段代码主要是通过数据增强的手段,产生初始的训练样本

    def generate_init_samples(self, im: torch.Tensor) -> TensorList:
        """Perform data augmentation to generate initial training samples."""

        if getattr(self.params, 'border_mode', 'replicate') == 'inside':
            # Get new sample size if forced inside the image
            im_sz = torch.Tensor([im.shape[2], im.shape[3]])
            sample_sz = self.target_scale * self.img_sample_sz
            shrink_factor = (sample_sz.float() / im_sz).max().clamp(1)
            sample_sz = (sample_sz.float() / shrink_factor)
            self.init_sample_scale = (sample_sz / self.img_sample_sz).prod().sqrt()
            tl = self.pos - (sample_sz - 1) / 2
            br = self.pos + sample_sz / 2 + 1
            global_shift = - ((-tl).clamp(0) - (br - im_sz).clamp(0)) / self.init_sample_scale
        else:
            # 执行else里的
            self.init_sample_scale = self.target_scale
            global_shift = torch.zeros(2)   # tensor([0.,0.])

        self.init_sample_pos = self.pos.round()    # round:返回四舍五入值

        # Compute augmentation size
        aug_expansion_factor = getattr(self.params, 'augmentation_expansion_factor', None)  # 2
        aug_expansion_sz = self.img_sample_sz.clone()   # 克隆出属性完全一样的物体,两个物体彼此独立
        aug_output_sz = None
        if aug_expansion_factor is not None and aug_expansion_factor != 1:  # 执行
            aug_expansion_sz = (self.img_sample_sz * aug_expansion_factor).long()   # long:变为长整型
            aug_expansion_sz += (aug_expansion_sz - self.img_sample_sz.long()) % 2
            aug_expansion_sz = aug_expansion_sz.float()
            aug_output_sz = self.img_sample_sz.long().tolist()  # 将数组或矩阵转换为列表 [18*16,18*16]

        # Random shift for each sample
        get_rand_shift = lambda: None   # lambda函数,冒号前为输入,冒号后为输入的函数表达式,如lambda x,y:x*y
        random_shift_factor = getattr(self.params, 'random_shift_factor', 0)    # 1/3
        if random_shift_factor > 0: # 执行
            get_rand_shift = lambda: ((torch.rand(2) - 0.5) * self.img_sample_sz * random_shift_factor + global_shift).long().tolist()

        # Always put identity transformation first, since it is the unaugmented sample that is always used
        # aug_output_sz=[288,288]
        self.transforms = [augmentation.Identity(aug_output_sz, global_shift.long().tolist())]	
        # 跳到“四”中的Identity函数

        augs = self.params.augmentation if getattr(self.params, 'use_augmentation', True) else {}

        # Add all augmentations
        # extend:用新列表扩充旧列表的值,新列表将成为多个元素
        # append:将某对象作为一个整体添加到列表中
        if 'shift' in augs:     # 平移
            self.transforms.extend([augmentation.Translation(shift, aug_output_sz, global_shift.long().tolist()) for shift in augs['shift']])	# 跳到“四”中的Translation函数
        if 'relativeshift' in augs:     # 平移
            get_absolute = lambda shift: (torch.Tensor(shift) * self.img_sample_sz/2).long().tolist()
            self.transforms.extend([augmentation.Translation(get_absolute(shift), aug_output_sz, global_shift.long().tolist()) for shift in augs['relativeshift']])	# 跳到“四”中的Translation函数
        if 'fliplr' in augs and augs['fliplr']:     # 水平翻转
            self.transforms.append(augmentation.FlipHorizontal(aug_output_sz, get_rand_shift()))	
            # 跳到“四”中的FlipHorizontal函数
        if 'blur' in augs:  # 模糊
            self.transforms.extend([augmentation.Blur(sigma, aug_output_sz, get_rand_shift()) for sigma in augs['blur']])	# 跳到“四”中的Blur函数
        if 'scale' in augs: # 缩放
            self.transforms.extend([augmentation.Scale(scale_factor, aug_output_sz, get_rand_shift()) for scale_factor in augs['scale']])	# 跳到“四”中的Scale函数
        if 'rotate' in augs:    # 旋转
            self.transforms.extend([augmentation.Rotate(angle, aug_output_sz, get_rand_shift()) for angle in augs['rotate']])	# 跳到“四”中的Rotate函数

        # Extract augmented image patches
        im_patches = sample_patch_transformed(im, self.init_sample_pos, self.init_sample_scale, aug_expansion_sz, self.transforms)	# 跳到“五”中的sample_patch_transformed函数

        # Extract initial backbone features
        with torch.no_grad():   # with torch.no_grad()中的数据不需要计算梯度,也不需要进行反向传播
            init_backbone_feat = self.net.extract_backbone(im_patches)
        """关于特征提取这块,主要就是resnet的代码,在我的另一篇文章里有讲(https://blog.csdn.net/missyoudaisy/article/details/104512577),作者的代码跟官网的稍微有点不一样,但思路是一致的。这里就不讲resnet了"""

        return init_backbone_feat	# 返回“一”中的# Initialize classifier一行


四、pytracking/pytracking/features/augmentation.py文件

class Transform:
    """Base data augmentation transform class."""

    def __init__(self, output_sz = None, shift = None):
        self.output_sz = output_sz
        self.shift = (0,0) if shift is None else shift

    def __call__(self, image):
        raise NotImplementedError

    def crop_to_output(self, image):
        if isinstance(image, torch.Tensor):
            imsz = image.shape[2:] # [height,width]

            if self.output_sz is None:
                pad_h = 0
                pad_w = 0
            else:
                pad_h = (self.output_sz[0] - imsz[0]) / 2
                pad_w = (self.output_sz[1] - imsz[1]) / 2

            # 取整:floor把数字变小,ceil把数字变大
            pad_left = math.floor(pad_w) + self.shift[1]
            pad_right = math.ceil(pad_w) - self.shift[1]
            pad_top = math.floor(pad_h) + self.shift[0]
            pad_bottom = math.ceil(pad_h) - self.shift[0]
 			# 如果图片比我要采样的样本大小小的话,就对其进行填充
            return F.pad(image, (pad_left, pad_right, pad_top, pad_bottom), 'replicate')    # 复制填充

        else:
            raise NotImplementedError

class Identity(Transform):
    """Identity transformation."""
    def __call__(self, image):
        return self.crop_to_output(image)	# 跳回“三”

class FlipHorizontal(Transform):
    """Flip along horizontal axis."""
    def __call__(self, image):
        if isinstance(image, torch.Tensor):
            return self.crop_to_output(image.flip((3,)))    # flip(input,dims)按照给定维度翻转张量  跳回“三”
        else:
            return np.fliplr(image)

class FlipVertical(Transform):
    """Flip along vertical axis."""
    def __call__(self, image: torch.Tensor):
        if isinstance(image, torch.Tensor):
            return self.crop_to_output(image.flip((2,)))
        else:
            return np.flipud(image)

class Translation(Transform):   # 平移
    """Translate."""
    def __init__(self, translation, output_sz = None, shift = None):
        super().__init__(output_sz, shift)  # 使用super函数继承父类(Transform)的方法和属性,否则只能继承方法
        self.shift = (self.shift[0] + translation[0], self.shift[1] + translation[1])

    def __call__(self, image):
        if isinstance(image, torch.Tensor):
        # 我理解的是通过控制图片的上下左右填充量,控制图片的平移
            return self.crop_to_output(image)	# 返回“三”
        else:
            raise NotImplementedError

class Scale(Transform):
    """Scale."""
    def __init__(self, scale_factor, output_sz = None, shift = None):
        super().__init__(output_sz, shift)
        self.scale_factor = scale_factor

    def __call__(self, image):
        if isinstance(image, torch.Tensor):
            # Calculate new size. Ensure that it is even so that crop/pad becomes easier. even:偶数
            # 图片原来的height和width
            h_orig, w_orig = image.shape[2:]

            if h_orig != w_orig:
                raise NotImplementedError

            h_new = round(h_orig /self.scale_factor)    # round函数返回四舍五入值
            h_new += (h_new - h_orig) % 2   # %为求余运算 确保h_new为偶数
            w_new = round(w_orig /self.scale_factor)
            w_new += (w_new - w_orig) % 2

            image_resized = F.interpolate(image, [h_new, w_new], mode='bilinear')

            return self.crop_to_output(image_resized)	# 返回“三”
        else:
            raise NotImplementedError

class Affine(Transform):
    """Affine transformation."""
    def __init__(self, transform_matrix, output_sz = None, shift = None):
        super().__init__(output_sz, shift)
        self.transform_matrix = transform_matrix

    def __call__(self, image):
        if isinstance(image, torch.Tensor):
            return self.crop_to_output(numpy_to_torch(self(torch_to_numpy(image))))
        else:
            return cv.warpAffine(image, self.transform_matrix, image.shape[1::-1], borderMode=cv.BORDER_REPLICATE)

class Rotate(Transform):
    """Rotate with given angle."""
    def __init__(self, angle, output_sz = None, shift = None):
        super().__init__(output_sz, shift)
        self.angle = math.pi * angle/180    # 弧度制与角度制的转换

    def __call__(self, image):
        if isinstance(image, torch.Tensor):
            return self.crop_to_output(numpy_to_torch(self(torch_to_numpy(image))))
        else:
            # c代表中心点坐标,具体公式参考下面的链接:https://blog.csdn.net/missyoudaisy/article/details/104492020
            c = (np.expand_dims(np.array(image.shape[:2]),1)-1)/2   
            # image.shape[:2]图像的高和宽 [height,width,channel]
            R = np.array([[math.cos(self.angle), math.sin(self.angle)],
                          [-math.sin(self.angle), math.cos(self.angle)]])
            H =np.concatenate([R, c - R @ c], 1)    # @表示矩阵乘法  H为变换矩阵
            return cv.warpAffine(image, H, image.shape[1::-1], borderMode=cv.BORDER_REPLICATE)
            # 上面这个函数在链接里讲的很清楚了,返回“三”

class Blur(Transform):
    """Blur with given sigma (can be axis dependent)."""
    """ 高斯模糊,推荐两个比较好的博客,看了就清楚下面是在做什么
   https://www.cnblogs.com/invisible2/p/9177018.html
   https://blog.csdn.net/lyl771857509/article/details/84113177
    """
    def __init__(self, sigma, output_sz = None, shift = None):
    # 求出两个方向上的高斯权重
        super().__init__(output_sz, shift)
        if isinstance(sigma, (float, int)):
            sigma = (sigma, sigma)
        self.sigma = sigma
        self.filter_size = [math.ceil(2*s) for s in self.sigma
        x_coord = [torch.arange(-sz, sz+1, dtype=torch.float32) for sz in self.filter_size]
        self.filter = [torch.exp(-(x**2)/(2*s**2)) for x, s in zip(x_coord, self.sigma)]    # x**2:x的平方
        self.filter[0] = self.filter[0].view(1,1,-1,1) / self.filter[0].sum()
        self.filter[1] = self.filter[1].view(1,1,1,-1) / self.filter[1].sum()

    def __call__(self, image):
        if isinstance(image, torch.Tensor):
            # shape的四维分别为batch size,通道数,图像高度,图像宽度。image.shape[2:]指从第3(2+1)维开始
            sz = image.shape[2:]
            # 当view有一个维度为-1时,则该维的维数会根据其他维的维数自动确定。
            im1 = F.conv2d(image.view(-1,1,sz[0],sz[1]), self.filter[0], padding=(self.filter_size[0],0))
            return self.crop_to_output(F.conv2d(im1, self.filter[1], padding=(0,self.filter_size[1])).view(1,-1,sz[0],sz[1]))	# 返回“三”
        else:
            raise NotImplementedError

五、pytracking/pytracking/features/preprocessing.py文件

def sample_patch_transformed(im, pos, scale, image_sz, transforms):
    """Extract transformed image samples.
    args:
        im: Image.
        pos: Center position for extraction.
        scale: Image scale to extract features from.
        image_sz: Size to resize the image samples to before extraction.
        transforms: A set of image transforms to apply.
    """

    # Get image patch
    im_patch, _ = sample_patch(im, pos, scale*image_sz, image_sz) # 跳到下面的“sample_patch”函数

    # Apply transforms
    im_patches = torch.cat([T(im_patch) for T in transforms])

    return im_patches
def sample_patch(im: torch.Tensor, pos: torch.Tensor, sample_sz: torch.Tensor, output_sz: torch.Tensor = None,
                 mode: str = 'replicate'):
    """Sample an image patch.

    args:
        im: Image
        pos: center position of crop
        sample_sz: size to crop
        output_sz: size to resize to
        mode: how to treat image borders: 'replicate' (default) or 'inside'
    """

    if mode not in ['replicate', 'inside']:
        raise ValueError('Unknown border mode \'{}\'.'.format(mode))

    # copy and convert
    posl = pos.long().clone()

    # Get new sample size if forced inside the image
    if mode == 'inside':
        im_sz = torch.Tensor([im.shape[2], im.shape[3]])
        shrink_factor = (sample_sz.float() / im_sz).max().clamp(1)  # clamp(1)限定了shrink_factor最小为1
        sample_sz = (sample_sz.float() / shrink_factor).long()

    # Compute pre-downsampling factor
    if output_sz is not None:
        resize_factor = torch.min(sample_sz.float() / output_sz.float()).item()
        df = int(max(int(resize_factor - 0.1), 1))  # 为什么要减0.1?  df肯定是大于等于1的值
    else:
        df = int(1)

    sz = sample_sz.float() / df     # new size
    # Do downsampling 这里没有很明白,中心点位置是怎么变的
    if df > 1:  # 说明有缩放
        os = posl % df              # offset
        posl = (posl - os) / df     # new position
        im2 = im[..., os[0].item()::df, os[1].item()::df]   # downsample
    else:
        # 实际代码运行这里!
        im2 = im

    # compute size to crop
    szl = torch.max(sz.round(), torch.Tensor([2])).long()   # 不小于2

    # Extract top and bottom coordinates
    tl = posl - (szl - 1)/2 # 左上角坐标
    br = posl + szl/2 + 1   # 右下角坐标
    # Shift the crop to inside
    if mode == 'inside':
        im2_sz = torch.LongTensor([im2.shape[2], im2.shape[3]])
        shift = (-tl).clamp(0) - (br - im2_sz).clamp(0)
        tl += shift
        br += shift

        # Get image patch
        im_patch = im2[...,tl[0].item():br[0].item(),tl[1].item():br[1].item()]
    else:
        # Get image patch
        # 搞不懂为什么pad参数又是负数?
        im_patch = F.pad(im2, (-tl[1].item(), br[1].item() - im2.shape[3], -tl[0].item(), br[0].item() - im2.shape[2]), mode)

    # Get image coordinates
    patch_coord = df * torch.cat((tl, br)).view(1,4)

    if output_sz is None or (im_patch.shape[-2] == output_sz[0] and im_patch.shape[-1] == output_sz[1]):
        return im_patch.clone(), patch_coord

    # Resample
    im_patch = F.interpolate(im_patch, output_sz.long().tolist(), mode='bilinear')

    return im_patch, patch_coord # 返回“三”

你可能感兴趣的:([DiMP跟踪算法]代码学习笔记1——初始化)