[拆轮子] PaddleDetection 中的预处理 PadMaskBatch

PadMaskBatch 基本与基类 BaseOperator 关系不大了,只在基类的 __init__ 中初始化变量self._id

PadMaskBatch 用来给同 batch 中的图片加 pad 0 使得同 batch 中的图片shape都一样,同时输入的图片必须是CHW

class PadMaskBatch(BaseOperator):
    Pad a batch of samples so they can be divisible by a stride.
    The layout of each image should be 'CHW'.
        pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
            height and width is divisible by `pad_to_stride`.
        return_pad_mask (bool): If `return_pad_mask = True`, return
            `pad_mask` for transformer.

    def __init__(self, pad_to_stride=0, return_pad_mask=False):
        super(PadMaskBatch, self).__init__()
        self.pad_to_stride = pad_to_stride      # 需要适配的步长
        self.return_pad_mask = return_pad_mask  # 是否需要返回对应的 mask

    def __call__(self, samples, context=None):
            samples (list): a batch of sample, each is dict.
        coarsest_stride = self.pad_to_stride 

        max_shape = np.array([data['image'].shape for data in samples]).max(
        if coarsest_stride > 0:
        	# np.ceil 用来向上取整
        	# 为了可以整除步长,则需要向上取整
            max_shape[1] = int(
                np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
            max_shape[2] = int(
                np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)

        for data in samples:

			# ---------- 给当前的图片 pad 0 ----------
            im = data['image']
            im_c, im_h, im_w = im.shape[:]
            padding_im = np.zeros(
                (im_c, max_shape[1], max_shape[2]), dtype=np.float32)
            padding_im[:, :im_h, :im_w] = im
            data['image'] = padding_im

			# ---------- 和分割有关的标注部分跳过 ----------
            if 'semantic' in data and data['semantic'] is not None:
                semantic = data['semantic']
                padding_sem = np.zeros(
                    (1, max_shape[1], max_shape[2]), dtype=np.float32)
                padding_sem[:, :im_h, :im_w] = semantic
                data['semantic'] = padding_sem
            if 'gt_segm' in data and data['gt_segm'] is not None:
                gt_segm = data['gt_segm']
                padding_segm = np.zeros(
                    (gt_segm.shape[0], max_shape[1], max_shape[2]),
                padding_segm[:, :im_h, :im_w] = gt_segm
                data['gt_segm'] = padding_segm

			# ---------- 是否返回对应的 mask ----------
            if self.return_pad_mask:
                padding_mask = np.zeros(
                    (max_shape[1], max_shape[2]), dtype=np.float32)
                padding_mask[:im_h, :im_w] = 1.
                data['pad_mask'] = padding_mask

        return samples

注意 PaddleDetection 中的 mask 是:

>>> padding_mask
array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

而 Torch 中的 NestedTensor 可能是相反的:

>>> samples.mask
tensor([[[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True]],

        [[False, False, False,  ...,  True,  True,  True],
         [False, False, False,  ...,  True,  True,  True],
         [False, False, False,  ...,  True,  True,  True],
         [False, False, False,  ...,  True,  True,  True],
         [False, False, False,  ...,  True,  True,  True],
         [False, False, False,  ...,  True,  True,  True]]], device='cuda:0')
