DIDL笔记(pytorch版)(十二)

文章目录

  • 图像增广
  • 锚框
  • 锚框和真实框的对比
  • 标注训练集的锚框
    • 标注
    • 偏移量
    • 代码
  • 输出预测边界框
    • 非极大值抑制

图像增广

import d2l as d2l
from PIL import Image
from matplotlib import pyplot as plt    # 画布
import torchvision


def show_images(imgs, num_rows, num_cols, scale=2):
    figsize = (num_cols * scale, num_rows * scale)
    _, axes = plt.subplots(num_rows, num_cols, figsize=figsize)     # 画布布局
    for i in range(num_rows):
        for j in range(num_cols):
            axes[i][j].imshow(imgs[i * num_cols + j])
            axes[i][j].axes.get_xaxis().set_visible(False)
            axes[i][j].axes.get_yaxis().set_visible(False)
    return axes


def apply(img, aug, num_rows=2, num_cols=4, scale=1.5):
    Y = [aug(img) for _ in range(num_rows * num_cols)]  # 去掉aug()也可以,aug是一个方法从外面调来的
    show_images(Y, num_rows, num_cols, scale)


img = Image.open('outcome.png')
apply(img, torchvision.transforms.RandomHorizontalFlip())       # 水平
apply(img, torchvision.transforms.RandomVerticalFlip())         # 垂直
apply(img, torchvision.transforms.RandomResizedCrop(200, scale=(0.1, 1), ratio=(0.5, 2)))   # 随机裁剪+放缩最后调整
# 亮度、对比度、饱和度、色调
apply(img, torchvision.transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5))
augs = torchvision.transforms.Compose([         # 组合技
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
    torchvision.transforms.RandomResizedCrop(200, scale=(0.1, 1), ratio=(0.5, 2))])
apply(img, augs)
plt.show()  # 展示,几张画布几个图

锚框

在检测出目标的真实边界框之前,目标检测算法通常会在输入样本中随机采样大量区域。采集区域的方法很多种。其中使用每个像素中心生成多个大小和宽高比不同的边界框的方法,我们把这样的边界框叫做锚框。

可以看出,我们需要大小 s s s((0,1])和宽高比 r r r( r r r>0),如果有 n n n个大小, m m m个宽高比,我们会有 w ∗ h ∗ n ∗ s w*h*n*s whns个锚框。但是为了简单计算,我们只考虑包含 s 1 s_1 s1 r 1 r_1 r1的组合,所以只有 w ∗ h ∗ ( n + m − 1 ) w*h*(n+m-1) wh(n+m1)个锚框。

import torch
import d2l as d2l
from PIL import Image
import numpy as np
import math
from matplotlib import pyplot as plt

img = Image.open('outcome.png')
w, h = img.size

def MultiBoxPrior(feature_map, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5]):
    """
    return: anchor表示成(xmin, ymin, xmax, ymax).
    Args:
        feature_map: torch tensor, Shape: [N, C, H, W].
        sizes: List of sizes (0~1) of generated MultiBoxPriores.
        ratios: List of aspect ratios (non-negative) of generated MultiBoxPriores.
    Returns:
        anchors of shape (1, num_anchors, 4). 由于batch里每个都一样, 所以第一维为1
    """
    pairs = []      # pair of (size, sqrt(ration))
    # 为什么只有这些? 通常只对包含s1或者r1的组合感兴趣,否则太多了
    for r in ratios:
        pairs.append([sizes[0], math.sqrt(r)])
    for s in sizes[1:]:
        pairs.append([s, math.sqrt(ratios[0])])

    pairs = np.array(pairs)

    ss1 = pairs[:, 0] * pairs[:, 1]     # size * sqrt(ration)
    ss2 = pairs[:, 0] / pairs[:, 1]     # size / sqrt(ration)

    base_anchors = np.stack([-ss1, -ss2, ss1, ss2], axis=1) / 2
	
    h, w = feature_map.shape[-2:]
    shifts_x = np.arange(0, w) / w	# 不明白为什么这样
    shifts_y = np.arange(0, h) / h
    shift_x, shift_y = np.meshgrid(shifts_x, shifts_y)
    shift_x = shift_x.reshape(-1)	# w*h个
    shift_y = shift_y.reshape(-1)	# w*h个
    shifts = np.stack((shift_x, shift_y, shift_x, shift_y), axis=1)

    anchors = shifts.reshape((-1, 1, 4)) + base_anchors.reshape((1, -1, 4))	# 广播机制
    return torch.tensor(anchors, dtype=torch.float32).view(1, -1, 4)    # (批量大小,锚框个数,4)

X = torch.Tensor(1, 3, h, w)  # 构造输入数据
Y = MultiBoxPrior(X, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5])
boxes = Y.reshape((h, w, 5, 4))

画框

def show_bboxes(axes, bboxes, labels=None, colors=None):
    def _make_list(obj, default_values=None):
        if obj is None:
            obj = default_values
        elif not isinstance(obj, (list, tuple)):
            obj = [obj]
        return obj

    labels = _make_list(labels)
    colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c'])
    for i, bbox in enumerate(bboxes):
        color = colors[i % len(colors)]
        rect = d2l.bbox_to_rect(bbox.detach().cpu().numpy(), color)
        axes.add_patch(rect)
        if labels and len(labels) > i:
            text_color = 'k' if color == 'w' else 'w'
            axes.text(rect.xy[0], rect.xy[1], labels[i],
                      va='center', ha='center', fontsize=6, color=text_color,
                      bbox=dict(facecolor=color, lw=0))

fig = plt.imshow(img)
bbox_scale = torch.tensor([[w, h, w, h]], dtype=torch.float32)
show_bboxes(fig.axes, boxes[250, 250, :, :] * bbox_scale,
            ['s=0.75, r=1', 's=0.75, r=2', 's=0.55, r=0.5', 's=0.5, r=1', 's=0.25, r=1'])
plt.show()

锚框和真实框的对比

使用交并比

def compute_intersection(set_1, set_2):
    """
    计算anchor之间的交集
    Args:
        set_1: a tensor of dimensions (n1, 4), anchor表示成(xmin, ymin, xmax, ymax)
        set_2: a tensor of dimensions (n2, 4), anchor表示成(xmin, ymin, xmax, ymax)
    Returns:
        intersection of each of the boxes in set 1 with respect to each of the boxes in set 2, shape: (n1, n2)
    """
    # PyTorch auto-broadcasts singleton dimensions
    lower_bounds = torch.max(set_1[:, :2].unsqueeze(1), set_2[:, :2].unsqueeze(0))  # (n1, n2, 2),xmin+ymin找最大的
    upper_bounds = torch.min(set_1[:, 2:].unsqueeze(1), set_2[:, 2:].unsqueeze(0))  # (n1, n2, 2),xmax+ymax找最小的
    intersection_dims = torch.clamp(upper_bounds - lower_bounds, min=0)  # (n1, n2, 2)
    return intersection_dims[:, :, 0] * intersection_dims[:, :, 1]  # (n1, n2)


def compute_jaccard(set_1, set_2):
    """
    计算anchor之间的Jaccard系数(IoU)
    Args:
        set_1: a tensor of dimensions (n1, 4), anchor表示成(xmin, ymin, xmax, ymax)
        set_2: a tensor of dimensions (n2, 4), anchor表示成(xmin, ymin, xmax, ymax)
    Returns:
        Jaccard Overlap of each of the boxes in set 1 with respect to each of the boxes in set 2, shape: (n1, n2)
    """
    # Find intersections
    intersection = compute_intersection(set_1, set_2)  # (n1, n2)

    # Find areas of each box in both sets
    areas_set_1 = (set_1[:, 2] - set_1[:, 0]) * (set_1[:, 3] - set_1[:, 1])  # (n1)
    areas_set_2 = (set_2[:, 2] - set_2[:, 0]) * (set_2[:, 3] - set_2[:, 1])  # (n2)

    # Find the union
    # PyTorch auto-broadcasts singleton dimensions
    union = areas_set_1.unsqueeze(1) + areas_set_2.unsqueeze(0) - intersection  # (n1, n2)

    return intersection / union  # (n1, n2)

标注训练集的锚框

一个锚框算是一个训练样本,我们需要标注锚框所包含的类别以及锚框相对于真实框额偏移量。那么如何为锚框添加标注和偏移量呢?

标注

假设 A i A_i Ai代表第 i i i个锚框,有 n a n_a na个; B i B_i Bi代表第 i i i个真实边界框,有 n b n_b nb个; n a ≥ n b n_a\ge n_b nanb。定义一个 n a ∗ n b n_a*n_b nanb大小的矩阵,矩阵值为锚框与真实边界框的交并比。标注步骤如下:

  1. 找到矩阵中最大的元素,此处的锚框与真实边界框相似度最高。此处行对应的锚框分配为此处列对应的真实边界框。该行该列标记为匹配成功不再查找。
  2. 重复第一个操作。注意:避开之前标记为不能查找的行或列,当真实边界框都被标记完时,执行下一步。
  3. 此时还有剩余 n a − n b n_a-n_b nanb个锚框还没有标记,在这些锚框中所在的行中找到一个最大的交并比,并且这个交并比大于预先设定的阀值时,行对应的锚框分配为列对应的真实边界框。至于小于阀值的设置为背景类别。
    DIDL笔记(pytorch版)(十二)_第1张图片

偏移量

DIDL笔记(pytorch版)(十二)_第2张图片

代码

# 以下函数已保存在d2lzh_pytorch包中方便以后使用
def assign_anchor(bb, anchor, jaccard_threshold=0.5):
    """
    anchor表示成归一化(xmin, ymin, xmax, ymax).

    Args:
        bb: 真实边界框(bounding box), shape:(nb, 4)
        anchor: 待分配的anchor, shape:(na, 4)
        jaccard_threshold: 预先设定的阈值
    Returns:
        assigned_idx: shape: (na, ), 每个anchor分配的真实bb对应的索引, 若未分配任何bb则为-1
    """
    na = anchor.shape[0]
    nb = bb.shape[0]
    jaccard = compute_jaccard(anchor, bb).detach().cpu().numpy()    # shape: (na, nb),计算交并比
    assigned_idx = np.ones(na) * -1  # 初始全为-1

    # 先为每个bb分配一个anchor(不要求满足jaccard_threshold)
    jaccard_cp = jaccard.copy()
    for j in range(nb):
        i = np.argmax(jaccard_cp[:, j])     # 找到真实边界框j中最大值对应的i
        assigned_idx[i] = j                 # i分配给j
        jaccard_cp[i, :] = float("-inf")    # 赋值为负无穷, 相当于去掉这一行

    # 处理还未被分配的anchor, 要求满足jaccard_threshold
    for i in range(na):
        if assigned_idx[i] == -1:
            j = np.argmax(jaccard[i, :])    # 找到锚框i中最大值对应的j
            if jaccard[i, j] >= jaccard_threshold:
                assigned_idx[i] = j

    return torch.tensor(assigned_idx, dtype=torch.long)


# 四角计算中心
def xy_to_cxcy(xy):
    """
    将(x_min, y_min, x_max, y_max)形式的anchor转换成(center_x, center_y, w, h)形式的.
    Args:
        xy: bounding boxes in boundary coordinates, a tensor of size (n_boxes, 4)
    Returns:
        bounding boxes in center-size coordinates, a tensor of size (n_boxes, 4)
    """
    return torch.cat([(xy[:, 2:] + xy[:, :2]) / 2,  # c_x, c_y
                      xy[:, 2:] - xy[:, :2]], 1)  # w, h


def MultiBoxTarget(anchor, label):
    """
    anchor表示成归一化(xmin, ymin, xmax, ymax).
    Args:
        anchor: torch tensor, 输入的锚框, 一般是通过MultiBoxPrior生成, shape:(1,锚框总数,4)
        label: 真实标签, shape为(bn, 每张图片最多的真实锚框数, 5)
               第二维中,如果给定图片没有这么多锚框, 可以先用-1填充空白, 最后一维中的元素为[类别标签, 四个坐标值]
    Returns:
        列表, [bbox_offset, bbox_mask, cls_labels]
        bbox_offset: 每个锚框的标注偏移量,形状为(bn,锚框总数*4)
        bbox_mask: 形状同bbox_offset, 每个锚框的掩码, 一一对应上面的偏移量, 负类锚框(背景)对应的掩码均为0, 正类锚框的掩码均为1
        cls_labels: 每个锚框的标注类别, 其中0表示为背景, 形状为(bn,锚框总数)
    """
    assert len(anchor.shape) == 3 and len(label.shape) == 3
    bn = label.shape[0]

    def MultiBoxTarget_one(anc, lab, eps=1e-6):
        """
        MultiBoxTarget函数的辅助函数, 处理batch中的一个
        Args:
            anc: shape of (锚框总数, 4)
            lab: shape of (真实锚框数, 5), 5代表[类别标签, 四个坐标值]
            eps: 一个极小值, 防止log0
        Returns:
            offset: (锚框总数*4, )
            bbox_mask: (锚框总数*4, ), 0代表背景, 1代表非背景
            cls_labels: (锚框总数, 4), 0代表背景
        """
        an = anc.shape[0]
        assigned_idx = assign_anchor(lab[:, 1:], anc)   # (锚框总数, ) 锚框对应的真实边界框
        bbox_mask = ((assigned_idx >= 0).float().unsqueeze(-1)).repeat(1, 4)    # (锚框总数, 4) repeat(1, 4) axis=1,复制4次

        cls_labels = torch.zeros(an, dtype=torch.long)  # 0表示背景
        assigned_bb = torch.zeros((an, 4), dtype=torch.float32)     # 所有anchor对应的bb坐标
        for i in range(an):
            bb_idx = assigned_idx[i]
            if bb_idx >= 0:     # 即非背景
                cls_labels[i] = lab[bb_idx, 0].long().item() + 1    # 注意要加一,第一个位置是0或者1,我们需要1位狗,2为猫
                assigned_bb[i, :] = lab[bb_idx, 1:]     # 位置,四个角

        center_anc = xy_to_cxcy(anc)    # (center_x, center_y, w, h)
        center_assigned_bb = xy_to_cxcy(assigned_bb)

        offset_xy = 10.0 * (center_assigned_bb[:, :2] - center_anc[:, :2]) / center_anc[:, 2:]
        offset_wh = 5.0 * torch.log(eps + center_assigned_bb[:, 2:] / center_anc[:, 2:])
        offset = torch.cat([offset_xy, offset_wh], dim = 1) * bbox_mask # (锚框总数, 4)
		# 锚框偏移量,掩码,锚框类别
        return offset.view(-1), bbox_mask.view(-1), cls_labels

    batch_offset = []
    batch_mask = []
    batch_cls_labels = []
    for b in range(bn):
        offset, bbox_mask, cls_labels = MultiBoxTarget_one(anchor[0, :, :], label[b, :, :])

        batch_offset.append(offset)
        batch_mask.append(bbox_mask)
        batch_cls_labels.append(cls_labels)

    bbox_offset = torch.stack(batch_offset)
    bbox_mask = torch.stack(batch_mask)
    cls_labels = torch.stack(batch_cls_labels)

    return [bbox_offset, bbox_mask, cls_labels]

# 这里为了方便直接设置了。
bbox_scale = torch.tensor((w, h, w, h), dtype=torch.float32)
ground_truth = torch.tensor([[0, 0.1, 0.08, 0.52, 0.92],
                            [1, 0.55, 0.2, 0.9, 0.88]])
anchors = torch.tensor([[0, 0.1, 0.2, 0.3], [0.15, 0.2, 0.4, 0.4],
                    [0.63, 0.05, 0.88, 0.98], [0.66, 0.45, 0.8, 0.8],
                    [0.57, 0.3, 0.92, 0.9]])
labels = MultiBoxTarget(anchors.unsqueeze(dim=0),
                        ground_truth.unsqueeze(dim=0))

输出预测边界框

在上节我们分别给锚框分配了类别以及偏移量。锚框进入训练网络后输出得到预测边界框。多个预测边界框要如何选择出一个最适合的边界框需要用到非极大值抑制

非极大值抑制

  1. 输出的边界框按照置信度(最大的预测类别概率)从高到低排序。
  2. 选取置信度最大的边界框作为基准,其他边界框如果与基准的交并比大于某个阀值,删除。最后这个边界框标记已成为过基准。
  3. 重复第二步骤,选取没有成为过基准的边界框执行操作。
  4. 保留下来的边界框均成为过基准时,结束。
def non_max_suppression(bb_info_list, nms_threshold = 0.5):
    """
    非极大抑制处理预测的边界框
    Args:
        bb_info_list: Pred_BB_Info的列表, 包含预测类别、置信度等信息
        nms_threshold: 阈值
    Returns:
        output: Pred_BB_Info的列表, 只保留过滤后的边界框信息
    """
    output = []
    # 先根据置信度从高到低排序
    sorted_bb_info_list = sorted(bb_info_list, key=lambda x: x.confidence, reverse=True)

    while len(sorted_bb_info_list) != 0:
        best = sorted_bb_info_list.pop(0)
        output.append(best)

        if len(sorted_bb_info_list) == 0:
            break

        bb_xyxy = []
        for bb in sorted_bb_info_list:
            bb_xyxy.append(bb.xyxy)

        iou = compute_jaccard(torch.tensor([best.xyxy]),
                              torch.tensor(bb_xyxy))[0]     # shape: (len(sorted_bb_info_list), )

        n = len(sorted_bb_info_list)
        sorted_bb_info_list = [sorted_bb_info_list[i] for i in range(n) if iou[i] <= nms_threshold]
    return output
def MultiBoxDetection(cls_prob, loc_pred, anchor, nms_threshold = 0.5):
    """
    anchor表示成归一化(xmin, ymin, xmax, ymax).
    Args:
        cls_prob: 经过softmax后得到的各个锚框的预测概率, shape:(bn, 预测总类别数+1, 锚框个数)
        loc_pred: 预测的各个锚框的偏移量, shape:(bn, 锚框个数*4)
        anchor: MultiBoxPrior输出的默认锚框, shape: (1, 锚框个数, 4)
        nms_threshold: 非极大抑制中的阈值
    Returns:
        所有锚框的信息, shape: (bn, 锚框个数, 6)
        每个锚框信息由[class_id, confidence, xmin, ymin, xmax, ymax]表示
        class_id=-1 表示背景或在非极大值抑制中被移除了
    """
    assert len(cls_prob.shape) == 3 and len(loc_pred.shape) == 2 and len(anchor.shape) == 3
    bn = cls_prob.shape[0]

    def MultiBoxDetection_one(c_p, l_p, anc, nms_threshold = 0.5):
        """
        MultiBoxDetection的辅助函数, 处理batch中的一个
        Args:
            c_p: (预测总类别数+1, 锚框个数)
            l_p: (锚框个数*4, )
            anc: (锚框个数, 4)
            nms_threshold: 非极大抑制中的阈值
        Return:
            output: (锚框个数, 6)
        """
        pred_bb_num = c_p.shape[1]
        anc = (anc + l_p.view(pred_bb_num, 4)).detach().cpu().numpy()   # 加上偏移量

        confidence, class_id = torch.max(c_p, 0)
        confidence = confidence.detach().cpu().numpy()
        class_id = class_id.detach().cpu().numpy()

        pred_bb_info = [Pred_BB_Info(
                            index=i,
                            class_id=class_id[i] - 1,   # 正类label从0开始
                            confidence=confidence[i],
                            xyxy=[*anc[i]])             # xyxy是个列表
                        for i in range(pred_bb_num)]

        # 正类的index
        obj_bb_idx = [bb.index for bb in non_max_suppression(pred_bb_info, nms_threshold)]

        output = []
        for bb in pred_bb_info:
            output.append([
                (bb.class_id if bb.index in obj_bb_idx else -1.0),
                bb.confidence,
                *bb.xyxy
            ])

        return torch.tensor(output)     # shape: (锚框个数, 6)

    batch_output = []
    for b in range(bn):
        batch_output.append(MultiBoxDetection_one(cls_prob[b], loc_pred[b], anchor[0], nms_threshold))

    return torch.stack(batch_output)

你可能感兴趣的:(深度学习,非极大值抑制,交并比,锚框)