Tensorflow版本yolo v3源码阅读笔记(5)

在上一篇文章中，我们已经分析了yolov3训练过程的代码，这就表示我们的神经网络已经训练好了，可以马上测试了，但是在测试之前，我们还是先搞清楚测试函数中出现的一些函数吧。

下面开始吧。

导入需要的库

import cv2
import random
import colorsys
import numpy as np
from core.config import cfg

load_weights

load_weights的作用是将预训练好的权值文件yolov3.weights 加载到模型中。
首先，我们需要知道模型中一共有多少个卷积层：

count = 0
for layer in model.layers:
    if layer.name[0] == 'c':
        count += 1

得到count=75，说明model中一共有75个卷积层，另外，通过打印所有层的名称，我们得知conv2d_58, conv2d_66和conv2d_74，这三层卷积层后面是不跟BN层的（但是这三层的卷积层上有阈值）

然后，分别设置卷积层和BN层的权值，这个model就可以直接使用了。

def load_weights(model, weights_file):
    """
    I agree that this code is very ugly, but I don’t know any better way of doing it.
    """
    #打开权值文件
    wf = open(weights_file, 'rb')
    #从文件中读取5个数据，这个5个数据可能是这个文件的数据头信息。
    major, minor, revision, seen, _ = np.fromfile(wf, dtype=np.int32, count=5)

    j = 0
    #遍历75个卷积层
    for i in range(75):
        #卷积层的名字
        conv_layer_name = 'conv2d_%d' %i if i > 0 else 'conv2d'
        #BN层的名字
        bn_layer_name = 'batch_normalization_%d' %j if j > 0 else 'batch_normalization'

        #根据卷积层名字获取卷积层
        conv_layer = model.get_layer(conv_layer_name)
        #获取这个卷积层中的卷积核个数
        filters = conv_layer.filters
        #获取这个卷积层中的卷积核大小
        k_size = conv_layer.kernel_size[0]
        #获取这个卷积层的输入的通道数
        in_dim = conv_layer.input_shape[-1]

        if i not in [58, 66, 74]:
            # darknet weights: [beta, gamma, mean, variance]
            #这里获取BN层的权值信息
            bn_weights = np.fromfile(wf, dtype=np.float32, count=4 * filters)
            # tf weights: [gamma, beta, mean, variance]
            #第一维和第二维对调
            bn_weights = bn_weights.reshape((4, filters))[[1, 0, 2, 3]]
            #根据BN层的名字获取BN层
            bn_layer = model.get_layer(bn_layer_name)
            j += 1
        else:
            #获取卷积层的偏置信息
            conv_bias = np.fromfile(wf, dtype=np.float32, count=filters)

        # darknet shape (out_dim, in_dim, height, width)
        conv_shape = (filters, in_dim, k_size, k_size)
        #从文件中读取卷积层的权值信息
        conv_weights = np.fromfile(wf, dtype=np.float32, count=np.product(conv_shape))
        # tf shape (height, width, in_dim, out_dim)
        #将读取的权值信息的第1维和第2维对调到第4维和第3维
        conv_weights = conv_weights.reshape(conv_shape).transpose([2, 3, 1, 0])

        if i not in [58, 66, 74]:
            #设置卷积层和BN层的权值信息
            conv_layer.set_weights([conv_weights])
            bn_layer.set_weights(bn_weights)
        else:
            #设置卷积层和权值和偏置信息
            conv_layer.set_weights([conv_weights, conv_bias])

    assert len(wf.read()) == 0, 'failed to read all data'
    #关闭文件
    wf.close()

read_class_names

read_class_names 函数将yymnist.names 文件中的类别索引读取出来并放在一个字典中。

def read_class_names(class_file_name):
    '''loads class name from a file'''
    names = {}
    with open(class_file_name, 'r') as data:
        for ID, name in enumerate(data):
            names[ID] = name.strip('\n')
    return names

将 “./data/classes/yymnist.names” 输入将返回：

{0: '0',
 1: '1',
 2: '2',
 3: '3',
 4: '4',
 5: '5',
 6: '6',
 7: '7',
 8: '8',
 9: '9'}

get_anchors

get_anchors的功能是获取预测框的尺度大小，一共有9个数据，对应3个尺度的3个预测框大小

def get_anchors(anchors_path):
    '''loads the anchors from a file'''
    with open(anchors_path) as f:
        anchors = f.readline()
    anchors = np.array(anchors.split(','), dtype=np.float32)
    return anchors.reshape(3, 3, 2)

image_preporcess

image_preporcess函数我们在制作数据集的过程中见过，它的主要功能是把从文件中读出的图片转换成神经网络需要的尺寸大小。

def image_preporcess(image, target_size, gt_boxes=None):

    ih, iw    = target_size#神经网络需要的图片高宽
    h,  w, _  = image.shape#输入的图片的高宽

    scale = min(iw/w, ih/h)#计算出一个图片比率
    nw, nh  = int(scale * w), int(scale * h)#新的图片大小
    image_resized = cv2.resize(image, (nw, nh))#将输入图片转成新的图片大小

    #新的图片大小可能不是神经网络需要的高宽[416,416]大小，因此需要对新的图片大小进行填充，并进行归一化操作。这里填充的数据是128
    image_paded = np.full(shape=[ih, iw, 3], fill_value=128.0)
    dw, dh = (iw - nw) // 2, (ih-nh) // 2
    image_paded[dh:nh+dh, dw:nw+dw, :] = image_resized
    image_paded = image_paded / 255.

    if gt_boxes is None:
        return image_paded

    else:
        #这里对真实框的数据进行偏移处理，图片放大缩小了之后，原先的真实框也需要同步进行放大缩小。
        gt_boxes[:, [0, 2]] = gt_boxes[:, [0, 2]] * scale + dw
        gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] * scale + dh
        return image_paded, gt_boxes

draw_bbox

draw_bbox的功能是把预测框画到原图上去，并在图片上显示置信度和类别信息。

def draw_bbox(image, bboxes, classes=read_class_names(cfg.YOLO.CLASSES), show_label=True):
    """
    bboxes: [x_min, y_min, x_max, y_max, probability, cls_id] format coordinates.
    """

    num_classes = len(classes)#类别的数目
    image_h, image_w, _ = image.shape#图片的高宽
    #下面是根据类别显示不同的颜色
    hsv_tuples = [(1.0 * x / num_classes, 1., 1.) for x in range(num_classes)]
    colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
    colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors))

    random.seed(0)
    random.shuffle(colors)
    random.seed(None)

    #遍历所有的框框
    for i, bbox in enumerate(bboxes):
        coor = np.array(bbox[:4], dtype=np.int32)#取出框框的坐标
        fontScale = 0.5
        score = bbox[4]#框框的置信度
        class_ind = int(bbox[5])#框框的类别
        bbox_color = colors[class_ind]#得到框框的颜色
        bbox_thick = int(0.6 * (image_h + image_w) / 600)
        #得到框框的左上角坐标点和右下角坐标点
        c1, c2 = (coor[0], coor[1]), (coor[2], coor[3])
        #在图上画框框
        cv2.rectangle(image, c1, c2, bbox_color, bbox_thick)

        if show_label:
            #在图上画标签信息，包括框框的置信度的类别
            bbox_mess = '%s: %.2f' % (classes[class_ind], score)
            t_size = cv2.getTextSize(bbox_mess, 0, fontScale, thickness=bbox_thick//2)[0]
            #在图上画显示文字的框框
            cv2.rectangle(image, c1, (c1[0] + t_size[0], c1[1] - t_size[1] - 3), bbox_color, -1)  # filled
            #在框框里面写上文字
            cv2.putText(image, bbox_mess, (c1[0], c1[1]-2), cv2.FONT_HERSHEY_SIMPLEX,
                        fontScale, (0, 0, 0), bbox_thick//2, lineType=cv2.LINE_AA)

    return image

bboxes_iou

bboxes_iou的功能是计算两个框框的iou值，通过iou值可以粗略的估计两个框框的距离。

def bboxes_iou(boxes1, boxes2):

    boxes1 = np.array(boxes1)
    boxes2 = np.array(boxes2)

    #第一个框的面积
    boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
    #第二个框的面积
    boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])

    #两个框的交集的左上角坐标
    left_up       = np.maximum(boxes1[..., :2], boxes2[..., :2])
    #两个框的交集的右下角坐标
    right_down    = np.minimum(boxes1[..., 2:], boxes2[..., 2:])

    inter_section = np.maximum(right_down - left_up, 0.0)
    inter_area    = inter_section[..., 0] * inter_section[..., 1]#两个框框的交集面积
    union_area    = boxes1_area + boxes2_area - inter_area#两个框框的并集面积
    ious          = np.maximum(1.0 * inter_area / union_area, np.finfo(np.float32).eps)#计算iou值

    return ious

postprocess_boxes

postprocess_boxes的功能是用来确定预测框的信息，即预测框位置、预测框的置信度、预测框的类别。
输入：

pred_bbox: 图片经过yolov3网络后得到的所有预测框信息的集合，因为经过yolov3网络后得到3个尺度的特征图，其形状分别为[1,52,52,3,85]，[1,26,26,3,85]，[1,13,13,3,85]，所以pred_bbox的形状为[13x13x3+26x26x3+52x52x3,85]
org_img_shape：原始图片的尺寸大小。
input_size：输入网络时的图片大小。
score_threshold：分数阈值，我们只留下分数大于分数阈值的预测框。

代码处理流程：

通过yolov3网络后我们得到的预测框信息是中心坐标+高宽信息，所以我们要将它们转换成左上角坐标+右下角坐标。即(x,y,w,h) -> (xmin,ymin,xmax,ymax)。
现在我们得到的预测框坐标是在特征图上的，还需要将它们转化为在原图上的坐标信息，即(xmin,ymin,xmax,ymax) -> (xmin_org,ymin_org,xmax_org,ymax_org)。
如果某预测框超出原图边界，即整个预测框在原图外面，则将这个预测框的四个角的坐标都设为0，那么它的面积也为0 。
计算所有预测框的面积，如果面积为0，表示此预测框超出原图边界，则弃之。
计算每个预测框的分数（类别概率*置信概率）。如果某预测框分数<阈值，则弃之。

输出：
所有符合要求的预测框在原始图像上的信息，格式为（预测框数量，预测框位置+分数+类别），形状为[-1，6] 。

下面看看代码的具体实现流程吧。

def postprocess_boxes(pred_bbox, org_img_shape, input_size, score_threshold):

    valid_scale=[0, np.inf]
    pred_bbox = np.array(pred_bbox)

    pred_xywh = pred_bbox[:, 0:4]#取出预测框的坐标
    pred_conf = pred_bbox[:, 4]#取出预测框的置信度
    pred_prob = pred_bbox[:, 5:]#取出预测框的类别概率

    # # (1) (x, y, w, h) --> (xmin, ymin, xmax, ymax)
    ## 流程1:将预测框的坐标信息从2个中心坐标+2个高宽 转换为 2个左上角坐标+2个右下角坐标
    pred_coor = np.concatenate([pred_xywh[:, :2] - pred_xywh[:, 2:] * 0.5,
                                pred_xywh[:, :2] + pred_xywh[:, 2:] * 0.5], axis=-1)
    # # (2) (xmin, ymin, xmax, ymax) -> (xmin_org, ymin_org, xmax_org, ymax_org)
    ## 流程2: 将预测框的坐标信息从特征图上转换为原图上的坐标信息
    org_h, org_w = org_img_shape
    resize_ratio = min(input_size / org_w, input_size / org_h)

    dw = (input_size - resize_ratio * org_w) / 2#计算出在宽度上填充的部分
    dh = (input_size - resize_ratio * org_h) / 2#计算出在高度上填充的部分

    # 下面的这个转换坐标的方法和image_preporcess函数中的方法是相反的
    pred_coor[:, 0::2] = 1.0 * (pred_coor[:, 0::2] - dw) / resize_ratio#转换xmin和xmax
    pred_coor[:, 1::2] = 1.0 * (pred_coor[:, 1::2] - dh) / resize_ratio#转换ymin和ymax

    # # (3) clip some boxes those are out of range
    ##流程3: 超出原图边界的预测框的坐标变为0
    # 下面将每个预测框的坐标与边界坐标进行比较
    # 对xmin和ymin来说，如果它们中有小于0的，就用0替换它
    # 对xmax和ymax来说，如果它们中有大于原图右下角坐标的，就用原图右下角坐标替换它
    pred_coor = np.concatenate([np.maximum(pred_coor[:, :2], [0, 0]),
                                np.minimum(pred_coor[:, 2:], [org_w - 1, org_h - 1])], axis=-1)
    # 假设有预测框的坐标为[-3,-2,-1,-2]，那么经过上一步后就会变为[0,0,-1,-2]，所以invalid_mask将被标记为True
    invalid_mask = np.logical_or((pred_coor[:, 0] > pred_coor[:, 2]), (pred_coor[:, 1] > pred_coor[:, 3]))
    pred_coor[invalid_mask] = 0

    # # (4) discard some invalid boxes
    ## 流程4: 计算每个预测框的面积，找出面积>0的预测框
    bboxes_scale = np.sqrt(np.multiply.reduce(pred_coor[:, 2:4] - pred_coor[:, 0:2], axis=-1))#计算面积
    scale_mask = np.logical_and((valid_scale[0] < bboxes_scale), (bboxes_scale < valid_scale[1]))#找出面积大于0的预测框

    # # (5) discard some boxes with low scores
    ## 流程5:计算每个预测框的分数，找出分数大于指定阈值的预测框。
    # 分数 = 类别概率*置信度概率。 置信概率表示该预测框内有目标的概率，类别概率指的是类别中最大下标所对应的概率。
    classes = np.argmax(pred_prob, axis=-1)#找出类别中的最大下标的索引
    scores = pred_conf * pred_prob[np.arange(len(pred_coor)), classes]#计算分数 pred_prob[np.arange(len(pred_coor)), classes]这个是根据索引取出概率值
    score_mask = scores > score_threshold#根据阈值进行筛选
    mask = np.logical_and(scale_mask, score_mask)#最终找出所要的预测框的索引
    #取出想要的预测框的坐标、分数、类别
    coors, scores, classes = pred_coor[mask], scores[mask], classes[mask]
    #重新合并预测框的坐标、分数、索引，并返回
    return np.concatenate([coors, scores[:, np.newaxis], classes[:, np.newaxis]], axis=-1)

nms

nms的功能是用来清除冗余的预测框，也叫做极大值抑制处理。

这个函数紧跟在 postprocess_boxes 函数后面，因为postprocess_boxes返回的预测框中，有一些重合的预测框，什么意思，就是几个预测框可能预测的都是同一个物体，但是一个物体我们只需要一个预测框，那我们就需要把其它多余的预测框排除掉，用什么方法呢，就是这个nms方法了。

输入:

bboxes：原图上的预测框信息。
iou_threshold： iou阈值，如果某预测框与同一个物体中的得分最高的预测框的iou值>此阈值，我们即认为此预测框是冗余的，应弃之。
method：极大值抑制的2种方法，'nms'和'soft-nms'方法。
sigma: 用于'soft-nms'方法的参数。

程序流程：

1. 找出所有的预测框预测出的物体类别（yolov3中最多可以预测80种物体）
1. 遍历这个物体类别
  - 2.1. 首先取出一个类别，找出所有预测到这个类别的预测框
  - 2.2. 找出了想要的预测框，接着找出得分最大的一个预测框A，这个预测框就是我们想要的。
  - 2.3. 找出了一个A，余下的预测框和A计算iou值，如果有预测框和A的iou值>指定阈值，说明这个预测框和A预测的是同一个物体，这个预测框就丢弃掉。
  - 2.4. 到这一步，我们找出了A，丢弃了和A预测同一个物本的预测框。这时这个类别中很有可能还有一些预测框，我们接着重复2.2步，直到这个类别的预测框全部处理完毕。
1. 处理完所有类别，也找出了所有想要的预测框，且一个物体只有一个预测框。

程序代码实现如下：

def nms(bboxes, iou_threshold, sigma=0.3, method='nms'):
    """
    :param bboxes: (xmin, ymin, xmax, ymax, score, class)

    Note: soft-nms, https://arxiv.org/pdf/1704.04503.pdf
          https://github.com/bharatsingh430/soft-nms
    """
    # 找出预测框中的所有类别数
    classes_in_img = list(set(bboxes[:, 5]))
    best_bboxes = []#保存预测框的数组

    #遍历所有类别
    for cls in classes_in_img:
        # 下面2步，找出这个类别下的所有预测框
        cls_mask = (bboxes[:, 5] == cls)
        cls_bboxes = bboxes[cls_mask]

        # 循环读取这个类别下的预测框，直到为空，即全部处理完毕
        while len(cls_bboxes) > 0:
            max_ind = np.argmax(cls_bboxes[:, 4])#取出得分最高的预测框的索引
            best_bbox = cls_bboxes[max_ind]#取出得分最高的预测框A
            best_bboxes.append(best_bbox)#保存这个预测框A
            cls_bboxes = np.concatenate([cls_bboxes[: max_ind], cls_bboxes[max_ind + 1:]])#剩下的预测框重新组合
            # 剩下的预测框和A计算iou值
            iou = bboxes_iou(best_bbox[np.newaxis, :4], cls_bboxes[:, :4])
            #对每个预测框设置一个权值
            weight = np.ones((len(iou),), dtype=np.float32)

            assert method in ['nms', 'soft-nms']

            if method == 'nms':
                #根据阈值找出所有应该丢弃的预测框
                iou_mask = iou > iou_threshold
                weight[iou_mask] = 0.0

            if method == 'soft-nms':
                weight = np.exp(-(1.0 * iou ** 2 / sigma))

            # 经过一番折腾后，计算预测框的分数
            cls_bboxes[:, 4] = cls_bboxes[:, 4] * weight
            # 下面2步，找出剩下的预测框，并再次重复上述步骤，直到这个类别的所有预测框都处理完毕
            score_mask = cls_bboxes[:, 4] > 0.
            cls_bboxes = cls_bboxes[score_mask]

    return best_bboxes

到这里，yolov3的测试函数中所要用到的一些函数都讲完了，在下一篇中，我们开始分析测试函数的代码。