yolov3网络架构以及代码解析二

#! /usr/bin/env python3
# coding=utf-8
#================================================================
#   Copyright (C) 2018 * Ltd. All rights reserved.
#
#   Editor      : VIM
#   File name   : yolov3.py
#   Author      : YunYang1994
#   Created date: 2018-11-21 18:41:35
#   Description : YOLOv3: An Incremental Improvement
#
#================================================================

import tensorflow as tf
from core import common
slim = tf.contrib.slim

class darknet53(object):
    """network for performing feature extraction"""

    def __init__(self, inputs):
        self.outputs = self.forward(inputs)

    def _darknet53_block(self, inputs, filters):
        """
        implement residuals block in darknet53
        """
        # 类似于残差网络的结构,通过1*13*3的卷积之后将input和output相加送入下一层
        shortcut = inputs
        inputs = common._conv2d_fixed_padding(inputs, filters * 1, 1)
        inputs = common._conv2d_fixed_padding(inputs, filters * 2, 3)

        inputs = inputs + shortcut
        return inputs

    def forward(self, inputs):

        inputs = common._conv2d_fixed_padding(inputs, 32,  3, strides=1)
        inputs = common._conv2d_fixed_padding(inputs, 64,  3, strides=2)
        inputs = self._darknet53_block(inputs, 32)
        inputs = common._conv2d_fixed_padding(inputs, 128, 3, strides=2)

        for i in range(2):
            inputs = self._darknet53_block(inputs, 64)

        inputs = common._conv2d_fixed_padding(inputs, 256, 3, strides=2)

        for i in range(8):
            inputs = self._darknet53_block(inputs, 128)#27层

        route_1 = inputs
        inputs = common._conv2d_fixed_padding(inputs, 512, 3, strides=2)

        for i in range(8):
            inputs = self._darknet53_block(inputs, 256)

        route_2 = inputs
        inputs = common._conv2d_fixed_padding(inputs, 1024, 3, strides=2)

        for i in range(4):
            inputs = self._darknet53_block(inputs, 512)
        #大小分别为[52,52,256]  [26,26,512]   [13,13,1024]
        return route_1, route_2, inputs



class yolov3(object):

    def __init__(self, num_classes, anchors,
                 batch_norm_decay=0.9, leaky_relu=0.1):

        # self._ANCHORS = [[10 ,13], [16 , 30], [33 , 23],
                         # [30 ,61], [62 , 45], [59 ,119],
                         # [116,90], [156,198], [373,326]]
        self._ANCHORS = anchors
        self._BATCH_NORM_DECAY = batch_norm_decay
        self._LEAKY_RELU = leaky_relu
        self._NUM_CLASSES = num_classes
        self.feature_maps = [] # [[None, 13, 13, 255], [None, 26, 26, 255], [None, 52, 52, 255]]

    def _yolo_block(self, inputs, filters):
        inputs = common._conv2d_fixed_padding(inputs, filters * 1, 1)
        inputs = common._conv2d_fixed_padding(inputs, filters * 2, 3)
        inputs = common._conv2d_fixed_padding(inputs, filters * 1, 1)
        inputs = common._conv2d_fixed_padding(inputs, filters * 2, 3)
        inputs = common._conv2d_fixed_padding(inputs, filters * 1, 1)
        route = inputs
        inputs = common._conv2d_fixed_padding(inputs, filters * 2, 3)
        return route, inputs

    #输出[gride,gride,3*(5+80)]大小的feature map
    def _detection_layer(self, inputs, anchors):
        num_anchors = len(anchors)
        feature_map = slim.conv2d(inputs, num_anchors * (5 + self._NUM_CLASSES), 1,
                                stride=1, normalizer_fn=None,
                                activation_fn=None,
                                biases_initializer=tf.zeros_initializer())
        return feature_map

    #输入[gride,gride,3*(5+80)]大小的feature map,以及anchors,输出每个cell[7*7*1*2]左上角坐标
    #box[13*13*3*4]坐标,box置信率[13*13*3*1]以及class[13*13*3*80]
    def _reorg_layer(self, feature_map, anchors):

        num_anchors = len(anchors) # num_anchors=3
        grid_size = feature_map.shape.as_list()[1:3] #从feature map中求出cell的面积,也就是2,3的维度
        # 下采样的倍率
        stride = tf.cast(self.img_size // grid_size, tf.float32)
        feature_map = tf.reshape(feature_map,
                                 [-1, grid_size[0], grid_size[1],   #将特征图变成[batch_size,13,13,3,5+num_cls]
                                num_anchors, 5 + self._NUM_CLASSES])

        box_centers, box_sizes, conf_logits, prob_logits = tf.split(
            feature_map, [2, 2, 1, self._NUM_CLASSES], axis=-1) #将feature map的最后一个维度分成2,2,1,num_cls,也就是box_centers[13*13*3*2]
                                                                #box_sizes[13*13*3*2],conf_logits[13*13*3*1],prob_logits[13*13*3*80]
        box_centers = tf.nn.sigmoid(box_centers) #将box的中心限制在当前cell中

        grid_x = tf.range(grid_size[1], dtype=tf.int32)
        grid_y = tf.range(grid_size[0], dtype=tf.int32)

        a, b = tf.meshgrid(grid_x, grid_y)  #a,b分别为13*13的矩阵,a每一行都是0-12,b的每一列都是0-12
        x_offset   = tf.reshape(a, (-1, 1))
        y_offset   = tf.reshape(b, (-1, 1))
        x_y_offset = tf.concat([x_offset, y_offset], axis=-1)
        x_y_offset = tf.reshape(x_y_offset, [grid_size[0], grid_size[1], 1, 2]) #feature map中每个cell左上角的坐标
        x_y_offset = tf.cast(x_y_offset, tf.float32)    #转化为float[7*7*1*2]

        box_centers = box_centers + x_y_offset  #中心点坐标 = 相对于cell左上角的坐标+offset
        box_centers = box_centers * stride[::-1]   #将中心点坐标按比例放大也就是扩展到原图像中

        box_sizes = tf.exp(box_sizes) * anchors # anchors -> [w, h]计算box的长宽[13*13*3*2]
        boxes = tf.concat([box_centers, box_sizes], axis=-1)  #返回box的中心点坐标以及长宽[13*13*3*4]
        return x_y_offset, boxes, conf_logits, prob_logits
    #x_y_offset[7*7*1*2],boxes[13*13*3*4],conf_logits[13*13*3*1],prob_logits[13*13*3*80]

    #上采用函数
    @staticmethod
    def _upsample(inputs, out_shape):

        new_height, new_width = out_shape[1], out_shape[2]
        inputs = tf.image.resize_nearest_neighbor(inputs, (new_height, new_width)) #最近邻插值
        inputs = tf.identity(inputs, name='upsampled')

        return inputs

    # @staticmethod
    # def _upsample(inputs, out_shape):
        # """
        # replace resize_nearest_neighbor with conv2d_transpose To support TensorRT 5 optimization
        # """
        # new_height, new_width = out_shape[1], out_shape[2]
        # filters = 256 if (new_height == 26 and new_width==26) else 128
        # inputs = tf.layers.conv2d_transpose(inputs, filters, kernel_size=3, padding='same',
                                            # strides=(2,2), kernel_initializer=tf.ones_initializer())
        # return inputs
    #产生3张不同尺度的特征图,主要是融合了低尺度的信息
    def forward(self, inputs, is_training=False, reuse=False):
        """
        Creates YOLO v3 model.

        :param inputs: a 4-D tensor of size [batch_size, height, width, channels].
               Dimension batch_size may be undefined. The channel order is RGB.
        :param is_training: whether is training or not.
        :param reuse: whether or not the network and its variables should be reused.
        :return:
        """
        # it will be needed later on
        self.img_size = tf.shape(inputs)[1:3] #input的12维代表图片的size
        # set batch norm params
        batch_norm_params = {
            'decay': self._BATCH_NORM_DECAY,
            'epsilon': 1e-05,
            'scale': True,
            'is_training': is_training,
            'fused': None,  # Use fused batch norm if possible.
        }

        # Set activation_fn and parameters for conv2d, batch_norm.
        with slim.arg_scope([slim.conv2d, slim.batch_norm, common._fixed_padding],reuse=reuse):
            with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm,
                                normalizer_params=batch_norm_params,
                                biases_initializer=None,
                                activation_fn=lambda x: tf.nn.leaky_relu(x, alpha=self._LEAKY_RELU)):
                with tf.variable_scope('darknet-53'):
                    route_1, route_2, inputs = darknet53(inputs).outputs
                    #darknet输出3个不同尺度的feature map,尺度越来越小,特征等级越来越高

                with tf.variable_scope('yolo-v3'):
                    route, inputs = self._yolo_block(inputs, 512) #经过darknet之后的输出再过一个yolo_block
                    # 将feature map转化成最终检测的形式[cell_size,cell_size,num_anchors * (5 + self._NUM_CLASSES)]
                    feature_map_1 = self._detection_layer(inputs, self._ANCHORS[6:9])
                    #feature map1是直接通过dark_net,yolo_block以及最终的detection_layer产生的
                    feature_map_1 = tf.identity(feature_map_1, name='feature_map_1')  #大小为[13*13*255],

                    #将route上采样到route_2大小并且与route_2相加在经过yolo_block,detection然后输出第二张feature map
                    inputs = common._conv2d_fixed_padding(route, 256, 1) #route[13*13*512] feature map1产生前一层的输出
                    upsample_size = route_2.get_shape().as_list()   #route2[2626,512] dark_net中一种尺度的输出
                    inputs = self._upsample(inputs, upsample_size)  #将route上采样到route2大小然后在最后一个维度上合并
                    inputs = tf.concat([inputs, route_2], axis=3) #[26,26,768]

                    route, inputs = self._yolo_block(inputs, 256) #经过darknet之后的输出再过一个yolo_block
                    feature_map_2 = self._detection_layer(inputs, self._ANCHORS[3:6])
                    feature_map_2 = tf.identity(feature_map_2, name='feature_map_2')#大小为[26,26,256]

                    #重复上述操作又产生一张融合了route_1的特征图
                    inputs = common._conv2d_fixed_padding(route, 128, 1)
                    upsample_size = route_1.get_shape().as_list()
                    inputs = self._upsample(inputs, upsample_size)
                    inputs = tf.concat([inputs, route_1], axis=3)

                    route, inputs = self._yolo_block(inputs, 128)
                    feature_map_3 = self._detection_layer(inputs, self._ANCHORS[0:3])
                    feature_map_3 = tf.identity(feature_map_3, name='feature_map_3') #大小为[52,52,255]

            return feature_map_1, feature_map_2, feature_map_3

    def _reshape(self, x_y_offset, boxes, confs, probs):

        grid_size = x_y_offset.shape.as_list()[:2] #offset前两维为gride_size
        boxes = tf.reshape(boxes, [-1, grid_size[0]*grid_size[1]*3, 4]) #将box变成[batch_size,grid_size[0]*grid_size[1]*3, 4]
        confs = tf.reshape(confs, [-1, grid_size[0]*grid_size[1]*3, 1]) #cnonfs变成[batch_size,grid_size[0]*grid_size[1]*3, 1]
        probs = tf.reshape(probs, [-1, grid_size[0]*grid_size[1]*3, self._NUM_CLASSES]) #将pro变成 [batch_size,grid_size[0]*grid_size[1]*3, 80]

        return boxes, confs, probs



    def predict(self, feature_maps):
        """
        Note: given by feature_maps, compute the receptive field
              and get boxes, confs and class_probs
        input_argument: feature_maps -> [None, 13, 13, 255],
                                        [None, 26, 26, 255],
                                        [None, 52, 52, 255],
        out_put:
        boxes = [batch_size,13*13+26*26+52*52*3,4] 为左上角右上角的形式
        confs = [batch_size,13*13+26*26+52*52*3,1]
        probs = [batch_size,13*13+26*26+52*52*3,80]

        """
        # 每个特征图有三种不同大小比例的anchor,小的特征图有大的anchor,表示在小的图上检测大的物体
        feature_map_1, feature_map_2, feature_map_3 = feature_maps
        feature_map_anchors = [(feature_map_1, self._ANCHORS[6:9]),
                               (feature_map_2, self._ANCHORS[3:6]),
                               (feature_map_3, self._ANCHORS[0:3]),]

        # 返回3个元祖,每个元祖中包括x_y_offset,boxes,conf_logits,prob_logits
        results = [self._reorg_layer(feature_map, anchors) for (feature_map, anchors) in feature_map_anchors]
        boxes_list, confs_list, probs_list = [], [], []

        """
        将结果reshape之后装入3个list中,
        其中box = [batch_size,grid_size[0]*grid_size[1]*3, 4]
        cnonfs = [batch_size,grid_size[0]*grid_size[1]*3, 1]
        pro = [batch_size,grid_size[0]*grid_size[1]*3, 80]        
        """
        for result in results:
            boxes, conf_logits, prob_logits = self._reshape(*result)

            confs = tf.sigmoid(conf_logits)
            probs = tf.sigmoid(prob_logits)

            boxes_list.append(boxes)
            confs_list.append(confs)
            probs_list.append(probs)
        #三个列表中分别存放着三个feature map的box、confs、probs
        # 将三个feature map产生的box在第一维度上聚合,相当于级联,[batch_size,grid_size[0]*grid_size[1]*3*3,4]
        boxes = tf.concat(boxes_list, axis=1)
        confs = tf.concat(confs_list, axis=1)   #同上
        probs = tf.concat(probs_list, axis=1)   #同上

        #将box[中心点,长宽]的形式改为[左上角,右上角的形式]
        #先将最后一个维度等分成4份每份[batch_size,grid_size[0]*grid_size[1]*3*3,1]
        center_x, center_y, width, height = tf.split(boxes, [1,1,1,1], axis=-1)
        x0 = center_x - width   / 2.
        y0 = center_y - height  / 2.
        x1 = center_x + width   / 2.
        y1 = center_y + height  / 2.
        #转化完毕再将最后一个维度聚合,变回原来的样子
        boxes = tf.concat([x0, y0, x1, y1], axis=-1)
        return boxes, confs, probs

    def compute_loss(self, pred_feature_map, y_true, ignore_thresh=0.5, max_box_per_image=8):
        """
        Note: compute the loss
        Arguments: y_pred, list -> [feature_map_1, feature_map_2, feature_map_3]
                                        the shape of [None, 13, 13, 3*85]. etc
        """
        loss_xy, loss_wh, loss_conf, loss_class = 0., 0., 0., 0.
        total_loss = 0.
        # total_loss, rec_50, rec_75,  avg_iou    = 0., 0., 0., 0.
        _ANCHORS = [self._ANCHORS[6:9], self._ANCHORS[3:6], self._ANCHORS[0:3]]

        for i in range(len(pred_feature_map)):
            result = self.loss_layer(pred_feature_map[i], y_true[i], _ANCHORS[i])
            loss_xy    += result[0]
            loss_wh    += result[1]
            loss_conf  += result[2]
            loss_class += result[3]

        total_loss = loss_xy + loss_wh + loss_conf + loss_class
        return [total_loss, loss_xy, loss_wh, loss_conf, loss_class]


    def loss_layer(self, feature_map_i, y_true, anchors):
        # size in [h, w] format! don't get messed up!
        grid_size = tf.shape(feature_map_i)[1:3]
        grid_size_ = feature_map_i.shape.as_list()[1:3]

        #将label转换成标准格式
        y_true = tf.reshape(y_true, [-1, grid_size_[0], grid_size_[1], 3, 5+self._NUM_CLASSES])

        # the downscale ratio in height and weight下采样的比率
        ratio = tf.cast(self.img_size / grid_size, tf.float32)
        # N: batch_size,feature map的第一个维度为batch_size
        N = tf.cast(tf.shape(feature_map_i)[0], tf.float32)

        #返回off_set,pred_boxes(原图像中)、pred_conf_logits、pred_prob_logits
        x_y_offset, pred_boxes, pred_conf_logits, pred_prob_logits = self._reorg_layer(feature_map_i, anchors)
        # shape: take 416x416 input image and 13*13 feature_map for example:
        # [N, 13, 13, 3, 1] 第五个维度表示是否含有物体
        object_mask = y_true[..., 4:5]
        # shape: [N, 13, 13, 3, 4] & [N, 13, 13, 3] ==> [V, 4]
        # V: num of true gt box,将object_mask转化为布尔向量,除去那些假的box,留下真的box
        valid_true_boxes = tf.boolean_mask(y_true[..., 0:4], tf.cast(object_mask[..., 0], 'bool'))

        # shape: [V, 2] 有v个ground_truth
        valid_true_box_xy = valid_true_boxes[:, 0:2] #gtbox中心点坐标
        valid_true_box_wh = valid_true_boxes[:, 2:4] #gtbox长宽
        # shape: [N, 13, 13, 3, 2]
        pred_box_xy = pred_boxes[..., 0:2] #预测box中心点
        pred_box_wh = pred_boxes[..., 2:4] #预测box长宽

        # calc iou
        # shape: [N, 13, 13, 3, V]计算预测和label的iou
        iou = self._broadcast_iou(valid_true_box_xy, valid_true_box_wh, pred_box_xy, pred_box_wh)

        # shape: [N, 13, 13, 3] 找出每个box对应的最大iou,也就是说每个box只去预测与他iou最大的gt
        best_iou = tf.reduce_max(iou, axis=-1)
        # get_ignore_mask  形成mask,也就是要删除iou小于0.5的box
        ignore_mask = tf.cast(best_iou < 0.5, tf.float32)
        # shape: [N, 13, 13, 3, 1]
        ignore_mask = tf.expand_dims(ignore_mask, -1)
        # get xy coordinates in one cell from the feature_map
        # numerical range: 0 ~ 1
        # shape: [N, 13, 13, 3, 2] 转化到feature map上
        true_xy = y_true[..., 0:2] / ratio[::-1] - x_y_offset
        pred_xy = pred_box_xy      / ratio[::-1] - x_y_offset

        # get_tw_th, numerical range: 0 ~ 1
        # shape: [N, 13, 13, 3, 2] 转化到feature map上
        true_tw_th = y_true[..., 2:4] / anchors
        pred_tw_th = pred_box_wh      / anchors
        # for numerical stability 判断true_tw_th是否为0,如果为0则令true_tw_th=1,否则保持不变
        true_tw_th = tf.where(condition=tf.equal(true_tw_th, 0),
                              x=tf.ones_like(true_tw_th), y=true_tw_th)
        pred_tw_th = tf.where(condition=tf.equal(pred_tw_th, 0),
                              x=tf.ones_like(pred_tw_th), y=pred_tw_th)

        #使true_tw_th在1e-9, 1e9之间,不在这之间的用边界值表示
        true_tw_th = tf.log(tf.clip_by_value(true_tw_th, 1e-9, 1e9))
        pred_tw_th = tf.log(tf.clip_by_value(pred_tw_th, 1e-9, 1e9))

        # box size punishment:
        # box with smaller area has bigger weight. This is taken from the yolo darknet C source code.
        # shape: [N, 13, 13, 3, 1]
        #框越小,权值越大
        box_loss_scale = 2. - (y_true[..., 2:3] / tf.cast(self.img_size[1], tf.float32)) * (y_true[..., 3:4] / tf.cast(self.img_size[0], tf.float32))

        # shape: [N, 13, 13, 3, 1]
        #objec_mask表示是否含有物体,box越小box_loss_scale越大,由于小的物体即使相对于他的scal变化了很大但是总体上看来还是比较小
        #因此我们给他一个较大的权重,这和yolo1中取根号的道理是一样的
        xy_loss = tf.reduce_sum(tf.square(true_xy    - pred_xy) * object_mask * box_loss_scale) / N
        wh_loss = tf.reduce_sum(tf.square(true_tw_th - pred_tw_th) * object_mask * box_loss_scale) / N

        # shape: [N, 13, 13, 3, 1]
        #对于含有物体,我们希望object_mask和pred_conf_logits很接近,也就是含有物体的box我们希望pred_conf_logits = 1
        #对于不含有物体,我们也希望object_mask和pred_conf_logits很接近,对于不含有物体的box我们希望pred_conf_logits = 0
        conf_pos_mask = object_mask
        conf_neg_mask = (1 - object_mask) * ignore_mask
        conf_loss_pos = conf_pos_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask, logits=pred_conf_logits)
        conf_loss_neg = conf_neg_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask, logits=pred_conf_logits)
        conf_loss = tf.reduce_sum(conf_loss_pos + conf_loss_neg) / N

        # shape: [N, 13, 13, 3, 1]类别损失
        class_loss = object_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true[..., 5:], logits=pred_prob_logits)
        class_loss = tf.reduce_sum(class_loss) / N

        return xy_loss, wh_loss, conf_loss, class_loss

    def _broadcast_iou(self, true_box_xy, true_box_wh, pred_box_xy, pred_box_wh):
        '''
        maintain an efficient way to calculate the ios matrix between ground truth true boxes and the predicted boxes
        note: here we only care about the size match
        '''
        # shape:
        # true_box_??: [V, 2]
        # pred_box_??: [N, 13, 13, 3, 2]

        # shape: [N, 13, 13, 3, 1, 2]倒数第二维后面增加一个维度
        pred_box_xy = tf.expand_dims(pred_box_xy, -2)
        pred_box_wh = tf.expand_dims(pred_box_wh, -2)

        # shape: [1, V, 2]
        true_box_xy = tf.expand_dims(true_box_xy, 0)
        true_box_wh = tf.expand_dims(true_box_wh, 0)

        # [N, 13, 13, 3, 1, 2] & [1, V, 2] ==> [N, 13, 13, 3, V, 2]
        intersect_mins = tf.maximum(pred_box_xy - pred_box_wh / 2.,
                                    true_box_xy - true_box_wh / 2.)
        intersect_maxs = tf.minimum(pred_box_xy + pred_box_wh / 2.,
                                    true_box_xy + true_box_wh / 2.)
        intersect_wh = tf.maximum(intersect_maxs - intersect_mins, 0.)

        # shape: [N, 13, 13, 3, V]
        intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
        # shape: [N, 13, 13, 3, 1]
        pred_box_area  = pred_box_wh[..., 0]  * pred_box_wh[..., 1]
        # shape: [1, V]
        true_box_area  = true_box_wh[..., 0]  * true_box_wh[..., 1]
        # [N, 13, 13, 3, V]
        iou = intersect_area / (pred_box_area + true_box_area - intersect_area)

        return iou

```如有错误欢迎指正,谢谢!


你可能感兴趣的:(yolov3网络架构以及代码解析二)