YOLO-V3代码解析系列(三) —— 数据处理(dataset.py)

代码结构

Dataset类代码结构:

  • 初始化数据类,设置训练相关参数
  • parse_annotation():将图像和标签处理到特定尺度,以及相应的预处理
  • preprocess_true_boxes():进一步处理标签数据,具体见代码和注释

代码解析

import os
import cv2
import random
import numpy as np
import tensorflow as tf
import core.utils as utils
from core.config import cfg


class Dataset(object):
    """implement Dataset here"""

    def __init__(self, dataset_type):
        self.annot_path = cfg.TRAIN.ANNOT_PATH if dataset_type == 'train' else cfg.TEST.ANNOT_PATH
        self.input_sizes = cfg.TRAIN.INPUT_SIZE if dataset_type == 'train' else cfg.TEST.INPUT_SIZE
        self.batch_size = cfg.TRAIN.BATCH_SIZE if dataset_type == 'train' else cfg.TEST.BATCH_SIZE
        self.data_aug = cfg.TRAIN.DATA_AUG if dataset_type == 'train' else cfg.TEST.DATA_AUG
		
		# 多尺度训练:训练过程中,随机选择预先设定的输入尺度,必须为32的整倍数
		# [320, 352, 384, 416, 448, 480, 512, 544, 576, 608]
        self.train_input_sizes = cfg.TRAIN.INPUT_SIZE
        # 网络输出尺度,yolov3总共三个尺度,output_stride=[8, 16, 32]
        # 分别针对小、中、大目标
        self.strides = np.array(cfg.YOLO.STRIDES)
        self.classes = utils.read_class_names(cfg.YOLO.CLASSES)
        self.num_classes = len(self.classes)
		
		# 加载提前生成的anchor数据
        self.anchors = np.array(utils.get_anchors(cfg.YOLO.ANCHORS))
        self.anchor_per_scale = cfg.YOLO.ANCHOR_PER_SCALE
        self.max_bbox_per_scale = 150
		
		# 加载训练文本的全部数据
        self.annotations = self.load_annotations()
        self.num_samples = len(self.annotations)
        self.num_batchs = int(np.ceil(self.num_samples / self.batch_size))
        self.batch_count = 0

    def load_annotations(self):
        # 训练集路径以及目标框标注, 读取制作的train.txt
        # train.txt中每一条数据内容:image_path+image_name xmin,ymin,xmax,ymax,class_id
        with open(self.annot_path, 'r') as f:
            txt = f.readlines()
            annotations = [line.strip() for line in txt if len(line.strip().split()[1:]) != 0]
        np.random.shuffle(annotations)
        return annotations

    # 表明该数据类是可迭代类
    def __iter__(self):
        return self

    # 可迭代对象具体的迭代算法
    def __next__(self):
        # 循环读取数据集
        with tf.device('/cpu:0'):
            # 多尺度训练, 随机选出一个尺度训练
            self.train_input_size = random.choice(self.train_input_sizes)
            # 网络输出, output_stride: [8, 16, 32]
            self.train_output_sizes = self.train_input_size // self.strides

            # 存储输入图像一个batch数据的numpy数组容器
            batch_image = np.zeros((self.batch_size, self.train_input_size, self.train_input_size, 3))

            # 存储不同输出尺度下一个batch标签的数据
            batch_label_sbbox = np.zeros((self.batch_size,
                                          self.train_output_sizes[0],
                                          self.train_output_sizes[0],
                                          self.anchor_per_scale,
                                          5 + self.num_classes))

            batch_label_mbbox = np.zeros((self.batch_size,
                                          self.train_output_sizes[1],
                                          self.train_output_sizes[1],
                                          self.anchor_per_scale,
                                          5 + self.num_classes))

            batch_label_lbbox = np.zeros((self.batch_size,
                                          self.train_output_sizes[2],
                                          self.train_output_sizes[2],
                                          self.anchor_per_scale,
                                          5 + self.num_classes))

            # 输出真实框的数据
            batch_sbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4))
            batch_mbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4))
            batch_lbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4))

            # 读取数据,实际上每次只度一个batch,然后填入网络训练. GPU大部分时间处于闲置
            # 状态. 所以,训练速度很慢,TitanXp上测试相差30倍.
            num = 0
            if self.batch_count < self.num_batchs:
                # print('if if if')
                while num < self.batch_size:
                    index = self.batch_count * self.batch_size + num
                    if index >= self.num_samples:
                        index -= self.num_samples

                    # 读取一条数据
                    annotation = self.annotations[index]

                    # 图像和目标框处理为目标尺度上的数值,比如随机选择一个尺度
                    # [320, 352, 384, 416, 448, 480, 512, 544, 576, 608]
                    image, bboxes = self.parse_annotation(annotation)

                    # 进一步处理真实标签:用于后续损失函数计算,处理流程如下,
                    # 1. 标签平滑(one-hot)
                    # 2. 边框坐标(xmin,ymin,xmax,ymax)转为(x,y,w,h)
                    # 3. 存储每个尺度下的label和bboxes. 
                    #    label:每种尺度下,大于iou阈值,则填充真实标签
                    #          shape:(3,output_size,output_size,anchor_per_scale,5+num_class)
                    #    bboxes:大于iou(0.3),则填充真实框的中心坐标和宽高,shape:(3, 150, 4)
                    label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = self.preprocess_true_boxes(bboxes)

                    batch_image[num, :, :, :] = image
                    batch_label_sbbox[num, :, :, :, :] = label_sbbox
                    batch_label_mbbox[num, :, :, :, :] = label_mbbox
                    batch_label_lbbox[num, :, :, :, :] = label_lbbox
                    # print('batch: ', batch_label_sbbox.shape)
                    batch_sbboxes[num, :, :] = sbboxes
                    batch_mbboxes[num, :, :] = mbboxes
                    batch_lbboxes[num, :, :] = lbboxes
                    num += 1
                self.batch_count += 1
                return batch_image, \
                       batch_label_sbbox, \
                       batch_label_mbbox, \
                       batch_label_lbbox, \
                       batch_sbboxes, \
                       batch_mbboxes, \
                       batch_lbboxes
            else:
                # print('else else else')
                self.batch_count = 0
                np.random.shuffle(self.annotations)
                raise StopIteration

image, bboxes = self.parse_annotation(annotation),该函数主要是将原始尺度下的图像和标签(边界框)处理到目标尺度下,并且进行数据增强操作,增强数据的多样性,进而提升网络的泛化能力。

    def parse_annotation(self, annotation):
        # 读取图片和标签
        line = annotation.split()
        image_path = line[0]

        if not os.path.exists(image_path):
            raise KeyError("%s does not exist ... " % image_path)

        image = np.array(cv2.imread(image_path))
        bboxes = np.array([list(map(lambda x: int(float(x)), box.split(','))) for box in line[1:]])

        # 数据增强处理
        if self.data_aug:
            image, bboxes = self.random_horizontal_flip(np.copy(image), np.copy(bboxes))
            image, bboxes = self.random_crop(np.copy(image), np.copy(bboxes))
            image, bboxes = self.random_translate(np.copy(image), np.copy(bboxes))

        # 颜色转换(BGR->RGB), 将图片和标签处理到随机选择的尺度,并且填充为正方形
        image, bboxes = utils.image_preporcess(np.copy(image),
                                               [self.train_input_size, self.train_input_size],
                                               np.copy(bboxes))
        return image, bboxes

label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = self.preprocess_true_boxes(bboxes):顾名思义,是对真实框进行进一步预处理,便于后续计算损失函数。为了更好地理解这个函数,有必要先说明bboxeslabel分别表示什么,具体怎么操作。

bboxes 是用来存放真实框的中心坐标以及宽高(x,y,w,h),其shape为(3,150,4),3表示3种网格尺寸,150表示每种网格尺寸允许存放的最大真实框数量,4就是(x,y,w,h)。
label 是用来存放3种网格尺寸下每一个网格的中心坐标、宽高、置信度以及所属类别,(x, y, w, h, conf, classid)。shape为(3, train_output_sizes, train_output_sizes, anchor_per_scale, 5+num_classes)。label的初始化为0矩阵,即每个网格的信息(x, y, w, h, conf, classid)都填充为0。每个尺度下,分别计算3个先验框和相应尺度下真实框的iou值,筛选iou值>0.3的先验框并标记索引,然后将真实框的(x, y, w, h, class_id)填充到真实框所属的网格中(对应标记索引),网格的置信度设为1.

    def preprocess_true_boxes(self, bboxes):
        label = [np.zeros((self.train_output_sizes[i],
                           self.train_output_sizes[i],
                           self.anchor_per_scale,
                           5 + self.num_classes)) for i in range(3)]

        # print(label[0].shape)
        # print(label[1].shape)
        # print(label[2].shape)

        # [(150, 4), (150, 4), (150, 4)], 每种stride下最多产生150个真实框
        bboxes_xywh = [np.zeros((self.max_bbox_per_scale, 4)) for _ in range(3)]
        bbox_count = np.zeros((3,))

        for bbox in bboxes:
            # 获取坐标和类别值
            bbox_coor = bbox[:4]
            bbox_class_ind = bbox[4]

            # 标签平滑处理
            onehot = np.zeros(self.num_classes, dtype=np.float)
            onehot[bbox_class_ind] = 1.0
            uniform_distribution = np.full(self.num_classes, 1.0 / self.num_classes)
            deta = 0.01
            smooth_onehot = onehot * (1 - deta) + deta * uniform_distribution

            # 计算中心点坐标(x,y) = ((x_max, y_max) + (x_min, y_min)) * 0.5
            # 计算宽高(w,h) = (x_max, y_max) - (x_min, y_min), 拼接成一个数组(x, y, w, h).
            bbox_xywh = np.concatenate([(bbox_coor[2:] + bbox_coor[:2]) * 0.5, bbox_coor[2:] - bbox_coor[:2]], axis=-1)
            print('bbox_xywh: ', bbox_xywh)
            # bbox_xywh: [431.5 282.5 135.  153. ]

            # 8, 16, 32倍对中心点以及宽高缩放, shape=(3, 4)
            bbox_xywh_scaled = 1.0 * bbox_xywh[np.newaxis, :] / self.strides[:, np.newaxis]
            print(self.strides[:, np.newaxis])
            print('bbox_xywh_scaled:\n', bbox_xywh_scaled)
            # self.strides[:, np.newaxis] 输出如下:
            # [[ 8]
            #  [16]
            #  [32]]
            # ==========================================
            # bbox_xywh_scaled:[x y w h]
            #   [[53.9375   35.3125   16.875    19.125   ]
            #    [26.96875  17.65625   8.4375    9.5625  ]
            #    [13.484375  8.828125  4.21875   4.78125 ]]

            iou = []
            exist_positive = False
            for i in range(3):  # 3种stride尺度[8,16,32]
                # 用于存储每种网格尺寸[8,16,32]下3个anchor框的中心位置和宽高
                anchors_xywh = np.zeros((self.anchor_per_scale, 4))
                # 将3种尺度下的anchor分别移到三种尺度下真实框的中心,重合方便计算IOU
                # anchor只有宽高,没有中心
                anchors_xywh[:, 0:2] = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5
                anchors_xywh[:, 2:4] = self.anchors[i]
                print("anchor[i]:\n", self.anchors[i])
                print("anchor_xywh:\n", anchors_xywh)
                # anchor[0]: stride=8下的anchor,每个stride下有三个先验anchor
                #    [[ 16.75   19.   ]
                #     [ 38.75   42.125]
                #     [ 95.625 110.125]]
                # anchor_xywh: 移至到真实框的中心
                #    [[ 53.5    35.5    16.75   19.   ]
                #     [ 53.5    35.5    38.75   42.125]
                #     [ 53.5    35.5    95.625 110.125]]

                # 计算1个尺度下真实框分别与3个anchor的iou值
                iou_scale = self.bbox_iou(bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh)
                iou.append(iou_scale)
                iou_mask = iou_scale > 0.3
                print('iou_mask: ', iou_mask)
                # iou_mask = [True,True,False]

                # iou > 0.3 的anchor, 执行此处处理
                if np.any(iou_mask):
                    # 获取真实框的中心,也即是在网络输出上的网格左上角坐标
                    # np.floor:返回的是不大于当前值的最大整数,也就是左上角坐标
                    xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32)
                    print("yind, xind: ", xind, yind)

                    # label[i].shape: (output_size, output_size, anchor_per_scale, 5+num_class)
                    # 初始化全用零填充
                    label[i][yind, xind, iou_mask, :] = 0
                    # print(label[i][yind, xind])

                    # 网格(yind,xind)处有目标, 填充真实框的中心和宽高
                    label[i][yind, xind, iou_mask, 0:4] = bbox_xywh
                    # 设置置信度为1.0, 表面有目标
                    label[i][yind, xind, iou_mask, 4:5] = 1.0
                    label[i][yind, xind, iou_mask, 5:] = smooth_onehot
                    print("label:\n", label[i].shape)
                    print(label[i][yind, xind])
                    # 假设iou_mask=[True,True,False],则填充如下:
                    # [[260.5 130.5 153.  147.    1.    1.]
                    #  [260.5 130.5 153.  147.    1.    1.]
                    #  [0.    0.    0.    0.      0.    0.]]
                    # exit()
                    # label[i].shape: (72, 72, 3, 6)

                    # 计算真实框的索引, 根据场景的真实标签,可以调整数值大小
                    bbox_ind = int(bbox_count[i] % self.max_bbox_per_scale)
                    bboxes_xywh[i][bbox_ind, :4] = bbox_xywh

                    # 记录该尺度下真实框的数量
                    bbox_count[i] += 1

                    exist_positive = True
            print('===============================')
            # iou < 0.3 执行此处处理,则选取最大的IOU,作为正样本.
            if not exist_positive:
                print('+++++++++++++++++++++++++++')
                best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1)
                best_detect = int(best_anchor_ind / self.anchor_per_scale)
                best_anchor = int(best_anchor_ind % self.anchor_per_scale)
                xind, yind = np.floor(bbox_xywh_scaled[best_detect, 0:2]).astype(np.int32)

                label[best_detect][yind, xind, best_anchor, :] = 0
                label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh
                label[best_detect][yind, xind, best_anchor, 4:5] = 1.0
                label[best_detect][yind, xind, best_anchor, 5:] = smooth_onehot

                bbox_ind = int(bbox_count[best_detect] % self.max_bbox_per_scale)
                bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh
                bbox_count[best_detect] += 1
        # print('label shape: ', len(label))
        # print(label[0].shape)
        # print(len(label))

        # 5+num_class:存放三种尺度下中心坐标,宽高,置信度,类别概率
        # label shape:(3, output_size, output_size, anchor_per_scale, 5+num_class)
        label_sbbox, label_mbbox, label_lbbox = label

        # bboxes: 存放真实框的中心和宽高坐标
        # shape: (3, 150, 4)
        # 3: 3种尺度;150:每种尺度下最大的真实框的数量;4:(x,y,w,h)
        sbboxes, mbboxes, lbboxes = bboxes_xywh
        return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes

你可能感兴趣的:(YOLO-V3代码解析)