简书:关于YOLOv3的一些细节
github:源代码作者YOLOV3
知乎:【YOLO】yolo v1到yolo v3
知乎:目标检测|YOLO原理与实现
知乎:YOLO v3深入理解
CSDN:yolo系列之yolo v3【深度解析】
github:tensorflow-yolov3
该部分主要作用是:读取数据集信息,并将标注数据进行一系列的预处理,最终按batch_size大小返回:图像、大中小标记框的标签及坐标信息。下面这个图只能说个大概,细节都在代码里,话不多说,解析源码吧。
在代码的train.py的第34,35行调用了Dataset()
self.trainset = Dataset('train') # 数据集设置 ******************
self.testset = Dataset('test') # 数据集设置 ******************
然后进入到dataset.py文件,进入Dataset()类,首先执行__init__函数,该函数中除了从配置文件中读取一些相关参数外,还执行了下面这行代码:
self.annotations = self.load_annotations(dataset_type) # 获取图像信息列表
该代码调用load_annotations()函数,读取图像信息,其信息格式应该是这样的:
格式为前面是图像路径+空格+标记框信息(x1,y1,x2,y2,itype)
load_annotations()函数源代码:
# 获取图像信息列表
def load_annotations(self, dataset_type):
with open(self.annot_path, 'r') as f:
txt = f.readlines()
annotations = [line.strip() for line in txt if len(line.strip().split()[1:]) != 0]
np.random.shuffle(annotations) # 随机打乱
return annotations
下面进入关键的__next__函数:
该函数最终返回的是:按batch_size大小返回图像、大中小标记框的标签及坐标信息。
def __next__(self):
with tf.device('/cpu:0'):
self.train_input_size = random.choice(self.train_input_sizes) # 随机选择一个训练图像的尺寸
self.train_output_sizes = self.train_input_size // self.strides # 图像尺寸除以缩放倍率,得到输出图像尺寸 【52,26,13】3个尺寸
batch_image = np.zeros((self.batch_size, self.train_input_size, self.train_input_size, 3)) # 输入图像 1*448*448*3
# 1*52*52*3*85
batch_label_sbbox = np.zeros((self.batch_size, self.train_output_sizes[0], self.train_output_sizes[0],
self.anchor_per_scale, 5 + self.num_classes))
batch_label_mbbox = np.zeros((self.batch_size, self.train_output_sizes[1], self.train_output_sizes[1],
self.anchor_per_scale, 5 + self.num_classes))
batch_label_lbbox = np.zeros((self.batch_size, self.train_output_sizes[2], self.train_output_sizes[2],
self.anchor_per_scale, 5 + self.num_classes))
# 1*150*4
batch_sbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4))
batch_mbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4))
batch_lbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4))
num = 0
if self.batch_count < self.num_batchs:
while num < self.batch_size:
index = self.batch_count * self.batch_size + num # 计算获取图像的索引
if index >= self.num_samples: index -= self.num_samples # 如果索引超出了图像数量,则减去图像数量,从头开始
annotation = self.annotations[index] # 从图像名称列表中获取图像名称
image, bboxes = self.parse_annotation(annotation) # 解析annotation,获取图像数据和标记框(做一些预处理)
# 对标记框做一些预处理
label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = self.preprocess_true_boxes(bboxes)
batch_image[num, :, :, :] = image
batch_label_sbbox[num, :, :, :, :] = label_sbbox
batch_label_mbbox[num, :, :, :, :] = label_mbbox
batch_label_lbbox[num, :, :, :, :] = label_lbbox
batch_sbboxes[num, :, :] = sbboxes
batch_mbboxes[num, :, :] = mbboxes
batch_lbboxes[num, :, :] = lbboxes
num += 1
self.batch_count += 1
return batch_image, batch_label_sbbox, batch_label_mbbox, batch_label_lbbox, \
batch_sbboxes, batch_mbboxes, batch_lbboxes
else:
self.batch_count = 0
np.random.shuffle(self.annotations)
raise StopIteration # 结束
中间调用了 parse_annotation(annotation)函数,主要是 解析annotation,获取图像数据和标记框(做一些预处理);以及调用了preprocess_true_boxes(bboxes)函数,主要是对标记框做一些预处理。
主要是根据列表信息读取图像以及获取该图像标记框信息,并根据需要对图像做一些处理,如:随机水平翻转、随机裁剪图像、随机移动图像,这3个操作各有1个函数,代码我就不贴了。
'''
根据列表信息读取图像以及获取该图像标记框信息
annotation="./VOC2007/JPEGImages/000002.jpg 139,200,207,301,18"
'''
def parse_annotation(self, annotation):
line = annotation.split() # 分解当前图像路径
image_path = line[0] # 图像路径
if not os.path.exists(image_path): # 判断路径是否存在
raise KeyError("%s does not exist ... " %image_path)
image = np.array(cv2.imread(image_path)) # 读取图像
bboxes = np.array([list(map(int, box.split(','))) for box in line[1:]]) # 分解出标记框的信息
if self.data_aug:
image, bboxes = self.random_horizontal_flip(np.copy(image), np.copy(bboxes)) # 随机水平翻转
image, bboxes = self.random_crop(np.copy(image), np.copy(bboxes)) # 随机裁剪图像
image, bboxes = self.random_translate(np.copy(image), np.copy(bboxes)) #随机移动图像
# 将输入图像按训练选取的图像尺寸进行缩放,空白部分用灰度值128填充
image, bboxes = utils.image_preporcess(np.copy(image), [self.train_input_size, self.train_input_size], np.copy(bboxes))
return image, bboxes
1.随机水平翻转
'''
随机水平翻转
'''
def random_horizontal_flip(self, image, bboxes):
if random.random() < 0.5: # 如果产生的随机数小于0.5
_, w, _ = image.shape
image = image[:, ::-1, :] # 翻转图像
bboxes[:, [0, 2]] = w - bboxes[:, [2, 0]] - 1 # 翻转标记框
return image, bboxes
2.随机裁剪图像
'''
随机裁剪图像
'''
def random_crop(self, image, bboxes):
if random.random() < 0.5: # 如果随机生成的数小于0.5
h, w, _ = image.shape
max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)
max_l_trans = max_bbox[0]
max_u_trans = max_bbox[1]
max_r_trans = w - max_bbox[2]
max_d_trans = h - max_bbox[3]
crop_xmin = max(0, int(max_bbox[0] - random.uniform(0, max_l_trans)))
crop_ymin = max(0, int(max_bbox[1] - random.uniform(0, max_u_trans)))
crop_xmax = max(w, int(max_bbox[2] + random.uniform(0, max_r_trans)))
crop_ymax = max(h, int(max_bbox[3] + random.uniform(0, max_d_trans)))
image = image[crop_ymin : crop_ymax, crop_xmin : crop_xmax]
bboxes[:, [0, 2]] = bboxes[:, [0, 2]] - crop_xmin
bboxes[:, [1, 3]] = bboxes[:, [1, 3]] - crop_ymin
return image, bboxes
3.随机移动图像
'''
随机移动图像
'''
def random_translate(self, image, bboxes):
if random.random() < 0.5:
h, w, _ = image.shape
max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)
max_l_trans = max_bbox[0]
max_u_trans = max_bbox[1]
max_r_trans = w - max_bbox[2]
max_d_trans = h - max_bbox[3]
tx = random.uniform(-(max_l_trans - 1), (max_r_trans - 1))
ty = random.uniform(-(max_u_trans - 1), (max_d_trans - 1))
M = np.array([[1, 0, tx], [0, 1, ty]])
image = cv2.warpAffine(image, M, (w, h))
bboxes[:, [0, 2]] = bboxes[:, [0, 2]] + tx
bboxes[:, [1, 3]] = bboxes[:, [1, 3]] + ty
return image, bboxes
该函数主要是对标记框进行预处理,根据有无物体,及anchor和GT的IOU初始化置信度和分类概率,细节还需要看代码理解。
'''
标记框预处理
'''
def preprocess_true_boxes(self, bboxes):
# train_output_sizes[i] =》 52 ,26,13三个值
label = [np.zeros((self.train_output_sizes[i], self.train_output_sizes[i], self.anchor_per_scale,
5 + self.num_classes)) for i in range(3)]
bboxes_xywh = [np.zeros((self.max_bbox_per_scale, 4)) for _ in range(3)] # 3,150,4
bbox_count = np.zeros((3,))
for bbox in bboxes:
bbox_coor = bbox[:4] # 取x1,y1,x2,y2
bbox_class_ind = bbox[4] # 取类别序号
onehot = np.zeros(self.num_classes, dtype=np.float) # onehot [0,0,1,0,0...]
onehot[bbox_class_ind] = 1.0 # 将类别索引出置为1
uniform_distribution = np.full(self.num_classes, 1.0 / self.num_classes) # num_classes维,每维以类别总数的倒数填充
deta = 0.01
smooth_onehot = onehot * (1 - deta) + deta * uniform_distribution
# 根据x1,y1,x2,y2计算得出x,y,w,h,(x,y)为矩形框中心点坐标,相对于下采样前图的坐标
bbox_xywh = np.concatenate([(bbox_coor[2:] + bbox_coor[:2]) * 0.5, bbox_coor[2:] - bbox_coor[:2]], axis=-1)
# 除以下采样率,对应到特征图上的坐标,包含小中大三个尺寸信息
bbox_xywh_scaled = 1.0 * bbox_xywh[np.newaxis, :] / self.strides[:, np.newaxis]
iou = []
exist_positive = False # 存在标记框
for i in range(3):
anchors_xywh = np.zeros((self.anchor_per_scale, 4)) # anchor_per_scale每个框产生几个anchor
anchors_xywh[:, 0:2] = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5 # np.floor()向下取整
anchors_xywh[:, 2:4] = self.anchors[i] # 获取基准anchor的宽高
iou_scale = self.bbox_iou(bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh) # 计算缩放后的GT与anchor框的IOU
iou.append(iou_scale)
iou_mask = iou_scale > 0.3 # 大于0.3为1,否则为0
if np.any(iou_mask): # 有1个非0
xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) # 标记框中心坐标
# # 减少数据敏感性, 9月19号添加,可以打开,不影响结果。
xind, yind = abs(xind), abs(yind)
if yind >= label[i].shape[1]: # shape[1] 13,26,52
yind = label[i].shape[1] - 1
if xind >= label[i].shape[0]: # shape[0] 13,26,52
xind = label[i].shape[0] - 1
label[i][yind, xind, iou_mask, :] = 0 # 先初始化为0
label[i][yind, xind, iou_mask, 0:4] = bbox_xywh # 标记框的坐标信息
label[i][yind, xind, iou_mask, 4:5] = 1.0 # 置信度
label[i][yind, xind, iou_mask, 5:] = smooth_onehot # 分类概率
bbox_ind = int(bbox_count[i] % self.max_bbox_per_scale)
bboxes_xywh[i][bbox_ind, :4] = bbox_xywh # 第i个box的标记框
bbox_count[i] += 1
exist_positive = True
if not exist_positive: # 没有标记框,找iou最大值
best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1) # 找iou最大值
best_detect = int(best_anchor_ind / self.anchor_per_scale)
best_anchor = int(best_anchor_ind % self.anchor_per_scale)
xind, yind = np.floor(bbox_xywh_scaled[best_detect, 0:2]).astype(np.int32)
# 减少数据敏感性 9月19号添加
xind, yind = abs(xind), abs(yind)
if yind >= label[best_detect].shape[1]:
yind = label[best_detect].shape[1] - 1
if xind >= label[best_detect].shape[0]:
xind = label[best_detect].shape[0] - 1
label[best_detect][yind, xind, best_anchor, :] = 0
label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh # 标记框坐标
label[best_detect][yind, xind, best_anchor, 4:5] = 1.0 # 置信度
label[best_detect][yind, xind, best_anchor, 5:] = smooth_onehot # 分类概率
bbox_ind = int(bbox_count[best_detect] % self.max_bbox_per_scale)
bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh
bbox_count[best_detect] += 1
label_sbbox, label_mbbox, label_lbbox = label # 获取小中大标记框的标签
sbboxes, mbboxes, lbboxes = bboxes_xywh # 获取小中大标记框的坐标值
return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes
以上是数据预处理部分的代码,并没有贴全部,只是按主要框架贴了出来。