所有代码已上传到本人github repository:https://github.com/zgcr/pytorch-ImageNet-CIFAR-COCO-VOC-training
如果觉得有用,请点个star哟!
下列代码均在pytorch1.4版本中测试过,确认正确无误。
在前面的RetinaNet/FCOS实现中,计算loss时采用的是先计算每张图的分类/回归loss再加起来平均的方式。这种方式认为每张图的权重是一样的。但是,目标检测模型实际学习的是每张图标注的框,而每张图标注的框的数量是不一样的,因此每张图的权重也不一样。所以,我将RetinaNet/FCOS的各个loss改为整个batch计算loss再求平均,平均的分母是这个batch的所有positive anchor/point的数量,这个数量是和标注框的数量线性正相关的。这样修改后,计算loss时的并行度得到进一步提高,计算而来的梯度也更加合理,模型训练一个epoch的时间减少了40%,且训练的模型性能更好。
各个模型的训练结果如下:
Network | resize | batch | gpu-num | apex | syncbn | epoch | mAP-mAR-loss | training-time(hours) |
---|---|---|---|---|---|---|---|---|
ResNet50-RetinaNet | 667 | 20 | 2 RTX2080Ti | yes | no | 12 | 0.305,0.421,0.56 | 17.43 |
ResNet101-RetinaNet | 667 | 16 | 2 RTX2080Ti | yes | no | 12 | 0.306,0.420,0.55 | 22.06 |
ResNet50-RetinaNet | 1000 | 16 | 4 RTX2080Ti | yes | no | 12 | 0.332,0.458,0.57 | 26.25 |
ResNet50-FCOS | 667 | 24 | 2 RTX2080Ti | yes | no | 12 | 0.318,0.452,1.09 | 14.17 |
ResNet101-FCOS | 667 | 20 | 2 RTX2080Ti | yes | no | 12 | 0.342,0.475,1.07 | 19.20 |
ResNet50-FCOS | 1000 | 20 | 4 RTX2080Ti | yes | no | 12 | 0.361,0.502,1.10 | 18.92 |
ResNet50-FCOS | 1333 | 12 | 4 RTX2080Ti | yes | no | 24 | 0.381,0.534,1.03 | 37.73 |
这里训练时resize均采用的时yolov3的resize方法,按照flops大小对标,我的resize=667 flops等同于RetinaNet文章中的resize=400,resize=1000等同于RetinaNet文章中的resize=600,resize=1333RetinaNet文章中的resize=800。可以看到FCOS在resize=1333时的mAP已经到了0.381,和原文中集成所有改进之后的mAP0.386已经非常接近了。
multi scale训练在目标检测中使用的非常广泛。这里采用yolov3/yolov5中的multi scale方法。首先确定一个最小到最大尺寸的范围,比如没用multi scale前resize=416,那么可以确定一个范围[0.5,1.5],乘以416之后就是最小到最大尺寸的范围。然后,选择一个stride长度,比如yolov3/yolov5是32,找出最小到最大尺寸中所有能被stride长度整除的尺寸。最后,随机选择一个尺寸,将所有图片resize到该尺寸再填充成正方形图片就可以送入网络训练了。
multi scale的代码实现如下(注意用multi scale时transform里不再使用resize方法):
class MultiScaleCollater():
def __init__(self,
resize=512,
multi_scale_range=[0.5, 1.5],
stride=32,
use_multi_scale=False):
self.resize = resize
self.multi_scale_range = multi_scale_range
self.stride = stride
self.use_multi_scale = use_multi_scale
def next(self, data):
if self.use_multi_scale:
min_resize = int(
((self.resize + self.stride) * self.multi_scale_range[0]) //
self.stride * self.stride)
max_resize = int(
((self.resize + self.stride) * self.multi_scale_range[1]) //
self.stride * self.stride)
final_resize = random.choice(
range(min_resize, max_resize, self.stride))
else:
final_resize = self.resize
imgs = [s['img'] for s in data]
annots = [s['annot'] for s in data]
scales = [s['scale'] for s in data]
padded_img = torch.zeros((len(imgs), final_resize, final_resize, 3))
for i, image in enumerate(imgs):
height, width, _ = image.shape
max_image_size = max(height, width)
resize_factor = final_resize / max_image_size
resize_height, resize_width = int(height * resize_factor), int(
width * resize_factor)
image = cv2.resize(image, (resize_width, resize_height))
padded_img[i, 0:resize_height,
0:resize_width] = torch.from_numpy(image)
annots[i][:, :4] *= resize_factor
scales[i] = scales[i] * resize_factor
max_num_annots = max(annot.shape[0] for annot in annots)
if max_num_annots > 0:
annot_padded = torch.ones((len(annots), max_num_annots, 5)) * (-1)
if max_num_annots > 0:
for idx, annot in enumerate(annots):
if annot.shape[0] > 0:
annot_padded[
idx, :annot.shape[0], :] = torch.from_numpy(annot)
else:
annot_padded = torch.ones((len(annots), 1, 5)) * (-1)
padded_img = padded_img.permute(0, 3, 1, 2)
return {'img': padded_img, 'annot': annot_padded, 'scale': scales}
在前面的文章中,我按照自己的理解写了一版mosaic数据增强的代码。但是最近在阅读yolov3/yolov5的代码时发现官方的mosaic数据增强有些不一样。假设不使用mosaic数据增强时的resize=416,最终送入网络训练的图片尺寸是416x416,那么使用mosaic数据增强后最终送入网络训练的图片尺寸就是832x832(长宽都变为416的两倍)。首先从数据集随机取4张图,然后把它们都resize到416,再用值114填充一个832x832的新图片,在416x0.5到416x1.5的范围随机取一个中心点,把这个新图片分成四块,即左上、右上、右下、左下。原来从数据集取的4张图先resize到416,然后会以右下角、左下角、左上角、右上角对齐中心点的方式按在新图片上的对应部分的大小进行裁剪,剪去超出新图片上相应部分的区域,然后把该图片填充到新图片上对应的部分。对于标注框,超出区域的标注框统统抛弃。
mosaic数据增强的代码实现如下:
import os
import cv2
import torch
import numpy as np
import random
import math
from torch.utils.data import Dataset
from pycocotools.coco import COCO
import torch.nn.functional as F
class CocoDetection(Dataset):
def __init__(self,
image_root_dir,
annotation_root_dir,
set='train2017',
resize=416,
use_mosaic=False,
mosaic_center_range=[0.5, 1.5],
transform=None):
self.image_root_dir = image_root_dir
self.annotation_root_dir = annotation_root_dir
self.set_name = set
self.resize = resize
self.use_mosaic = use_mosaic
self.mosaic_center_range = mosaic_center_range
self.transform = transform
self.coco = COCO(
os.path.join(self.annotation_root_dir,
'instances_' + self.set_name + '.json'))
self.load_classes()
def load_classes(self):
self.image_ids = self.coco.getImgIds()
self.cat_ids = self.coco.getCatIds()
self.categories = self.coco.loadCats(self.cat_ids)
self.categories.sort(key=lambda x: x['id'])
# category_id is an original id,coco_id is set from 0 to 79
self.category_id_to_coco_label = {
category['id']: i
for i, category in enumerate(self.categories)
}
self.coco_label_to_category_id = {
v: k
for k, v in self.category_id_to_coco_label.items()
}
def __len__(self):
return len(self.image_ids)
def __getitem__(self, idx):
if self.use_mosaic:
# mosaic center x, y
x_ctr, y_ctr = [
int(
random.uniform(self.resize * self.mosaic_center_range[0],
self.resize * self.mosaic_center_range[1]))
for _ in range(2)
]
# all 4 image indices
imgs_indices = [idx] + [
random.randint(0,
len(self.image_ids) - 1) for _ in range(3)
]
final_annots = []
# combined image by 4 images
combined_img = np.full((self.resize * 2, self.resize * 2, 3),
114,
dtype=np.uint8)
for i, index in enumerate(imgs_indices):
img = self.load_image(index)
annot = self.load_annotations(index)
origin_height, origin_width, _ = img.shape
resize_factor = self.resize / max(origin_height, origin_width)
resize_height, resize_width = int(
origin_height * resize_factor), int(origin_width *
resize_factor)
img = cv2.resize(img, (resize_width, resize_height))
annot[:, :4] *= resize_factor
# top left img
if i == 0:
# combined image coordinates
x1a, y1a, x2a, y2a = max(x_ctr - resize_width,
0), max(y_ctr - resize_height,
0), x_ctr, y_ctr
# single img choosen area
x1b, y1b, x2b, y2b = resize_width - (
x2a - x1a), resize_height - (
y2a - y1a), resize_width, resize_height
# top right img
elif i == 1:
x1a, y1a, x2a, y2a = x_ctr, max(y_ctr - resize_height,
0), min(
x_ctr + resize_width,
self.resize * 2), y_ctr
x1b, y1b, x2b, y2b = 0, resize_height - (y2a - y1a), min(
resize_width, x2a - x1a), resize_height
# bottom left img
elif i == 2:
x1a, y1a, x2a, y2a = max(x_ctr - resize_width,
0), y_ctr, x_ctr, min(
self.resize * 2,
y_ctr + resize_height)
x1b, y1b, x2b, y2b = resize_width - (x2a - x1a), 0, max(
x_ctr, resize_width), min(y2a - y1a, resize_height)
# bottom right img
elif i == 3:
x1a, y1a, x2a, y2a = x_ctr, y_ctr, min(
x_ctr + resize_width,
self.resize * 2), min(self.resize * 2,
y_ctr + resize_height)
x1b, y1b, x2b, y2b = 0, 0, min(
resize_width, x2a - x1a), min(y2a - y1a, resize_height)
# combined_img[ymin:ymax, xmin:xmax]
combined_img[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]
padw, padh = x1a - x1b, y1a - y1b
# annot coordinates transform
if annot.shape[0] > 0:
annot[:, 0] = annot[:, 0] + padw
annot[:, 1] = annot[:, 1] + padh
annot[:, 2] = annot[:, 2] + padw
annot[:, 3] = annot[:, 3] + padh
final_annots.append(annot)
final_annots = np.concatenate(final_annots, axis=0)
final_annots[:, 0:4] = np.clip(final_annots[:, 0:4], 0,
self.resize * 2)
final_annots = final_annots[final_annots[:, 2] -
final_annots[:, 0] > 1]
final_annots = final_annots[final_annots[:, 3] -
final_annots[:, 1] > 1]
sample = {'img': combined_img, 'annot': final_annots, 'scale': 1.}
else:
img = self.load_image(idx)
annot = self.load_annotations(idx)
scale = 1.
origin_height, origin_width, _ = img.shape
resize_factor = self.resize / max(origin_height, origin_width)
resize_height, resize_width = int(
origin_height * resize_factor), int(origin_width *
resize_factor)
img = cv2.resize(img, (resize_width, resize_height))
annot[:, :4] *= resize_factor
scale *= resize_factor
sample = {'img': img, 'annot': annot, 'scale': scale}
if self.transform:
sample = self.transform(sample)
return sample
def load_image(self, image_index):
image_info = self.coco.loadImgs(self.image_ids[image_index])[0]
path = os.path.join(self.image_root_dir, image_info['file_name'])
img = cv2.imread(path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return img.astype(np.float32) / 255.
def load_annotations(self, image_index):
# get ground truth annotations
annotations_ids = self.coco.getAnnIds(
imgIds=self.image_ids[image_index], iscrowd=False)
annotations = np.zeros((0, 5))
# some images appear to miss annotations
if len(annotations_ids) == 0:
return annotations
# parse annotations
coco_annotations = self.coco.loadAnns(annotations_ids)
for _, a in enumerate(coco_annotations):
# some annotations have basically no width / height, skip them
if a['bbox'][2] < 1 or a['bbox'][3] < 1:
continue
annotation = np.zeros((1, 5))
if a['bbox'][2] > 0 and a['bbox'][3] > 0:
annotation[0, :4] = a['bbox']
annotation[0, 4] = self.find_coco_label_from_category_id(
a['category_id'])
annotations = np.append(annotations, annotation, axis=0)
# transform from [x_min, y_min, w, h] to [x_min, y_min, x_max, y_max]
annotations[:, 2] = annotations[:, 0] + annotations[:, 2]
annotations[:, 3] = annotations[:, 1] + annotations[:, 3]
return annotations
def find_coco_label_from_category_id(self, category_id):
return self.category_id_to_coco_label[category_id]
def find_category_id_from_coco_label(self, coco_label):
return self.coco_label_to_category_id[coco_label]
def num_classes(self):
return 80
def image_aspect_ratio(self, image_index):
image = self.coco.loadImgs(self.image_ids[image_index])[0]
return float(image['width']) / float(image['height'])