yolox的整个模型配置从model、dataset和训练策略的配置如下
_base_ = ['../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py']
# model settings
model = dict(
type='YOLOX',
backbone=dict(type='CSPDarknet', deepen_factor=0.33, widen_factor=0.5),
neck=dict(
type='YOLOXPAFPN',
in_channels=[128, 256, 512],
out_channels=128,
num_csp_blocks=1),
bbox_head=dict(
type='YOLOXHead', num_classes=80, in_channels=128, feat_channels=128),
train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
# In order to align the source code, the threshold of the val phase is
# 0.01, and the threshold of the test phase is 0.001.
test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)))
# dataset settings
data_root = 'data/coco/'
dataset_type = 'CocoDataset'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
img_scale = (640, 640)
train_pipeline = [
dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
dict(
type='RandomAffine',
scaling_ratio_range=(0.1, 2),
border=(-img_scale[0] // 2, -img_scale[1] // 2)),
dict(
type='MixUp',
img_scale=img_scale,
ratio_range=(0.8, 1.6),
pad_val=114.0),
dict(
type='PhotoMetricDistortion',
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Resize', keep_ratio=True),
dict(type='Pad', pad_to_square=True, pad_val=114.0),
dict(type='Normalize', **img_norm_cfg),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]
train_dataset = dict(
type='MultiImageMixDataset',
dataset=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'train2017/',
pipeline=[
dict(type='LoadImageFromFile', to_float32=True),
dict(type='LoadAnnotations', with_bbox=True)
],
filter_empty_gt=False,
),
pipeline=train_pipeline,
dynamic_scale=img_scale)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=img_scale,
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Pad', size=img_scale, pad_val=114.0),
dict(type='Normalize', **img_norm_cfg),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=8,
workers_per_gpu=2,
train=train_dataset,
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
pipeline=test_pipeline))
# optimizer
# default 8 gpu
optimizer = dict(
type='SGD',
lr=0.01,
momentum=0.9,
weight_decay=5e-4,
nesterov=True,
paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.))
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
_delete_=True,
policy='YOLOX',
warmup='exp',
by_epoch=False,
warmup_by_epoch=True,
warmup_ratio=1,
warmup_iters=5, # 5 epoch
num_last_epochs=15,
min_lr_ratio=0.05)
runner = dict(type='EpochBasedRunner', max_epochs=300)
resume_from = None
interval = 10
custom_hooks = [
dict(type='YOLOXModeSwitchHook', num_last_epochs=15, priority=48),
dict(
type='SyncRandomSizeHook',
ratio_range=(14, 26),
img_scale=img_scale,
priority=48),
dict(
type='SyncNormHook',
num_last_epochs=15,
interval=interval,
priority=48),
dict(type='ExpMomentumEMAHook', resume_from=resume_from, priority=49)
]
checkpoint_config = dict(interval=interval)
evaluation = dict(interval=interval, metric='bbox')
log_config = dict(interval=50)
数据集的建立build_dataset函数是入口
def build_dataset(cfg, default_args=None):
from .dataset_wrappers import (ConcatDataset, RepeatDataset,
ClassBalancedDataset, MultiImageMixDataset)
if isinstance(cfg, (list, tuple)):
dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
elif cfg['type'] == 'ConcatDataset':
dataset = ConcatDataset(
[build_dataset(c, default_args) for c in cfg['datasets']],
cfg.get('separate_eval', True))
elif cfg['type'] == 'RepeatDataset':
dataset = RepeatDataset(
build_dataset(cfg['dataset'], default_args), cfg['times'])
elif cfg['type'] == 'ClassBalancedDataset':
dataset = ClassBalancedDataset(
build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
elif cfg['type'] == 'MultiImageMixDataset':
cp_cfg = copy.deepcopy(cfg)
cp_cfg['dataset'] = build_dataset(cp_cfg['dataset'])
cp_cfg.pop('type')
dataset = MultiImageMixDataset(**cp_cfg)
elif isinstance(cfg.get('ann_file'), (list, tuple)):
dataset = _concat_dataset(cfg, default_args)
else:
dataset = build_from_cfg(cfg, DATASETS, default_args)
return dataset
可以看到为实现mosaic和mixup新增了MultiImageMixDataset类其定义,在MultiImageMixDataset分支下循环调用了build_dataset,其调用的是class CocoDataset(CustomDataset)返回的是一个pytorch的dataset类型,将其输入到MultiImageMixDataset中。
@DATASETS.register_module()
class MultiImageMixDataset:
"""A wrapper of multiple images mixed dataset.
Suitable for training on multiple images mixed data augmentation like
mosaic and mixup. For the augmentation pipeline of mixed image data,
the `get_indexes` method needs to be provided to obtain the image
indexes, and you can set `skip_flags` to change the pipeline running
process. At the same time, we provide the `dynamic_scale` parameter
to dynamically change the output image size.
Args:
dataset (:obj:`CustomDataset`): The dataset to be mixed.
pipeline (Sequence[dict]): Sequence of transform object or
config dict to be composed.
dynamic_scale (tuple[int], optional): The image scale can be changed
dynamically. Default to None.
skip_type_keys (list[str], optional): Sequence of type string to
be skip pipeline. Default to None.
"""
def __init__(self,
dataset,
pipeline,
dynamic_scale=None,
skip_type_keys=None):
assert isinstance(pipeline, collections.abc.Sequence)
if skip_type_keys is not None:
assert all([
isinstance(skip_type_key, str)
for skip_type_key in skip_type_keys
])
self._skip_type_keys = skip_type_keys
self.pipeline = []
self.pipeline_types = []
for transform in pipeline:
if isinstance(transform, dict):
self.pipeline_types.append(transform['type'])
transform = build_from_cfg(transform, PIPELINES)
self.pipeline.append(transform)
else:
raise TypeError('pipeline must be a dict')
self.dataset = dataset
self.CLASSES = dataset.CLASSES
if hasattr(self.dataset, 'flag'):
self.flag = dataset.flag
self.num_samples = len(dataset)
if dynamic_scale is not None:
assert isinstance(dynamic_scale, tuple)
self._dynamic_scale = dynamic_scale
def __len__(self):
return self.num_samples
def __getitem__(self, idx):
results = copy.deepcopy(self.dataset[idx])
for (transform, transform_type) in zip(self.pipeline,
self.pipeline_types):
if self._skip_type_keys is not None and \
transform_type in self._skip_type_keys:
continue
if hasattr(transform, 'get_indexes'):
indexes = transform.get_indexes(self.dataset)
if not isinstance(indexes, collections.abc.Sequence):
indexes = [indexes]
mix_results = [
copy.deepcopy(self.dataset[index]) for index in indexes
]
results['mix_results'] = mix_results
if self._dynamic_scale is not None:
# Used for subsequent pipeline to automatically change
# the output image size. E.g MixUp, Resize.
results['scale'] = self._dynamic_scale
results = transform(results)
if 'mix_results' in results:
results.pop('mix_results')
return results
def update_skip_type_keys(self, skip_type_keys):
"""Update skip_type_keys. It is called by an external hook.
Args:
skip_type_keys (list[str], optional): Sequence of type
string to be skip pipeline.
"""
assert all([
isinstance(skip_type_key, str) for skip_type_key in skip_type_keys
])
self._skip_type_keys = skip_type_keys
def update_dynamic_scale(self, dynamic_scale):
"""Update dynamic_scale. It is called by an external hook.
Args:
dynamic_scale (tuple[int]): The image scale can be
changed dynamically.
"""
assert isinstance(dynamic_scale, tuple)
self._dynamic_scale = dynamic_scale
注意的get_indexes函数他主要增添了results[‘mix_results’],然后通过用 results = transform(results)函数调用了新增的mosaic,miuup
@PIPELINES.register_module()
class Mosaic:
"""Mosaic augmentation.
Given 4 images, mosaic transform combines them into
one output image. The output image is composed of the parts from each sub-
image.
.. code:: text
mosaic transform
center_x
+------------------------------+
| pad | pad |
| +-----------+ |
| | | |
| | image1 |--------+ |
| | | | |
| | | image2 | |
center_y |----+-------------+-----------|
| | cropped | |
|pad | image3 | image4 |
| | | |
+----|-------------+-----------+
| |
+-------------+
The mosaic transform steps are as follows:
1. Choose the mosaic center as the intersections of 4 images
2. Get the left top image according to the index, and randomly
sample another 3 images from the custom dataset.
3. Sub image will be cropped if image is larger than mosaic patch
Args:
img_scale (Sequence[int]): Image size after mosaic pipeline of single
image. Default to (640, 640).
center_ratio_range (Sequence[float]): Center ratio range of mosaic
output. Default to (0.5, 1.5).
min_bbox_size (int | float): The minimum pixel for filtering
invalid bboxes after the mosaic pipeline. Default to 0.
pad_val (int): Pad value. Default to 114.
"""
def __init__(self,
img_scale=(640, 640),
center_ratio_range=(0.5, 1.5),
min_bbox_size=0,
pad_val=114):
assert isinstance(img_scale, tuple)
self.img_scale = img_scale
self.center_ratio_range = center_ratio_range
self.min_bbox_size = min_bbox_size
self.pad_val = pad_val
def __call__(self, results):
"""Call function to make a mosaic of image.
Args:
results (dict): Result dict.
Returns:
dict: Result dict with mosaic transformed.
"""
results = self._mosaic_transform(results)
return results
def get_indexes(self, dataset):
"""Call function to collect indexes.
Args:
dataset (:obj:`MultiImageMixDataset`): The dataset.
Returns:
list: indexes.
"""
indexes = [random.randint(0, len(dataset)) for _ in range(3)]
return indexes
def _mosaic_transform(self, results):
"""Mosaic transform function.
Args:
results (dict): Result dict.
Returns:
dict: Updated result dict.
"""
assert 'mix_results' in results
mosaic_labels = []
mosaic_bboxes = []
if len(results['img'].shape) == 3:
mosaic_img = np.full(
(int(self.img_scale[0] * 2), int(self.img_scale[1] * 2), 3),
self.pad_val,
dtype=results['img'].dtype)
else:
mosaic_img = np.full(
(int(self.img_scale[0] * 2), int(self.img_scale[1] * 2)),
self.pad_val,
dtype=results['img'].dtype)
# mosaic center x, y
center_x = int(
random.uniform(*self.center_ratio_range) * self.img_scale[1])
center_y = int(
random.uniform(*self.center_ratio_range) * self.img_scale[0])
center_position = (center_x, center_y)
loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
for i, loc in enumerate(loc_strs):
if loc == 'top_left':
results_patch = copy.deepcopy(results)
else:
results_patch = copy.deepcopy(results['mix_results'][i - 1])
img_i = results_patch['img']
h_i, w_i = img_i.shape[:2]
# keep_ratio resize
scale_ratio_i = min(self.img_scale[0] / h_i,
self.img_scale[1] / w_i)
img_i = mmcv.imresize(
img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
# compute the combine parameters
paste_coord, crop_coord = self._mosaic_combine(
loc, center_position, img_i.shape[:2][::-1])
x1_p, y1_p, x2_p, y2_p = paste_coord
x1_c, y1_c, x2_c, y2_c = crop_coord
# crop and paste image
mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
# adjust coordinate
gt_bboxes_i = results_patch['gt_bboxes']
gt_labels_i = results_patch['gt_labels']
if gt_bboxes_i.shape[0] > 0:
padw = x1_p - x1_c
padh = y1_p - y1_c
gt_bboxes_i[:, 0::2] = \
scale_ratio_i * gt_bboxes_i[:, 0::2] + padw
gt_bboxes_i[:, 1::2] = \
scale_ratio_i * gt_bboxes_i[:, 1::2] + padh
mosaic_bboxes.append(gt_bboxes_i)
mosaic_labels.append(gt_labels_i)
if len(mosaic_labels) > 0:
mosaic_bboxes = np.concatenate(mosaic_bboxes, 0)
mosaic_bboxes[:, 0::2] = np.clip(mosaic_bboxes[:, 0::2], 0,
2 * self.img_scale[1])
mosaic_bboxes[:, 1::2] = np.clip(mosaic_bboxes[:, 1::2], 0,
2 * self.img_scale[0])
mosaic_labels = np.concatenate(mosaic_labels, 0)
mosaic_bboxes, mosaic_labels = \
self._filter_box_candidates(mosaic_bboxes, mosaic_labels)
results['img'] = mosaic_img
results['img_shape'] = mosaic_img.shape
results['ori_shape'] = mosaic_img.shape
results['gt_bboxes'] = mosaic_bboxes
results['gt_labels'] = mosaic_labels
return results
def _mosaic_combine(self, loc, center_position_xy, img_shape_wh):
"""Calculate global coordinate of mosaic image and local coordinate of
cropped sub-image.
Args:
loc (str): Index for the sub-image, loc in ('top_left',
'top_right', 'bottom_left', 'bottom_right').
center_position_xy (Sequence[float]): Mixing center for 4 images,
(x, y).
img_shape_wh (Sequence[int]): Width and height of sub-image
Returns:
tuple[tuple[float]]: Corresponding coordinate of pasting and
cropping
- paste_coord (tuple): paste corner coordinate in mosaic image.
- crop_coord (tuple): crop corner coordinate in mosaic image.
"""
assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
if loc == 'top_left':
# index0 to top left part of image
x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
max(center_position_xy[1] - img_shape_wh[1], 0), \
center_position_xy[0], \
center_position_xy[1]
crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - (
y2 - y1), img_shape_wh[0], img_shape_wh[1]
elif loc == 'top_right':
# index1 to top right part of image
x1, y1, x2, y2 = center_position_xy[0], \
max(center_position_xy[1] - img_shape_wh[1], 0), \
min(center_position_xy[0] + img_shape_wh[0],
self.img_scale[1] * 2), \
center_position_xy[1]
crop_coord = 0, img_shape_wh[1] - (y2 - y1), min(
img_shape_wh[0], x2 - x1), img_shape_wh[1]
elif loc == 'bottom_left':
# index2 to bottom left part of image
x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
center_position_xy[1], \
center_position_xy[0], \
min(self.img_scale[0] * 2, center_position_xy[1] +
img_shape_wh[1])
crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min(
y2 - y1, img_shape_wh[1])
else:
# index3 to bottom right part of image
x1, y1, x2, y2 = center_position_xy[0], \
center_position_xy[1], \
min(center_position_xy[0] + img_shape_wh[0],
self.img_scale[1] * 2), \
min(self.img_scale[0] * 2, center_position_xy[1] +
img_shape_wh[1])
crop_coord = 0, 0, min(img_shape_wh[0],
x2 - x1), min(y2 - y1, img_shape_wh[1])
paste_coord = x1, y1, x2, y2
return paste_coord, crop_coord
def _filter_box_candidates(self, bboxes, labels):
"""Filter out bboxes too small after Mosaic."""
bbox_w = bboxes[:, 2] - bboxes[:, 0]
bbox_h = bboxes[:, 3] - bboxes[:, 1]
valid_inds = (bbox_w > self.min_bbox_size) & \
(bbox_h > self.min_bbox_size)
valid_inds = np.nonzero(valid_inds)[0]
return bboxes[valid_inds], labels[valid_inds]
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'img_scale={self.img_scale}, '
repr_str += f'center_ratio_range={self.center_ratio_range})'
repr_str += f'pad_val={self.pad_val})'
return repr_str
可以看到其主要是对输入的mix_results内容进行更改。
yolox使用了多个hook,下边主要将ema hook和yoloxmodeswitchhook
class BaseEMAHook(Hook):
"""Exponential Moving Average Hook.
Use Exponential Moving Average on all parameters of model in training
process. All parameters have a ema backup, which update by the formula
as below. EMAHook takes priority over EvalHook and CheckpointHook. Note,
the original model parameters are actually saved in ema field after train.
Args:
momentum (float): The momentum used for updating ema parameter.
Ema's parameter are updated with the formula:
`ema_param = (1-momentum) * ema_param + momentum * cur_param`.
Defaults to 0.0002.
skip_buffers (bool): Whether to skip the model buffers, such as
batchnorm running stats (running_mean, running_var), it does not
perform the ema operation. Default to False.
interval (int): Update ema parameter every interval iteration.
Defaults to 1.
resume_from (str, optional): The checkpoint path. Defaults to None.
momentum_fun (func, optional): The function to change momentum
during early iteration (also warmup) to help early training.
It uses `momentum` as a constant. Defaults to None.
"""
def __init__(self,
momentum=0.0002,
interval=1,
skip_buffers=False,
resume_from=None,
momentum_fun=None):
assert 0 < momentum < 1
self.momentum = momentum
self.skip_buffers = skip_buffers
self.interval = interval
self.checkpoint = resume_from
self.momentum_fun = momentum_fun
def before_run(self, runner):
"""To resume model with it's ema parameters more friendly.
Register ema parameter as ``named_buffer`` to model.
"""
model = runner.model
if is_module_wrapper(model):
model = model.module
self.param_ema_buffer = {}
if self.skip_buffers:
self.model_parameters = dict(model.named_parameters())
else:
self.model_parameters = model.state_dict()
for name, value in self.model_parameters.items():
# "." is not allowed in module's buffer name
buffer_name = f"ema_{name.replace('.', '_')}"
self.param_ema_buffer[name] = buffer_name
model.register_buffer(buffer_name, value.data.clone())
self.model_buffers = dict(model.named_buffers())
if self.checkpoint is not None:
runner.resume(self.checkpoint)
def get_momentum(self, runner):
return self.momentum_fun(runner.iter) if self.momentum_fun else \
self.momentum
def after_train_iter(self, runner):
"""Update ema parameter every self.interval iterations."""
if (runner.iter + 1) % self.interval != 0:
return
momentum = self.get_momentum(runner)
for name, parameter in self.model_parameters.items():
# exclude num_tracking
if parameter.dtype.is_floating_point:
buffer_name = self.param_ema_buffer[name]
buffer_parameter = self.model_buffers[buffer_name]
buffer_parameter.mul_(1 - momentum).add_(
parameter.data, alpha=momentum)
def after_train_epoch(self, runner):
"""We load parameter values from ema backup to model before the
EvalHook."""
self._swap_ema_parameters()
def before_train_epoch(self, runner):
"""We recover model's parameter from ema backup after last epoch's
EvalHook."""
self._swap_ema_parameters()
def _swap_ema_parameters(self):
"""Swap the parameter of model with parameter in ema_buffer."""
for name, value in self.model_parameters.items():
temp = value.data.clone()
ema_buffer = self.model_buffers[self.param_ema_buffer[name]]
value.data.copy_(ema_buffer.data)
ema_buffer.data.copy_(temp)
@HOOKS.register_module()
class ExpMomentumEMAHook(BaseEMAHook):
"""EMAHook using exponential momentum strategy.
Args:
total_iter (int): The total number of iterations of EMA momentum.
Defaults to 2000.
"""
def __init__(self, total_iter=2000, **kwargs):
super(ExpMomentumEMAHook, self).__init__(**kwargs)
self.momentum_fun = lambda x: (1 - self.momentum) * math.exp(-(
1 + x) / total_iter) + self.momentum
ema hook的主要是实现hook类中的before_train_epoch,after_train_epoch等函数,具体实现哪几个看你实现的功能要求。整体代码比较明朗,主要是提供一个hook实现的思路。
@HOOKS.register_module()
class YOLOXModeSwitchHook(Hook):
"""Switch the mode of YOLOX during training.
This hook turns off the mosaic and mixup data augmentation and switches
to use L1 loss in bbox_head.
Args:
num_last_epochs (int): The number of latter epochs in the end of the
training to close the data augmentation and switch to L1 loss.
Default: 15.
skip_type_keys (list[str], optional): Sequence of type string to be
skip pipeline. Default: ('Mosaic', 'RandomAffine', 'MixUp')
"""
def __init__(self,
num_last_epochs=15,
skip_type_keys=('Mosaic', 'RandomAffine', 'MixUp')):
self.num_last_epochs = num_last_epochs
self.skip_type_keys = skip_type_keys
def before_train_epoch(self, runner):
"""Close mosaic and mixup augmentation and switches to use L1 loss."""
epoch = runner.epoch
train_loader = runner.data_loader
model = runner.model
if is_module_wrapper(model):
model = model.module
if (epoch + 1) == runner.max_epochs - self.num_last_epochs:
runner.logger.info('No mosaic and mixup aug now!')
train_loader.dataset.update_skip_type_keys(self.skip_type_keys)
runner.logger.info('Add additional L1 loss now!')
model.bbox_head.use_l1 = True
这里主要调用了train_loader.dataset.update_skip_type_keys来实现关闭后几个epoch的mosaic。
要注意的是在MultiImageMixDataset中要实现此方法。
整个model的register更简单一些只需要注意其输入的输出即可。
# Copyright (c) OpenMMLab. All rights reserved.
import math
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
from mmcv.runner import BaseModule
from torch.nn.modules.batchnorm import _BatchNorm
from ..builder import BACKBONES
from ..utils import CSPLayer
class Focus(nn.Module):
"""Focus width and height information into channel space.
Args:
in_channels (int): The input channels of this Module.
out_channels (int): The output channels of this Module.
kernel_size (int): The kernel size of the convolution. Default: 1
stride (int): The stride of the convolution. Default: 1
conv_cfg (dict): Config dict for convolution layer. Default: None,
which means using conv2d.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='BN', momentum=0.03, eps=0.001).
act_cfg (dict): Config dict for activation layer.
Default: dict(type='Swish').
"""
def __init__(self,
in_channels,
out_channels,
kernel_size=1,
stride=1,
conv_cfg=None,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')):
super().__init__()
self.conv = ConvModule(
in_channels * 4,
out_channels,
kernel_size,
stride,
padding=(kernel_size - 1) // 2,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
def forward(self, x):
# shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
patch_top_left = x[..., ::2, ::2]
patch_top_right = x[..., ::2, 1::2]
patch_bot_left = x[..., 1::2, ::2]
patch_bot_right = x[..., 1::2, 1::2]
x = torch.cat(
(
patch_top_left,
patch_bot_left,
patch_top_right,
patch_bot_right,
),
dim=1,
)
return self.conv(x)
class SPPBottleneck(BaseModule):
"""Spatial pyramid pooling layer used in YOLOv3-SPP.
Args:
in_channels (int): The input channels of this Module.
out_channels (int): The output channels of this Module.
kernel_sizes (tuple[int]): Sequential of kernel sizes of pooling
layers. Default: (5, 9, 13).
conv_cfg (dict): Config dict for convolution layer. Default: None,
which means using conv2d.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='BN').
act_cfg (dict): Config dict for activation layer.
Default: dict(type='Swish').
init_cfg (dict or list[dict], optional): Initialization config dict.
Default: None.
"""
def __init__(self,
in_channels,
out_channels,
kernel_sizes=(5, 9, 13),
conv_cfg=None,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=None):
super().__init__(init_cfg)
mid_channels = in_channels // 2
self.conv1 = ConvModule(
in_channels,
mid_channels,
1,
stride=1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
self.poolings = nn.ModuleList([
nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
for ks in kernel_sizes
])
conv2_channels = mid_channels * (len(kernel_sizes) + 1)
self.conv2 = ConvModule(
conv2_channels,
out_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
def forward(self, x):
x = self.conv1(x)
x = torch.cat([x] + [pooling(x) for pooling in self.poolings], dim=1)
x = self.conv2(x)
return x
@BACKBONES.register_module()
class CSPDarknet(BaseModule):
"""CSP-Darknet backbone used in YOLOv5 and YOLOX.
Args:
arch (str): Architecture of CSP-Darknet, from {P5, P6}.
Default: P5.
deepen_factor (float): Depth multiplier, multiply number of
channels in each layer by this amount. Default: 1.0.
widen_factor (float): Width multiplier, multiply number of
blocks in CSP layer by this amount. Default: 1.0.
out_indices (Sequence[int]): Output from which stages.
Default: (2, 3, 4).
frozen_stages (int): Stages to be frozen (stop grad and set eval
mode). -1 means not freezing any parameters. Default: -1.
use_depthwise (bool): Whether to use depthwise separable convolution.
Default: False.
arch_ovewrite(list): Overwrite default arch settings. Default: None.
spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP
layers. Default: (5, 9, 13).
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: dict(type='BN', requires_grad=True).
act_cfg (dict): Config dict for activation layer.
Default: dict(type='LeakyReLU', negative_slope=0.1).
norm_eval (bool): Whether to set norm layers to eval mode, namely,
freeze running stats (mean and var). Note: Effect on Batch Norm
and its variants only.
init_cfg (dict or list[dict], optional): Initialization config dict.
Default: None.
Example:
>>> from mmdet.models import CSPDarknet
>>> import torch
>>> self = CSPDarknet(depth=53)
>>> self.eval()
>>> inputs = torch.rand(1, 3, 416, 416)
>>> level_outputs = self.forward(inputs)
>>> for level_out in level_outputs:
... print(tuple(level_out.shape))
...
(1, 256, 52, 52)
(1, 512, 26, 26)
(1, 1024, 13, 13)
"""
# From left to right:
# in_channels, out_channels, num_blocks, add_identity, use_spp
arch_settings = {
'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False],
[256, 512, 9, True, False], [512, 1024, 3, False, True]],
'P6': [[64, 128, 3, True, False], [128, 256, 9, True, False],
[256, 512, 9, True, False], [512, 768, 3, True, False],
[768, 1024, 3, False, True]]
}
def __init__(self,
arch='P5',
deepen_factor=1.0,
widen_factor=1.0,
out_indices=(2, 3, 4),
frozen_stages=-1,
use_depthwise=False,
arch_ovewrite=None,
spp_kernal_sizes=(5, 9, 13),
conv_cfg=None,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
norm_eval=False,
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=math.sqrt(5),
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu')):
super().__init__(init_cfg)
arch_setting = self.arch_settings[arch]
if arch_ovewrite:
arch_setting = arch_ovewrite
assert set(out_indices).issubset(
i for i in range(len(arch_setting) + 1))
if frozen_stages not in range(-1, len(arch_setting) + 1):
raise ValueError('frozen_stages must be in range(-1, '
'len(arch_setting) + 1). But received '
f'{frozen_stages}')
self.out_indices = out_indices
self.frozen_stages = frozen_stages
self.use_depthwise = use_depthwise
self.norm_eval = norm_eval
conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
self.stem = Focus(
3,
int(arch_setting[0][0] * widen_factor),
kernel_size=3,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
self.layers = ['stem']
for i, (in_channels, out_channels, num_blocks, add_identity,
use_spp) in enumerate(arch_setting):
in_channels = int(in_channels * widen_factor)
out_channels = int(out_channels * widen_factor)
num_blocks = max(round(num_blocks * deepen_factor), 1)
stage = []
conv_layer = conv(
in_channels,
out_channels,
3,
stride=2,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
stage.append(conv_layer)
if use_spp:
spp = SPPBottleneck(
out_channels,
out_channels,
kernel_sizes=spp_kernal_sizes,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
stage.append(spp)
csp_layer = CSPLayer(
out_channels,
out_channels,
num_blocks=num_blocks,
add_identity=add_identity,
use_depthwise=use_depthwise,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
stage.append(csp_layer)
self.add_module(f'stage{i + 1}', nn.Sequential(*stage))
self.layers.append(f'stage{i + 1}')
def _freeze_stages(self):
if self.frozen_stages >= 0:
for i in range(self.frozen_stages + 1):
m = getattr(self, self.layers[i])
m.eval()
for param in m.parameters():
param.requires_grad = False
def train(self, mode=True):
super(CSPDarknet, self).train(mode)
self._freeze_stages()
if mode and self.norm_eval:
for m in self.modules():
if isinstance(m, _BatchNorm):
m.eval()
def forward(self, x):
outs = []
for i, layer_name in enumerate(self.layers):
layer = getattr(self, layer_name)
x = layer(x)
if i in self.out_indices:
outs.append(x)
return tuple(outs)