mmdetection支持多卡训练,有两种模式,分别是distributed模式和非distributed模式。官方推荐使用distributed模式。
那我们先讲一下distributed模式,mmdetection是使用tools/dist_train.sh来实现。其使用方法是如下:
./tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
# CONFIG_FILE是指模型的参数文件,例如: ./configs/faster_rcnn_r50_fpn_1x.py
# GPU_NUM是指使用GPU个数
# optional arguments其中可以使用的有:“--validate”,这个表示在trian的过程中使用val数据集进行验证
其中--validate的默认值是1个epoch进行一次validation,如果需要修改,在模型参数文件中加入如下
# 例如在./configs/faster_rcnn_r50_fpn_1x.py加入如下
evaluation = dict(interval=1)
打开dist_train.sh文件,可以看到其实还是调用tools/train.py
但由于我在电脑上跑dist_train.sh总是卡住,也不知道原因,所以我就尝试了非distributed的模式。
非distributed模式就直接调用tools/train.py就可以,调用格式如下:
python tools/train.py ${CONFIG_FILE}
# CONFIG_FILE是指模型的参数文件,例如: ./configs/faster_rcnn_r50_fpn_1x.py
需要注意的有如下:
这个解释过程是要从tools/train.py → \rightarrow → mmdet/apis/train.py → \rightarrow → mmcv/runner/runner.py
想要了解训练过程,就需要仔细地看一下train.py中的内容
# tools/train.py
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
# update configs according to CLI args
if args.work_dir is not None:
cfg.work_dir = args.work_dir
if args.resume_from is not None:
cfg.resume_from = args.resume_from
cfg.gpus = args.gpus
if args.autoscale_lr:
# apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
cfg.optimizer['lr'] = cfg.optimizer['lr'] * cfg.gpus / 8
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
# init logger before other steps
logger = get_root_logger(cfg.log_level)
logger.info('Distributed training: {}'.format(distributed))
# set random seeds
if args.seed is not None:
logger.info('Set random seed to {}'.format(args.seed))
set_random_seed(args.seed)
#构建模型,其中cfg.model包含着模型的各种参数,加载自CONFIG_FILE
model = build_detector(
cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
#构建训练数据集
datasets = [build_dataset(cfg.data.train)]
if len(cfg.workflow) == 2:
datasets.append(build_dataset(cfg.data.val))
if cfg.checkpoint_config is not None:
# save mmdet version, config file content and class names in
# checkpoints as meta data
cfg.checkpoint_config.meta = dict(
mmdet_version=__version__,
config=cfg.text,
CLASSES=datasets[0].CLASSES)
# add an attribute for visualization convenience
model.CLASSES = datasets[0].CLASSES
#进行训练
train_detector(
model,
datasets,
cfg,
distributed=distributed,
validate=args.validate,
logger=logger)
所以tools/train.py完成了模型的加载,数据集的加载,使用train_detector进行训练。接下来看一下train_detector中的内容。
#/mmdet/apis/train.py
def train_detector(model,
dataset,
cfg,
distributed=False,
validate=False,
logger=None):
if logger is None:
logger = get_root_logger(cfg.log_level)
# start training
if distributed:
_dist_train(model, dataset, cfg, validate=validate)
else:
_non_dist_train(model, dataset, cfg, validate=validate)
可以看到,train_detector只是为了将distributed_train和non_distributed_train分开,因为我们选择non_distributed_train的模式,就直接看_non_dist_train的函数。
#/mmdet/apis/train.py
def _non_dist_train(model, dataset, cfg, validate=False):
# prepare data loaders
dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
data_loaders = [
build_dataloader(
ds,
cfg.data.imgs_per_gpu,
cfg.data.workers_per_gpu,
cfg.gpus,
dist=False) for ds in dataset
]
# put model on gpus
model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
# build runner
optimizer = build_optimizer(model, cfg.optimizer)
runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
cfg.log_level)
# fp16 setting
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
optimizer_config = Fp16OptimizerHook(
**cfg.optimizer_config, **fp16_cfg, distributed=False)
else:
optimizer_config = cfg.optimizer_config
runner.register_training_hooks(cfg.lr_config, optimizer_config,
cfg.checkpoint_config, cfg.log_config)
if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
#进行训练
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
可以看到_non_dist_train完成了定义optimizer,定义data_loader和定义runner,runner就是为了训练网络,使用runner.run对网络进行训练。接下来看runner.run,这个是在依赖的mmcv包中。
#mmcv/runner/runner.py
class Runner(object):
def run(self, data_loaders, workflow, max_epochs, **kwargs):
"""Start running.
Args:
data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
and validation.
workflow (list[tuple]): A list of (phase, epochs) to specify the
running order and epochs. E.g, [('train', 2), ('val', 1)] means
running 2 epochs for training and 1 epoch for validation,
iteratively.
max_epochs (int): Total training epochs.
"""
... # 省略了一些代码
while self.epoch < max_epochs:
for i, flow in enumerate(workflow):
mode, epochs = flow
if isinstance(mode, str): # self.train()
if not hasattr(self, mode):
raise ValueError(
'runner has no method named "{}" to run an epoch'.
format(mode))
# mode只有两种,是'train'和'val',把epoch_runner指向对应的函数
epoch_runner = getattr(self, mode)
elif callable(mode): # custom train()
epoch_runner = mode
else:
raise TypeError('mode in workflow must be a str or '
'callable function, not {}'.format(
type(mode)))
for _ in range(epochs):
if mode == 'train' and self.epoch >= max_epochs:
return
epoch_runner(data_loaders[i], **kwargs)
time.sleep(1) # wait for some hooks like loggers to finish
self.call_hook('after_run')
def train(self, data_loader, **kwargs):
self.model.train()
self.mode = 'train'
self.data_loader = data_loader
self._max_iters = self._max_epochs * len(data_loader)
self.call_hook('before_train_epoch')
for i, data_batch in enumerate(data_loader):
self._inner_iter = i
self.call_hook('before_train_iter')
# 调用batch_processor来进行前向计算
outputs = self.batch_processor(
self.model, data_batch, train_mode=True, **kwargs)
if not isinstance(outputs, dict):
raise TypeError('batch_processor() must return a dict')
if 'log_vars' in outputs:
self.log_buffer.update(outputs['log_vars'],
outputs['num_samples'])
self.outputs = outputs
self.call_hook('after_train_iter')
self._iter += 1
self.call_hook('after_train_epoch')
self._epoch += 1
看了runner的代码,其中有两个问题需要注意:
先来解决第一个问题:
batch_processor是作为参数在Runner的初始化中使用,所以在./mmdet/apis/train.py找,在runner初始化时用到了batch_processor,其实它是mmdet/apis/train.py中的一个函数:
def batch_processor(model, data, train_mode):
losses = model(**data)
loss, log_vars = parse_losses(losses)
outputs = dict(
loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
return outputs
可以看到,在这个函数中,将data送入前向计算中。熟悉pytorch的同学应该对这个表示会觉得很亲切了。
TODO:查看model(**data)中的代码
解决第二个问题:
这个是由CONFIG_FILE中定义的,参看CONFIG_FILE中的:
workflow = [('train', 1)]
这表示在训练过程中1个train为一个循环,这个可以更改,例如:
workflow = [('train', 2), ('val', 1)] # 表示2个train,一个val为一个循环
Test东西少,就直接放在这里讲了。
Test也分两种模式,distributed和non-distributed,操作与train类似,只不过需要另外一个参数就是要加载的网络权重。所以这两种调用方式分别如下
# single-gpu testing
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show]
# multi-gpu testing
./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}]
官网文档有很多例子可以查看。
接下来,我们就看一下tools/test.py中的内容:
def single_gpu_test(model, data_loader, show=False):
model.eval()
results = []
dataset = data_loader.dataset
prog_bar = mmcv.ProgressBar(len(dataset))
for i, data in enumerate(data_loader):
with torch.no_grad():
result = model(return_loss=False, rescale=not show, **data)
results.append(result)
if show:
model.module.show_result(data, result)
batch_size = data['img'][0].size(0)
for _ in range(batch_size):
prog_bar.update()
return results
def main():
... # 省略了一些加载函数调用参数的代码
# build the dataloader
# TODO: support multiple images per gpu (only minor changes are needed)
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(
dataset,
imgs_per_gpu=1,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=distributed,
shuffle=False)
# build the model and load checkpoint
model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
wrap_fp16_model(model)
checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
# old versions did not save class info in checkpoints, this walkaround is
# for backward compatibility
if 'CLASSES' in checkpoint['meta']:
model.CLASSES = checkpoint['meta']['CLASSES']
else:
model.CLASSES = dataset.CLASSES
if not distributed:
model = MMDataParallel(model, device_ids=[0])
outputs = single_gpu_test(model, data_loader, args.show)
else:
model = MMDistributedDataParallel(model.cuda())
outputs = multi_gpu_test(model, data_loader, args.tmpdir)
... # 省略了一些储存输出和eval的代码
可以看到,在main函数中,使用single_gpu_test函数进行前向计算,而single_gpu_test中使用model(return_loss=False, rescale=not show, **data)计算,这与mmdet/api/train.py中batch_processor函数是一样的。
值得注意的一点是:test过程中,目前的代码只支持一张卡只前向计算一张图片。(其实这一点我很不明白,在训练中都支持一张卡多张图片,这里我认为不难实现啊)。
TODO:看看能不能支持一张卡test多张图片。