def main():
args = parse_args()
if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
raise ValueError('The output file must be a pkl file.')
cfg = mmcv.Config.fromfile(args.config)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
cfg.model.pretrained = None
#测试模式
cfg.data.test.test_mode = True
dataset = obj_from_dict(cfg.data.test, datasets, dict(test_mode=True))
if args.gpus == 1:
model = build_detector(
cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
load_checkpoint(model, args.checkpoint)
model = MMDataParallel(model, device_ids=[0])
data_loader = build_dataloader(
dataset,
imgs_per_gpu=1,
workers_per_gpu=cfg.data.workers_per_gpu,
num_gpus=1,
dist=False,
shuffle=False)
outputs = single_test(model, data_loader, args.show)
else:
model_args = cfg.model.copy()
model_args.update(train_cfg=None, test_cfg=cfg.test_cfg)
model_type = getattr(detectors, model_args.pop('type'))
outputs = parallel_test(
model_type,
model_args,
args.checkpoint,
dataset,
_data_func,
range(args.gpus),
workers_per_gpu=args.proc_per_gpu)
if args.out:
print('writing results to {}'.format(args.out))
mmcv.dump(outputs, args.out)
eval_types = args.eval
if eval_types:
print('Starting evaluate {}'.format(' and '.join(eval_types)))
if eval_types == ['proposal_fast']:
result_file = args.out
coco_eval(result_file, eval_types, dataset.coco)
else:
if not isinstance(outputs[0], dict):
result_file = args.out + '.json'
results2json(dataset, outputs, result_file)
coco_eval(result_file, eval_types, dataset.coco)
else:
for name in outputs[0]:
print('\nEvaluating {}'.format(name))
outputs_ = [out[name] for out in outputs]
result_file = args.out + '.{}.json'.format(name)
results2json(dataset, outputs_, result_file)
coco_eval(result_file, eval_types, dataset.coco)
解释一下obj_from_dict方法
def obj_from_dict(info, parrent=None, default_args=None):
"""Initialize an object from dict.
The dict must contain the key "type", which indicates the object type, it
can be either a string or type, such as "list" or ``list``. Remaining
fields are treated as the arguments for constructing the object.
Args:
info (dict): Object types and arguments.
module (:class:`module`): Module which may containing expected object
classes.
default_args (dict, optional): Default arguments for initializing the
object.
Returns:
any type: Object built from the dict.
"""
assert isinstance(info, dict) and 'type' in info
assert isinstance(default_args, dict) or default_args is None
args = info.copy()
obj_type = args.pop('type')
if mmcv.is_str(obj_type):
if parrent is not None:
obj_type = getattr(parrent, obj_type)
else:
obj_type = sys.modules[obj_type]
elif not isinstance(obj_type, type):
raise TypeError('type must be a str or valid type, but got {}'.format(
type(obj_type)))
if default_args is not None:
for name, value in default_args.items():
args.setdefault(name, value)
return obj_type(**args)
输入参数info,是一个cfg里的字典,比如这里的是test配置。
第二个参数是module,包含了期望的对象类别。
第三个是默认参数,用来初始化对象的
首先,判断info是不是字典,而且里面必须包含type关键字
默认参数也要检查是字典或者为None
然后,pop出type
字典的值 obj+type
,
如果是字符串类型,在判断module(parrent)
参数是否为空
obj_type = getattr(parrent, obj_type)
。测试时,parrent是datasets文件夹,obj_type是cfg中的test字典的type
参数,所以相当于是从datasets文件夹里,加载相应名称的数据集读取程序dataset
。如果obj_type不是任何一种python类型,就报错。
然后,如果默认参数不为空,迭代读取每个默认参数的key:value
,把这些字典添加到cfg中test字典的参数中(也就是args)。
最后返回一个数据集读取的类 return obj_type(**args)
obj_type是dataset类,args是配置参数,是一个字典。
同理,这个函数也可以给定cfg,module,其他参数
三个输入来初始化任何一个对象,以上是以加载测试dataset为例
接着往下,如果gpu参数为1,进入单卡程序。
调用build_detector
新建一个模型。
def build_detector(cfg, train_cfg=None, test_cfg=None):
from . import detectors
return build(cfg, detectors, dict(train_cfg=train_cfg, test_cfg=test_cfg))
可以看到是针对建立detector的一个封装
def build(cfg, parrent=None, default_args=None):
if isinstance(cfg, list):
modules = [_build_module(cfg_, parrent, default_args) for cfg_ in cfg]
return nn.Sequential(*modules)
else:
return _build_module(cfg, parrent, default_args)
再找到build
函数,build函数可以处理放在list中的多个module的建立,再调用_build_module
def _build_module(cfg, parrent=None, default_args=None):
return cfg if isinstance(cfg, nn.Module) else obj_from_dict(
cfg, parrent, default_args)
这里其实就是又用到了之前使用的obj_from_dict
函数,按照cfg配置返回一个detector的模块对象。
只不过obj_from_dict函数
传入的参数变成了 cfg=cfg.model , parrent=detectors , default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg)
,
下一步
,加载参数ckp
接着调用MMDataParallel
将model放到GPU上
然后调用build_dataloader
,建立dataloader对象。
我们看一下这个函数
def build_dataloader(dataset,
imgs_per_gpu,
workers_per_gpu,
num_gpus=1,
dist=True,
**kwargs):
输入时dataset对象,每个gpu的图片数,线程数,多少个gpu,是否分布式。
if dist:
rank, world_size = get_dist_info()
sampler = DistributedGroupSampler(dataset, imgs_per_gpu, world_size,
rank)
batch_size = imgs_per_gpu
num_workers = workers_per_gpu
获取分布式的信息,然后调用sampler中的DistributedGroupSampler
划分数据集,得到sampler。
else:
if not kwargs.get('shuffle', True):
sampler = None
else:
sampler = GroupSampler(dataset, imgs_per_gpu)
batch_size = num_gpus * imgs_per_gpu
num_workers = num_gpus * workers_per_gpu
调用GroupSampler
得到sampler。
解释一下kwargs.get():
get(key[, default])
Return the value for key if key is in the dictionary, else default. Ifdefault
is not given, itdefaults
to None, so that this method never raises a KeyError.
然后,调用pytorch.utils.data中的DataLoader生成一个dataloader对象。
data_loader = DataLoader(
dataset,
batch_size=batch_size,
sampler=sampler,
num_workers=num_workers,
collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu),
pin_memory=False,
**kwargs)
return data_loader
doc
:sampler (Sampler, optional) – defines the strategy to draw samples from the dataset. If specified, shuffle must be False.
文档中说,sampler参数如果指定了,就必须设置shuffle=False,这也是为什么,前面检查了shuffle为True时,要设置Sampler=None
doc
:collate_fn (callable, optional) – merges a list of samples to form a mini-batch.
我们来看一下collate function
,
def collate(batch, samples_per_gpu=1):
"""Puts each data field into a tensor/DataContainer with outer dimension
batch size.
Extend default_collate to add support for
:type:`~mmcv.parallel.DataContainer`. There are 3 cases.
1. cpu_only = True, e.g., meta data
2. cpu_only = False, stack = True, e.g., images tensors
3. cpu_only = False, stack = False, e.g., gt bboxes
"""
if not isinstance(batch, collections.Sequence):
raise TypeError("{} is not supported.".format(batch.dtype))
if isinstance(batch[0], DataContainer):
assert len(batch) % samples_per_gpu == 0
stacked = []
#第一种情况
if batch[0].cpu_only:
for i in range(0, len(batch), samples_per_gpu):
stacked.append(
[sample.data for sample in batch[i:i + samples_per_gpu]])
return DataContainer(
stacked, batch[0].stack, batch[0].padding_value, cpu_only=True)
#第二种情况
elif batch[0].stack:
for i in range(0, len(batch), samples_per_gpu):
assert isinstance(batch[i].data, torch.Tensor)
# TODO: handle tensors other than 3d
assert batch[i].dim() == 3
c, h, w = batch[0].size()
for sample in batch[i:i + samples_per_gpu]:
assert c == sample.size(0)
h = max(h, sample.size(1))
w = max(w, sample.size(2))
padded_samples = [
F.pad(
sample.data,
(0, w - sample.size(2), 0, h - sample.size(1)),
value=sample.padding_value)
for sample in batch[i:i + samples_per_gpu]
]
stacked.append(default_collate(padded_samples))
#第三种情况
else:
for i in range(0, len(batch), samples_per_gpu):
stacked.append(
[sample.data for sample in batch[i:i + samples_per_gpu]])
return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
elif isinstance(batch[0], collections.Sequence):
transposed = zip(*batch)
return [collate(samples, samples_per_gpu) for samples in transposed]
elif isinstance(batch[0], collections.Mapping):
return {
key: collate([d[key] for d in batch], samples_per_gpu)
for key in batch[0]
}
else:
return default_collate(batch)
#from torch.utils.data.dataloader import default_collate
collate
函数定义了四种情况处理。分别是DataContainer collections.Sequence collections.Mapping 其他
。
我们只看一下最关键的DataContainer的情况,因为这个数据类型是我们自定义的一个类型。(会在dataloading部分进行专门讲解)
三种情况
return DataContainer( stacked, batch[0].stack, batch[0].padding_value, cpu_only=True)
batch[0]
的大小c,w,h
,然后对每个[i:i+samples_per_gpu]的数据,都计算他们和第一个数据的大小的差,padding这个大小的差距。最后调用torch中默认的default_coolate
将一个minibatch大小的list数据变成image tensors。return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
,cpu_only参数==Falsedoc
:pin_memory (bool, optional) – If True, the data loader will copy tensors into CUDA pinned memory
before returning them.
对CUDA架构而言,主机端的内存被分为两种,一种是可分页内存(pageable memroy)和
页锁定内存(page-lock或 pinned)
。可分页内存是由操作系统API malloc()在主机上分配的,页锁定内存是由CUDA函数cudaHostAlloc()在主机内存上分配的,页锁定内存的重要属性
是主机的操作系统将不会对这块内存进行分页和交换操作,确保该内存始终驻留在物理内存中。
GPU知道页锁定内存的物理地址,可以通过“直接内存访问(Direct Memory Access,DMA)”技术直接在主机和GPU之间复制数据,速率更快
。由于每个页锁定内存都需要分配物理内存,并且这些内存不能交换到磁盘上,所以页锁定内存比使用标准malloc()分配的可分页内存更消耗内存空间
。
以上所有内容解决了gpu只有一个的情况,得到了model和data_loader,然后调用single_test
函数得到outputs。
ok.现在回到test.py
文件,下面是gpu num大于1的情况
像将cfg.model的参数复制到model_args。更新test_cfg配置
model_type = getattr(detectors, model_args.pop('type'))
这一步获取对应的detector
类。
在model/detector文件init中,共定义了以下几种detector:
__ all__ = [ 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN', 'FastRCNN', 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN', 'RetinaNet' ]
然后调用
outputs = parallel_test( model_type, model_args, args.checkpoint, dataset, _data_func, range(args.gpus), workers_per_gpu=args.proc_per_gpu)
我们看一下parallel_test函数
多进程基础教程
直接在代码中解释
def parallel_test(model_cls,
model_kwargs,
checkpoint,
dataset,
data_func,
gpus,
workers_per_gpu=1):
"""Parallel testing on multiple GPUs.
Args:
model_cls (type): Model class type.
model_kwargs (dict): Arguments to init the model.
checkpoint (str): Checkpoint filepath.
dataset (:obj:`Dataset`): The dataset to be tested.
data_func (callable): The function that generates model inputs.
gpus (list[int]): GPU ids to be used.
workers_per_gpu (int): Number of processes on each GPU. It is possible
to run multiple workers on each GPU.
Returns:
list: Test results.
"""
#开启一个多进程的上下文
ctx = multiprocessing.get_context('spawn')
#定义两个多线程队列,用来存储结果/索引结果
idx_queue = ctx.Queue()
result_queue = ctx.Queue()
#计算线程的总共多少个
num_workers = len(gpus) * workers_per_gpu
#定义num_workers个线程函数。
#定义线程函数worker_func, args参数要是可迭代的
workers = [
ctx.Process(
target=worker_func,
args=(model_cls, model_kwargs, checkpoint, dataset, data_func,
gpus[i % len(gpus)], idx_queue, result_queue))
for i in range(num_workers)
]
#启动进程
for w in workers:
#要使你的python服务不受终端影响而常驻系统,就需要将它变成守护进程
w.daemon = True
w.start()
#index加入索引队列
for i in range(len(dataset)):
idx_queue.put(i)
#定义一个结果list初始化为[None,None,None.....]
results = [None for _ in range(len(dataset))]
#定义一个进度bar
prog_bar = mmcv.ProgressBar(task_num=len(dataset))
for _ in range(len(dataset)):
#从结果队列 获取结果 ,索引和result
idx, res = result_queue.get()
#将得到的result加入结果list
results[idx] = res
#更新bar的显示
prog_bar.update()
print('\n')
#关闭每个进程
for worker in workers:
worker.terminate()
return results
ok那么上面函数中最关键的是定义线程时候加入的worker_func函数。
def worker_func(model_cls, model_kwargs, checkpoint, dataset, data_func,
gpu_id, idx_queue, result_queue):
model = model_cls(**model_kwargs)
#实例化这个model,也就是前面的detector
#加载ckp
load_checkpoint(model, checkpoint, map_location='cpu')
#设置多个gpuid
torch.cuda.set_device(gpu_id)
model.cuda()
model.eval()
# 不更新梯度的上下文管理器
with torch.no_grad():
#死循环,到索引队列空了后退出。
while True:
#首先从索引队列中获取索引
idx = idx_queue.get()
#获取第idx个数据
data = dataset[idx]
#计算结果
result = model(**data_func(data, gpu_id))
#将结果(idx,result)加入到结果队列
result_queue.put((idx, result))
data_func函数是怎么样的?
def _data_func(data, device_id):
data = scatter(collate([data], samples_per_gpu=1), [device_id])[0]
return dict(return_loss=False, rescale=True, **data)
datafunc函数是将data分配到多个gpu–>gpu_id。
看一下scatter
函数
#如果输入时list类型,那么对list中每一项都递归地做scatter。
def scatter(input, devices, streams=None):
"""Scatters tensor across multiple GPUs.
"""
if streams is None:
streams = [None] * len(devices)
if isinstance(input, list):
chunk_size = (len(input) - 1) // len(devices) + 1
#如果输入时list类型,那么对list中每一项都递归地做scatter。
outputs = [
scatter(input[i], [devices[i // chunk_size]],
[streams[i // chunk_size]]) for i in range(len(input))
]
return outputs
elif isinstance(input, torch.Tensor):
output = input.contiguous()
# TODO: copy to a pinned buffer first (if copying from CPU)
stream = streams[0] if output.numel() > 0 else None
with torch.cuda.device(devices[0]), torch.cuda.stream(stream):
output = output.cuda(devices[0], non_blocking=True)
return output
else:
raise Exception('Unknown type {}.'.format(type(input)))
最后一部分
结果写入out文件
if args.out:
print('writing results to {}'.format(args.out))
mmcv.dump(outputs, args.out)
eval_types = args.eval
if eval_types:
print('Starting evaluate {}'.format(' and '.join(eval_types)))
if eval_types == ['proposal_fast']:
result_file = args.out
coco_eval(result_file, eval_types, dataset.coco)
else:
if not isinstance(outputs[0], dict):
result_file = args.out + '.json'
results2json(dataset, outputs, result_file)
coco_eval(result_file, eval_types, dataset.coco)
else:
for name in outputs[0]:
print('\nEvaluating {}'.format(name))
outputs_ = [out[name] for out in outputs]
result_file = args.out + '.{}.json'.format(name)
results2json(dataset, outputs_, result_file)
coco_eval(result_file, eval_types, dataset.coco)
将之前得到的outputs通过mmcv/io.py中的dump函数写入输出文件。
接着,下面的大部分是evaluation,调用了coco API做eval,暂时不详细解释。