def parse_args():
parser = argparse.ArgumentParser(description='Train keypoints network')
# general
parser.add_argument('--cfg',
help='experiment configure file name',
required=True,
type=str)
parser.add_argument('opts',
help="Modify config options using the command-line",
default=None,
nargs=argparse.REMAINDER)
# distributed training
parser.add_argument('--gpu',
help='gpu id for multiprocessing training',
type=str)
parser.add_argument('--world-size',
default=1,
type=int,
help='number of nodes for distributed training')
parser.add_argument('--dist-url',
default='tcp://127.0.0.1:23456',
type=str,
help='url used to set up distributed training')
parser.add_argument('--rank',
default=0,
type=int,
help='node rank for distributed training')
args = parser.parse_args()
return args
这里的world_size是表示有多少个节点存在,单服务器就是1而已,和下文的world_size含义不一样,下文的world_size是指有多少个进程,因为一个gpu处理一个进程,所以最后的world_size就是多少张卡参与进来。rank是指该节点在所有节点的顺序,一台服务器就是0。
def main():
args = parse_args()
update_config(cfg, args)
cfg.defrost()
cfg.RANK = args.rank
cfg.freeze()
logger, final_output_dir, tb_log_dir = create_logger(
cfg, args.cfg, 'train'
)
logger.info(pprint.pformat(args))
logger.info(cfg)
if args.gpu is not None: #这一句是说如果给定gpu参数就默认是单卡的训练了
warnings.warn('You have chosen a specific GPU. This will completely '
'disable data parallelism.')
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or cfg.MULTIPROCESSING_DISTRIBUTED
#这句话是看是否指定程序是分布式训练,如果world_size大于1或者多进程处理分布这个参数是真的话就表示采用分布式的训练
ngpus_per_node = torch.cuda.device_count()
#这句话是每个节点的gpu的数量,如果是一台服务器就一个节点,那么这里的每个节点gpu数量就是本服务器的显卡数量,4卡机就是4
if cfg.MULTIPROCESSING_DISTRIBUTED:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = ngpus_per_node * args.world_size
#world_size是指全部的进程数,
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp.spawn(
main_worker,
nprocs=ngpus_per_node,
args=(ngpus_per_node, args, final_output_dir, tb_log_dir)
)
#mp.spawn来在一个节点启动该节点所有进程,每个进程运行train
else:
# Simply call main_worker function
main_worker(
','.join([str(i) for i in cfg.GPUS]),
ngpus_per_node,
args,
final_output_dir,
tb_log_dir
)
主要的代码都在main_worker里面,具体分配每张gpu干什么
def main_worker(
gpu, ngpus_per_node, args, final_output_dir, tb_log_dir
):
# cudnn related setting
cudnn.benchmark = cfg.CUDNN.BENCHMARK
torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED
if cfg.FP16.ENABLED:
assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."
if cfg.FP16.STATIC_LOSS_SCALE != 1.0:
if not cfg.FP16.ENABLED:
print("Warning: if --fp16 is not used, static_loss_scale will be ignored.")
args.gpu = gpu
if args.gpu is not None:
print("Use GPU: {} for training".format(args.gpu))
if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if cfg.MULTIPROCESSING_DISTRIBUTED:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args.rank = args.rank * ngpus_per_node + gpu
#计算出当前当前进程在所有进程中的排序
print('Init process group: dist_url: {}, world_size: {}, rank: {}'.
format(args.dist_url, args.world_size, args.rank))
dist.init_process_group(
backend=cfg.DIST_BACKEND,
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank
)
#dist.init_process_group初始化分布式环境,第三个参数world_size在main()函数里面变成了所有的进程数了,args.rank也变成目前的gpu对应的排序号
update_config(cfg, args)
# setup logger
logger, _ = setup_logger(final_output_dir, args.rank, 'train')
model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')(
cfg, is_train=True
)
# copy model file
if not cfg.MULTIPROCESSING_DISTRIBUTED or (
cfg.MULTIPROCESSING_DISTRIBUTED
and args.rank % ngpus_per_node == 0
):
this_dir = os.path.dirname(__file__)
shutil.copy2(
os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'),
final_output_dir
)
writer_dict = {
'writer': SummaryWriter(log_dir=tb_log_dir),
'train_global_steps': 0,
'valid_global_steps': 0,
}
if not cfg.MULTIPROCESSING_DISTRIBUTED or (
cfg.MULTIPROCESSING_DISTRIBUTED
and args.rank % ngpus_per_node == 0
):
dump_input = torch.rand(
(1, 3, cfg.DATASET.INPUT_SIZE, cfg.DATASET.INPUT_SIZE)
)
#writer_dict['writer'].add_graph(model, (dump_input, ))
# logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE))
if cfg.FP16.ENABLED:
model = network_to_half(model)
if cfg.MODEL.SYNC_BN and not args.distributed:
print('Warning: Sync BatchNorm is only supported in distributed training.')
if args.distributed:
if cfg.MODEL.SYNC_BN:
model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
model.cuda(args.gpu)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
# args.workers = int(args.workers / ngpus_per_node)
model = torch.nn.parallel.DistributedDataParallel(
model, device_ids=[args.gpu]
)
else:
model.cuda()
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
model = torch.nn.parallel.DistributedDataParallel(model)
elif args.gpu is not None:
torch.cuda.set_device(args.gpu)
model = model.cuda(args.gpu)
else:
model = torch.nn.DataParallel(model).cuda()
# define loss function (criterion) and optimizer
loss_factory = MultiLossFactory(cfg).cuda()
# Data loading code
train_loader = make_dataloader(
cfg, is_train=True, distributed=args.distributed
)
logger.info(train_loader.dataset)
best_perf = -1
best_model = False
last_epoch = -1
optimizer = get_optimizer(cfg, model)
if cfg.FP16.ENABLED:
optimizer = FP16_Optimizer(
optimizer,
static_loss_scale=cfg.FP16.STATIC_LOSS_SCALE,
dynamic_loss_scale=cfg.FP16.DYNAMIC_LOSS_SCALE
)
begin_epoch = cfg.TRAIN.BEGIN_EPOCH
checkpoint_file = os.path.join(
final_output_dir, 'checkpoint.pth.tar')
if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
checkpoint = torch.load(checkpoint_file)
begin_epoch = checkpoint['epoch']
best_perf = checkpoint['perf']
last_epoch = checkpoint['epoch']
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
logger.info("=> loaded checkpoint '{}' (epoch {})".format(
checkpoint_file, checkpoint['epoch']))
if cfg.FP16.ENABLED:
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
optimizer.optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR,
last_epoch=last_epoch
)
else:
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR,
last_epoch=last_epoch
)
for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH):
# train one epoch
do_train(cfg, model, train_loader, loss_factory, optimizer, epoch,
final_output_dir, tb_log_dir, writer_dict, fp16=cfg.FP16.ENABLED)
# In PyTorch 1.1.0 and later, you should call `lr_scheduler.step()` after `optimizer.step()`.
lr_scheduler.step()
perf_indicator = epoch
if perf_indicator >= best_perf:
best_perf = perf_indicator
best_model = True
else:
best_model = False
if not cfg.MULTIPROCESSING_DISTRIBUTED or (
cfg.MULTIPROCESSING_DISTRIBUTED
and args.rank == 0
):
logger.info('=> saving checkpoint to {}'.format(final_output_dir))
save_checkpoint({
'epoch': epoch + 1,
'model': cfg.MODEL.NAME,
'state_dict': model.state_dict(),
'best_state_dict': model.module.state_dict(),
'perf': perf_indicator,
'optimizer': optimizer.state_dict(),
}, best_model, final_output_dir)
final_model_state_file = os.path.join(
final_output_dir, 'final_state{}.pth.tar'.format(gpu)
)
logger.info('saving final model state to {}'.format(
final_model_state_file))
torch.save(model.module.state_dict(), final_model_state_file)
writer_dict['writer'].close()
更详细的参考
https://zhuanlan.zhihu.com/p/113694038