参考文献:https://www.cnblogs.com/jfdwd/p/11196439.html
NVIDIA Collective Communication Library (NCCL) :NVIDIA分布式通信库,负责加速共同通信原语,不是一个并行编程框架。利用NCCL可以在多GPU训练下的优化。
下载安装链接:https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html
Pytorch中的torch.distributed
包可以调用NCCL作为后端,下面展示单机多GPU的情况。
首先看全局的运行方法:
python -m torch.distributed.launch --nproc_per_node=GPU数量 train.py --arg1 --arg2 --arg3
Note that如果要同时运行另外一个程序,则需要指定主机的地址和端口,否则会与之前的程序地址冲突:
python -m torch.distributed.launch --nproc_per_node=GPU数量 --master_addr 127.0.0.2 --master_port 29501 train.py --arg1 --arg2 --arg3
上述命令为使用torch.distributed.launch来运行我们包含有torch.distributed
的训练脚本,其会传递一些环境变量,下面来看训练脚本:
import torch.distributed as dist
parser = argparse.ArgumentParser(description='Distributed Training')
# 该参数由torch.distributed.launch自动传递,代表当前进程处理的GPU编号
parser.add_argument('--local_rank', type=int, default=0)
num_gpus = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
is_distributed = num_gpus > 1
if is_distributed:
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
dist.barrier()
# train
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
torch.distributed.launch
为我们触发了n个train.py
GPU进程(PID),n就是我们将要使用的GPU数量。train.py
会并行地n个运行。所以我们需要torch.cuda.set_device(args.local_rank)
设定默认的GPU。
如下提供一个训练CIFAR10的文件:
执行代码:python -m torch.distributed.launch --nproc_per_node=2 main.py
import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import time
import os
import argparse
import numpy as np
import models
import torchvision
import torchvision.transforms as transforms
import torch.distributed as dist
parser = argparse.ArgumentParser(description='Distributed Training')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()
num_gpus = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
is_distributed = num_gpus > 1
if is_distributed:
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
dist.barrier()
trainset = torchvision.datasets.CIFAR10(root='data', train=True, download=True,
transform=transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.49139968, 0.48215827, 0.44653124],
[0.24703233, 0.24348505, 0.26158768]),]))
testset = torchvision.datasets.CIFAR10(root='data', train=False, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.49139968, 0.48215827, 0.44653124],
[0.24703233, 0.24348505, 0.26158768]),]))
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True,
pin_memory=(torch.cuda.is_available()))
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False,
pin_memory=(torch.cuda.is_available()))
def reduce_tensor(tensor):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.ReduceOp.SUM)
rt /= 2
return rt
criterion = nn.CrossEntropyLoss()
model = getattr(models, 'LeNet')().cuda()
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=True)
def train(epoch):
print(args.local_rank)
model.train()
start_time = time.time()
for batch_idx, (inputs, targets) in enumerate(trainloader):
inputs, targets = inputs.cuda(), targets.cuda()
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
# 每个GPU独立更新参数
loss.backward()
optimizer.step()
if is_distributed:
# 将多GPU求解的loss进行汇总,torch.distributed会自动的同步所有GPU
reduced_loss = reduce_tensor(loss)
print('GPU: %s, loss: %.3f, batch_size: %d, reduced_loss: %.3f' % (args.local_rank,
loss.item(), targets.size()[0], reduced_loss.item()))
for i in range(100):
train(i)
apex是一个Pytorch的扩展,用于辅助NVIDIA显卡的混合精度和分布式训练。
参考文档:https://nvidia.github.io/apex/
github地址:https://github.com/NVIDIA/apex
这里展示使用apex来实现分布式训练和混合精度训练,同时还能实现多卡的同步批标准化。其中,我们仍然使用torch.distributed
库,但是并行方法采用apex的DistributedDataParallel
对象。其中FP16
和普通精度训练都被包含在代码中。
参考链接:https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main_amp.py
import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import time
import os
import argparse
import numpy as np
import models
import torchvision
import torchvision.transforms as transforms
import torch.distributed as dist
import apex
import apex.amp as amp
from apex.parallel import DistributedDataParallel as DDP
parser = argparse.ArgumentParser(description='Distributed Training')
parser.add_argument('--local_rank', type=int, default=0)
parser.add_argument('--opt_level', type=str, default='O0',
help='O0:FP32,O1:Mixed Precision,O2:Almost FP16 Mixed Precision, O3:fp16 training')
parser.add_argument('--sync_bn', action='store_true',
help='enabling apex sync BN.')
parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
parser.add_argument('--loss-scale', type=str, default=None)
args = parser.parse_args()
num_gpus = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
is_distributed = num_gpus > 1
if is_distributed:
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
trainset = torchvision.datasets.CIFAR10(root='data', train=True, download=True,
transform=transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.49139968, 0.48215827, 0.44653124],
[0.24703233, 0.24348505, 0.26158768]),]))
testset = torchvision.datasets.CIFAR10(root='data', train=False, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.49139968, 0.48215827, 0.44653124],
[0.24703233, 0.24348505, 0.26158768]),]))
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True,
pin_memory=(torch.cuda.is_available()))
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False,
pin_memory=(torch.cuda.is_available()))
def reduce_tensor(tensor):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.ReduceOp.SUM)
rt /= 2
return rt
criterion = nn.CrossEntropyLoss().cuda()
model = getattr(models, 'LeNet')()
if args.sync_bn:
model = apex.parallel.convert_syncbn_model(model)
model=model.cuda()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=True)
model, optimizer = amp.initialize(model, optimizer,
opt_level=args.opt_level,
keep_batchnorm_fp32=args.keep_batchnorm_fp32,
loss_scale=args.loss_scale)
if is_distributed:
model = DDP(model, delay_allreduce=True)
def train(epoch):
model.train()
start_time = time.time()
for batch_idx, (inputs, targets) in enumerate(trainloader):
inputs, targets = inputs.cuda(), targets.cuda()
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
optimizer.step()
if is_distributed:
reduced_loss = reduce_tensor(loss)
print('GPU: %s, loss: %.3f, batch_size: %d, reduced_loss: %.3f' % (args.local_rank,
loss.item(), targets.size()[0], reduced_loss.item()))
for i in range(100):
train(i)