首先介绍一下分布式的基础概念:
接下来就是实操:
我的 pytorch 版本為 1.9.0、torchvision 版本為 0.10.0。
import torch
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
在启动分布式训练时候,需要在命令行使用torch.distributed.launch启动器,该启动器会将当前进程的序号(若每个GPU使用一个进程,也是指GPU序号)通过local_rank参数传递给Python文件。
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()
设置进程通讯的后端为nccl
可参考官方参数设置https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group
dist.init_process_group(backend='nccl')
同步所有进程
dist.barrier()
获取当前进程组内的所有进程数:
world_size = dist.get_world_size()
将数据集划分给每一个进程,避免进程间的数据重复
train_sampler = DistributedSampler(train_dataset)
valid_sampler = DistributedSampler(valid_dataset)
train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=256, pin_memory=False, prefetch_factor=2, num_workers=4)
valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=256, pin_memory=False, prefetch_factor=2, num_workers=4)
假设模型已经load,直接使用.to(device),将模型分配至CUDA
device = torch.device("cuda", args.local_rank)
model = model.to(devive)
使用SyncBN优化各自进程中数据集的BN计算
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
使用DistributedDataParallel将模型包装起来
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
在每一次迭代前,通过set_epoch来设置一个random seed,这样每次运行才会将每个epoch的数据都shuffle,并且让每个GPU拿到的数据不同。
train_sampler.set_epoch(epoch)
valid_sampler.set_epoch(epoch)
这部分有两种写法:torch.distributed.launch和torchrun。
参数说明:
nproc_per_node:
每台机器有多少进程
nnodes:
总共使用多少台机器
node_rank:
当前进程属于那台机器
master_addr:
master进程的网络地址
master_port:
master进程的端口,首先要确认该端口是否被其他进程占用
可参考官方设置:https://pytorch.org/docs/stable/distributed.html#launch-utility
$ CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 ddp_example.py
# 第一台机器
$ CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 --nnodes=2 --node_rank=0 --master_addr=”192.168.1.1" --master_port=1234 ddp_example.py
# 第二台机器
$ CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 --nnodes=2 --node_rank=0 --master_addr=”192.168.1.1" --master_port=1234 ddp_example.py
$ CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 ddp_example.py
# 使用不同 master_port 避免冲突
$ CUDA_VISIBLE_DEVICES=2,3 python -m torch.distributed.launch --nproc_per_node=2 --master_port 9999 ddp_example.py
PyTorch提供了新的指令替代 torch.distributed.launch,参数的使用方式参考文件: https://pytorch.org/docs/stable/elastic/run.html#launcher-api
$ CUDA_VISIBLE_DEVICES=0,1 torchrun --standalone --nnodes=1 --nproc_per_node=2 ddp_example.py
:
,替代原来的master_addr以及master_port。$ torchrun --nnodes=1:4 --nproc_per_node=$NUM_TRAINERS --rdzv_id=$JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$HOST_NODE_ADDR ddp_example.py
完整的代码参见:https://github.com/chingi071/Pytorch_note/blob/master/ddp_example.py
import os
import argparse
import torch
import torch.nn as nn
from torch.optim import lr_scheduler
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
def train_epoch(train_loader, optimizer, criterion, lr_scheduler, model, world_size):
model.train()
train_running_loss = 0.0
train_running_acc = 0.0
for batch_idx, (data, target) in enumerate(train_loader):
data = data.to(device, non_blocking=True)
target = target.to(device, non_blocking=True)
output = model(data)
preds = torch.max(output, 1)[1]
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_running_loss += loss.item()
train_running_acc += torch.eq(preds, target).sum().item()
lr_scheduler.step()
train_loss_value = train_running_loss/ (len(train_dataset) / world_size)
train_acc_value = train_running_acc/ (len(train_dataset) / world_size)
return train_loss_value, train_acc_value
def valid_epoch(valid_loader, criterion, model, world_size):
model.eval()
valid_running_loss = 0.0
valid_running_acc = 0.0
with torch.no_grad():
for batch_idx, (data, target) in enumerate(valid_loader):
data = data.to(device, non_blocking=True)
target = target.to(device, non_blocking=True)
outputs = model(data)
preds = torch.max(outputs, 1)[1]
loss = criterion(outputs, target)
valid_running_loss += loss.item()
valid_running_acc += torch.eq(preds, target).sum().item()
valid_loss_value = valid_running_loss/ (len(valid_dataset) / world_size)
valid_acc_value = valid_running_acc/ (len(valid_dataset) / world_size)
return valid_loss_value, valid_acc_value
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()
dist.init_process_group(backend='nccl')
dist.barrier()
# rank = dist.get_rank()
world_size = dist.get_world_size()
train_transform = transforms.Compose([transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])])
valid_transform = transforms.Compose([transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])])
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
valid_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=valid_transform)
train_sampler = DistributedSampler(train_dataset)
valid_sampler = DistributedSampler(valid_dataset)
train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=256,
pin_memory=False, prefetch_factor=2, num_workers=4)
valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=256,
pin_memory=False, prefetch_factor=2, num_workers=4)
if torch.cuda.is_available():
device = torch.device("cuda", args.local_rank)
else:
device = torch.device("cpu")
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18')
model.fc = nn.Sequential(nn.Linear(in_features=512, out_features=128), nn.LeakyReLU(),
nn.Dropout(0.5), nn.Linear(128, 10))
model = model.to(device)
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
output_device=args.local_rank)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.05)
lr_scheduler_values = lr_scheduler.StepLR(optimizer, step_size = 30, gamma = 0.1)
criterion = nn.CrossEntropyLoss().to(device)
num_epochs = 100
for epoch in range(num_epochs):
train_sampler.set_epoch(epoch)
valid_sampler.set_epoch(epoch)
train_loss_value, train_acc_value = train_epoch(train_loader, optimizer, criterion, lr_scheduler_values, model, world_size)
valid_loss_value, valid_acc_value = valid_epoch(valid_loader, criterion, model, world_size)
print("Train_local_rank: {} Train_Epoch: {}/{} Training_Loss: {} Training_acc: {:.2f}\
".format(args.local_rank, epoch, num_epochs-1, train_loss_value, train_acc_value))
print("Valid_local_rank: {} Valid_Epoch: {}/{} Valid_Loss: {} Valid_acc: {:.2f}\
".format(args.local_rank, epoch, num_epochs-1, valid_loss_value, valid_acc_value))
print('--------------------------------')
print("finished.")
执行以下命令进行训练
$ CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 ddp_example.py
最后,总结PyTorch分布式训练的基本流程
https://medium.com/ching-i/pytorch-%E5%88%86%E6%95%A3%E5%BC%8F%E8%A8%93%E7%B7%B4-distributeddataparallel-%E5%AF%A6%E4%BD%9C%E7%AF%87-35c762cb7e08