DataParallel(DP):Parameter Server
模式,一张卡位reducer
,实现也超级简单,一行代码。 有个不能接受的缺陷是:DataParallel
是基于Parameter server
的算法,所有的loss
都在主卡上计算,负载不均衡的问题比较严重,有时在模型较大的时候(比如bert-large
),主卡占满了,其他的卡一半都占不到,及其浪费资源。
值得注意的是,模型和数据都需要先 load
进 GPU
中,DataParallel
的 module
才能对其进行处理,否则会报错:
示例代码:
# coding=utf-8
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class Model(nn.Module):
def __init__(self, input_size, output_size):
super(Model, self).__init__()
self.fc = nn.Linear(input_size, output_size)
def forward(self, input):
output = self.fc(input)
return output
input_size = 5
output_size = 2
batch_size = 30
data_size = 30
dataset = RandomDataset(input_size, data_size)
rand_loader = DataLoader(dataset=dataset,
batch_size=batch_size, shuffle=True)
model = Model(input_size, output_size)
if torch.cuda.is_available():
model.cuda()
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model) # 关键代码
for data in rand_loader:
if torch.cuda.is_available():
input_var = Variable(data.cuda())
else:
input_var = Variable(data)
output = model(input_var)
是的,你没有看错,这个函数是为了分布式训练设计的。但是,即使在单机多卡上,官方也建议使用新的DistributedDataParallel
,采用all-reduce
算法。
(1)初始化后端
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
(2)模型并行化 这里也很简单,使用DistributedDataParallel函数warp一下就可以:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank)
(3)数据并行 这里需要注意,如果指定了sampler,则shuffle=False,其中DataLoader的num_worker是每一个卡独立设置。
dataset = RandomDataset(input_size, data_size)
sampler = torch.utils.data.distributed.DistributedSampler(dataset)
rand_loader = DataLoader(dataset=dataset,
batch_size=batch_size, shuffle=False, sampler=sampler)
(4)启动脚本
python -m torch.distributed.launch --nproc_per_node=8 train_face_troch.py
完整代码示例:
# coding=utf-8
import torch
import torch.distributed
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import apex
import argparse
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
self.label = torch.mean(self.data, dim=-1)
def __getitem__(self, index):
return self.data[index], self.label[index]
def __len__(self):
return self.len
class Model(nn.Module):
def __init__(self, input_size, output_size):
super(Model, self).__init__()
self.fc = nn.Linear(input_size, output_size)
def forward(self, input):
output = self.fc(input)
return output
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--local_rank', default=0, type=int)
args = parser.parse_args()
return args
input_size = 5
output_size = 2
batch_size = 30
data_size = 30
args = parse_args()
local_rank = args.local_rank
torch.cuda.set_device(local_rank) # 设定cuda的默认GPU,每个rank不同
torch.distributed.init_process_group(backend='nccl', init_method='env://')
dataset = RandomDataset(input_size, data_size)
sampler = torch.utils.data.distributed.DistributedSampler(dataset)
rand_loader = DataLoader(dataset=dataset,
batch_size=batch_size, shuffle=False, sampler=sampler)
model = Model(input_size, output_size)
if torch.cuda.is_available():
model.cuda()
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss()
# if torch.cuda.device_count() > 1:
# model = nn.DataParallel(model)
for data, label in rand_loader:
data = data.cuda()
label = label.cuda()
output = model(data)
loss = criterion(output, label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
使用 torch.multiprocessing 取代启动器
有的同学可能比较熟悉 torch.multiprocessing,也可以手动使用 torch.multiprocessing
进行多进程控制。绕开 torch.distributed.launch 自动控制开启和退出进程的一些小毛病~
使用时,只需要调用 torch.multiprocessing.spawn
,torch.multiprocessing
就会帮助我们自动创建进程。如下面的代码所示,spawn
开启了 nprocs=4
个线程,每个线程执行 main_worker
并向其中传入 local_rank
(当前进程 index
)和 args
(即 4
和 myargs
)作为参数:
import torch.multiprocessing as mp
mp.spawn(main_worker, nprocs=4, args=(4, myargs))
这里,我们直接将原本需要 torch.distributed.launch
管理的执行内容,封装进 main_worker
函数中,其中 proc
对应 local_rank
(当前进程 index
),ngpus_per_node
对应 4
, args
对应 myargs
:
def main_worker(proc, ngpus_per_node, args):
dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:23456', world_size=4, rank=gpu)
torch.cuda.set_device(args.local_rank)
train_dataset = ...
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=..., sampler=train_sampler)
model = ...
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank])
optimizer = optim.SGD(model.parameters())
for epoch in range(100):
for batch_idx, (data, target) in enumerate(train_loader):
images = images.cuda(non_blocking=True)
target = target.cuda(non_blocking=True)
...
output = model(images)
loss = criterion(output, target)
...
optimizer.zero_grad()
loss.backward()
optimizer.step()
在上面的代码中值得注意的是,由于没有 torch.distributed.launch
读取的默认环境变量作为配置,我们需要手动为 init_process_group
指定参数:
dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:23456', world_size=4, rank=gpu)
汇总一下,添加 multiprocessing
后并行训练部分主要与如下代码段有关:
# main.py
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
mp.spawn(main_worker, nprocs=4, args=(4, myargs))
def main_worker(proc, ngpus_per_node, args):
dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:23456', world_size=4, rank=gpu)
torch.cuda.set_device(args.local_rank)
train_dataset = ...
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=..., sampler=train_sampler)
model = ...
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank])
optimizer = optim.SGD(model.parameters())
for epoch in range(100):
for batch_idx, (data, target) in enumerate(train_loader):
images = images.cuda(non_blocking=True)
target = target.cuda(non_blocking=True)
...
output = model(images)
loss = criterion(output, target)
...
optimizer.zero_grad()
loss.backward()
optimizer.step()
在使用时,直接使用 python
运行就可以了:
python main.py