Pytorch并行分布式训练四 (DDP)

Getting Started with Distributed Data Parallelicon-default.png?t=LA46https://pytorch.org/tutorials/intermediate/ddp_tutorial.html

DDP是目前Pytorch推荐的多GPU训练方法,它支持单机多卡,多机多卡等情况。目前pytorch对DDP技术中进程的启动方式又分为launch脚本启动,和mp模启动。就目前而言,pytorch更推荐采用mp的方法,但launch的方法使用的也很广,所以下面一并做一些介绍。

1.DataParallel 和 DistributedDataParallel之间的比较

  1. 首先,DP是单进程多线程,只可以在单机中工作。DDP是多进程的,在单机或多机情况下都可以工作。
  2. 尽管在单机工作的情况下,DP也要比DDP更慢。这是由于DP受到了GIL锁、每次前向传播复制模型、和额外的输入散射输出收集、等因素的限制。
  3. 从之前的文章可以知道,当你的模型太大以至于单卡无法加载时,你必须借助“模型并行”来解决。DDP至此结合“模型并行”,但DP却不支持。注意:当DDP结合“模型并行时”,每一个DDP进程将使用“模型并行”,所有的进程间总体使用“数据并行”。

2.一个Launch示例

此方法借助于torch.distributed.launch模块,使用得交广泛,但现在pytorch官方更推荐于mp方法(见下节)

import os

import argparse

import torch

import torch.distributed as dist

import torchvision

import torch.nn as nn

import torch.optim as optim

from torch.nn.parallel import DistributedDataParallel as DDP

   

class ToyModel(nn.Module):

    def __init__(self):

        super(ToyModel, self).__init__()

        self.net1 = nn.Linear(10, 10)

        self.relu = nn.ReLU()

        self.net2 = nn.Linear(10, 5)

    def forward(self, x):

        return self.net2(self.relu(self.net1(x)))

   

def setup(rank, world_size):

    # 注:设置默认使用的GPU设备,当遇到CUDA error: an illegal memory access was encountered

    #     请检查这块是否有设置

    torch.cuda.set_device(rank)

   

    os.environ['MASTER_ADDR'] = 'localhost'

    os.environ['MASTER_PORT'] = '12355'

   

    # 注:第一个参数选择后端,nccl后端是单机多卡情况下的推荐,比gloo快很多。

    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():

    dist.destroy_process_group()

def demo_basic(rank):

    print(f"Running basic DDP example on rank {rank}.")

   

    trainset = torchvision.datasets.CIFAR10(...)

    # 注:数据需要用torch.utils.data.distributed.DistributedSampler处理

    train_sampler = torch.utils.data.distributed.DistributedSampler(trainset)

    trainloader = torch.utils.data.DataLoader(trainset,

                                              batch_size=bs,

                                              num_workers=2,

                                              sampler=train_sampler,

                                              )

   

    # 注:需要先加载到rank上,再用DDP处理

    model = ToyModel().to(rank)

    ddp_model = DDP(model, device_ids=[rank], output_device=rank)

    loss_fn = nn.MSELoss()

    # 注:需要在使用DDP处理完之后,再构建optimizerr

    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    optimizer.zero_grad()

    for data in trainloader:

        inputs, labels = data.input.to(rank), data.lable.to(rank)

        outputs = ddp_model(inputs)

        loss_fn(outputs, labels).backward()

        optimizer.step()

   

    if rank == 0:

        # 注1:只需再rank==0时进行模型保存

        # 注2:保存的是ddp_model.module.state_dict,这才是真正的模型参数。ddp_model只是一个包裹函数

        torch.save(ddp_model.module.state_dict(), "xxx.ckpt")

   



if __name__ == "__main__":

    parser = argparse.ArgumentParser()

    # 注:这里必须要定义一个传参“locak_rank”, launch模块会自动传递这个参数,我们无需传递,未定义会报错。

    parser.add_argument("--locak_rank", default=0, type=int)

    opts = parser.parse_args()

   

    n_gpus = torch.cuda.device_count()

    assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"

    world_size = n_gpus

   

    setup(opts.local_rank, world_size)

    demo_basic(opts.local_rank)

    cleanup()

####

##CMD:

#CUDA_VISIBLE_DEVICES="a,b,c" python -m torch.distributed.launch --nproc_per_node n main.py  

###

一些注意事项:

A.大部分说明在代码注释里

B.--nproc_per_node指定了需要用到的GPU数量

C.此处设置的batch_size是每张卡的batch_size,每个iter所训练的数据量实际为:batch_size x n_gpu。(这和DP不同)

D.torch.cuda.set_device(rank)设置了默认使用的GPU设备号,以防止illegal memory错误。在程序的一些其他地方就可以直接用aa.to('cuda')来使用GPU啦(其实不推荐,pytorch推荐的用法是aa.to(rank),如果rank参数能够传递到对应位置,还是使用官方推荐比较好)。

一些参考资料:

[原创][深度][PyTorch] DDP系列第一篇:入门教程 - 知乎 (zhihu.com)

examples/README.md at master · pytorch/examples (github.com)

3.一个mp示例(pytorch推荐方法)

在使用DDP时,首先要做的就是进行一些“进程组”的设置。

import os

import sys

import tempfile

import torch

import torch.distributed as dist

import torch.nn as nn

import torch.optim as optim

import torch.multiprocessing as mp

from torch.nn.parallel import DistributedDataParallel as DDP

# On Windows platform, the torch.distributed package only

# supports Gloo backend, FileStore and TcpStore.

# For FileStore, set init_method parameter in init_process_group

# to a local file. Example as follow:

# init_method="file:///f:/libtmp/some_file"

# dist.init_process_group(

#    "gloo",

#    rank=rank,

#    init_method=init_method,

#    world_size=world_size)

# For TcpStore, same way as on Linux.

def setup(rank, world_size):

    # 注:根据情况添加

    torch.cuda.set_device(rank)

    os.environ['MASTER_ADDR'] = 'localhost'

    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group

    dist.init_process_group("gloo", rank=rank, world_size=world_size)

def cleanup():

    dist.destroy_process_group()

做完上述初始化后,需要对模型和数据做一些包装。需要注意的是,DDP在构建时0进程将模型广播到其他进程,所以你不用担心各个进程中的模型初始值不同。

class ToyModel(nn.Module):

    def __init__(self):

        super(ToyModel, self).__init__()

        self.net1 = nn.Linear(10, 10)

        self.relu = nn.ReLU()

        self.net2 = nn.Linear(10, 5)

    def forward(self, x):

        return self.net2(self.relu(self.net1(x)))



def demo_basic(rank, world_size):

    print(f"Running basic DDP example on rank {rank}.")

    setup(rank, world_size)

    # create model and move it to GPU with id rank

    model = ToyModel().to(rank)

    ddp_model = DDP(model, device_ids=[rank])

    loss_fn = nn.MSELoss()

    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    optimizer.zero_grad()

    outputs = ddp_model(torch.randn(20, 10))

    labels = torch.randn(20, 5).to(rank)

    loss_fn(outputs, labels).backward()

    optimizer.step()

    cleanup()



def run_demo(demo_fn, world_size):

    mp.spawn(demo_fn,

             args=(world_size,),

             nprocs=world_size,

             join=True)

 

if __name__ == "__main__":

    n_gpus = torch.cuda.device_count()

    assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"

    world_size = n_gpus

    run_demo(demo_basic, world_size)

 

####

##CMD:

#CUDA_VISIBLE_DEVICES="a,b,c" python main.py  

###

  以上就是一个简单的DDP使用示例。正如你所见,DDP拥有很清晰的API接口,从而你无需关心梯度的同步通信等问题。

一些注意事项:

A.这个示例与Getting Started with Distributed Data Parallel 保持一致,没有处理dataset, 具体处理方法可见上一节。

B. torch.cuda.set_device(rank)不被官方推荐,但根据个人情况添加。

C.mp方法存在初始化较慢的现象,但是后面的运行速度不会慢,如果你发现运行速度慢于luanch方法,可以参考Why using mp.spawn is slower than using torch.distributed.launch when using multi-GPU training · Issue #47587 · pytorch/pytorch (github.com)

D.mp方法是pytorch官方的推荐方法,可以尝试使用。mp方法具有更灵活的进程控制(如果会用),运行时也不需要调用其他模块。

4.保存和载入Checkpoints

当使用DDP时,模型应该只在一个进程中被保存,并且在载入时由一个进程载入到所有进程。这样能避免重复载入并且保证各个进程初始参数相同。你还需注意在所有进程都完成保存后再进行载入操作。此外,在加载模块时,需要提供适当的map_location参数,以防止进程进入其他设备。如果map_location未设置。Torch.load将首先将模块加载到CPU,然后将每个参数复制到保存它的位置,这将导致同一台机器上的所有进程使用相同的设备集。

def demo_checkpoint(rank, world_size):

    print(f"Running DDP checkpoint example on rank {rank}.")

    setup(rank, world_size)

    model = ToyModel().to(rank)

    ddp_model = DDP(model, device_ids=[rank])

    loss_fn = nn.MSELoss()

    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint"

    if rank == 0:

        # All processes should see same parameters as they all start from same

        # random parameters and gradients are synchronized in backward passes.

        # Therefore, saving it in one process is sufficient.

        torch.save(ddp_model.state_dict(), CHECKPOINT_PATH)

    # Use a barrier() to make sure that process 1 loads the model after process

    # 0 saves it.

    dist.barrier()

    # configure map_location properly

    map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}

    ddp_model.load_state_dict(

        torch.load(CHECKPOINT_PATH, map_location=map_location))

    optimizer.zero_grad()

    outputs = ddp_model(torch.randn(20, 10))

    labels = torch.randn(20, 5).to(rank)

    loss_fn = nn.MSELoss()

    loss_fn(outputs, labels).backward()

    optimizer.step()

    # Not necessary to use a dist.barrier() to guard the file deletion below

    # as the AllReduce ops in the backward pass of DDP already served as

    # a synchronization.

    if rank == 0:

        os.remove(CHECKPOINT_PATH)

    cleanup()

5.DDP与“模型并行”相结合

class ToyMpModel(nn.Module):

    def __init__(self, dev0, dev1):

        super(ToyMpModel, self).__init__()

        self.dev0 = dev0

        self.dev1 = dev1

        self.net1 = torch.nn.Linear(10, 10).to(dev0)

        self.relu = torch.nn.ReLU()

        self.net2 = torch.nn.Linear(10, 5).to(dev1)

    def forward(self, x):

        x = x.to(self.dev0)

        x = self.relu(self.net1(x))

        x = x.to(self.dev1)

        return self.net2(x)

当DDP结合“模型并行”时,device_ids 和 output_device必须不被设置。输入和输出数据将被自动适当地放置在不同的设备当中。

def demo_model_parallel(rank, world_size):

    print(f"Running DDP with model parallel example on rank {rank}.")

    setup(rank, world_size)

    # setup mp_model and devices for this process

    dev0 = (rank * 2) % world_size

    dev1 = (rank * 2 + 1) % world_size

    mp_model = ToyMpModel(dev0, dev1)

    ddp_mp_model = DDP(mp_model)

    loss_fn = nn.MSELoss()

    optimizer = optim.SGD(ddp_mp_model.parameters(), lr=0.001)

    optimizer.zero_grad()

    # outputs will be on dev1

    outputs = ddp_mp_model(torch.randn(20, 10))

    labels = torch.randn(20, 5).to(dev1)

    loss_fn(outputs, labels).backward()

    optimizer.step()

    cleanup()



if __name__ == "__main__":

    n_gpus = torch.cuda.device_count()

    assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"

    world_size = n_gpus

    run_demo(demo_basic, world_size)

    run_demo(demo_checkpoint, world_size)

    run_demo(demo_model_parallel, world_size)

 

你可能感兴趣的:(Pytorch,pytorch,人工智能)