Pytorch分布式训练:DDP

         pytorch是支持单机多卡甚至是多机多卡训练的,目前大部分的模型朝着6G甚至好几十G上涨,单凭一张显卡有点难以招架了。上回测试ChatGLM3-6B推理都消耗了8G,单卡全参数训练可能够呛了,研究一下多卡分布式的,后续做chatGLM的finetune。DDP的大致原理是模型进行拷贝,数据多路输入在模型中求梯度、求多个模型的均值梯度统一做反向传播,大致的前向传播如下图,反向太复杂了不描写,感兴趣的朋友自行查找资料。正文完毕,后续上代码。

Pytorch分布式训练:DDP_第1张图片

快速开始

1. 导入需要的包

import os
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

# 多gpu训练所需的包
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group

2. 设置DDP

def ddp_setup(rank, world_size):
    """
    每个显卡都进行初始化
    """
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12355"
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

3. 构造数据加载器

class MyTrainDateset(Dataset):
    """
    数据集加载器
    """
    def __init__(self, size):
        self.size = size
        self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)]

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.size

4. 构造训练器

class Trainer:
    def __init__(self, model, train_dataloader, optimizer, gpu_id):
        self.gpu_id = gpu_id
        self.model = model.to(gpu_id)
        self.train_dataloader = train_dataloader
        self.optimizer = optimizer
        self.model = DDP(model, device_ids=[gpu_id])

    def _run_batch(self, xs, ys):
        self.optimizer.zero_grad()
        output = self.model(xs)
        loss = F.cross_entropy(output, ys)
        loss.backward()
        self.optimizer.step()

    def _run_epoch(self, epoch):
        batch_size = len(next(iter(self.train_dataloader))[0])
        print(f"|GPU:{self.gpu_id}| Epoch:{epoch} | batchsize:{batch_size} | steps:{len(self.train_dataloader)}")
        # 打乱数据,随机打乱
        self.train_dataloader.sampler.set_epoch(epoch)
        for xs, ys in self.train_dataloader:
            xs = xs.to(self.gpu_id)
            ys = ys.to(self.gpu_id)
            self._run_batch(xs, ys)

    def train(self, max_epoch: int):
        for epoch in range(max_epoch):
            self._run_epoch(epoch)

5. mian函数

def main(rank, world_size, max_epochs, batch_size):
    ddp_setup(rank, world_size)

    train_dataset = MyTrainDateset(2048)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  sampler=DistributedSampler(train_dataset))

    model = torch.nn.Linear(20, 1)
    optimzer = torch.optim.SGD(model.parameters(), lr=1e-3)
    trainer = Trainer(model=model, gpu_id=rank, optimizer=optimzer, train_dataloader=train_dataloader)
    trainer.train(max_epochs)
    destroy_process_group()

小结

        我认为ddp不过是把以前单机的训练放到多机上,我们本质的训练过程是不变的,只是多了一些步骤,这些步骤我们可以做个封装,使其不外露以达到无痛迁移。完整代码如下:

"""
pytorch分布式训练结构
"""
import os
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

# 多gpu训练所需的包
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group


def ddp_setup(rank, world_size):
    """
    每个显卡都进行初始化
    """
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12355"
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)


class MyTrainDateset(Dataset):
    """
    数据集加载器
    """
    def __init__(self, size):
        self.size = size
        self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)]

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.size


class Trainer:
    def __init__(self, model, train_dataloader, optimizer, gpu_id):
        self.gpu_id = gpu_id
        self.model = model.to(gpu_id)
        self.train_dataloader = train_dataloader
        self.optimizer = optimizer
        self.model = DDP(model, device_ids=[gpu_id])

    def _run_batch(self, xs, ys):
        self.optimizer.zero_grad()
        output = self.model(xs)
        loss = F.cross_entropy(output, ys)
        loss.backward()
        self.optimizer.step()

    def _run_epoch(self, epoch):
        batch_size = len(next(iter(self.train_dataloader))[0])
        print(f"|GPU:{self.gpu_id}| Epoch:{epoch} | batchsize:{batch_size} | steps:{len(self.train_dataloader)}")
        # 打乱数据,随机打乱
        self.train_dataloader.sampler.set_epoch(epoch)
        for xs, ys in self.train_dataloader:
            xs = xs.to(self.gpu_id)
            ys = ys.to(self.gpu_id)
            self._run_batch(xs, ys)

    def train(self, max_epoch: int):
        for epoch in range(max_epoch):
            self._run_epoch(epoch)


def main(rank, world_size, max_epochs, batch_size):
    ddp_setup(rank, world_size)

    train_dataset = MyTrainDateset(2048)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  sampler=DistributedSampler(train_dataset))

    model = torch.nn.Linear(20, 1)
    optimzer = torch.optim.SGD(model.parameters(), lr=1e-3)
    trainer = Trainer(model=model, gpu_id=rank, optimizer=optimzer, train_dataloader=train_dataloader)
    trainer.train(max_epochs)
    destroy_process_group()


if __name__ == "__main__":
    max_epochs = 5
    batch_size = 32
    world_size = torch.cuda.device_count()
    mp.spawn(main, args=(world_size, max_epochs, batch_size), nprocs=world_size)
结尾

准备一下下,然后使用mnist做个demo,敬请期待!
如果觉得文章对你有用请点赞、关注  ->> 你的点赞对我太有用了
群内交流更多技术
130856474  <--  在这里

你可能感兴趣的:(开源大模型使用,pytorch,pytorch,人工智能,python)