pytorch是支持单机多卡甚至是多机多卡训练的,目前大部分的模型朝着6G甚至好几十G上涨,单凭一张显卡有点难以招架了。上回测试ChatGLM3-6B推理都消耗了8G,单卡全参数训练可能够呛了,研究一下多卡分布式的,后续做chatGLM的finetune。DDP的大致原理是模型进行拷贝,数据多路输入在模型中求梯度、求多个模型的均值梯度统一做反向传播,大致的前向传播如下图,反向太复杂了不描写,感兴趣的朋友自行查找资料。正文完毕,后续上代码。
1. 导入需要的包
import os
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
# 多gpu训练所需的包
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
2. 设置DDP
def ddp_setup(rank, world_size):
"""
每个显卡都进行初始化
"""
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
init_process_group(backend="nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
3. 构造数据加载器
class MyTrainDateset(Dataset):
"""
数据集加载器
"""
def __init__(self, size):
self.size = size
self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)]
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.size
4. 构造训练器
class Trainer:
def __init__(self, model, train_dataloader, optimizer, gpu_id):
self.gpu_id = gpu_id
self.model = model.to(gpu_id)
self.train_dataloader = train_dataloader
self.optimizer = optimizer
self.model = DDP(model, device_ids=[gpu_id])
def _run_batch(self, xs, ys):
self.optimizer.zero_grad()
output = self.model(xs)
loss = F.cross_entropy(output, ys)
loss.backward()
self.optimizer.step()
def _run_epoch(self, epoch):
batch_size = len(next(iter(self.train_dataloader))[0])
print(f"|GPU:{self.gpu_id}| Epoch:{epoch} | batchsize:{batch_size} | steps:{len(self.train_dataloader)}")
# 打乱数据,随机打乱
self.train_dataloader.sampler.set_epoch(epoch)
for xs, ys in self.train_dataloader:
xs = xs.to(self.gpu_id)
ys = ys.to(self.gpu_id)
self._run_batch(xs, ys)
def train(self, max_epoch: int):
for epoch in range(max_epoch):
self._run_epoch(epoch)
5. mian函数
def main(rank, world_size, max_epochs, batch_size):
ddp_setup(rank, world_size)
train_dataset = MyTrainDateset(2048)
train_dataloader = DataLoader(train_dataset,
batch_size=batch_size,
shuffle=False,
sampler=DistributedSampler(train_dataset))
model = torch.nn.Linear(20, 1)
optimzer = torch.optim.SGD(model.parameters(), lr=1e-3)
trainer = Trainer(model=model, gpu_id=rank, optimizer=optimzer, train_dataloader=train_dataloader)
trainer.train(max_epochs)
destroy_process_group()
我认为ddp不过是把以前单机的训练放到多机上,我们本质的训练过程是不变的,只是多了一些步骤,这些步骤我们可以做个封装,使其不外露以达到无痛迁移。完整代码如下:
"""
pytorch分布式训练结构
"""
import os
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
# 多gpu训练所需的包
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
def ddp_setup(rank, world_size):
"""
每个显卡都进行初始化
"""
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
init_process_group(backend="nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
class MyTrainDateset(Dataset):
"""
数据集加载器
"""
def __init__(self, size):
self.size = size
self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)]
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.size
class Trainer:
def __init__(self, model, train_dataloader, optimizer, gpu_id):
self.gpu_id = gpu_id
self.model = model.to(gpu_id)
self.train_dataloader = train_dataloader
self.optimizer = optimizer
self.model = DDP(model, device_ids=[gpu_id])
def _run_batch(self, xs, ys):
self.optimizer.zero_grad()
output = self.model(xs)
loss = F.cross_entropy(output, ys)
loss.backward()
self.optimizer.step()
def _run_epoch(self, epoch):
batch_size = len(next(iter(self.train_dataloader))[0])
print(f"|GPU:{self.gpu_id}| Epoch:{epoch} | batchsize:{batch_size} | steps:{len(self.train_dataloader)}")
# 打乱数据,随机打乱
self.train_dataloader.sampler.set_epoch(epoch)
for xs, ys in self.train_dataloader:
xs = xs.to(self.gpu_id)
ys = ys.to(self.gpu_id)
self._run_batch(xs, ys)
def train(self, max_epoch: int):
for epoch in range(max_epoch):
self._run_epoch(epoch)
def main(rank, world_size, max_epochs, batch_size):
ddp_setup(rank, world_size)
train_dataset = MyTrainDateset(2048)
train_dataloader = DataLoader(train_dataset,
batch_size=batch_size,
shuffle=False,
sampler=DistributedSampler(train_dataset))
model = torch.nn.Linear(20, 1)
optimzer = torch.optim.SGD(model.parameters(), lr=1e-3)
trainer = Trainer(model=model, gpu_id=rank, optimizer=optimzer, train_dataloader=train_dataloader)
trainer.train(max_epochs)
destroy_process_group()
if __name__ == "__main__":
max_epochs = 5
batch_size = 32
world_size = torch.cuda.device_count()
mp.spawn(main, args=(world_size, max_epochs, batch_size), nprocs=world_size)
准备一下下,然后使用mnist做个demo,敬请期待!
如果觉得文章对你有用请点赞、关注 ->> 你的点赞对我太有用了
群内交流更多技术
130856474 <-- 在这里