DistributedDataParallel, 多进程,支持数据并行、模型并行,支持单机多卡、多机多卡;进程间仅传递参数,运行效率高于DataParallel
下面是一个文本分类的完整示例
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist
import torch.multiprocessing as mp
from datasets import Dataset
from torch.utils.data import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DistributedSampler
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '5678'
# 加载数据
def create_dataset(data_file, tokenizer):
print('create dataset')
with open(data_file, 'r', encoding='utf-8') as f:
data = [_.strip().split('\t') for _ in f.readlines()]
x = [_[0] for _ in data]
y = [int(_[1]) for _ in data]
data_dict = {'text': x, 'label': y}
dataset = Dataset.from_dict(data_dict)
def preprocess_function(examples):
text_token = tokenizer(examples['text'], padding='max_length' ,truncation=True, max_length=256)
return text_token
dataset = dataset.map(preprocess_function, batched=True)
# 将数据集中的几个字段转成Tensor类型
dataset.set_format(type='torch', columns=['input_ids', 'label', 'attention_mask'])
return dataset
# 模型测试
def evaluate(model, data_loader, rank):
print('eval:')
model.eval()
true_labels = []
pred_labels = []
for batch in data_loader:
input_ids = batch['input_ids'].to(rank)
attention_mask = batch['attention_mask'].to(rank)
label = batch['label'].to(rank)
outputs = model(input_ids, attention_mask)
pred_label = torch.argmax(outputs.logits, axis=1)
true_labels.append(label)
pred_labels.append(pred_label)
true_labels = torch.cat(true_labels)
pred_labels = torch.cat(pred_labels)
acc_count = torch.sum(true_labels == pred_labels)
result = torch.Tensor([acc_count, true_labels.size()[-1]]).to(rank)
dist.barrier()
# 各进程结果合并
dist.all_reduce(result, op=torch.distributed.ReduceOp.SUM)
if rank == 0:
acc = result[0]/result[1]
print("测试数据总量:%s, 预测正确数量:%s, 准确率:%s" % (int(result[1].item()), int(result[0].item()), round(acc.item(), 4)))
model.train()
# 模型训练
def example(rank, world_size, model, train_dataset, dev_dataset):
# 指定进程GPU
torch.cuda.set_device(rank)
# 初始化进程组
dist.init_process_group("nccl", rank=rank, world_size=world_size)
# 分布式
ddp_model = DDP(model.to(rank), device_ids=[rank])
# 优化器
optimizer = optim.Adam(ddp_model.parameters(), lr=5e-5)
# 将数据分配到各进程中
train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
dev_sampler = DistributedSampler(dev_dataset, num_replicas=world_size, rank=rank)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, pin_memory=True, sampler=train_sampler)
dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=8, pin_memory=True, sampler=dev_sampler)
for epoch in range(5):
print('epoch', epoch)
train_sampler.set_epoch(epoch)
n = 0
for batch in train_dataloader:
input_ids = batch['input_ids'].to(rank)
attention_mask = batch['attention_mask'].to(rank)
label = batch['label'].to(rank)
outputs = ddp_model(input_ids, attention_mask, labels=label)
loss = outputs.loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
n += 1
if n % 10 == 0:
torch.cuda.empty_cache() # 减少显存占用
evaluate(ddp_model, dev_dataloader, rank)
torch.cuda.empty_cache()
def main():
train_file = './data/train.txt'
dev_file = './data/dev.txt'
model_name = '../model_hub/bert-base-chinese/'
# 初始化分词器与模型
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
# 加载数据
train_dataset = create_dataset(train_file, tokenizer)
dev_dataset = create_dataset(dev_file, tokenizer)
world_size = 2
mp.spawn(example,
args=(world_size, model, train_dataset, dev_dataset),
nprocs=world_size,
join=True)
if __name__=="__main__":
main()
执行命令:
CUDA_VISIBLE_DEVICES=‘6,7’ python bert_ddp.py