Pytorch 多GPU分布式数据并行(DistributedDataParallel)

DistributedDataParallel, 多进程,支持数据并行、模型并行,支持单机多卡、多机多卡;进程间仅传递参数,运行效率高于DataParallel
下面是一个文本分类的完整示例

import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist
import torch.multiprocessing as mp
from datasets import Dataset
from torch.utils.data import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DistributedSampler
 
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '5678'

# 加载数据
def create_dataset(data_file, tokenizer):
    print('create dataset')
    with open(data_file, 'r', encoding='utf-8') as f:
        data = [_.strip().split('\t') for _ in f.readlines()]
    x = [_[0] for _ in data]
    y = [int(_[1]) for _ in data]
    data_dict = {'text': x, 'label': y}
    dataset = Dataset.from_dict(data_dict)
    def preprocess_function(examples):
        text_token = tokenizer(examples['text'], padding='max_length' ,truncation=True, max_length=256)
        return text_token
    dataset = dataset.map(preprocess_function, batched=True)
    # 将数据集中的几个字段转成Tensor类型
    dataset.set_format(type='torch', columns=['input_ids', 'label', 'attention_mask'])
    return dataset

# 模型测试
def evaluate(model, data_loader, rank):
    print('eval:')
    model.eval()
    true_labels = []
    pred_labels = []
    for batch in data_loader:
        input_ids = batch['input_ids'].to(rank)
        attention_mask = batch['attention_mask'].to(rank)
        label = batch['label'].to(rank)
        outputs = model(input_ids, attention_mask)
        pred_label = torch.argmax(outputs.logits, axis=1)
        true_labels.append(label)
        pred_labels.append(pred_label)
    true_labels = torch.cat(true_labels)
    pred_labels = torch.cat(pred_labels)
    acc_count = torch.sum(true_labels == pred_labels)
    result = torch.Tensor([acc_count, true_labels.size()[-1]]).to(rank)
    dist.barrier() 
    # 各进程结果合并
    dist.all_reduce(result, op=torch.distributed.ReduceOp.SUM)
    if rank == 0:
        acc = result[0]/result[1]
        print("测试数据总量:%s, 预测正确数量:%s, 准确率:%s" % (int(result[1].item()), int(result[0].item()), round(acc.item(), 4)))
    model.train()

# 模型训练
def example(rank, world_size, model, train_dataset, dev_dataset):
	# 指定进程GPU
	torch.cuda.set_device(rank)
    # 初始化进程组
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    # 分布式
    ddp_model = DDP(model.to(rank), device_ids=[rank])
    # 优化器
    optimizer = optim.Adam(ddp_model.parameters(), lr=5e-5)
    # 将数据分配到各进程中
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
    dev_sampler = DistributedSampler(dev_dataset, num_replicas=world_size, rank=rank)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, pin_memory=True, sampler=train_sampler)
    dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=8, pin_memory=True, sampler=dev_sampler)
    
    
    for epoch in range(5):
        print('epoch', epoch)
        train_sampler.set_epoch(epoch)
        n = 0
        for batch in train_dataloader:
            input_ids = batch['input_ids'].to(rank)
            attention_mask = batch['attention_mask'].to(rank)
            label = batch['label'].to(rank)
            outputs = ddp_model(input_ids, attention_mask, labels=label)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            n += 1
            if n % 10 == 0:
            	torch.cuda.empty_cache()  # 减少显存占用
                evaluate(ddp_model, dev_dataloader, rank)
                torch.cuda.empty_cache()

def main():
    train_file = './data/train.txt'
    dev_file = './data/dev.txt'
    model_name = '../model_hub/bert-base-chinese/'
    # 初始化分词器与模型
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
    # 加载数据
    train_dataset = create_dataset(train_file, tokenizer)
    dev_dataset = create_dataset(dev_file, tokenizer)
    world_size = 2
    mp.spawn(example,
        args=(world_size, model, train_dataset, dev_dataset),
        nprocs=world_size,
        join=True)

if __name__=="__main__":
    main()

执行命令:

CUDA_VISIBLE_DEVICES=‘6,7’ python bert_ddp.py

你可能感兴趣的:(NLP,模型,python)