- https://pytorch.org/tutorials/beginner/dist_overview.html
- https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
class DataParallel(Module):
"""
Args:
module (Module): module to be parallelized
device_ids (list of int or torch.device): CUDA devices (default: all devices)
output_device (int or torch.device): device location of output (default: device_ids[0])
Attributes:
module (Module): the module to be parallelized
Example::
>>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2])
>>> output = net(input_var) # input_var can be on any device, including CPU
"""
def __init__(self, module, device_ids=None, output_device=None, dim=0):
super(DataParallel, self).__init__()
torch._C._log_api_usage_once("torch.nn.parallel.DataParallel")
device_type = _get_available_device_type()
if device_type is None:
self.module = module
self.device_ids = []
return
以上是从 DataParallell类定义的代码中截取的部分:
device_ids代表并行的GPU的编号
output_device代表进行loss求取与梯度聚合的GPU编号(默认是GPU0,即device_ids[0],与上图一致)
module不用多讲,便是需要数据并行的模型
1).在文件导入模块部分进行以下设置
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,3"
代码设置了训练使用哪几个GPU,上面的代码就是说使用GPU0与GPU3,它们对应device_id便是0,1这对下面非常有用。
2).接下来将定义好的模型放入DataParallell中:
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = nn.DataParallel(model,device_ids = [0,1])
#model.to(device)的目的是将模型放入device_id对应为的GPU
model.to(device)
3)将数据放入device0中即(device_id为1的GPU)
for X, y in dataloader:
X, y = X.to(device), y.to(device)
本次搭建了一个简单的全连接网络来实现分类任务,数据集为FashionMNIST,损失函数使用交叉损失熵,使用SGD优化器,以下是全部代码,如果要复现代码,尽量将数据集下载到本地。
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import os
import matplotlib.pyplot as plt
import time
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,3"
# Download training data from open datasets.
# Define model
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
model.train()
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
# Compute prediction error
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
if __name__ == '__main__':
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
path = '/home/ymx/PycharmProjects/pytorch_reinforce_example/fashion-mnist-master/data/fashion'
training_data = datasets.FashionMNIST(
root='data',
train=True,
download=True,
transform=ToTensor(),
)
test_data = datasets.FashionMNIST(
root='data',
train=False,
download=True,
transform=ToTensor(),
)
batch_size = 1024
# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
model = NeuralNetwork()
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
model = nn.DataParallel(model,device_ids = [0,1])
#model = nn.DataParallel(model,device_ids = [0,1])
model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=2e-3)
epochs = 5
T1 = time.time()
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train(train_dataloader, model, loss_fn, optimizer)
test(test_dataloader, model, loss_fn)
T2 = time.time()
print('time is %s ms' % ((T2 - T1) * 1000))
print("Done!")