模型并行的几种使用情况
如果使用服务器,需要指定GPU个数 gpu:2
#!/bin/bash
#SBATCH -J LIYIHAO
#SBATCH --output=cataract_RL_%j.out
#SBATCH --error=cataract_RL_%j.txt
#SBATCH --gres=gpu:2
#SBATCH --nodelist=ns3185995
#SBATCH -c 9
srun singularity exec --nv /home/yihao/GOALS/ra_new.sif python3 Basic_Usage.py
import torch
import torch.nn as nn
import torch.optim as optim
print('cuda',torch.cuda.is_available())
print('gpu number',torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
print(torch.cuda.get_device_name(i))
class ToyModel(nn.Module):
def __init__(self):
super(ToyModel, self).__init__()
self.net1 = torch.nn.Linear(10, 10).to('cuda:0')
self.relu = torch.nn.ReLU()
self.net2 = torch.nn.Linear(10, 5).to('cuda:1')
def forward(self, x):
x = self.relu(self.net1(x.to('cuda:0')))
return self.net2(x.to('cuda:1'))
model = ToyModel()
for name, param in model.named_parameters():
print(name, param.device)
loss_fn = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)
optimizer.zero_grad()
outputs = model(torch.randn(20, 10))
labels = torch.randn(20, 5).to('cuda:1')
loss_fn(outputs, labels).backward()
optimizer.step()
print('finish')
输出
cuda True
gpu number 2
Tesla V100S-PCIE-32GB
Tesla V100S-PCIE-32GB
net1.weight cuda:0
net1.bias cuda:0
net2.weight cuda:1
net2.bias cuda:1
finish
import torch.nn as nn
from torchvision.models.resnet import ResNet, Bottleneck
num_classes = 1000
class ModelParallelResNet50(ResNet):
def init(self, args, **kwargs):
super(ModelParallelResNet50, self).init(
Bottleneck, [3, 4, 6, 3], num_classes=num_classes,args, **kwargs)
self.seq1 = nn.Sequential(
self.conv1,
self.bn1,
self.relu,
self.maxpool,
self.layer1,
self.layer2
).to('cuda:0') # 放置在第1个GPU上
self.seq2 = nn.Sequential(
self.layer3,
self.layer4,
self.avgpool,
).to('cuda:1') # 放置在第2个GPU上
self.fc.to('cuda:1')
def forward(self, x):
x = self.seq2(self.seq1(x).to('cuda:1'))
return self.fc(x.view(x.size(0), -1))
model = ModelParallelResNet50()
继承现有的网络,并将不同的层组合成新的block指定GPU
import torch
import torch.nn as nn
import torch.optim as optim
import timm
Res_timm2 = timm.create_model('resnet34', pretrained=True)
class ModelParallelResNet34(nn.Module):
def __init__(self, Res_timm2):
super(ModelParallelResNet34, self).__init__()
self.seq1 = nn.Sequential(
Res_timm2.conv1,
Res_timm2.bn1,
Res_timm2.act1,
Res_timm2.maxpool,
Res_timm.layer1,
Res_timm.layer2
).to('cuda:0') # 放置在第1个GPU上
self.seq2 = nn.Sequential(
Res_timm2.layer3,
Res_timm2.layer4,
Res_timm2.global_pool,
).to('cuda:1') # 放置在第2个GPU上
self.fc = Res_timm2.fc.to('cuda:1')
def forward(self, x):
x = self.seq2(self.seq1(x).to('cuda:1'))
return self.fc(x)
model2 = ModelParallelResNet34(Res_timm2)
print("----method 2 Resnet34 = ")
for name, param in model2.named_parameters():
print(name, param.device)