unet pytorch

1.单机多卡版本:代码中的DistributedDataParallel (DDP) 部分对应单机多卡的分布式训练方式

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import RandomHorizontalFlip, RandomVerticalFlip, RandomRotation, RandomResizedCrop, ToTensor
from torch.nn.parallel import DistributedDataParallel as DDP

# 定义ResNet块
class ResNetBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ResNetBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.relu(out)
        out = self.conv2(out)
        out += residual
        out = self.relu(out)
        return out

# 定义UNet模型
class UNet(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(UNet, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=3, padding=1)
        self.block1 = ResNetBlock(64, 64)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.block2 = ResNetBlock(128, 128)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.block3 = ResNetBlock(256, 256)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, padding=1
                self.block4 = ResNetBlock(512, 512)
        self.upconv3 = nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2)
        self.upconv2 = nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2)
        self.upconv1 = nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2)
        self.conv5 = nn.Conv2d(128, out_channels, kernel_size=1)

    def forward(self, x):
        x1 = self.conv1(x)
        x1 = self.block1(x1)
        x2 = self.pool1(x1)
        x2 = self.conv2(x2)
        x2 = self.block2(x2)
        x3 = self.pool2(x2)
        x3 = self.conv3(x3)
        x3 = self.block3(x3)
        x4 = self.pool3(x3)
        x4 = self.conv4(x4)
        x4 = self.block4(x4)
        x = self.upconv3(x4)
        x = torch.cat((x, x3), dim=1)
        x = self.conv5(x)
        x = self.upconv2(x)
        x = torch.cat((x, x2), dim=1)
        x = self.upconv1(x)
        x = torch.cat((x, x1), dim=1)
        x = self.conv5(x)
        return x


# 定义数据集类
class CustomDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data = # Load data from data_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        image, mask = self.data[index]
        if self.transform:
            image = self.transform(image)
            mask = self.transform(mask)
        return image, mask

# 设置训练参数
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 10
batch_size = 4

# 创建UNet模型和优化器
model = UNet(in_channels=3, num_classes=2).to(device)
model = DDP(model)

optimizer = optim.Adam(model.parameters(), lr=0.001)

# 定义数据增强方法
transform = transforms.Compose([
    RandomHorizontalFlip(),
    RandomVerticalFlip(),
    RandomRotation(15),
    RandomResizedCrop(256, scale=(0.8, 1.0)),
    ToTensor(),
])

# 加载数据集并进行数据增强
dataset = CustomDataset(data_dir="path_to_dataset", transform=transform)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4)

# 训练循环
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for images, masks in dataloader:
        images = images.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = nn.CrossEntropyLoss()(outputs, masks)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader)}")

2.多机多卡版本:使用torch.utils.data.distributed.DistributedSampler和torch.distributed.init_process_group来实现多机多卡的分布式训练,确保在每个进程中都有不同的数据划分和完整的通信。

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.nn.parallel import DistributedDataParallel
from torchvision.transforms import transforms
from torchvision.datasets import YourDataset
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist

# 定义ResNet块
class ResNetBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ResNetBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.relu(out)
        out = self.conv2(out)
        out += residual
        out = self.relu(out)
        return out

# 定义UNet模型
class UNet(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(UNet, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=3, padding=1)
        self.block1 = ResNetBlock(64, 64)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.block2 = ResNetBlock(128, 128)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.block3 = ResNetBlock(256, 256)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, padding=1
                self.block4 = ResNetBlock(512, 512)
        self.upconv3 = nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2)
        self.upconv2 = nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2)
        self.upconv1 = nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2)
        self.conv5 = nn.Conv2d(128, out_channels, kernel_size=1)

    def forward(self, x):
        x1 = self.conv1(x)
        x1 = self.block1(x1)
        x2 = self.pool1(x1)
        x2 = self.conv2(x2)
        x2 = self.block2(x2)
        x3 = self.pool2(x2)
        x3 = self.conv3(x3)
        x3 = self.block3(x3)
        x4 = self.pool3(x3)
        x4 = self.conv4(x4)
        x4 = self.block4(x4)
        x = self.upconv3(x4)
        x = torch.cat((x, x3), dim=1)
        x = self.conv5(x)
        x = self.upconv2(x)
        x = torch.cat((x, x2), dim=1)
        x = self.upconv1(x)
        x = torch.cat((x, x1), dim=1)
        x = self.conv5(x)
        return x

# 定义数据集类
class CustomDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data = # Load data from data_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        image, mask = self.data[index]
        if self.transform:
            image = self.transform(image)
            mask = self.transform(mask)
        return image, mask

def main(rank, world_size):
    # 设置分布式训练参数
    torch.cuda.set_device(rank)
    torch.distributed.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)

    # 设置训练参数
    num_epochs = 10
    batch_size_per_gpu = 4

    # 创建UNet模型和优化器
    in_channels = 3
   
    model = UNet(in_channels=3, num_classes=2).cuda(rank)
    model = DistributedDataParallel(model, device_ids=[rank])
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # 数据增强方法
    transform = transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        transforms.RandomRotation(30),
        transforms.RandomResizedCrop(256, scale=(0.8, 1.2)),
        transforms.ToTensor()
    ])

    # 加载训练集和验证集
    train_dataset = CustomDataset(transform=transform)
    train_sampler = DistributedSampler(train_dataset)
    train_loader = DataLoader(train_dataset, batch_size=batch_size_per_gpu, sampler=train_sampler)

    # 训练循环
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0

        for images, masks in train_loader:
            images = images.cuda(rank)
            masks = masks.cuda(rank)

            # 执行前向传播和反向传播
            optimizer.zero_grad()
            outputs = model(images)
            loss = F.binary_cross_entropy_with_logits(outputs, masks)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        if world_size > 1:
            torch.distributed.all_reduce(total_loss)
        total_loss /= len(train_sampler)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss:.4f}")

def main_multi_gpu():
    world_size = torch.cuda.device_count()
    if world_size > 1:
        torch.multiprocessing.spawn(main, args=(world_size,), nprocs=world_size, join=True)
    else:
        main(0, 1)

if __name__ == '__main__':
    main_multi_gpu()

你可能感兴趣的:(pytorch,深度学习,人工智能)