task04 Pytorch进阶训练技巧

task04 Pytorch进阶训练技巧

2022/6/22 雾切凉宫

继续使用上一次的UNet网络为例,使用UNet网络完成Carvana数据集的车辆分割。

Carvana数据集地址:Carvana Image Masking Challenge | Kaggle

1. 数据准备

首先还是先导入包:

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.optim as optim
import matplotlib.pyplot as plt
import PIL
from sklearn.model_selection import train_test_split
import os
import numpy as np
import collections
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision

读取数据集:

下载了前面kaggle官网上的train.zip和train_masks.zip并读取数据集,划分检验训练集。

class CarvanaDataset (Dataset) :
    def __init__(self, base_dir, idx_list, mode= "train", transform=None): 
        self.base_dir = base_dir
        self.idx_list = idx_list
        self.images = os. listdir(base_dir+"train")
        self.masks = os.listdir(base_dir+"train_masks")
        self.mode = mode
        self.transform=transform
    def __len__ (self):
        return len(self.idx_list)
    def __getitem__ (self, index):
        image_file = self. images[self.idx_list[ index] ]
        mask_file = image_file[ :-4]+"_mask.gif"
        image = PIL.Image.open(os.path.join(base_dir, "train", image_file))
        if self.mode == "train" :
            mask = PIL.Image.open(os.path.join(base_dir, "train_masks", mask_file)) 
            if self. transform is not None:
                image = self.transform(image)
                mask = self.transform(mask)
                mask[mask!=0] = 1.0
                return image, mask.float( )
        else:
            if self.transform is not None:
                image = self.transform(image)
                return image
base_dir ="./"
transform = transforms.Compose( [ transforms.Resize( (256,256)), transforms. ToTensor() ])
train_idxs,val_idxs = train_test_split(range(len(os.listdir(base_dir+"train_masks"))), test_size=0.3)
train_data = CarvanaDataset(base_dir, train_idxs, transform=transform)
val_data = CarvanaDataset(base_dir, val_idxs, transform= transform)
train_loader = DataLoader(train_data, batch_size=6, shuffle=True)
val_loader = DataLoader(train_data, batch_size=6, shuffle=True)

数据集读取好了,输出几个示例图片看一下

image,mask = next(iter(train_loader))
plt.subplot(121)
plt.imshow(image[0,0])
plt.subplot(122)
plt.imshow(mask[0,0],cmap="gray")

task04 Pytorch进阶训练技巧_第1张图片

2. 模型训练

  • 定义交叉熵损失函数

  • 定义优化器,规定学习率

  • 实例化模型至显存

criterion = nn.BCEWithLogitsLoss() 
optimizer = optim.Adam(unet.parameters(), lr=1e-3, weight_decay=1e-8)
unet = nn.DataParallel(unet).cuda()
  • dice系数,是图像分割任务使用的一种损失函数

  • 定义训练函数

  • 定义评估函数


def dice_coeff(pred, target) : 
    eps=0.0001
    num = pred.size(0)
    m1 = pred.view(num, -1)
    m2 = target.view(num, -1)
    intersection = (m1 * m2).sum( )
    return (2.*intersection + eps) / (m1.sum() + m2.sum() + eps )
def train(epoch) :
    unet.train( )
    train_1oss = 0
    for data, mask in train_loader:
        data, mask = data.cuda(), mask.cuda( )
        optimizer.zero_grad( )
        output = unet(data) 
        loss = criterion (output ,mask)
        loss.backward( )
        optimizer.step( )
        train_1oss += loss.item()*data.size(0)
    train_loss = train_1oss/len(train_loader.dataset)
    print( 'Epoch:{} \tTraining Loss: {: .6f}' . format(epoch, train_loss))
def val(epoch):
    print( "current learning rate:" , optimizer.state_dict()["param_groups"][0]["lr"])
    unet.eval()
    val_loss=0
    dice_score = 0
    with torch.no_grad():
        for data, mask in val_loader:
            data, mask = data.cuda(), mask.cuda( )
            output = unet(data)
            loss = criterion (output, mask)
            val_loss += loss.item( ) *data.size(0)
            dice_score += dice_coeff(torch.sigmoid(output).cpu(), mask.cpu() )*data.size(0)
    val_loss = val_loss/len(val_loader.dataset)
    dice_score = dice_score/len(val_loader.dataset)
    print( 'Epoch: {} \t Validation Loss: {:.6f}, Dice score: {: .6f}' .format(epoch, val_loss, dice_score))


开始训练

epochs=100
for epoch in range(1,epochs+1):
    if hasattr(torch.cuda, 'empty_cache'):
        torch.cuda.empty_cache()
    train(epoch)
    val(epoch)

下面这条命令可以用于查看gpu占用情况

!nvidia-smi
Wed Jun 22 20:11:54 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 512.77       Driver Version: 512.77       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   55C    P3    23W /  N/A |    242MiB /  6144MiB |      3%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A      3860    C+G   ...txyewy\MiniSearchHost.exe    N/A      |
|    0   N/A  N/A      5688    C+G   ...artMenuExperienceHost.exe    N/A      |
|    0   N/A  N/A      7464    C+G   ...d\runtime\WeChatAppEx.exe    N/A      |
|    0   N/A  N/A      8448    C+G   ...tracted\WechatBrowser.exe    N/A      |
|    0   N/A  N/A     16996    C+G   ...cw5n1h2txyewy\LockApp.exe    N/A      |
|    0   N/A  N/A     19540    C+G   ...erver\YourPhoneServer.exe    N/A      |
|    0   N/A  N/A     19660    C+G   ...2txyewy\TextInputHost.exe    N/A      |
|    0   N/A  N/A     21580    C+G   ...n1h2txyewy\SearchHost.exe    N/A      |
|    0   N/A  N/A     26356      C   ...vs\pytorch_enc\python.exe    N/A      |
+-----------------------------------------------------------------------------+

3. 自定义损失函数

如果我们不想使用交叉熵函数,而是想针对分割模型常用的Dice系数设计专门的loss,即DiceLoss, 这时就需要我们自定义PyTorch的损失函数

推荐还是使用类的方法来书写。不过不同于模型定义,并不需要初始化很多属性,只要重写forward函数即可

class DiceLoss(nn. Module) :
    def __init__ (self, weight=None, size_average=True) :
        super(DiceLoss, self).__init__ ()
    def forward(self, inputs, targets, smooth=1):
        inputs = torch.sigmoid( inputs)
        inputs = inputs.view(-1)
        targets = targets.view(-1 )
        intersection = (inputs * targets).sum( )
        dice = (2.*intersection + smooth)/(inputs.sum() + targets.sum() + smooth)
        return 1-dice

评估集中取一张照片并使用新定义的损失函数进行评估

newcriterion = DiceLoss()

unet.eval()
image, mask = next(iter(val_loader))
out_unet = unet(image.cuda())
loss = newcriterion(out_unet, mask.cuda())
print(loss)

tensor(0.6993, device='cuda:0', grad_fn=)

4. 动态调整学习率

随着优化的进行,固定的学习率可能无法满足优化的需求,这时需要调整学习率,降低优化的速度
这里演示使用PyTorch自带的StepLR scheduler动态调整学习率的效果,文字版教程中给出了自定义scheduler的方式

4.1 定义动态学习率

其中step_size为调整学习率的频率,gamma为每次调整的倍率

scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=1, gamma=0.8)

4.2 使用动态学习率进行训练

epochs =100
for epoch in range(1,epochs+1):
    train(epoch)
    val(epoch)
    scheduler.step()
    

5. 模型微调

设置unet最后输出层的conv不进行参数梯度更新

unet.module.outc.conv.weight.requires_grad = False

for layer, param in unet.named_parameters():
    print(layer,"\t",param.requires_grad)
    

6. 半精度训练

可以把原来32位浮点数压缩成16位浮点数。可以让显存压力小很多QAQ

from torch.cuda.amp import autocast

对模型的修改只需要在forward函数前添加**@autocast()修饰器**即可

class UNet_half(nn.Module):
    def __init__(self, n_channels, n_classes, bilinear=True):
        super(UNet_half, self).__init__()
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.bilinear = bilinear

        self.inc = DoubleConv(n_channels, 64)
        self.down1 = Down(64, 128)
        self.down2 = Down(128, 256)
        self.down3 = Down(256, 512)
        factor = 2 if bilinear else 1
        self.down4 = Down(512, 1024 // factor)
        self.up1 = Up(1024, 512 // factor, bilinear)
        self.up2 = Up(512, 256 // factor, bilinear)
        self.up3 = Up(256, 128 // factor, bilinear)
        self.up4 = Up(128, 64, bilinear)
        self.outc = OutConv(64, n_classes)
    
    @autocast()
    def forward(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x = self.up1(x5, x4)
        x = self.up2(x, x3)
        x = self.up3(x, x2)
        x = self.up4(x, x1)
        logits = self.outc(x)
        return logits
unet_half = UNet_half(3,1)
unet_half = nn.DataParallel(unet_half).cuda()

训练函数评估函数的修改在for循环中加入with autocast():

def dice_coeff(pred, target) : 
    eps=0.0001
    num = pred.size(0)
    m1 = pred.view(num, -1)
    m2 = target.view(num, -1)
    intersection = (m1 * m2).sum( )
    return (2.*intersection + eps) / (m1.sum() + m2.sum() + eps )
def train_half(epoch) :
    unet.train( )
    train_1oss = 0
    for data, mask in train_loader:
        data, mask = data.cuda(), mask.cuda( )
        with autocast():     #修改了这里
            optimizer.zero_grad( )
            output = unet(data) 
            loss = criterion (output ,mask)
            loss.backward( )
            optimizer.step( )
            train_1oss += loss.item()*data.size(0)
    train_loss = train_1oss/len(train_loader.dataset)
    print( 'Epoch:{} \tTraining Loss: {: .6f}' . format(epoch, train_loss))
def val_half(epoch):
    print( "current learning rate:" , optimizer.state_dict()["param_groups"][0]["lr"])
    unet.eval()
    val_loss=0
    dice_score = 0
    with torch.no_grad():
        for data, mask in val_loader:
            data, mask = data.cuda(), mask.cuda( )
            with autocast():			#修改了这里
                output = unet(data)
                loss = criterion (output, mask)
                val_loss += loss.item( ) *data.size(0)
                dice_score += dice_coeff(torch.sigmoid(output).cpu(), mask.cpu() )*data.size(0)
    val_loss = val_loss/len(val_loader.dataset)
    dice_score = dice_score/len(val_loader.dataset)
    print( 'Epoch: {} \t Validation Loss: {:.6f}, Dice score: {: .6f}' .format(epoch, val_loss, dice_score))


你可能感兴趣的:(深入浅出pytorch,pytorch,深度学习,python)