使用PyTorch训练cifar10

训练cifar10

cifar10数据集相对较大,比minst更适合测试不同算法下的性能,这里没有使用原始的cifar10的python数据,因为原始数据为了方便存储采用的是序列化后的文件,在实际中我们训练的模型通常都是直接获取的图像,没有必要先pickle之后unpickle。此例子用来展示一个简单的分类任务实现,网络部分没有自行设计。主要是从产品的角度实现分类这一功能。同时还有一个目的是用来测试PyTorch环境是否正常。

这段时间一直使用ubuntu20,无奈ubuntu20问题太多了,休眠后音频无输出必须重启,alsa 重启只对非网易云的软件有用、训练的时候视频音频输出卡的和ppt一样,也就是说训练的时候你只能等它训练,听歌看电影都不行。因为我的训练机器是我正在使用的服务器。所以图形化界面也是很重要的一部分,经常卡死目前尚不确定是不是由图形化界面导致。经过n次卡死之后我决定至此以后不再使用ubuntu,所有的工作全部在manjaro上完成。后续会文章中会包含有caffe,darknet相关的训练。此前在ubuntu上非常顺利的编译成功了,manjaro目前存在一些问题,后续有时间我回将caffe和darknet的编译配置更新在次系列文章中。

本地环境:

  • 软件环境
    • 操作系统:5.9.16-1-MANJARO
    • nvidia-driver:Driver Version: 460.39
    • CUDA:cuda_11.1
    • cudnn:8.0.5
    • PyTorch:1.8.0a0+963f762,源码编译之前尝试使用1.7对cuda11支持有些小问题,目前不知道是否解决,所以使用的是源码编译版本,截至到文章发布当日,PyTorch1.8版本已经更新,我没有使用公共编译版。后续更新后没啥使用问题再来更新。
  • 硬件:
    • RTX3090:为啥全都使用最新软件栈是因为RTX3090的驱动太新了,驱动和老CUDA配合不起来,所以没有采用cuda10的任何软件。全都基于最新版本编译。
    • CPU:i7-10700KF,内存32G

说明硬件和软件参数是希望有在较新软件下训练模型同学可以参考我的环境,排查是否是自己软件安装有问题,我的软硬件条件下是可以正常训练的。tensorflow官方源基本都是基于cuda10编译的,较新的显卡驱动太新和cuda10配合的时候经常有问题,最好做到driver的cuda版本和装机的cuda版本一致,这里我的driver编译的版本为11.2cuda版本为11.1目前没有出现问题。PyTorch编译相对简单,这里不说编译流程了,tensorflow新版需要手动支持cuda11的话需要自己编译,tf编译稍微复杂一些,基于我本地环境编译的tensorflow可以从我的百度网盘下载。详情请查看我的另一篇文章。

Tensorflow源码编译相关

训练数据

Datasets/cifar10
├── airplane
├── automobile
├── bird
├── cat
├── deer
├── dog
├── frog
├── horse
├── ship
└── truck

cifar图像每个类别5000张图像,每张图像的size为32x32x3.转换好的数据:链接 提取码: euct 。

创建模型训练数据

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor, Lambda
from torchvision import transforms
import os
from os.path import basename, dirname, join, exists, isdir
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import glob
from os.path import join
import shutil
from PIL import Image
from torch.utils.tensorboard import SummaryWriter
import glob

class PetDataset(Dataset):
    def __init__(self, image_dir,tansform):
        self.image_dir = image_dir
        self.img_label = self.get_classes()
        self.datasets = self.get_image_and_label()
        self.transform = tansform 


    def get_image_and_label(self):
        datasets = []
        for class_name in self.img_label:
            class_image_path = join(self.image_dir, class_name)
            image_filenames = glob.glob("{}/*.png".format(class_image_path))
            for image_filename in image_filenames:
                # image = Image.open(image_filename) 
                label = self.img_label.index(class_name)
                datasets.append([image_filename,label])
        return datasets
    def get_classes(self):
        class_names = []
        for class_name in os.listdir(self.image_dir):
            if isdir(join(self.image_dir,class_name)):
                class_names.append(class_name)
        return class_names

    def __len__(self):
        return len(self.datasets)
    

    def __getitem__(self, idx):
        image,label = self.datasets[idx]
        image = Image.open(image).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)
        return image,label

def train_model(model, criterion, optimizer, scheduler, dataloaders, device, dataset_sizes, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    writer = SummaryWriter('/tmp/cifar_event')

    for epoch in range(num_epochs):
        message = 'Epoch {}/{} '.format(epoch, num_epochs - 1)
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            message+='{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc)
            writer.add_scalar('Loss/{}'.format(phase),epoch_loss,epoch)
            writer.add_scalar('Accuracy/{}'.format(phase),epoch_acc,epoch)

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
        print(message)


    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60),'Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model


if __name__ == "__main__":
    batch_size = 1024
    dataset_path = '/tmp/cifar10'
    data_transforms = {
        'train': transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'val': transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }
    pd = {phase:PetDataset(join(dataset_path,phase),tansform=data_transforms[phase]) for phase in ['train','val']}
    
    class_names = pd['train'].img_label
    dataloader = {phase:DataLoader(dataset=pd[phase],batch_size=batch_size,shuffle=True) for phase in ['train','val']}

    dataset_sizes = {x: len(pd[x]) for x in ['train', 'val']}
    class_names = pd['train'].img_label

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    model_ft = models.resnet18(pretrained=True)
    num_ftrs = model_ft.fc.in_features
    model_ft.fc = nn.Linear(num_ftrs, len(class_names))

    model_ft = model_ft.to(device)

    criterion = nn.CrossEntropyLoss()

    # Observe that all parameters are being optimized
    optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = lr_scheduler.StepLR(
        optimizer_ft, step_size=7, gamma=0.1)
    model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, dataloader, device, dataset_sizes, num_epochs=25)
   

128 batch的训练时间大概是:2分钟
256 batch的训练时间大概是:1m 27s

训练输出:
使用PyTorch训练cifar10_第1张图片

导出onnx模型

dummy_input = torch.randn(1, 3, 32, 32, device='cuda')
    torch.onnx.export(model_ft,dummy_input,join(model_outputs,'cifar10_resnet.onnx'),opset_version=11,do_constant_folding=True,verbose=True)

你可能感兴趣的:(计算机视觉,PyTorch)