cifar10数据集相对较大,比minst更适合测试不同算法下的性能,这里没有使用原始的cifar10的python数据,因为原始数据为了方便存储采用的是序列化后的文件,在实际中我们训练的模型通常都是直接获取的图像,没有必要先pickle之后unpickle。此例子用来展示一个简单的分类任务实现,网络部分没有自行设计。主要是从产品的角度实现分类这一功能。同时还有一个目的是用来测试PyTorch环境是否正常。
这段时间一直使用ubuntu20,无奈ubuntu20问题太多了,休眠后音频无输出必须重启,alsa 重启只对非网易云的软件有用、训练的时候视频音频输出卡的和ppt一样,也就是说训练的时候你只能等它训练,听歌看电影都不行。因为我的训练机器是我正在使用的服务器。所以图形化界面也是很重要的一部分,经常卡死目前尚不确定是不是由图形化界面导致。经过n次卡死之后我决定至此以后不再使用ubuntu,所有的工作全部在manjaro上完成。后续会文章中会包含有caffe,darknet相关的训练。此前在ubuntu上非常顺利的编译成功了,manjaro目前存在一些问题,后续有时间我回将caffe和darknet的编译配置更新在次系列文章中。
本地环境:
5.9.16-1-MANJARO
Driver Version: 460.39
cuda_11.1
1.8.0a0+963f762
,源码编译之前尝试使用1.7对cuda11支持有些小问题,目前不知道是否解决,所以使用的是源码编译版本,截至到文章发布当日,PyTorch1.8版本已经更新,我没有使用公共编译版。后续更新后没啥使用问题再来更新。说明硬件和软件参数是希望有在较新软件下训练模型同学可以参考我的环境,排查是否是自己软件安装有问题,我的软硬件条件下是可以正常训练的。tensorflow官方源基本都是基于cuda10编译的,较新的显卡驱动太新和cuda10配合的时候经常有问题,最好做到driver的cuda版本和装机的cuda版本一致,这里我的driver编译的版本为11.2cuda版本为11.1目前没有出现问题。PyTorch编译相对简单,这里不说编译流程了,tensorflow新版需要手动支持cuda11的话需要自己编译,tf编译稍微复杂一些,基于我本地环境编译的tensorflow可以从我的百度网盘下载。详情请查看我的另一篇文章。
Tensorflow源码编译相关
Datasets/cifar10
├── airplane
├── automobile
├── bird
├── cat
├── deer
├── dog
├── frog
├── horse
├── ship
└── truck
cifar图像每个类别5000
张图像,每张图像的size为32x32x3
.转换好的数据:链接 提取码: euct 。
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor, Lambda
from torchvision import transforms
import os
from os.path import basename, dirname, join, exists, isdir
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import glob
from os.path import join
import shutil
from PIL import Image
from torch.utils.tensorboard import SummaryWriter
import glob
class PetDataset(Dataset):
def __init__(self, image_dir,tansform):
self.image_dir = image_dir
self.img_label = self.get_classes()
self.datasets = self.get_image_and_label()
self.transform = tansform
def get_image_and_label(self):
datasets = []
for class_name in self.img_label:
class_image_path = join(self.image_dir, class_name)
image_filenames = glob.glob("{}/*.png".format(class_image_path))
for image_filename in image_filenames:
# image = Image.open(image_filename)
label = self.img_label.index(class_name)
datasets.append([image_filename,label])
return datasets
def get_classes(self):
class_names = []
for class_name in os.listdir(self.image_dir):
if isdir(join(self.image_dir,class_name)):
class_names.append(class_name)
return class_names
def __len__(self):
return len(self.datasets)
def __getitem__(self, idx):
image,label = self.datasets[idx]
image = Image.open(image).convert('RGB')
if self.transform is not None:
image = self.transform(image)
return image,label
def train_model(model, criterion, optimizer, scheduler, dataloaders, device, dataset_sizes, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
writer = SummaryWriter('/tmp/cifar_event')
for epoch in range(num_epochs):
message = 'Epoch {}/{} '.format(epoch, num_epochs - 1)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
message+='{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc)
writer.add_scalar('Loss/{}'.format(phase),epoch_loss,epoch)
writer.add_scalar('Accuracy/{}'.format(phase),epoch_acc,epoch)
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print(message)
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60),'Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
if __name__ == "__main__":
batch_size = 1024
dataset_path = '/tmp/cifar10'
data_transforms = {
'train': transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
pd = {phase:PetDataset(join(dataset_path,phase),tansform=data_transforms[phase]) for phase in ['train','val']}
class_names = pd['train'].img_label
dataloader = {phase:DataLoader(dataset=pd[phase],batch_size=batch_size,shuffle=True) for phase in ['train','val']}
dataset_sizes = {x: len(pd[x]) for x in ['train', 'val']}
class_names = pd['train'].img_label
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, len(class_names))
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(
optimizer_ft, step_size=7, gamma=0.1)
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, dataloader, device, dataset_sizes, num_epochs=25)
128 batch的训练时间大概是:2分钟
256 batch的训练时间大概是:1m 27s
dummy_input = torch.randn(1, 3, 32, 32, device='cuda')
torch.onnx.export(model_ft,dummy_input,join(model_outputs,'cifar10_resnet.onnx'),opset_version=11,do_constant_folding=True,verbose=True)