所有代码已上传到本人github repository:https://github.com/zgcr/pytorch-ImageNet-CIFAR-COCO-VOC-training
如果觉得有用,请点个star哟!
下列代码均在pytorch1.4版本中测试过,确认正确无误。
由于CIFAR100输入均为32x32的图像,而原始的ResNet第一层卷积是7X7的大核卷积,这样的卷积结构对于CIFAR100数据集性能表现较差。因此,我们参照:https://github.com/weiaicunzai/pytorch-cifar100 中的做法,将ResNet第一层卷积改为kernel_size=3,stride=1,padding=1的卷积,并去掉之后的maxpooling层。
修改后的ResNet网络结构如下:
"""
https://github.com/weiaicunzai/pytorch-cifar100/blob/master/models/resnet.py
"""
import os
import sys
BASE_DIR = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.dirname(
os.path.abspath(__file__)))))
sys.path.append(BASE_DIR)
from public.path import pretrained_models_path
import torch
import torch.nn as nn
__all__ = [
'resnet18',
'resnet34',
'resnet50',
'resnet101',
'resnet152',
]
model_urls = {
'resnet18':
'{}/resnetforcifar/resnet18-cifar-acc78.41.pth'.format(
pretrained_models_path),
'resnet34':
'{}/resnetforcifar/resnet34-cifar-acc78.84.pth'.format(
pretrained_models_path),
'resnet50':
'{}/resnetforcifar/resnet50-cifar-acc77.88.pth'.format(
pretrained_models_path),
'resnet101':
'{}/resnetforcifar/resnet101-cifar-acc80.16.pth'.format(
pretrained_models_path),
'resnet152':
'{}/resnetforcifar/resnet152-cifar-acc80.99.pth'.format(
pretrained_models_path),
}
class BasicBlock(nn.Module):
"""Basic Block for resnet 18 and resnet 34
"""
expansion = 1
def __init__(self, in_channels, out_channels, stride=1):
super(BasicBlock, self).__init__()
self.residual_branch = nn.Sequential(
nn.Conv2d(in_channels,
out_channels,
kernel_size=3,
stride=stride,
padding=1,
bias=False), nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels,
out_channels * BasicBlock.expansion,
kernel_size=3,
padding=1,
bias=False),
nn.BatchNorm2d(out_channels * BasicBlock.expansion))
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != BasicBlock.expansion * out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels,
out_channels * BasicBlock.expansion,
kernel_size=1,
stride=stride,
bias=False),
nn.BatchNorm2d(out_channels * BasicBlock.expansion))
def forward(self, x):
return nn.ReLU(inplace=True)(self.residual_branch(x) +
self.shortcut(x))
class BottleNeck(nn.Module):
"""Residual block for resnet over 50 layers
"""
expansion = 4
def __init__(self, in_channels, out_channels, stride=1):
super(BottleNeck, self).__init__()
self.residual_branch = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels,
out_channels,
stride=stride,
kernel_size=3,
padding=1,
bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels,
out_channels * BottleNeck.expansion,
kernel_size=1,
bias=False),
nn.BatchNorm2d(out_channels * BottleNeck.expansion),
)
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels * BottleNeck.expansion:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels,
out_channels * BottleNeck.expansion,
stride=stride,
kernel_size=1,
bias=False),
nn.BatchNorm2d(out_channels * BottleNeck.expansion))
def forward(self, x):
return nn.ReLU(inplace=True)(self.residual_branch(x) +
self.shortcut(x))
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=100, inter_layer=False):
super(ResNet, self).__init__()
self.inter_layer = inter_layer
self.in_channels = 64
self.conv1 = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False),
nn.BatchNorm2d(64), nn.ReLU(inplace=True))
self.stage2 = self._make_layer(block, 64, layers[0], 1)
self.stage3 = self._make_layer(block, 128, layers[1], 2)
self.stage4 = self._make_layer(block, 256, layers[2], 2)
self.stage5 = self._make_layer(block, 512, layers[3], 2)
self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
def _make_layer(self, block, out_channels, num_blocks, stride):
"""make resnet layers(by layer i didnt mean this 'layer' was the
same as a neuron netowork layer, ex. conv layer), one layer may
contain more than one residual block
Args:
block: block type, basic block or bottle neck block
out_channels: output depth channel number of this layer
num_blocks: how many blocks per layer
stride: the stride of the first block of this layer
Return:
return a resnet layer
"""
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_channels, out_channels, stride))
self.in_channels = out_channels * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
if self.inter_layer:
x1 = self.stage2(x)
x2 = self.stage3(x1)
x3 = self.stage4(x2)
x4 = self.stage5(x3)
x = self.avg_pool(x4)
x = x.view(x.size(0), -1)
x = self.fc(x)
return [x1, x2, x3, x4, x]
else:
x = self.stage2(x)
x = self.stage3(x)
x = self.stage4(x)
x = self.stage5(x)
x = self.avg_pool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def _resnet(arch, block, layers, pretrained, progress, **kwargs):
model = ResNet(block, layers, **kwargs)
# only load state_dict()
if pretrained:
model.load_state_dict(
torch.load(model_urls[arch], map_location=torch.device('cpu')))
return model
def resnet18(pretrained=False, progress=True, **kwargs):
return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
**kwargs)
def resnet34(pretrained=False, progress=True, **kwargs):
return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
**kwargs)
def resnet50(pretrained=False, progress=True, **kwargs):
return _resnet('resnet50', BottleNeck, [3, 4, 6, 3], pretrained, progress,
**kwargs)
def resnet101(pretrained=False, progress=True, **kwargs):
return _resnet('resnet101', BottleNeck, [3, 4, 23, 3], pretrained,
progress, **kwargs)
def resnet152(pretrained=False, progress=True, **kwargs):
return _resnet('resnet152', BottleNeck, [3, 8, 36, 3], pretrained,
progress, **kwargs)
我们采用CIFAR100常用的训练超参数,config.py文件如下:
import os
import sys
BASE_DIR = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(BASE_DIR)
from public.path import CIFAR100_path
import numpy as np
import torchvision.transforms as transforms
import torchvision.datasets as datasets
class Config:
log = "./log" # Path to save log
checkpoint_path = "./checkpoints" # Path to store model
resume = "./checkpoints/latest.pth"
evaluate = None # 测试模型,evaluate为模型地址
train_dataset_path = CIFAR100_path
val_dataset_path = CIFAR100_path
# download CIFAR100 from here:https://www.cs.toronto.edu/~kriz/cifar.html
pretrained = False
seed = 0
num_classes = 100
milestones = [60, 120, 160]
epochs = 200
batch_size = 128
accumulation_steps = 1
lr = 0.1
gamma = 0.2
momentum = 0.9
weight_decay = 5e-4
num_workers = 4
print_interval = 30
apex = False
train_transform = transforms.Compose([
transforms.Pad(4, padding_mode='reflect'),
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32),
transforms.ToTensor(),
transforms.Normalize(
np.array([125.3, 123.0, 113.9]) / 255.0,
np.array([63.0, 62.1, 66.7]) / 255.0),
])
val_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(
np.array([125.3, 123.0, 113.9]) / 255.0,
np.array([63.0, 62.1, 66.7]) / 255.0),
])
train_dataset_init = {
"root": train_dataset_path,
"train": True,
"download": True,
"transform": train_transform
}
val_dataset_init = {
"root": val_dataset_path,
"train": False,
"download": True,
"transform": val_transform
}
train.py文件如下:
import sys
import os
import argparse
import random
import time
import warnings
BASE_DIR = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(BASE_DIR)
warnings.filterwarnings('ignore')
from apex import amp
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
from thop import profile
from thop import clever_format
from torch.utils.data import DataLoader
from config import Config
from public.distillation.models.resnetforcifar import resnet50
from torchvision.datasets import CIFAR100
from public.imagenet.utils import DataPrefetcher, get_logger, AverageMeter, accuracy
def parse_args():
parser = argparse.ArgumentParser(description='PyTorch CIFAR100 Training')
parser.add_argument('--lr',
type=float,
default=Config.lr,
help='learning rate')
parser.add_argument('--momentum',
type=float,
default=Config.momentum,
help='momentum')
parser.add_argument('--weight_decay',
type=float,
default=Config.weight_decay,
help='weight decay')
parser.add_argument('--gamma',
type=float,
default=Config.gamma,
help='gamma')
parser.add_argument('--milestones',
type=list,
default=Config.milestones,
help='optimizer milestones')
parser.add_argument('--epochs',
type=int,
default=Config.epochs,
help='num of training epochs')
parser.add_argument('--batch_size',
type=int,
default=Config.batch_size,
help='batch size')
parser.add_argument('--accumulation_steps',
type=int,
default=Config.accumulation_steps,
help='gradient accumulation steps')
parser.add_argument('--pretrained',
type=bool,
default=Config.pretrained,
help='load pretrained model params or not')
parser.add_argument('--num_classes',
type=int,
default=Config.num_classes,
help='model classification num')
parser.add_argument('--num_workers',
type=int,
default=Config.num_workers,
help='number of worker to load data')
parser.add_argument('--resume',
type=str,
default=Config.resume,
help='put the path to resuming file if needed')
parser.add_argument('--checkpoints',
type=str,
default=Config.checkpoint_path,
help='path for saving trained models')
parser.add_argument('--log',
type=str,
default=Config.log,
help='path to save log')
parser.add_argument('--evaluate',
type=str,
default=Config.evaluate,
help='path for evaluate model')
parser.add_argument('--seed', type=int, default=Config.seed, help='seed')
parser.add_argument('--print_interval',
type=bool,
default=Config.print_interval,
help='print interval')
parser.add_argument('--apex',
type=bool,
default=Config.apex,
help='use apex or not')
return parser.parse_args()
def train(train_loader, model, criterion, optimizer, scheduler, epoch, logger,
args):
top1 = AverageMeter()
top5 = AverageMeter()
losses = AverageMeter()
# switch to train mode
model.train()
iters = len(train_loader.dataset) // args.batch_size
prefetcher = DataPrefetcher(train_loader)
inputs, labels = prefetcher.next()
iter_index = 1
while inputs is not None:
inputs, labels = inputs.cuda(), labels.cuda()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss = loss / args.accumulation_steps
if args.apex:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
if iter_index % args.accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
# measure accuracy and record loss
acc1, acc5 = accuracy(outputs, labels, topk=(1, 5))
top1.update(acc1.item(), inputs.size(0))
top5.update(acc5.item(), inputs.size(0))
losses.update(loss.item(), inputs.size(0))
inputs, labels = prefetcher.next()
if iter_index % args.print_interval == 0:
logger.info(
f"train: epoch {epoch:0>3d}, iter [{iter_index:0>4d}, {iters:0>4d}], lr: {scheduler.get_lr()[0]:.6f}, top1 acc: {acc1.item():.2f}%, top5 acc: {acc5.item():.2f}%, loss_total: {loss.item():.2f}"
)
iter_index += 1
scheduler.step()
return top1.avg, top5.avg, losses.avg
def validate(val_loader, model, args):
batch_time = AverageMeter()
data_time = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
# switch to evaluate mode
model.eval()
with torch.no_grad():
end = time.time()
for inputs, labels in val_loader:
data_time.update(time.time() - end)
inputs, labels = inputs.cuda(), labels.cuda()
outputs = model(inputs)
acc1, acc5 = accuracy(outputs, labels, topk=(1, 5))
top1.update(acc1.item(), inputs.size(0))
top5.update(acc5.item(), inputs.size(0))
batch_time.update(time.time() - end)
end = time.time()
throughput = 1.0 / (batch_time.avg / inputs.size(0))
return top1.avg, top5.avg, throughput
def main(logger, args):
if not torch.cuda.is_available():
raise Exception("need gpu to train network!")
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
cudnn.deterministic = True
gpus = torch.cuda.device_count()
logger.info(f'use {gpus} gpus')
logger.info(f"args: {args}")
cudnn.benchmark = True
cudnn.enabled = True
start_time = time.time()
# dataset and dataloader
logger.info('start loading data')
train_dataset = CIFAR100(**Config.train_dataset_init)
train_loader = DataLoader(train_dataset,
batch_size=args.batch_size,
shuffle=True,
num_workers=args.num_workers,
pin_memory=True)
val_dataset = CIFAR100(**Config.val_dataset_init)
val_loader = DataLoader(val_dataset,
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True)
logger.info('finish loading data')
logger.info(f"creating model")
model = resnet50(**{
"pretrained": args.pretrained,
"num_classes": args.num_classes,
})
for name, param in model.named_parameters():
logger.info(f"{name},{param.requires_grad}")
model = model.cuda()
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.SGD(model.parameters(),
args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
scheduler = torch.optim.lr_scheduler.MultiStepLR(
optimizer, milestones=args.milestones, gamma=args.gamma)
if args.apex:
model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
model = nn.DataParallel(model)
if args.evaluate:
if not os.path.isfile(args.evaluate):
raise Exception(
f"{args.resume} is not a file, please check it again")
logger.info('start only evaluating')
logger.info(f"start resuming model from {args.evaluate}")
checkpoint = torch.load(args.evaluate,
map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_state_dict'])
acc1, acc5, throughput = validate(val_loader, model, args)
logger.info(
f"epoch {checkpoint['epoch']:0>3d}, top1 acc: {acc1:.2f}%, top5 acc: {acc5:.2f}%, throughput: {throughput:.2f}sample/s"
)
return
best_acc = 0.0
start_epoch = 1
# resume training
if os.path.exists(args.resume):
logger.info(f"start resuming model from {args.resume}")
checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
start_epoch += checkpoint['epoch']
best_acc = checkpoint['best_acc']
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
logger.info(
f"finish resuming model from {args.resume}, epoch {checkpoint['epoch']}, "
f"loss: {checkpoint['loss']:3f}, best_acc: {checkpoint['best_acc']:.2f}%, lr: {checkpoint['lr']:.6f}, "
f"top1_acc: {checkpoint['acc1']}%")
if not os.path.exists(args.checkpoints):
os.makedirs(args.checkpoints)
logger.info('start training')
for epoch in range(start_epoch, args.epochs + 1):
acc1, acc5, losses = train(train_loader, model, criterion, optimizer,
scheduler, epoch, logger, args)
logger.info(
f"train: epoch {epoch:0>3d}, top1 acc: {acc1:.2f}%, top5 acc: {acc5:.2f}%, losses: {losses:.2f}"
)
acc1, acc5, throughput = validate(val_loader, model, args)
logger.info(
f"val: epoch {epoch:0>3d}, top1 acc: {acc1:.2f}%, top5 acc: {acc5:.2f}%, throughput: {throughput:.2f}sample/s"
)
if acc1 > best_acc:
torch.save(model.module.state_dict(),
os.path.join(args.checkpoints, "best.pth"))
best_acc = acc1
# remember best prec@1 and save checkpoint
torch.save(
{
'epoch': epoch,
'best_acc': best_acc,
'acc1': acc1,
'loss': losses,
'lr': scheduler.get_lr()[0],
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'scheduler_state_dict': scheduler.state_dict(),
}, os.path.join(args.checkpoints, 'latest.pth'))
logger.info(f"finish training, best acc: {best_acc:.2f}%")
training_time = (time.time() - start_time) / 3600
logger.info(
f"finish training, total training time: {training_time:.2f} hours")
if __name__ == '__main__':
args = parse_args()
logger = get_logger(__name__, args.log)
main(logger, args)
由于在CIFAR100上训练时模型表现波动较大,因此我们保存一个best model作为训练得到模型参数。
Network | Top-1 error |
---|---|
ResNet-18 | 21.59 |
ResNet-34 | 21.16 |
ResNet-50 | 22.12 |
ResNet-101 | 19.84 |
ResNet-152 | 19.01 |