没计算资源,只能简单测试下resnet18
训练代码(带验证):
'''
TODO:
- 采用更深的网络(简单,但是需要计算资源)
- top3 accuracy,可参考https://github.com/pytorch/examples/blob/master/imagenet/main.py
'''
#pkill -9 python
#nvidia-smi
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import pandas as pd
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms, utils
import time
import json
#plt.ion() # interactive mode
%matplotlib inline
with open('../ai_challenger_scene_train_20170904/scene_train_annotations_20170904.json', 'r') as f: #label文件
label_raw_train = json.load(f)
with open('../ai_challenger_scene_validation_20170908/scene_validation_annotations_20170908.json', 'r') as f: #label文件
label_raw_val = json.load(f)
label_raw_train[0]['label_id']
len(label_raw_train)
class SceneDataset(Dataset):
def __init__(self, json_labels, root_dir, transform=None):
"""
Args:
json_labesl (list):read from official json file.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.label_raw = json_labels
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.label_raw)
def __getitem__(self, idx):
img_name = os.path.join(self.root_dir, self.label_raw[idx]['image_id'])
image = Image.open(img_name)
label = int(self.label_raw[idx]['label_id'])
if self.transform:
image = self.transform(image)
return image, label
data_transforms = {
'train': transforms.Compose([
transforms.RandomSizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Scale(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
transformed_dataset_train = SceneDataset(json_labels=label_raw_train,
root_dir='../ai_challenger_scene_train_20170904/scene_train_images_20170904',
transform=data_transforms['train']
)
transformed_dataset_val = SceneDataset(json_labels=label_raw_val,
root_dir='../ai_challenger_scene_validation_20170908/scene_validation_images_20170908',
transform=data_transforms['val']
)
batch_size = 64
dataloader = {'train':DataLoader(transformed_dataset_train, batch_size=batch_size,shuffle=True, num_workers=8),
'val':DataLoader(transformed_dataset_val, batch_size=batch_size,shuffle=True, num_workers=8)
}
dataset_sizes = {'train': len(label_raw_train), 'val':len(label_raw_val)}
use_gpu = torch.cuda.is_available()
#use_gpu = False
def imshow(inp, title=None):
"""Imshow for Tensor."""
inp = inp.numpy().transpose((1, 2, 0))
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
inp = std * inp + mean
plt.imshow(inp)
if title is not None:
plt.title(title)
plt.pause(0.001) # pause a bit so that plots are updated
# Get a batch of training data
inputs, classes = next(iter(dataloader['train']))
# Make a grid from batch
out = torchvision.utils.make_grid(inputs)
imshow(out)
######################################################################
# Training the model
# ------------------
#
# Now, let's write a general function to train a model. Here, we will
# illustrate:
#
# - Scheduling the learning rate
# - Saving the best model
#
# In the following, parameter ``scheduler`` is an LR scheduler object from
# ``torch.optim.lr_scheduler``.
def train_model (model, criterion, optimizer, scheduler, num_epochs, total_steps):
since = time.time()
print('total_steps is %d' % total_steps)
mystep = 0
best_model_wts = model.state_dict()
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
if (epoch%10 == 0):
torch.save(best_model_wts, ('resnet18_model_wts_%d.pth')% epoch)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
model.train(True) # Set model to training mode
else:
model.train(False) # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for data in dataloader[phase]:
# get the inputs
mystep = mystep + 1
if(mystep%100 ==0):
duration = time.time() - since
print('progress %d vs %d in %.0f' % (mystep, total_steps, duration))
inputs, labels = data
# wrap them in Variable
if use_gpu:
inputs = Variable(inputs.cuda())
labels = Variable(labels.cuda())
else:
inputs, labels = Variable(inputs), Variable(labels)
# zero the parameter gradients
optimizer.zero_grad()
# forward
outputs = model(inputs)
_, preds = torch.max(outputs.data, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.data[0]
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = model.state_dict()
print()
#if (epoch%10 == 0):
# torch.save(best_model_wts, ('models/best_model_wts_%d.pth')% epoch)
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
######################################################################
# ConvNet as fixed feature extractor
# ----------------------------------
#
# Here, we need to freeze all the network except the final layer. We need
# to set ``requires_grad == False`` to freeze the parameters so that the
# gradients are not computed in ``backward()``.
#
# You can read more about this in the documentation
# `here `__.
#
model_conv = torchvision.models.resnet18(pretrained=True)
for param in model_conv.parameters():
param.requires_grad = False
# Parameters of newly constructed modules have requires_grad=True by default
num_ftrs = model_conv.fc.in_features
model_conv.fc = nn.Linear(num_ftrs, 80)
if use_gpu:
model_conv = model_conv.cuda()
criterion = nn.CrossEntropyLoss()
# Observe that only parameters of final layer are being optimized as
# opoosed to before.
optimizer_conv = optim.SGD(model_conv.fc.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)
######################################################################
# Train and evaluate
num_epochs = 90
total_steps = 1.0 * num_epochs * (len(label_raw_train) + len(label_raw_val)) / batch_size
print(total_steps)
model_conv = train_model(model_conv, criterion, optimizer_conv,
exp_lr_scheduler, num_epochs, total_steps)
torch.save(model_conv.state_dict(), 'resnet18_best_model_wts_final.pth')
运行结果:
total_steps is 85779
Epoch 0/89
----------
progress 100 vs 85779 in 83
progress 200 vs 85779 in 168
progress 300 vs 85779 in 254
progress 400 vs 85779 in 344
progress 500 vs 85779 in 435
progress 600 vs 85779 in 526
progress 700 vs 85779 in 616
progress 800 vs 85779 in 706
train Loss: 0.0501 Acc: 0.2997
progress 900 vs 85779 in 798
val Loss: 0.0346 Acc: 0.4992
Epoch 1/89
----------
progress 1000 vs 85779 in 888
progress 1100 vs 85779 in 977
progress 1200 vs 85779 in 1065
progress 1300 vs 85779 in 1155
progress 1400 vs 85779 in 1244
progress 1500 vs 85779 in 1335
progress 1600 vs 85779 in 1424
progress 1700 vs 85779 in 1513
train Loss: 0.0353 Acc: 0.4689
progress 1800 vs 85779 in 1610
progress 1900 vs 85779 in 1696
val Loss: 0.0274 Acc: 0.5663
Epoch 2/89
----------
progress 2000 vs 85779 in 1789
progress 2100 vs 85779 in 1879
progress 2200 vs 85779 in 1968
progress 2300 vs 85779 in 2058
progress 2400 vs 85779 in 2151
progress 2500 vs 85779 in 2241
progress 2600 vs 85779 in 2334
progress 2700 vs 85779 in 2426
train Loss: 0.0312 Acc: 0.5042
progress 2800 vs 85779 in 2519
val Loss: 0.0246 Acc: 0.5916
Epoch 3/89
----------
progress 2900 vs 85779 in 2614
progress 3000 vs 85779 in 2706
progress 3100 vs 85779 in 2795
progress 3200 vs 85779 in 2887
progress 3300 vs 85779 in 2984
progress 3400 vs 85779 in 3077
progress 3500 vs 85779 in 3176
progress 3600 vs 85779 in 3269
progress 3700 vs 85779 in 3361
train Loss: 0.0294 Acc: 0.5191
progress 3800 vs 85779 in 3458
val Loss: 0.0230 Acc: 0.6154
Epoch 4/89
----------
progress 3900 vs 85779 in 3552
progress 4000 vs 85779 in 3641
progress 4100 vs 85779 in 3730
progress 4200 vs 85779 in 3819
progress 4300 vs 85779 in 3909
progress 4400 vs 85779 in 4000
progress 4500 vs 85779 in 4093
progress 4600 vs 85779 in 4189
train Loss: 0.0282 Acc: 0.5344
progress 4700 vs 85779 in 4286
val Loss: 0.0220 Acc: 0.6251
Epoch 5/89
----------
progress 4800 vs 85779 in 4377
progress 4900 vs 85779 in 4466
progress 5000 vs 85779 in 4556
progress 5100 vs 85779 in 4646
progress 5200 vs 85779 in 4740
progress 5300 vs 85779 in 4832
progress 5400 vs 85779 in 4924
progress 5500 vs 85779 in 5017
progress 5600 vs 85779 in 5109
train Loss: 0.0274 Acc: 0.5428
progress 5700 vs 85779 in 5205
val Loss: 0.0211 Acc: 0.6371
Epoch 6/89
----------
progress 5800 vs 85779 in 5305
progress 5900 vs 85779 in 5397
progress 6000 vs 85779 in 5490
progress 6100 vs 85779 in 5582
progress 6200 vs 85779 in 5675
progress 6300 vs 85779 in 5765
progress 6400 vs 85779 in 5855
progress 6500 vs 85779 in 5945
train Loss: 0.0269 Acc: 0.5472
progress 6600 vs 85779 in 6042
val Loss: 0.0206 Acc: 0.6427
Epoch 7/89
----------
progress 6700 vs 85779 in 6135
progress 6800 vs 85779 in 6225
progress 6900 vs 85779 in 6318
progress 7000 vs 85779 in 6412
progress 7100 vs 85779 in 6504
progress 7200 vs 85779 in 6596
progress 7300 vs 85779 in 6688
progress 7400 vs 85779 in 6777
progress 7500 vs 85779 in 6867
train Loss: 0.0264 Acc: 0.5555
progress 7600 vs 85779 in 6959
val Loss: 0.0206 Acc: 0.6468
Epoch 8/89
----------
progress 7700 vs 85779 in 7051
progress 7800 vs 85779 in 7140
progress 7900 vs 85779 in 7230
progress 8000 vs 85779 in 7320
progress 8100 vs 85779 in 7409
progress 8200 vs 85779 in 7499
progress 8300 vs 85779 in 7589
progress 8400 vs 85779 in 7678
train Loss: 0.0264 Acc: 0.5574
progress 8500 vs 85779 in 7772
val Loss: 0.0205 Acc: 0.6492
Epoch 9/89
----------
progress 8600 vs 85779 in 7863
progress 8700 vs 85779 in 7952
progress 8800 vs 85779 in 8042
progress 8900 vs 85779 in 8132
progress 9000 vs 85779 in 8221
progress 9100 vs 85779 in 8311
progress 9200 vs 85779 in 8401
progress 9300 vs 85779 in 8490
progress 9400 vs 85779 in 8580
train Loss: 0.0264 Acc: 0.5589
progress 9500 vs 85779 in 8672
val Loss: 0.0204 Acc: 0.6469
Epoch 10/89
----------
progress 9600 vs 85779 in 8764
progress 9700 vs 85779 in 8853
progress 9800 vs 85779 in 8943
progress 9900 vs 85779 in 9033
progress 10000 vs 85779 in 9122
progress 10100 vs 85779 in 9212
progress 10200 vs 85779 in 9302
progress 10300 vs 85779 in 9391
train Loss: 0.0264 Acc: 0.5575
progress 10400 vs 85779 in 9486
val Loss: 0.0205 Acc: 0.6466
Epoch 11/89
----------
progress 10500 vs 85779 in 9576
progress 10600 vs 85779 in 9665
progress 10700 vs 85779 in 9755
progress 10800 vs 85779 in 9845
progress 10900 vs 85779 in 9934
progress 11000 vs 85779 in 10024
progress 11100 vs 85779 in 10114
progress 11200 vs 85779 in 10204
progress 11300 vs 85779 in 10293
train Loss: 0.0262 Acc: 0.5595
OSErrorTraceback (most recent call last)
8-d3668bd74124> in <module>()
45 print(total_steps)
46 model_conv = train_model(model_conv, criterion, optimizer_conv,
---> 47 exp_lr_scheduler, num_epochs, total_steps)
48 torch.save(model_conv.state_dict(), 'resnet18_best_model_wts_final.pth')
7-7cfe8ab63cef> in train_model(model, criterion, optimizer, scheduler, num_epochs, total_steps)
42
43 # Iterate over data.
---> 44 for data in dataloader[phase]:
45 # get the inputs
46 mystep = mystep + 1
/home/wayne/anaconda3/lib/python3.5/site-packages/torch/utils/data/dataloader.py in __iter__(self)
299
300 def __iter__(self):
--> 301 return DataLoaderIter(self)
302
303 def __len__(self):
/home/wayne/anaconda3/lib/python3.5/site-packages/torch/utils/data/dataloader.py in __init__(self, loader)
156 for w in self.workers:
157 w.daemon = True # ensure that the worker exits on process exit
--> 158 w.start()
159
160 if self.pin_memory:
/home/wayne/anaconda3/lib/python3.5/multiprocessing/process.py in start(self)
103 'daemonic processes are not allowed to have children'
104 _cleanup()
--> 105 self._popen = self._Popen(self)
106 self._sentinel = self._popen.sentinel
107 _children.add(self)
/home/wayne/anaconda3/lib/python3.5/multiprocessing/context.py in _Popen(process_obj)
210 @staticmethod
211 def _Popen(process_obj):
--> 212 return _default_context.get_context().Process._Popen(process_obj)
213
214 class DefaultContext(BaseContext):
/home/wayne/anaconda3/lib/python3.5/multiprocessing/context.py in _Popen(process_obj)
265 def _Popen(process_obj):
266 from .popen_fork import Popen
--> 267 return Popen(process_obj)
268
269 class SpawnProcess(process.BaseProcess):
/home/wayne/anaconda3/lib/python3.5/multiprocessing/popen_fork.py in __init__(self, process_obj)
18 sys.stderr.flush()
19 self.returncode = None
---> 20 self._launch(process_obj)
21
22 def duplicate_for_child(self, fd):
/home/wayne/anaconda3/lib/python3.5/multiprocessing/popen_fork.py in _launch(self, process_obj)
65 code = 1
66 parent_r, child_w = os.pipe()
---> 67 self.pid = os.fork()
68 if self.pid == 0:
69 try:
OSError: [Errno 12] Cannot allocate memory