AI challenger 场景分类 PyTorch 迁移学习 resnet18

没计算资源,只能简单测试下resnet18

训练代码(带验证):

'''
TODO:
- 采用更深的网络(简单,但是需要计算资源)
- top3 accuracy,可参考https://github.com/pytorch/examples/blob/master/imagenet/main.py
'''
#pkill -9 python
#nvidia-smi
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import pandas as pd
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms, utils
import time
import json
#plt.ion()   # interactive mode
%matplotlib inline

with open('../ai_challenger_scene_train_20170904/scene_train_annotations_20170904.json', 'r') as f: #label文件
    label_raw_train = json.load(f)
with open('../ai_challenger_scene_validation_20170908/scene_validation_annotations_20170908.json', 'r') as f: #label文件
    label_raw_val = json.load(f)

label_raw_train[0]['label_id']
len(label_raw_train)

class SceneDataset(Dataset):

    def __init__(self, json_labels, root_dir, transform=None):
        """
        Args:
            json_labesl (list):read from official json file.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.label_raw = json_labels
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.label_raw)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.label_raw[idx]['image_id'])
        image = Image.open(img_name)
        label = int(self.label_raw[idx]['label_id'])

        if self.transform:
            image = self.transform(image)

        return image, label

data_transforms = {
    'train': transforms.Compose([
        transforms.RandomSizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Scale(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}
transformed_dataset_train = SceneDataset(json_labels=label_raw_train,
                                    root_dir='../ai_challenger_scene_train_20170904/scene_train_images_20170904',
                                           transform=data_transforms['train']
                                           )      
transformed_dataset_val = SceneDataset(json_labels=label_raw_val,
                                    root_dir='../ai_challenger_scene_validation_20170908/scene_validation_images_20170908',
                                           transform=data_transforms['val']
                                           )         
batch_size = 64
dataloader = {'train':DataLoader(transformed_dataset_train, batch_size=batch_size,shuffle=True, num_workers=8),
             'val':DataLoader(transformed_dataset_val, batch_size=batch_size,shuffle=True, num_workers=8)
             }
dataset_sizes = {'train': len(label_raw_train), 'val':len(label_raw_val)}
use_gpu = torch.cuda.is_available()
#use_gpu = False

def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated


# Get a batch of training data
inputs, classes = next(iter(dataloader['train']))

# Make a grid from batch
out = torchvision.utils.make_grid(inputs)

imshow(out)


######################################################################
# Training the model
# ------------------
#
# Now, let's write a general function to train a model. Here, we will
# illustrate:
#
# -  Scheduling the learning rate
# -  Saving the best model
#
# In the following, parameter ``scheduler`` is an LR scheduler object from
# ``torch.optim.lr_scheduler``.


def train_model (model, criterion, optimizer, scheduler, num_epochs, total_steps):
    since = time.time()

    print('total_steps is %d' % total_steps)
    mystep = 0

    best_model_wts = model.state_dict()
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)


        if (epoch%10 == 0):
            torch.save(best_model_wts, ('resnet18_model_wts_%d.pth')% epoch)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for data in dataloader[phase]:
                # get the inputs
                mystep = mystep + 1
                if(mystep%100 ==0):
                    duration = time.time() - since
                    print('progress %d vs %d in %.0f' % (mystep, total_steps, duration))

                inputs, labels = data

                # wrap them in Variable
                if use_gpu:
                    inputs = Variable(inputs.cuda())
                    labels = Variable(labels.cuda())
                else:
                    inputs, labels = Variable(inputs), Variable(labels)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                outputs = model(inputs)
                _, preds = torch.max(outputs.data, 1)
                loss = criterion(outputs, labels)

                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # statistics
                running_loss += loss.data[0]
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()

        print()

        #if (epoch%10 == 0):
           # torch.save(best_model_wts, ('models/best_model_wts_%d.pth')% epoch)

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

######################################################################
# ConvNet as fixed feature extractor
# ----------------------------------
#
# Here, we need to freeze all the network except the final layer. We need
# to set ``requires_grad == False`` to freeze the parameters so that the
# gradients are not computed in ``backward()``.
#
# You can read more about this in the documentation
# `here `__.
#

model_conv = torchvision.models.resnet18(pretrained=True)
for param in model_conv.parameters():
    param.requires_grad = False

# Parameters of newly constructed modules have requires_grad=True by default
num_ftrs = model_conv.fc.in_features
model_conv.fc = nn.Linear(num_ftrs, 80)

if use_gpu:
    model_conv = model_conv.cuda()

criterion = nn.CrossEntropyLoss()

# Observe that only parameters of final layer are being optimized as
# opoosed to before.
optimizer_conv = optim.SGD(model_conv.fc.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)


######################################################################
# Train and evaluate

num_epochs = 90
total_steps = 1.0 * num_epochs * (len(label_raw_train) + len(label_raw_val)) / batch_size
print(total_steps)
model_conv = train_model(model_conv, criterion, optimizer_conv,
                         exp_lr_scheduler, num_epochs, total_steps)
torch.save(model_conv.state_dict(), 'resnet18_best_model_wts_final.pth')

运行结果:

total_steps is 85779
Epoch 0/89
----------
progress 100 vs 85779 in 83
progress 200 vs 85779 in 168
progress 300 vs 85779 in 254
progress 400 vs 85779 in 344
progress 500 vs 85779 in 435
progress 600 vs 85779 in 526
progress 700 vs 85779 in 616
progress 800 vs 85779 in 706
train Loss: 0.0501 Acc: 0.2997
progress 900 vs 85779 in 798
val Loss: 0.0346 Acc: 0.4992

Epoch 1/89
----------
progress 1000 vs 85779 in 888
progress 1100 vs 85779 in 977
progress 1200 vs 85779 in 1065
progress 1300 vs 85779 in 1155
progress 1400 vs 85779 in 1244
progress 1500 vs 85779 in 1335
progress 1600 vs 85779 in 1424
progress 1700 vs 85779 in 1513
train Loss: 0.0353 Acc: 0.4689
progress 1800 vs 85779 in 1610
progress 1900 vs 85779 in 1696
val Loss: 0.0274 Acc: 0.5663

Epoch 2/89
----------
progress 2000 vs 85779 in 1789
progress 2100 vs 85779 in 1879
progress 2200 vs 85779 in 1968
progress 2300 vs 85779 in 2058
progress 2400 vs 85779 in 2151
progress 2500 vs 85779 in 2241
progress 2600 vs 85779 in 2334
progress 2700 vs 85779 in 2426
train Loss: 0.0312 Acc: 0.5042
progress 2800 vs 85779 in 2519
val Loss: 0.0246 Acc: 0.5916

Epoch 3/89
----------
progress 2900 vs 85779 in 2614
progress 3000 vs 85779 in 2706
progress 3100 vs 85779 in 2795
progress 3200 vs 85779 in 2887
progress 3300 vs 85779 in 2984
progress 3400 vs 85779 in 3077
progress 3500 vs 85779 in 3176
progress 3600 vs 85779 in 3269
progress 3700 vs 85779 in 3361
train Loss: 0.0294 Acc: 0.5191
progress 3800 vs 85779 in 3458
val Loss: 0.0230 Acc: 0.6154

Epoch 4/89
----------
progress 3900 vs 85779 in 3552
progress 4000 vs 85779 in 3641
progress 4100 vs 85779 in 3730
progress 4200 vs 85779 in 3819
progress 4300 vs 85779 in 3909
progress 4400 vs 85779 in 4000
progress 4500 vs 85779 in 4093
progress 4600 vs 85779 in 4189
train Loss: 0.0282 Acc: 0.5344
progress 4700 vs 85779 in 4286
val Loss: 0.0220 Acc: 0.6251

Epoch 5/89
----------
progress 4800 vs 85779 in 4377
progress 4900 vs 85779 in 4466
progress 5000 vs 85779 in 4556
progress 5100 vs 85779 in 4646
progress 5200 vs 85779 in 4740
progress 5300 vs 85779 in 4832
progress 5400 vs 85779 in 4924
progress 5500 vs 85779 in 5017
progress 5600 vs 85779 in 5109
train Loss: 0.0274 Acc: 0.5428
progress 5700 vs 85779 in 5205
val Loss: 0.0211 Acc: 0.6371

Epoch 6/89
----------
progress 5800 vs 85779 in 5305
progress 5900 vs 85779 in 5397
progress 6000 vs 85779 in 5490
progress 6100 vs 85779 in 5582
progress 6200 vs 85779 in 5675
progress 6300 vs 85779 in 5765
progress 6400 vs 85779 in 5855
progress 6500 vs 85779 in 5945
train Loss: 0.0269 Acc: 0.5472
progress 6600 vs 85779 in 6042
val Loss: 0.0206 Acc: 0.6427

Epoch 7/89
----------
progress 6700 vs 85779 in 6135
progress 6800 vs 85779 in 6225
progress 6900 vs 85779 in 6318
progress 7000 vs 85779 in 6412
progress 7100 vs 85779 in 6504
progress 7200 vs 85779 in 6596
progress 7300 vs 85779 in 6688
progress 7400 vs 85779 in 6777
progress 7500 vs 85779 in 6867
train Loss: 0.0264 Acc: 0.5555
progress 7600 vs 85779 in 6959
val Loss: 0.0206 Acc: 0.6468

Epoch 8/89
----------
progress 7700 vs 85779 in 7051
progress 7800 vs 85779 in 7140
progress 7900 vs 85779 in 7230
progress 8000 vs 85779 in 7320
progress 8100 vs 85779 in 7409
progress 8200 vs 85779 in 7499
progress 8300 vs 85779 in 7589
progress 8400 vs 85779 in 7678
train Loss: 0.0264 Acc: 0.5574
progress 8500 vs 85779 in 7772
val Loss: 0.0205 Acc: 0.6492

Epoch 9/89
----------
progress 8600 vs 85779 in 7863
progress 8700 vs 85779 in 7952
progress 8800 vs 85779 in 8042
progress 8900 vs 85779 in 8132
progress 9000 vs 85779 in 8221
progress 9100 vs 85779 in 8311
progress 9200 vs 85779 in 8401
progress 9300 vs 85779 in 8490
progress 9400 vs 85779 in 8580
train Loss: 0.0264 Acc: 0.5589
progress 9500 vs 85779 in 8672
val Loss: 0.0204 Acc: 0.6469

Epoch 10/89
----------
progress 9600 vs 85779 in 8764
progress 9700 vs 85779 in 8853
progress 9800 vs 85779 in 8943
progress 9900 vs 85779 in 9033
progress 10000 vs 85779 in 9122
progress 10100 vs 85779 in 9212
progress 10200 vs 85779 in 9302
progress 10300 vs 85779 in 9391
train Loss: 0.0264 Acc: 0.5575
progress 10400 vs 85779 in 9486
val Loss: 0.0205 Acc: 0.6466

Epoch 11/89
----------
progress 10500 vs 85779 in 9576
progress 10600 vs 85779 in 9665
progress 10700 vs 85779 in 9755
progress 10800 vs 85779 in 9845
progress 10900 vs 85779 in 9934
progress 11000 vs 85779 in 10024
progress 11100 vs 85779 in 10114
progress 11200 vs 85779 in 10204
progress 11300 vs 85779 in 10293
train Loss: 0.0262 Acc: 0.5595

OSErrorTraceback (most recent call last)
8-d3668bd74124> in <module>()
     45 print(total_steps)
     46 model_conv = train_model(model_conv, criterion, optimizer_conv,
---> 47                          exp_lr_scheduler, num_epochs, total_steps)
     48 torch.save(model_conv.state_dict(), 'resnet18_best_model_wts_final.pth')

7-7cfe8ab63cef> in train_model(model, criterion, optimizer, scheduler, num_epochs, total_steps)
     42 
     43             # Iterate over data.
---> 44             for data in dataloader[phase]:
     45                 # get the inputs
     46                 mystep = mystep + 1

/home/wayne/anaconda3/lib/python3.5/site-packages/torch/utils/data/dataloader.py in __iter__(self)
    299 
    300     def __iter__(self):
--> 301         return DataLoaderIter(self)
    302 
    303     def __len__(self):

/home/wayne/anaconda3/lib/python3.5/site-packages/torch/utils/data/dataloader.py in __init__(self, loader)
    156             for w in self.workers:
    157                 w.daemon = True  # ensure that the worker exits on process exit
--> 158                 w.start()
    159 
    160             if self.pin_memory:

/home/wayne/anaconda3/lib/python3.5/multiprocessing/process.py in start(self)
    103                'daemonic processes are not allowed to have children'
    104         _cleanup()
--> 105         self._popen = self._Popen(self)
    106         self._sentinel = self._popen.sentinel
    107         _children.add(self)

/home/wayne/anaconda3/lib/python3.5/multiprocessing/context.py in _Popen(process_obj)
    210     @staticmethod
    211     def _Popen(process_obj):
--> 212         return _default_context.get_context().Process._Popen(process_obj)
    213 
    214 class DefaultContext(BaseContext):

/home/wayne/anaconda3/lib/python3.5/multiprocessing/context.py in _Popen(process_obj)
    265         def _Popen(process_obj):
    266             from .popen_fork import Popen
--> 267             return Popen(process_obj)
    268 
    269     class SpawnProcess(process.BaseProcess):

/home/wayne/anaconda3/lib/python3.5/multiprocessing/popen_fork.py in __init__(self, process_obj)
     18         sys.stderr.flush()
     19         self.returncode = None
---> 20         self._launch(process_obj)
     21 
     22     def duplicate_for_child(self, fd):

/home/wayne/anaconda3/lib/python3.5/multiprocessing/popen_fork.py in _launch(self, process_obj)
     65         code = 1
     66         parent_r, child_w = os.pipe()
---> 67         self.pid = os.fork()
     68         if self.pid == 0:
     69             try:

OSError: [Errno 12] Cannot allocate memory

你可能感兴趣的:(PyTorch)