Pytorch yolov3 多GPU 训练

pytorch 多gpu训练:

# -*- coding:utf-8 -*-
from __future__ import division

import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

from PIL import Image

from utils.parse_config import *
from utils.utils import build_targets
from collections import defaultdict

def create_modules(module_defs):
    """
    Constructs module list of layer blocks from module configuration in module_defs
    """
    #根据cfg文件建立yolov3网络结构
    hyperparams = module_defs.pop(0)
    output_filters = [int(hyperparams['channels'])]
    module_list = nn.ModuleList()
    for i, module_def in enumerate(module_defs):
        modules = nn.Sequential()

        if module_def['type'] == 'convolutional':
            bn = int(module_def['batch_normalize'])
            filters = int(module_def['filters'])
            kernel_size = int(module_def['size'])
            pad = (kernel_size - 1) // 2 if int(module_def['pad']) else 0
            modules.add_module('conv_%d' % i, nn.Conv2d(in_channels=output_filters[-1],
                                                        out_channels=filters,
                                                        kernel_size=kernel_size,
                                                        stride=int(module_def['stride']),
                                                        padding=pad,
                                                        bias=not bn))
            if bn:
                modules.add_module('batch_norm_%d' % i, nn.BatchNorm2d(filters))
            if module_def['activation'] == 'leaky':
                modules.add_module('leaky_%d' % i, nn.LeakyReLU(0.1))

        elif module_def['type'] == 'upsample':
            upsample = nn.Upsample( scale_factor=int(module_def['stride']),
                                    mode='nearest')
            modules.add_module('upsample_%d' % i, upsample)

        elif module_def['type'] == 'route':
            layers = [int(x) for x in module_def["layers"].split(',')]
            filters = sum([output_filters[layer_i] for layer_i in layers])
            modules.add_module('route_%d' % i, EmptyLayer())

        elif module_def['type'] == 'shortcut':
            filters = output_filters[int(module_def['from'])]
            modules.add_module("shortcut_%d" % i, EmptyLayer())

        elif module_def["type"] == "yolo":
            anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
            # Extract anchors
            anchors = module_def["anchors"]
            anchors = [anchors[i] for i in anchor_idxs]
            num_classes = int(module_def['classes'])
            img_height = int(hyperparams['height'])
            # Define detection layer
            yolo_layer = YOLOLayer(anchors, num_classes, img_height)
            modules.add_module('yolo_%d' % i, yolo_layer)
        # Register module list and number of output filters
        module_list.append(modules)
        output_filters.append(filters)

    return hyperparams, module_list

class EmptyLayer(nn.Module):
    """Placeholder for 'route' and 'shortcut' layers"""
    def __init__(self):
        super(EmptyLayer, self).__init__()

class YOLOLayer(nn.Module):
    """Detection layer"""
    def __init__(self, anchors, num_classes, image_dim):
        super(YOLOLayer, self).__init__()
        self.anchors = anchors
        self.scaled_anchors = None
        self.num_anchors = len(anchors)
        self.num_classes = num_classes
        self.bbox_attrs = 5 + num_classes
        self.image_dim = image_dim
        self.ignore_thres = 0.5
        self.coord_scale = 1
        self.noobject_scale = 1
        self.object_scale = 5
        self.class_scale = 1
        self.seen = 0

        self.mse_loss = nn.MSELoss()
        self.bce_loss = nn.BCELoss()
        # self.bce_logits_loss = nn.BCEWithLogitsLoss()

    def forward(self, x, targets=None):
        bs = x.size(0)
        g_dim = x.size(2)
        stride =  self.image_dim / g_dim
        # Tensors for cuda support
        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
        LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor

        prediction = x.view(bs,  self.num_anchors, self.bbox_attrs, g_dim, g_dim).permute(0, 1, 3, 4, 2).contiguous()

        # Get outputs
        x = torch.sigmoid(prediction[..., 0])          # Center x
        y = torch.sigmoid(prediction[..., 1])          # Center y
        w = prediction[..., 2]                         # Width
        h = prediction[..., 3]                         # Height
        conf = torch.sigmoid(prediction[..., 4])       # Conf
        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.

        # Calculate offsets for each grid
        grid_x = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).repeat(bs*self.num_anchors, 1, 1).view(x.shape).type(FloatTensor)
        grid_y = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).t().repeat(bs*self.num_anchors, 1, 1).view(y.shape).type(FloatTensor)
        scaled_anchors = [(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]
        anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
        anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
        anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(w.shape)
        anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(h.shape)

        # Add offset and scale with anchors
        pred_boxes = FloatTensor(prediction[..., :4].shape)
        pred_boxes[..., 0] = x.data + grid_x
        pred_boxes[..., 1] = y.data + grid_y
        pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
        pred_boxes[..., 3] = torch.exp(h.data) * anchor_h

        self.seen += prediction.size(0)

        # Training
        if targets is not None:

            if x.is_cuda:
                self.mse_loss = self.mse_loss.cuda()
                self.bce_loss = self.bce_loss.cuda()

            nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes.cpu().data,
                                                                                                        targets.cpu().data,
                                                                                                        scaled_anchors,
                                                                                                        self.num_anchors,
                                                                                                        self.num_classes,
                                                                                                        g_dim,
                                                                                                        self.ignore_thres)


            # nProposals = int((conf > 0.25).sum().item())
            recall = float(nCorrect / nGT) if nGT else 1
            tx    = Variable(tx.type(FloatTensor), requires_grad=False)
            ty    = Variable(ty.type(FloatTensor), requires_grad=False)
            tw    = Variable(tw.type(FloatTensor), requires_grad=False)
            th    = Variable(th.type(FloatTensor), requires_grad=False)
            tconf = Variable(tconf.type(FloatTensor), requires_grad=False)
            tcls  = Variable(tcls[cls_mask == 1].type(FloatTensor), requires_grad=False)
            coord_mask = Variable(coord_mask.type(FloatTensor), requires_grad=False)
            conf_mask  = Variable(conf_mask.type(FloatTensor), requires_grad=False)

            loss_x = self.coord_scale * self.mse_loss(x[coord_mask == 1], tx[coord_mask == 1]) / 2
            loss_y = self.coord_scale * self.mse_loss(y[coord_mask == 1], ty[coord_mask == 1]) / 2
            loss_w = self.coord_scale * self.mse_loss(w[coord_mask == 1], tw[coord_mask == 1]) / 2
            loss_h = self.coord_scale * self.mse_loss(h[coord_mask == 1], th[coord_mask == 1]) / 2
            loss_conf = self.bce_loss(conf[conf_mask == 1], tconf[conf_mask == 1])
            loss_cls = self.class_scale * self.bce_loss(pred_cls[cls_mask == 1], tcls)
            loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls

            return loss, loss_x.item(), loss_y.item(), loss_w.item(), loss_h.item(), loss_conf.item(), loss_cls.item(),recall

        else:
            # If not in training phase return predictions
            output = torch.cat((pred_boxes.view(bs, -1, 4) * stride, conf.view(bs, -1, 1), pred_cls.view(bs, -1, self.num_classes)), -1)
            return output.data


class Darknet(nn.Module):
    """YOLOv3 object detection model"""
    def __init__(self, module_defs, img_size=416):
        super(Darknet, self).__init__()
        self.module_defs = module_defs
        self.hyperparams, self.module_list = create_modules(self.module_defs)#根据config文件建立yolov3网络模型,返回网络参数和torch版神经网络
        # print("module",self.module_list)
        self.img_size = img_size
        self.loss_names = ['x', 'y', 'w', 'h', 'conf', 'cls', 'recall']
        self.losses = defaultdict(float)

    def forward(self, x, targets=None):
        is_training = targets is not None
        output = []
        for name in self.loss_names:
            self.losses[name] =0
        layer_outputs = []
        for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
            if module_def['type'] in ['convolutional', 'upsample']:
                x = module(x)
            elif module_def['type'] == 'route':
                layer_i = [int(x) for x in module_def['layers'].split(',')]
                x = torch.cat([layer_outputs[i] for i in layer_i], 1)
            elif module_def['type'] == 'shortcut':
                layer_i = int(module_def['from'])
                x = layer_outputs[-1] + layer_outputs[layer_i]
            elif module_def['type'] == 'yolo':
                # Train phase: get loss
                if is_training:
                    x, *losses = module[0](x, targets)
                    for name, loss in zip(self.loss_names, losses):
                        self.losses[name] += loss
                # Test phase: Get detections
                else:
                    x = module(x)
                output.append(x)
            layer_outputs.append(x)
        self.losses['recall'] /= 3

        if is_training:
            return sum(output).view(-1, )
        else:
            return torch.cat(output, 1)
        # return sum(output) if is_training else torch.cat(output, 1)


    def load_weights(self, weights_path,is_training = False):
        """Parses and loads the weights stored in 'weights_path'"""

        #Open the weights file
        fp = open(weights_path, "rb")
        header = np.fromfile(fp, dtype=np.int32, count=5)   # First five are header values

        # Needed to write header when saving weights
        self.header_info = header

        self.seen = header[3]
        weights = np.fromfile(fp, dtype=np.float32)         # The rest are weights
        fp.close()

        ptr = 0
        for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
            if module_def['type'] == 'convolutional':
                conv_layer = module[0]
                if module_def['batch_normalize']:
                    # Load BN bias, weights, running mean and running variance
                    bn_layer = module[1]
                    num_b = bn_layer.bias.numel() # Number of biases
                    # Bias
                    bn_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.bias)
                    bn_layer.bias.data.copy_(bn_b)
                    ptr += num_b
                    # Weight
                    bn_w = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.weight)
                    bn_layer.weight.data.copy_(bn_w)
                    ptr += num_b
                    # Running Mean
                    bn_rm = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_mean)
                    bn_layer.running_mean.data.copy_(bn_rm)
                    ptr += num_b
                    # Running Var
                    bn_rv = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_var)
                    bn_layer.running_var.data.copy_(bn_rv)
                    ptr += num_b
                else:
                    # Load conv. bias
                    num_b = conv_layer.bias.numel()
                    conv_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(conv_layer.bias)
                    conv_layer.bias.data.copy_(conv_b)
                    ptr += num_b
                # Load conv. weights
                num_w = conv_layer.weight.numel()
                conv_w = torch.from_numpy(weights[ptr:ptr + num_w]).view_as(conv_layer.weight)#权重参数赋值
                conv_layer.weight.data.copy_(conv_w)
                ptr += num_w

    """
        @:param path    - path of the new weights file
        @:param cutoff  - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
    """
    def save_weights(self, path, cutoff=-1):

        fp = open(path, 'wb')
        self.header_info[3] = self.seen
        self.header_info.tofile(fp)

        # Iterate through layers
        for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
            if module_def['type'] == 'convolutional':
                conv_layer = module[0]
                # If batch norm, load bn first
                if module_def['batch_normalize']:
                    bn_layer = module[1]

                    bn_layer.bias.data.cpu().numpy().tofile(fp)
                    bn_layer.weight.data.cpu().numpy().tofile(fp)
                    bn_layer.running_mean.data.cpu().numpy().tofile(fp)
                    bn_layer.running_var.data.cpu().numpy().tofile(fp)
                # Load conv bias
                else:
                    conv_layer.bias.data.cpu().numpy().tofile(fp)
                # Load conv weights
                conv_layer.weight.data.cpu().numpy().tofile(fp)

        fp.close()


train代码:关键词

optimizer.module.zero_grad()

model.module.save_weights

loss = model(imgs, targets)

torch.sum(loss).backward()

optimizer.module.step()


for epoch in range(opt.epochs):
    for batch_i, (_, imgs, targets) in enumerate(dataloader):
        imgs = Variable(imgs.type(Tensor))
        targets = Variable(targets.type(Tensor), requires_grad=False)
        optimizer.module.zero_grad()
        loss = model(imgs, targets)
        # loss.backward()
        # optimizer.step()
        torch.sum(loss).backward()
        optimizer.module.step()
        now = datetime.datetime.now()
        strftime = now.strftime("%H:%M:%S")
        print(strftime, epoch, opt.epochs, batch_i, len(dataloader), loss)
        if batch_i % 40 == 39:
            if last_total_loss > 0 and total_loss > last_total_loss * 1.01:
                print("total_loss", total_loss)
                adjust_learning_rate(optimizer)
            else:
                print("total_loss", total_loss, last_total_loss)
                last_total_loss = total_loss
            total_loss = torch.sum(loss)
        elif batch_i == 0:
            total_loss = torch.sum(loss)
        else:
            total_loss += torch.sum(loss)

        # if epoch > 0 and batch_i == 0:
        #     if torch.sum(loss) > mean_loss / batch_size :
        #         print("mean_loss", mean_loss)
        #         adjust_learning_rate(optimizer)
        #     mean_loss = torch.sum(loss)
        # else:
        #     mean_loss += torch.sum(loss)
        # info = {'loss': loss.item(), 'cls': model.losses['cls'], 'conf': model.losses['conf']}

        # for tag, value in info.items():
        #     logger.scalar_summary(tag, value, epoch)

        print('%s [Epoch %d/%d, Batch %d/%d Losse s: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f, recall: %.5f]' %
              (strftime, epoch, opt.epochs, batch_i, len(dataloader),
               model.module.losses['x'], model.module.losses['y'], model.module.losses['w'],
               model.module.losses['h'], model.module.losses['conf'], model.module.losses['cls'],
               torch.sum(loss), model.module.losses['recall']))

    if epoch % opt.checkpoint_interval == 0:
        model.module.save_weights('%s/%d.weights' % (opt.checkpoint_dir, epoch))


# -*- coding:utf-8 -*-
from __future__ import division

from models import *
from utils.utils import *
from utils.datasets import *
from utils.parse_config import *
from logger import Logger
import os
import sys
import time
import datetime
import argparse

import torch
from torch.utils.data import DataLoader

from torch.autograd import Variable
import torch.optim as optim

parser = argparse.ArgumentParser()
parser.add_argument('--epochs', type=int, default=2001, help='number of epochs')
parser.add_argument('--image_folder', type=str, default='data/samples', help='path to dataset')
parser.add_argument('--batch_size', type=int, default=4, help='size of each image batch')
parser.add_argument('--learning_rate', type=float, default=0.01, help='learning_rate')
parser.add_argument('--train_dir', type=str, default=r'E:\team-CV\dataset\tiny_data\VOC2007/',help='train_dir')
parser.add_argument('--model_config_path', type=str, default='config/yolov3_2cls.cfg', help='path to model config file')
parser.add_argument('--data_config_path', type=str, default='config/coco.data', help='path to data config file')
parser.add_argument('--weights_path', type=str, default='weights/yolov3.weights', help='path to weights file')
# parser.add_argument('--weights_path', type=str, default='checkpoints/40.weights', help='path to weights file')
parser.add_argument('--class_path', type=str, default='data/coco_2cls.names', help='path to class label file')
parser.add_argument('--conf_thres', type=float, default=0.8, help='object confidence threshold')
parser.add_argument('--nms_thres', type=float, default=0.4, help='iou thresshold for non-maximum suppression')
parser.add_argument('--n_cpu', type=int, default=0, help='number of cpu threads to use during batch generation')
parser.add_argument('--img_size', type=int, default=416, help='size of each image dimension')
parser.add_argument('--checkpoint_interval', type=int, default=4, help='interval between saving model weights')
parser.add_argument('--checkpoint_dir', type=str, default='checkpoints', help='directory where model checkpoints are saved')
opt = parser.parse_args()
print(opt)

os.makedirs('output', exist_ok=True)
os.makedirs('checkpoints', exist_ok=True)
def adjust_learning_rate(optimizer, decay_rate=0.5):
    for param_group in optimizer.module.param_groups:
        if(param_group['lr']>1e-8):
            param_group['lr'] = param_group['lr'] * decay_rate
    print(optimizer.module)
cuda = True if torch.cuda.is_available else False

classes = load_classes(opt.class_path)

module_defs=parse_model_config(opt.model_config_path)
hyperparams     = module_defs[0]
anchors=hyperparams["anchors"]
anchors = [int(x) for x in anchors.split(",")]
anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
module_defs[83]["anchors"]=anchors
module_defs[95]["anchors"]=anchors
module_defs[107]["anchors"]=anchors
batch_size      = opt.batch_size# int(hyperparams['batch'])
subdivisions    = int(hyperparams['subdivisions'])
sub_batch       = batch_size // subdivisions
learning_rate   = opt.learning_rate
momentum        = float(hyperparams['momentum'])
decay           = float(hyperparams['decay'])
burn_in         = int(hyperparams['burn_in'])
hyperparams['height']=hyperparams['width']=opt.img_size

if __name__ == '__main__':
    dataloader = torch.utils.data.DataLoader(
        ListDataset(opt.train_dir,img_size=opt.img_size,is_training = 1,data_size=10000),
        batch_size=batch_size, shuffle=1, num_workers=opt.n_cpu)

    model = Darknet(module_defs,img_size=opt.img_size)
    model.load_weights(opt.weights_path,is_training=True)
    #model.apply(weights_init_normal)

    ngpus = 4
    if ngpus >= 1:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    if cuda:
        if ngpus > 1:
            model = torch.nn.DataParallel(model).to(device)
            # model = nn.parallel.DataParallel(model,device_ids=_DEVICE_ID).cuda()
        else:
            model = model.to(device)

    model.train()
    Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
    # optimizer = optim.SGD(model.parameters(), lr=learning_rate/batch_size, momentum=momentum, dampening=0, weight_decay=decay*batch_size)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate/batch_size, weight_decay=decay*batch_size)
    optimizer = torch.nn.DataParallel(optimizer).to(device)
    print("subdivisions",subdivisions)
    logger = Logger('./logs')
    total_loss=0
    last_total_loss=0
    for epoch in range(opt.epochs):
        for batch_i, (_, imgs, targets) in enumerate(dataloader):
            imgs = Variable(imgs.type(Tensor))
            targets = Variable(targets.type(Tensor), requires_grad=False)
            optimizer.module.zero_grad()
            loss = model(imgs, targets)
            # loss.backward()
            # optimizer.step()
            torch.sum(loss).backward()
            optimizer.module.step()
            strftime = datetime.datetime.now().strftime("%H:%M:%S")
            # print(strftime, epoch, opt.epochs, batch_i, len(dataloader), loss)
            if batch_i % 40 == 39:
                if last_total_loss > 0 and total_loss > last_total_loss * 1.01:
                    print("total_loss", total_loss)
                    adjust_learning_rate(optimizer)
                else:
                    last_total_loss = total_loss
                total_loss = torch.sum(loss)
            elif batch_i == 0:
                total_loss = torch.sum(loss)
            else:
                total_loss += torch.sum(loss)

            # if epoch > 0 and batch_i == 0:
            #     if torch.sum(loss) > mean_loss / batch_size :
            #         print("mean_loss", mean_loss)
            #         adjust_learning_rate(optimizer)
            #     mean_loss = torch.sum(loss)
            # else:
            #     mean_loss += torch.sum(loss)
            # info = {'loss': loss.item(), 'cls': model.losses['cls'], 'conf': model.losses['conf']}

            # for tag, value in info.items():
            #     logger.scalar_summary(tag, value, epoch)

            print('%s [Epoch %d/%d, Batch %d/%d Losses: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f, recall: %.5f]' %
                  (strftime, epoch, opt.epochs, batch_i, len(dataloader),
                   model.module.losses['x'], model.module.losses['y'], model.module.losses['w'],
                   model.module.losses['h'], model.module.losses['conf'], model.module.losses['cls'],
                   torch.sum(loss), model.module.losses['recall']))

        if epoch % opt.checkpoint_interval == 0:
            model.module.save_weights('%s/%d.weights' % (opt.checkpoint_dir, epoch))

你可能感兴趣的:(torch,yolo)