Faster RCNN pytorch 代码详解

Faster RCNN 原文

本文所搬运代码来源

一 数据处理部分

这部分对应data目录下的内容

File1: dataset.py

from __future__ import  absolute_import
from __future__ import  division
import torch as t
from data.voc_dataset import VOCBboxDataset
from skimage import transform as sktsf
from torchvision import transforms as tvtsf
from data import util
import numpy as np
from utils.config import op

1. def inverse_normalize(img)

def inverse_normalize(img):
    if opt.caffe_pretrain:
        img = img + (np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1))
        return img[::-1, :, :]
    # approximate un-normalize for visualize
    return (img * 0.225 + 0.45).clip(min=0, max=1) * 255

作用:对已经归一化的图片进行去归一化以进行可视化
实现:是归一化的逆操作,先通过opt.caffe_pretrain判断是否使用caffe_pretrain进行预训练,如果是,则进行的是caffe_normalize的逆操作,否则进行的是pytorch_normalize的逆操作

2. def pytorch_normalze(img)

def pytorch_normalze(img):
    normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225])
    img = normalize(t.from_numpy(img))
    return img.numpy()

作用:pytorch形式的归一化
实现:公式(x - mean)/std,这里使用的是imagenet数据集中的图像的均值和标准差(img中每个像素的RGB值都已经除以255)

3. def caffe_normalize(img)

def caffe_normalize(img):
    """
    return appr -125-125 BGR
    """
    img = img[[2, 1, 0], :, :]  # RGB-BGR
    img = img * 255
    mean = np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1)
    img = (img - mean).astype(np.float32, copy=True)
    return img

作用:caffe形式的归一化
实现:caffe中的图像是BGR形式,所以首先要通过交换通道次序将RGB图像转化为BGR图像;然后各个通道分别减去本通道的均值,进行零均值化

4. def preprocess(img, min_size=600, max_size=1000)

def preprocess(img, min_size=600, max_size=1000):
    C, H, W = img.shape
    scale1 = min_size / min(H, W)
    scale2 = max_size / max(H, W)
    scale = min(scale1, scale2)
    img = img / 255.
    img = sktsf.resize(img, (C, H * scale, W * scale), mode='reflect',anti_aliasing=False)
    if opt.caffe_pretrain:
        normalize = caffe_normalize
    else:
        normalize = pytorch_normalze
    return normalize(img)

作用:在提取特征之前对图片进行预处理,主要目的是进行缩放,以保证图片的高和宽都在[min_size,max_size]区间内,然后对图片进行归一化
实现:输入img是一个numpy.ndarray,格式为CHW三个通道分别为RGB。将较短的边缩放到min_size所需要的缩放倍数为scale1,将较长的边缩放到max_size所需要的缩放倍数为scale2,选择其中缩放倍数较小的作为缩放因子scale,这一步完成后,要么较短的边变成了min_size并且较长边落在[min_size,max_size]区间内,要么较长的边变成了max_size并且较短边在[min_size,max_size]区间内。
以第一种情况为例进行证明:
不妨设H 且scale1=min/H 于是scale=scale1=min/H
显然H经过scale后变成min
而Wscale=W(min/H) Wscale>Hscale=min
即min 然后进行归一化,根据opt.caffe_pretrain选择调用前面的pytorch归一化还是caffe归一化

5. class Transform(object)

class Transform(object):
    # 可调用对象
    def __init__(self, min_size=600, max_size=1000):
        self.min_size = min_size
        self.max_size = max_size

    def __call__(self, in_data):
        img, bbox, label = in_data
        _, H, W = img.shape
        img = preprocess(img, self.min_size, self.max_size)
        _, o_H, o_W = img.shape
        scale = o_H / H
        bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W))

        # horizontally flip
        img, params = util.random_flip(
            img, x_random=True, return_param=True)
        bbox = util.flip_bbox(
            bbox, (o_H, o_W), x_flip=params['x_flip'])

        return img, bbox, label, scale

作用:使用一个可调用对象,将对输入数据所要进行的全部转化操作及相关参数集成到一起
实现:输入数据是一个(img,bbox,label)三元祖,对img使用前述preprocess进行预处理,并使用img缩放因子对bbox进行同样的缩放,然后对图片惊醒随机的翻转,并对bbox进行相同的翻转

6. class Dataset

class Dataset:
    def __init__(self, opt):
        self.opt = opt
        self.db = VOCBboxDataset(opt.voc_data_dir)
        self.tsf = Transform(opt.min_size, opt.max_size)

    def __getitem__(self, idx):
        ori_img, bbox, label, difficult = self.db.get_example(idx)

        img, bbox, label, scale = self.tsf((ori_img, bbox, label))
        # TODO: check whose stride is negative to fix this instead copy all
        # some of the strides of a given numpy array are negative.
        return img.copy(), bbox.copy(), label.copy(), scale

    def __len__(self):
        return len(self.db)

作用:训练数据集
实现:对VOCBboxDataset数据集(后面会详细解读)进行又一层包装,以作为训练所用的数据集,从self.db(即VOCBboxDataset)中取出一个数据,对其使用self.tsf(即前述Transform)进行转化,再将该数据返回

7. class TestDataset

class TestDataset:
    def __init__(self, opt, split='test', use_difficult=True):
        self.opt = opt
        self.db = VOCBboxDataset(opt.voc_data_dir, split=split, use_difficult=use_difficult)

    def __getitem__(self, idx):
        ori_img, bbox, label, difficult = self.db.get_example(idx)
        img = preprocess(ori_img)
        return img, ori_img.shape[1:], bbox, label, difficult

    def __len__(self):
        return len(self.db)

作用:测试数据集
实现:与训练集类似,只是测试集只需要简单的对图片进行preprocess(缩放和归一化),不需要进行翻转等图像增强。另外,这边将原始图像的尺寸也作为了返回值,可能是为了完成detection后进行可视化等操作。

2019/11/05

File2: voc_dataset.py

在阅读这部分内容之前应当对PASCAL VOC数据集有一个了解,可参考这篇blog

import os
import xml.etree.ElementTree as ET
import numpy as np
from .util import read_image

1. class VOCBboxDataset

class VOCBboxDataset:
    def __init__(self, data_dir, split='trainval',
                 use_difficult=False, return_difficult=False,
                 ):

        # if split not in ['train', 'trainval', 'val']:
        #     if not (split == 'test' and year == '2007'):
        #         warnings.warn(
        #             'please pick split from \'train\', \'trainval\', \'val\''
        #             'for 2012 dataset. For 2007 dataset, you can pick \'test\''
        #             ' in addition to the above mentioned splits.'
        #         )
        id_list_file = os.path.join(
            data_dir, 'ImageSets/Main/{0}.txt'.format(split))

        self.ids = [id_.strip() for id_ in open(id_list_file)]
        self.data_dir = data_dir
        self.use_difficult = use_difficult
        self.return_difficult = return_difficult
        self.label_names = VOC_BBOX_LABEL_NAMES

    def __len__(self):
        return len(self.ids)

    def get_example(self, i):
        id_ = self.ids[i]
        anno = ET.parse(
            os.path.join(self.data_dir, 'Annotations', id_ + '.xml'))
        bbox = list()
        label = list()
        difficult = list()
        for obj in anno.findall('object'):
            # when in not using difficult split, and the object is
            # difficult, skipt it.
            if not self.use_difficult and int(obj.find('difficult').text) == 1:
                continue

            difficult.append(int(obj.find('difficult').text))
            bndbox_anno = obj.find('bndbox')
            # subtract 1 to make pixel indexes 0-based
            bbox.append([
                int(bndbox_anno.find(tag).text) - 1
                for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
            name = obj.find('name').text.lower().strip()
            label.append(VOC_BBOX_LABEL_NAMES.index(name))
        bbox = np.stack(bbox).astype(np.float32)
        label = np.stack(label).astype(np.int32)
        # When `use_difficult==False`, all elements in `difficult` are False.
        difficult = np.array(difficult, dtype=np.bool).astype(np.uint8)  # PyTorch don't support np.bool

        # Load a image
        img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg')
        img = read_image(img_file, color=True)

        # if self.return_difficult:
        #     return img, bbox, label, difficult
        return img, bbox, label, difficult

    __getitem__ = get_example

作用:构造VOCBbox数据集
实现:实现构造函数,__len__和__getitem__函数

1.1 def __ init __(self, data_dir, split=‘trainval’,use_difficult=False, return_difficult=False)

作用:初始化一个VOCBboxDataset对象
实现:
一、 参数:
data_dir(string):到VOC数据根目录的路径,即"/data/image/voc/VOCdevkit/VOC2007/"
split({‘train’, ‘val’, ‘trainval’, ‘test’}):选择要取出哪部分数据,(test数据只有2007年的可获得)
year({‘2007’, ‘2012’}): 选择使用哪一年的VOC数据集
use_difficult (bool):如果为True,则使用在annotation中被标记为difficult的图片
return_difficult (bool):如果为True,__ getitem__ 函数在返回图片,图片中所有bbox,每个bbox对应的label的同时,还要返回一个bool数组,其中每个元素对应一个bbox,指示该bbox是否被标记为difficult
二、具体步骤:
只有一点要讲:data_dir+'ImageSets/Main’目录下保存了具体数据集的索引,train数据集的索引保存为train.txt, trainval数据集的索引保存为trainval.txt,val数据集的索引保存为val.txt

1.2 def __ len__(self)

从txt文件中获得的索引个数就是数据集的长度

1.3 def get_example(self, i) (__ getitem__)

作用:返回第i条数据
实现:
一、返回值:
img:img就是一个图片对应的张量 img.dtype = numpy.float32 CHW RGB
bbox:img中所有的物体框 bbox.dtype = numpy.float32 bbox.shape=(R,4),其中R是img中物体框的个数,实际上是R个(y_{min}, x_{min}, y_{max}, x_{max})元祖所形成的列表
label:每个物体框对应的label形成的列表,label.dtype = numpy.int32,元素取值为0-19,对应20类物品,label.shape=(R,0)
difficult:每个物体框是否困难标记,difficult.dtype = numpy.bool
二、具体实现:
整体思路是,通过下标索引i获得对应索引id_ , 通过索引在JPEGImages中取得img,在Annotations中获得对应的xml文件并进行解析得到bbox,label,difficult

2. VOC_BBOX_LABEL_NAMES

VOC_BBOX_LABEL_NAMES = (
    'aeroplane',
    'bicycle',
    'bird',
    'boat',
    'bottle',
    'bus',
    'car',
    'cat',
    'chair',
    'cow',
    'diningtable',
    'dog',
    'horse',
    'motorbike',
    'person',
    'pottedplant',
    'sheep',
    'sofa',
    'train',
    'tvmonitor')

label 0-19对应的类别名称

File3: util.py

import numpy as np
from PIL import Image
import random

1. def read_image(path, dtype=np.float32, color=True)

def read_image(path, dtype=np.float32, color=True):
    f = Image.open(path)
    try:
        if color:
            img = f.convert('RGB')
        else:
            img = f.convert('P')
        img = np.asarray(img, dtype=dtype)
    finally:
        if hasattr(f, 'close'):
            f.close()

    if img.ndim == 2:
        # reshape (H, W) -> (1, H, W)
        return img[np.newaxis]
    else:
        # transpose (H, W, C) -> (C, H, W)
        return img.transpose((2, 0, 1))

作用:读取一张图片,并进行格式转换

2. def resize_bbox(bbox, in_size, out_size)

def resize_bbox(bbox, in_size, out_size):
    bbox = bbox.copy()
    y_scale = float(out_size[0]) / in_size[0]
    x_scale = float(out_size[1]) / in_size[1]
    bbox[:, 0] = y_scale * bbox[:, 0]
    bbox[:, 2] = y_scale * bbox[:, 2]
    bbox[:, 1] = x_scale * bbox[:, 1]
    bbox[:, 3] = x_scale * bbox[:, 3]
    return bbox

作用:根据image resize对bbox进行resize,in_size和out_size是image resize前后的H和W
实现:注意bbox格式 (y_{min}, x_{min}, y_{max}, x_{max})

3. def flip_bbox(bbox, size, y_flip=False, x_flip=False)

def flip_bbox(bbox, size, y_flip=False, x_flip=False):
    H, W = size
    bbox = bbox.copy()
    if y_flip:
        y_max = H - bbox[:, 0]
        y_min = H - bbox[:, 2]
        bbox[:, 0] = y_min
        bbox[:, 2] = y_max
    if x_flip:
        x_max = W - bbox[:, 1]
        x_min = W - bbox[:, 3]
        bbox[:, 1] = x_min
        bbox[:, 3] = x_max
    return bbox

作用:翻转bbox

4. def crop_bbox( bbox, y_slice=None, x_slice=None,allow_outside_center=True, return_param=False)

def crop_bbox(
        bbox, y_slice=None, x_slice=None,
        allow_outside_center=True, return_param=False):
    t, b = _slice_to_bounds(y_slice)
    l, r = _slice_to_bounds(x_slice)
    crop_bb = np.array((t, l, b, r))

    if allow_outside_center:
        mask = np.ones(bbox.shape[0], dtype=bool)
    else:
        center = (bbox[:, :2] + bbox[:, 2:]) / 2.0
        mask = np.logical_and(crop_bb[:2] <= center, center < crop_bb[2:]) \
            .all(axis=1)

    bbox = bbox.copy()
    bbox[:, :2] = np.maximum(bbox[:, :2], crop_bb[:2])
    bbox[:, 2:] = np.minimum(bbox[:, 2:], crop_bb[2:])
    bbox[:, :2] -= crop_bb[:2]
    bbox[:, 2:] -= crop_bb[:2]

    mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:]).all(axis=1))
    bbox = bbox[mask]

    if return_param:
        return bbox, {'index': np.flatnonzero(mask)}
    else:
        return bbox

作用:按照对image的crop对bbox进行crop。若bbox完全在cropped area中,则保留;若bbox完全在cropped area之外,则丢弃;若bbox部分在cropped area外,则跟据crop的边界进行截断
实现:
一、参数:
bbox:(R,4)
yslice:crop后的image在原image中y轴的切片
xslice:crop后的image在原image中x轴的切片
allow_outside_center (bool):如果为False,则将中心在cropped area外的bbox也直接丢弃
return_param (bool):如果为True,则返回一个bool列表,指示哪些bbox被保留下来了 (R,)
二、具体实现:
小技巧:丢弃bbox的操作并不需要真的del,只需要用一个长度为R的mask数组,一开始为全1,丢弃哪个bbox就将对应值设为0,最后通过bbox=bbox[mask]完成丢弃
整体思路:先通过yslice、xslice得到crop area的完整边界(t, l, b, r);如果allow_outside_center为False,丢弃中心点在crop area外的bbox,中心点坐标要满足(t,l)<=center<=(b,r);再对bbox进行截断,每个bbox的(t,l)取自身与cropped area较大的,(b,r)取较小的,然后将每个bbox的(t,l,b,r)减去cropped area的(t,l),得到bbox在以cropped area的左上角为原点的坐标系中的坐标。最后再删去那些(t,l)<(b,r)的bbox(表示在cropped area外)

5. def slice_to_bounds(slice)

def _slice_to_bounds(slice_):
    if slice_ is None:
        return 0, np.inf
    if slice_.start is None:
        l = 0
    else:
        l = slice_.start
    if slice_.stop is None:
        u = np.inf
    else:
        u = slice_.stop
    return l, u

作用:通过切片得到边界值

6. def translate_bbox(bbox, y_offset=0, x_offset=0)

def translate_bbox(bbox, y_offset=0, x_offset=0):
	out_bbox = bbox.copy()
    out_bbox[:, :2] += (y_offset, x_offset)
    out_bbox[:, 2:] += (y_offset, x_offset)
    return out_bbox

作用:平移bbox,主要与image padding一起使用

7. def random_flip(img, y_random=False, x_random=False,return_param=False, copy=False)

def random_flip(img, y_random=False, x_random=False,
                return_param=False, copy=False):
	y_flip, x_flip = False, False
    if y_random:
        y_flip = random.choice([True, False])
    if x_random:
        x_flip = random.choice([True, False])

    if y_flip:
        img = img[:, ::-1, :]
    if x_flip:
        img = img[:, :, ::-1]

    if copy:
        img = img.copy()

    if return_param:
        return img, {'y_flip': y_flip, 'x_flip': x_flip}
    else:
        return img

作用:随机翻转图片

2019/11/07

二 模型准备部分

这部分对应model/utils目录下的内容

File1: bbox_tools.py

import numpy as np
import numpy as xp
import six
from six import __init__

在理解下面两个函数之前,先回顾这组公式:
G ^ x = P w t x + P x G ^ y = P h t y + P y G ^ w = P w e x p ( t w ) G ^ h = P h e x p ( t h ) { \hat { G } }_{ x }={ P }_{ w }{ t }_{ x }+{ P }_{ x }\\ { \hat { G } }_{ y }={ P }_{ h }{ t }_{ y }+{ P }_{ y }\\ { \hat { G } }_{ w }={ P }_{ w }exp({ t }_{ w })\\ { \hat { G } }_{ h }={ P }_{ h }exp({ t }_{ h })\\ G^x=Pwtx+PxG^y=Phty+PyG^w=Pwexp(tw)G^h=Phexp(th)

1. def loc2bbox(src_bbox, loc)

def loc2bbox(src_bbox, loc):
	if src_bbox.shape[0] == 0:
        return xp.zeros((0, 4), dtype=loc.dtype)

    src_bbox = src_bbox.astype(src_bbox.dtype, copy=False)

    src_height = src_bbox[:, 2] - src_bbox[:, 0]
    src_width = src_bbox[:, 3] - src_bbox[:, 1]
    src_ctr_y = src_bbox[:, 0] + 0.5 * src_height
    src_ctr_x = src_bbox[:, 1] + 0.5 * src_width

    dy = loc[:, 0::4]
    dx = loc[:, 1::4]
    dh = loc[:, 2::4]
    dw = loc[:, 3::4]

    ctr_y = dy * src_height[:, xp.newaxis] + src_ctr_y[:, xp.newaxis]
    ctr_x = dx * src_width[:, xp.newaxis] + src_ctr_x[:, xp.newaxis]
    h = xp.exp(dh) * src_height[:, xp.newaxis]
    w = xp.exp(dw) * src_width[:, xp.newaxis]

    dst_bbox = xp.zeros(loc.shape, dtype=loc.dtype)
    dst_bbox[:, 0::4] = ctr_y - 0.5 * h
    dst_bbox[:, 1::4] = ctr_x - 0.5 * w
    dst_bbox[:, 2::4] = ctr_y + 0.5 * h
    dst_bbox[:, 3::4] = ctr_x + 0.5 * w

    return dst_bbox

作用:给出原始框 ( P x , P y , P w , P h ) ({ P }_{ x },{ P }_{ y },{ P }_{ w },{ P }_{ h }) (Px,Py,Pw,Ph)(事实上是以 ( P y m i n , P x m i n , P y m a x , P x m a x ) ({ P }_{ ymin },{ P }_{ xmin },{ P }_{ ymax },{ P }_{ xmax }) (Pymin,Pxmin,Pymax,Pxmax)形式给出)和偏移 ( t x , t y , t w , t h ) ({ t }_{ x },{ t }_{ y },{ t }_{ w },{ t }_{ h }) (tx,ty,tw,th),求出偏移后的框 ( G x , G y , G w , G h ) ({ G }_{ x },{ G }_{ y },{ G }_{ w },{ G }_{ h }) (Gx,Gy,Gw,Gh)(事实上又转化回了 ( G y m i n , G x m i n , G y m a x , G x m a x ) ({ G }_{ ymin },{ G }_{ xmin },{ G }_{ ymax },{ G }_{ xmax }) (Gymin,Gxmin,Gymax,Gxmax)的形式)
实现:先将 ( P y m i n , P x m i n , P y m a x , P x m a x ) ({ P }_{ ymin },{ P }_{ xmin },{ P }_{ ymax },{ P }_{ xmax }) (Pymin,Pxmin,Pymax,Pxmax)转化为 ( P x , P y , P w , P h ) ({ P }_{ x },{ P }_{ y },{ P }_{ w },{ P }_{ h }) (Px,Py,Pw,Ph),再根据公式进行转化,最后再将得到的框又转化成 ( G y m i n , G x m i n , G y m a x , G x m a x ) ({ G }_{ ymin },{ G }_{ xmin },{ G }_{ ymax },{ G }_{ xmax }) (Gymin,Gxmin,Gymax,Gxmax)的形式

2. def bbox2loc(src_bbox, dst_bbox)

def bbox2loc(src_bbox, dst_bbox):
    height = src_bbox[:, 2] - src_bbox[:, 0]
    width = src_bbox[:, 3] - src_bbox[:, 1]
    ctr_y = src_bbox[:, 0] + 0.5 * height
    ctr_x = src_bbox[:, 1] + 0.5 * width

    base_height = dst_bbox[:, 2] - dst_bbox[:, 0]
    base_width = dst_bbox[:, 3] - dst_bbox[:, 1]
    base_ctr_y = dst_bbox[:, 0] + 0.5 * base_height
    base_ctr_x = dst_bbox[:, 1] + 0.5 * base_width

    eps = xp.finfo(height.dtype).eps
    height = xp.maximum(height, eps)
    width = xp.maximum(width, eps)

    dy = (base_ctr_y - ctr_y) / height
    dx = (base_ctr_x - ctr_x) / width
    dh = xp.log(base_height / height)
    dw = xp.log(base_width / width)

    loc = xp.vstack((dy, dx, dh, dw)).transpose()
    return loc

作用:给出原始框 ( P y m i n , P x m i n , P y m a x , P x m a x ) ({ P }_{ ymin },{ P }_{ xmin },{ P }_{ ymax },{ P }_{ xmax }) (Pymin,Pxmin,Pymax,Pxmax)和偏移后的框 ( G y m i n , G x m i n , G y m a x , G x m a x ) ({ G }_{ ymin },{ G }_{ xmin },{ G }_{ ymax },{ G }_{ xmax }) (Gymin,Gxmin,Gymax,Gxmax),求偏移 ( t x , t y , t w , t h ) ({ t }_{ x },{ t }_{ y },{ t }_{ w },{ t }_{ h }) (tx,ty,tw,th)
注意以上两个函数都是批量操作,输入都是(R,4),输出也是(R,4)

3. def bbox_iou(bbox_a, bbox_b)

def bbox_iou(bbox_a, bbox_b):
	if bbox_a.shape[1] != 4 or bbox_b.shape[1] != 4:
        raise IndexError

    # top left
    tl = xp.maximum(bbox_a[:, None, :2], bbox_b[:, :2])
    # bottom right
    br = xp.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:])

    area_i = xp.prod(br - tl, axis=2) * (tl < br).all(axis=2)
    area_a = xp.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1)
    area_b = xp.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1)
    return area_i / (area_a[:, None] + area_b - area_i)

作用:计算两组bbox的IOU
实现:bbox_a.shape=(N,4),bbox_b.shape=(K,4),a和b中的框两两配对,要求的IOU一共有N*K个,即返回值.shape=(N,K)。bbox_a[:, None, :2].shape=[N,1,2],bbox_b[:, :2].shape=[K,2],故tl.shape=[N,K,2],同理br.shape=[N,K,2]。(br - tl).shape=[N,K,2],沿轴2第一个数是h,第二个数是w,故沿轴2累乘得到面积。area_i.shape=[N,K]。同理area_a,area_b。area_i是相交的面积,area_a是a的面积,area_b是b的面积。

4. def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2],anchor_scales=[8, 16, 32])

def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2],
                         anchor_scales=[8, 16, 32]):
	py = base_size / 2.
    px = base_size / 2.

    anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4),
                           dtype=np.float32)
    for i in six.moves.range(len(ratios)):
        for j in six.moves.range(len(anchor_scales)):
            h = base_size * anchor_scales[j] * np.sqrt(ratios[i])
            w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i])

            index = i * len(anchor_scales) + j
            anchor_base[index, 0] = py - h / 2.
            anchor_base[index, 1] = px - w / 2.
            anchor_base[index, 2] = py + h / 2.
            anchor_base[index, 3] = px + w / 2.
    return anchor_base                       

作用:以(base_size/2,base_size/2)为中心点,产生anchors。
具体实现:即anchor的产生方法:给定中心点,base_size,ratio,anchor_scale,只需确定h和w就确定了一个anchor。h和w scale后的长度都为base_size * anchor_scale,在保证hw不变的情况下,使h/w=ratio,则h=base_size * anchor_scale * sqrt(ratio),w=base_size * anchor_scale * sqrt(1/ratio),遍历不同的ratio,anchor_scale组合,则一共得到len(ratios) * len(anchor_scales)个anchors,故返回的是一个(len(ratios) len(anchor_scales),4)数组

File2: creater_tool.py

1. class ProposalCreator

class ProposalCreator:
	def __init__(self,
                 parent_model,
                 nms_thresh=0.7,
                 n_train_pre_nms=12000,
                 n_train_post_nms=2000,
                 n_test_pre_nms=6000,
                 n_test_post_nms=300,
                 min_size=16
                 ):
        self.parent_model = parent_model
        self.nms_thresh = nms_thresh
        self.n_train_pre_nms = n_train_pre_nms
        self.n_train_post_nms = n_train_post_nms
        self.n_test_pre_nms = n_test_pre_nms
        self.n_test_post_nms = n_test_post_nms
        self.min_size = min_size

    def __call__(self, loc, score,
                 anchor, img_size, scale=1.):
        # NOTE: when test, remember
        # faster_rcnn.eval()
        # to set self.traing = False
        if self.parent_model.training:
            n_pre_nms = self.n_train_pre_nms
            n_post_nms = self.n_train_post_nms
        else:
            n_pre_nms = self.n_test_pre_nms
            n_post_nms = self.n_test_post_nms

        # Convert anchors into proposal via bbox transformations.
        # roi = loc2bbox(anchor, loc)
        roi = loc2bbox(anchor, loc)

        # Clip predicted boxes to image.
        # proposal(即anchor+loc)可能超出了图片的边界,需要将其clip进图片
        roi[:, slice(0, 4, 2)] = np.clip(
            roi[:, slice(0, 4, 2)], 0, img_size[0])
        # 0,2是ymin,ymax
        roi[:, slice(1, 4, 2)] = np.clip(
            roi[:, slice(1, 4, 2)], 0, img_size[1])
         #1,3是xmin,xmax

        # Remove predicted boxes with either height or width < threshold.
        #对应原图中长或宽小于min_size的propasal移除
        #即对应scale后的图片中小于min_size*scale的propasal移除
        min_size = self.min_size * scale
        hs = roi[:, 2] - roi[:, 0]
        ws = roi[:, 3] - roi[:, 1]
        keep = np.where((hs >= min_size) & (ws >= min_size))[0]
        roi = roi[keep, :]
        score = score[keep]

        # Sort all (proposal, score) pairs by score from highest to lowest.
        # Take top pre_nms_topN (e.g. 6000).
        order = score.ravel().argsort()[::-1]
        #n_pre_nms<=0表示全要
        if n_pre_nms > 0:
            order = order[:n_pre_nms]
        roi = roi[order, :]

        # Apply nms (e.g. threshold = 0.7).
        # Take after_nms_topN (e.g. 300).

        # unNOTE: somthing is wrong here!
        # TODO: remove cuda.to_gpu
        keep = non_maximum_suppression(
            cp.ascontiguousarray(cp.asarray(roi)),
            thresh=self.nms_thresh)
        if n_post_nms > 0:
            keep = keep[:n_post_nms]
        roi = roi[keep]
        return roi

作用:产生proposal regions

1.1 def __ init__(self, parent_model, nms_thresh=0.7,n_train_pre_nms=12000, n_train_post_nms=2000, n_test_pre_nms=6000, n_test_post_nms=300,min_size=16)

作用:实例化一个ProposalCreator对象,传入一些参数值
实现:
一、参数:
nms_thresh (float):非极大值抑制的阈值(默认为0.7)
n_train_pre_nms (int):训练模式下,从所有的bbox中选出前n_train_pre_nms个是前景的概率最大的去进行非极大值抑制,默认为12000个
n_train_post_nms (int):训练模式下,进行nms后要保留的bbox的个数,默认为2000个
n_test_pre_nms (int):测试模式下,从所有的bbox中选出前n_test_pre_nms个是前景的概率最大的去进行非极大值抑制,默认为6000个
n_test_post_nms (int):测试模式下,进行nms后要保留的bbox的个数,默认为300个
min_size (int):当bbox的长或宽对应到原图中小于min_size时,丢弃

1.2 def __ call__(self, loc, score,anchor, img_size, scale=1.)

作用:根据anchors和预测的偏移得到bbox,并将其clip进边界,再选出其中是前景概率最高的一部,删去长或宽太小的一部分,再进行nms,选出nms后前若干个bbox,作为Proposal Regions
实现:
一、参数
loc (array):预测的anchors的偏移,loc.shape=(R,4)
score (array):anchors是前景的概率,score.shape=(R, )
anchor (array):anchors的坐标,anchor.shape=(R,4)
img_size (tuple of ints):(height,width)元祖,scaling之后的图片尺寸
scale (float):用于scale图片的scale因子
二、返回值
返回一个proposal boxes 的坐标数组,数组shape=(S,4),训练模式下,S小于n_train_post_nms;测试模式下,S小于n_test_post_nms。S取决于预测的bbox(anchor加偏移)的大小以及nms丢弃的bbox的数量

import numpy as np
import cupy as cp
from model.utils.bbox_tools import bbox2loc, bbox_iou, loc2bbox
from model.utils.nms import non_maximum_suppression

2. class ProposalTargetCreator(object)

class ProposalTargetCreator(object):
	def __init__(self,
                 n_sample=128,
                 pos_ratio=0.25, pos_iou_thresh=0.5,
                 neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0
                 ):
        self.n_sample = n_sample
        self.pos_ratio = pos_ratio
        self.pos_iou_thresh = pos_iou_thresh
        self.neg_iou_thresh_hi = neg_iou_thresh_hi
        self.neg_iou_thresh_lo = neg_iou_thresh_lo  # NOTE:default 0.1 in py-faster-rcnn
    def __call__(self, roi, bbox, label,
                 loc_normalize_mean=(0., 0., 0., 0.),
                 loc_normalize_std=(0.1, 0.1, 0.2, 0.2)):
         n_bbox, _ = bbox.shape
         # n_bbox是ground truth bbox的个数

        roi = np.concatenate((roi, bbox), axis=0)
        # 将proposals和ground truth box合到一起,形成一个大的roi

        pos_roi_per_image = np.round(self.n_sample * self.pos_ratio)
        # pos_roi_per_image是根据pos_ratio算出的要产生的正例个数
        iou = bbox_iou(roi, bbox)
        # iou.shape=(len(roi),len(bbox)),可以视为每个roi (行) 分别与每个ground truth bbox 的IOU
        gt_assignment = iou.argmax(axis=1)
        # gt_assignment.shape=(len(roi),),每个roi IOU最大的ground truth bbox 的序号
        max_iou = iou.max(axis=1)
        #max_iou.shape=(len(roi),),每个roi 与ground truth bbox 最大IOU
        # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class].
        # The label with value 0 is the background.
        gt_roi_label = label[gt_assignment] + 1
        #每个roi依附的ground truth bbox的label (从 0-19 变成 1-20)

        # Select foreground RoIs as those with >= pos_iou_thresh IoU.
        pos_index = np.where(max_iou >= self.pos_iou_thresh)[0]
        #where的返回值是一个元祖,尽管在这里元祖里面只有一个元素,也要用[0]取出来
        #pos_index是所有达到了正例阈值的roi的索引
        pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
        #最终的正例个数取pos_roi_per_image和达到正例阈值的roi数量的较小值
        if pos_index.size > 0:
            pos_index = np.random.choice(
                pos_index, size=pos_roi_per_this_image, replace=False)
         #从pos_index随机取出pos_roi_per_this_image个索引,作为最终的正例
         #pos_index.shape=(pos_roi_per_this_image,)

        # Select background RoIs as those within
        # [neg_iou_thresh_lo, neg_iou_thresh_hi).
        neg_index = np.where((max_iou < self.neg_iou_thresh_hi) &
                             (max_iou >= self.neg_iou_thresh_lo))[0]
        neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image
        neg_roi_per_this_image = int(min(neg_roi_per_this_image,
                                         neg_index.size))
        if neg_index.size > 0:
            neg_index = np.random.choice(
                neg_index, size=neg_roi_per_this_image, replace=False)
        # 选负例,与选正例过程完全类似
        # 注意一点,负例是要与每个ground truth bbox的IOU都小于阈值,只需最大的IOU小于阈值即可
        # neg_index.shape=(neg_roi_per_this_image,)
        # pos_roi_per_this_image + neg_roi_per_this_image = 256

        # The indices that we're selecting (both positive and negative).
        keep_index = np.append(pos_index, neg_index)
        #将正例索引和负例索引合并到一起
        #注意,这些索引值都是anchors和ground truth box合成的roi中的下标值
        gt_roi_label = gt_roi_label[keep_index]
        #先将每个roi的label设置为与之IOU最大的ground truth box的label
        gt_roi_label[pos_roi_per_this_image:] = 0  # negative labels --> 0
        #再将负例的label修改为0
        sample_roi = roi[keep_index]

        # Compute offsets and scales to match sampled RoIs to the GTs.
        gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]])
        # 算出sample出的roi与他们分别所依附的ground truth bbox之间的偏移
        # gt_assignment是前面定义的数组,存放每个roi IOU最大的ground truth bbox的下标
        gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)
                       ) / np.array(loc_normalize_std, np.float32))

        return sample_roi, gt_roi_loc, gt_roi_label

作用:从 ProposalCreator产生的proposal regions以及ground truth bbox中选出n_sample个,并打上标签(0-20)

2.1 def __ init__(self, n_sample=128, pos_ratio=0.25, pos_iou_thresh=0.5,neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0)

作用:初始化一个ProposalTargetCreator对象,设置一些参数
实现:
一、参数:
n_sample (int):要采样出的bbox的总个数
pos_ratio (float):正例所占的比例
pos_iou_thresh (float):一个bbox要被认为是正例需与某个ground truth达到的IOU的最小阈值
neg_iou_thresh_lo (float),neg_iou_thresh_hi (float):一个bbox被认为是负例,若它与任意一个ground truth的IOU都在[neg_iou_thresh_lo, neg_iou_thresh_hi )区间内

2.2 def call(self, roi, bbox, label,loc_normalize_mean=(0., 0., 0., 0.),loc_normalize_std=(0.1, 0.1, 0.2, 0.2))

作用:
从proposals(roi)和grout truth bbox(bbox)中随机选出n_sample个roi,其中有固定比例的正例,当正例不足时用负例补齐
实现:
一、参数
roi (array):ProposalCreator产生的bbox,roi.shape=(R,4)
bbox (array):ground truth bounding boxes,bbox.shape=(R’ , 4)
label (array):Ground truth bounding box labels,label.shape=(R’ ,),元素的取值范围是1-20
loc_normalize_mean (tuple of four floats):采样出的roi与它所依附的ground truth bbox之间的偏移的均值
loc_normalize_std (tupler of four floats):采样出的roi与它所依附的ground truth bbox之间的偏移的标准差
二、返回值
sample_roi:采样出的roi,sample_roi.shape=(self.n_sample,4)
gt_roi_loc:采样出的roi与它所依附的ground truth bbox之间的偏移,gt_roi_loc.shape=(self.n_sample,4)
gt_roi_label:采样出的roi对应的label,gt_roi_label.shape=(self.n_sample,),元素取值为0-20,0表示负例(背景)
三、具体的实现过程解读在代码注释中

3. class AnchorTargetCreator(object)

class AnchorTargetCreator(object):
	def __init__(self,
                 n_sample=256,
                 pos_iou_thresh=0.7, neg_iou_thresh=0.3,
                 pos_ratio=0.5):
        self.n_sample = n_sample
        self.pos_iou_thresh = pos_iou_thresh
        self.neg_iou_thresh = neg_iou_thresh
        self.pos_ratio = pos_ratio

    def __call__(self, bbox, anchor, img_size):
    	img_H, img_W = img_size

        n_anchor = len(anchor)
        inside_index = _get_inside_index(anchor, img_H, img_W)
        #_get_inside_index得到那些完全在图片内的anchor的索引
        anchor = anchor[inside_index]
        #不完全在图片内的anchor先删掉,但是等下要_unmap回来
        argmax_ious, label = self._create_label(
            inside_index, anchor, bbox)

        # compute bounding box regression targets
        loc = bbox2loc(anchor, bbox[argmax_ious])
        #每个anchor到与其IOU最大的gt bbox的偏移 (a,4)

        # map up to original set of anchors
        label = _unmap(label, n_anchor, inside_index, fill=-1)
        #_unmap后label.shape回到(R,),之前删掉的anchor对应的label设为-1
        loc = _unmap(loc, n_anchor, inside_index, fill=0)
        #_unmap后loc.shape回到(R,4),之前删掉的anchor对应的loc值全设为fill(值无影响,反正用不到)

        return loc, label

    def _create_label(self, inside_index, anchor, bbox):
        # label: 1 is positive, 0 is negative, -1 is dont care
        label = np.empty((len(inside_index),), dtype=np.int32)
        label.fill(-1)

        argmax_ious, max_ious, gt_argmax_ious = \
            self._calc_ious(anchor, bbox, inside_index)

        # assign negative labels first so that positive labels can clobber them
        label[max_ious < self.neg_iou_thresh] = 0

        # positive label: for each gt, anchor with highest iou
        label[gt_argmax_ious] = 1

        # positive label: above threshold IOU
        label[max_ious >= self.pos_iou_thresh] = 1

        # subsample positive labels if we have too many
        n_pos = int(self.pos_ratio * self.n_sample)
        pos_index = np.where(label == 1)[0]
        if len(pos_index) > n_pos:
            disable_index = np.random.choice(
                pos_index, size=(len(pos_index) - n_pos), replace=False)
            label[disable_index] = -1

        # subsample negative labels if we have too many
        n_neg = self.n_sample - np.sum(label == 1)
        neg_index = np.where(label == 0)[0]
        if len(neg_index) > n_neg:
            disable_index = np.random.choice(
                neg_index, size=(len(neg_index) - n_neg), replace=False)
            label[disable_index] = -1

        return argmax_ious, label
        #返回值argmax_ious:对于一个anchor,与之IOU最大的gt bbox的索引 (a,)
        #label,每个anchor的label (a,)

    def _calc_ious(self, anchor, bbox, inside_index):
        # ious between the anchors and the gt boxes
        ious = bbox_iou(anchor, bbox)
        # ious.shape=(a,b)
        argmax_ious = ious.argmax(axis=1)
        # argmax_ious.shape=(a,),对于一个anchor,与之IOU最大的gt bbox的索引
        max_ious = ious[np.arange(len(inside_index)), argmax_ious]
        # 等价于 max_ious = ious.max(axis=1)
        gt_argmax_ious = ious.argmax(axis=0)
        # gt_argmax_ious.shape=(b,),对于一个gt bbox,与之IOU最大的anchor的索引
        gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
        # 等价于 gt_max_ious = ious.max(axis=0)
        gt_argmax_ious = np.where(ious == gt_max_ious)[0]
        # ious.shape=(a,b)
        # gt_max_ious.shape=(b,)
        #gt_argmax_ious.shape=(>=b,)
        #以上操作就是先找到每个gt bbox对应IOU最大的anchor缩影
        #再通过索引找到每个gt bbox对应的最大IOU
        #为毛不直接用.max找最大值
        #再与每个gt bbox对应的最大IOU值相等的anchor的索引(每个gt bbox对应的可能不止一个)
        return argmax_ious, max_ious, gt_argmax_ious
        #返回值argmax_ious:对于一个anchor,与之IOU最大的gt bbox的索引 (a,)
        #max_ious:对于一个anchor,最大的IOU值 (a,)
        #gt_argmax_ious:那些是某个gt bbox的最大IOU对应的anchor的索引集合,(>=b,)

作用:为训练RPN产生带标签的数据

3.1 def __ init__(self,n_sample=256,pos_iou_thresh=0.7, neg_iou_thresh=0.3,pos_ratio=0.5)

作用:初始化一个AnchorTargetCreator对象,设置一些参数
实现:
一、参数:
n_sample (int):要采样出的anchor的总个数
pos_ratio (float):正例所占的比例
pos_iou_thresh (float):一个anchor要被认为是正例需与某个ground truth达到的IOU的最小阈值
neg_iou_thresh:一个anchor被认为是负例,若它与任意一个ground truth的IOU都小于neg_iou_thresh

3.2 def __ call__(self, bbox, anchor, img_size)

作用:为anchors计算到gt bbox的偏移,为anchors设置标签,从中选出n_sample个正例比例为pos_ratio的anchors作为一个batch用于RPN的训练
实现:
一、参数
bbox (array):ground truth bbox的坐标,bbox.shape=(R,4)
anchor (array):anchor的坐标,anchor.shape=(S,4)
img_size (tuple of ints):(H,W)图片尺寸
二、返回值
loc (array):anchors与ground truth bbox的偏差,loc.shape=(S,4)
label (array):anchors是正例还是负例,0负1正,-1忽略,label.shape=(S, )

此外,还有两个函数,帮助实现AnchorTargetCreator的功能,已经在解读AnchorTargetCreator的代码时提及过,这里不再重复。

def _unmap(data, count, index, fill=0):
    # Unmap a subset of item (data) back to the original set of items (of
    # size count)

    if len(data.shape) == 1:
        ret = np.empty((count,), dtype=data.dtype)
        ret.fill(fill)
        ret[index] = data
    else:
        ret = np.empty((count,) + data.shape[1:], dtype=data.dtype)
        ret.fill(fill)
        ret[index, :] = data
    return ret
def _get_inside_index(anchor, H, W):
    # Calc indicies of anchors which are located completely inside of the image
    # whose size is speficied.
    index_inside = np.where(
        (anchor[:, 0] >= 0) &
        (anchor[:, 1] >= 0) &
        (anchor[:, 2] <= H) &
        (anchor[:, 3] <= W)
    )[0]
    return index_inside

2019/11/08 & 2019/11/10

三 模型构造部分

File1: region_proposal_network.py

import numpy as np
from torch.nn import functional as F
import torch as t
from torch import nn

from model.utils.bbox_tools import generate_anchor_base
from model.utils.creator_tool import ProposalCreator

1. class RegionProposalNetwork(nn.Module)

def __init__(
            self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2],
            anchor_scales=[8, 16, 32], feat_stride=16,
            proposal_creator_params=dict(),
    ):
        super(RegionProposalNetwork, self).__init__()
        self.anchor_base = generate_anchor_base(
            anchor_scales=anchor_scales, ratios=ratios)
        self.feat_stride = feat_stride
        self.proposal_layer = ProposalCreator(self, **proposal_creator_params)
        n_anchor = self.anchor_base.shape[0]
        self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
        self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0)
        self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)
        normal_init(self.conv1, 0, 0.01)
        normal_init(self.score, 0, 0.01)
        normal_init(self.loc, 0, 0.01)

    def forward(self, x, img_size, scale=1.):
    	n, _, hh, ww = x.shape
        anchor = _enumerate_shifted_anchor(
            np.array(self.anchor_base),
            self.feat_stride, hh, ww)
        # anchor.shape = (A*hh*ww,4)

        n_anchor = anchor.shape[0] // (hh * ww)
        h = F.relu(self.conv1(x))
        # h.shape = (n,mid_channels,hh,ww)

        rpn_locs = self.loc(h)
        # rpn_locs.shape = (n,n_anchor * 4,hh,ww)
        # UNNOTE: check whether need contiguous
        # A: Yes
        rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4)
        #  rpn_locs.shape = (n,hh,ww,n_anchor * 4)
        rpn_scores = self.score(h)
        rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous()
        #  rpn_scores.shape = (n,hh,ww,n_anchor * 2)
        rpn_softmax_scores = F.softmax(rpn_scores.view(n, hh, ww, n_anchor, 2), dim=4)
        rpn_fg_scores = rpn_softmax_scores[:, :, :, :, 1].contiguous()
        # fg前景分数
        rpn_fg_scores = rpn_fg_scores.view(n, -1)
        #rpn_fg_scores.shape = (n,hh*ww*n_anchors)
        rpn_scores = rpn_scores.view(n, -1, 2)
        # rpn_scores.shape = (n,hh*ww*n_anchors,2)

        #使用ProposalCreator,根据anchors和预测的偏移得到bbox,并将其clip进边界
        #再选出其中是前景概率最高的一部分,删去长或宽太小的一部分,
        #再进行nms,选出nms后前若干个bbox,作为Proposal Regions
        rois = list()
        roi_indices = list()
        # 为n张图片分别产生roi,然后将其合成一个大的rois
        # 为了记录合并后的rois中每一个roi属于哪一张图片,为每个roi设置一个index,即生成一个与rois等长的roi_indices
        for i in range(n):
            roi = self.proposal_layer(
                rpn_locs[i].cpu().data.numpy(),
                rpn_fg_scores[i].cpu().data.numpy(),
                anchor, img_size,
                scale=scale)
            batch_index = i * np.ones((len(roi),), dtype=np.int32)
            rois.append(roi)
            roi_indices.append(batch_index)

        rois = np.concatenate(rois, axis=0)
        roi_indices = np.concatenate(roi_indices, axis=0)
        return rpn_locs, rpn_scores, rois, roi_indices, anchor

作用:RPN的实现

1.1 def __ init__(self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2],anchor_scales=[8, 16, 32], feat_stride=16,proposal_creator_params=dict() )

作用:
参数赋值,定义RPN中的“零部件”,初始化权重
实现:
一、参数:
in_channels (int):输入的通道数
mid_channels (int):卷积之后的通道数
ratios (list of floats):anchor的H和W之比
anchor_scales (list of numbers):anchors的面积(等于(basescale)**2)
feat_stride (int):输入到RPN的张量两个相邻特征点映射到原始图片上的具体,在本代码中,特征提取网络一共进行了4次pool/2,所以feat_stride应该为16
initialW (callable):初始权值,为None时使用标准差为0.1的高斯分布
proposal_creator_params (dict):model.utils.creator_tools.ProposalCreator的关键词参数
二、主要的结构
(nn.Conv2d参数 i o k s p)
conv1:卷积层 in_channels, mid_channels, 3, 1, 1
score:全卷积层 in_channels, n_anchor
2, 1, 1, 0
loc:全卷积层 in_channels, n_anchor*4, 1, 1, 0
proposal_layer:ProposalCreator

1.2 def forward(self, x, img_size, scale=1.)

实现:
一、参数
x (~torch.autograd.Variable):从图像中提取出的特征,x.shape=(N,C,H,W)
img_size (tuple of ints):scale后的图像尺寸,(height, width)
scale:scale因子
二、返回值
rpn_locs:预测的所有anchors的偏移,rpn_locs.shape=(N,H*W*A,4)
rpn_scores:预测的anchors是前景和背景的概率, rpn_scores.shape=(N,H*W*A,2)
rois:为这一个batch中n张图片生成的roi组成的数组, rois.shape=(S,4)
roi_indices:为了区分rois中每一个roi属于哪一张图片而设置的index, roi_indices.shape=(S,),元素取值为[0, N)
anchor:(A*H*W,4)

2. def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width)

def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width):
    import numpy as xp
    shift_y = xp.arange(0, height * feat_stride, feat_stride)
    shift_x = xp.arange(0, width * feat_stride, feat_stride)
    shift_x, shift_y = xp.meshgrid(shift_x, shift_y)
    shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
                      shift_y.ravel(), shift_x.ravel()), axis=1)

    A = anchor_base.shape[0]
    K = shift.shape[0]
    anchor = anchor_base.reshape((1, A, 4)) + \
             shift.reshape((1, K, 4)).transpose((1, 0, 2))
    anchor = anchor.reshape((K * A, 4)).astype(np.float32)
    return anchor

作用:通过平移base_anchors产生出一张图片上的所有anchors,共A*K个,A=len(scales)*len(ratios), K=特征图的height*特征图的width
具体实现的解读另附文件

3. def _enumerate_shifted_anchor_torch(anchor_base, feat_stride, height, width)

def _enumerate_shifted_anchor_torch(anchor_base, feat_stride, height, width):
    # xp = cuda.get_array_module(anchor_base)
    import torch as t
    shift_y = t.arange(0, height * feat_stride, feat_stride)
    shift_x = t.arange(0, width * feat_stride, feat_stride)
    shift_x, shift_y = xp.meshgrid(shift_x, shift_y)
    shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
                      shift_y.ravel(), shift_x.ravel()), axis=1)

    A = anchor_base.shape[0]
    K = shift.shape[0]
    anchor = anchor_base.reshape((1, A, 4)) + \
             shift.reshape((1, K, 4)).transpose((1, 0, 2))
    anchor = anchor.reshape((K * A, 4)).astype(np.float32)
    return anchor

作用:与上一个函数一样,只是上一个用的是numpy,这个用的torch

4. def normal_init(m, mean, stddev, truncated=False)

def normal_init(m, mean, stddev, truncated=False):
    """
    weight initalizer: truncated normal and random normal.
    """
    # x is a parameter
    if truncated:
        m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean)  # not a perfect approximation
    else:
        m.weight.data.normal_(mean, stddev)
        m.bias.data.zero_()

作用: normal_init

2019/11/11

File2: roi_module.py

这个文件实现的是ROI Pooling模块,这个模块的作用在[这篇博文]中讲得很清楚,(https://www.cnblogs.com/wangyong/p/8523814.html)下面只简单介绍一下RoI类和RoIPooling2D

1. class RoI(Function)

该类继承了Function,实现了forward和backward函数,可以想象成F.max_pool2d的一个变体

2. class RoIPooling2D(t.nn.Module)

该类继承了Module,只需实现forward函数,可以想象成nn.MaxPool2d的一个变体。输入特征features(N,C,H,W)及其对应的rois,得到每个rois固定大小(如7*7)的特征。大致过程为:先将roi除以16(从原图到特征图缩小的倍数,记得roi的坐标对应的是原图),就得到了roi在特征图上对应的区域,将这个区域划分成7*7的网格状,对每一个网格进行max pool,于是就得到了7*7的特征,最终的输出应该是 (len(rois), channel, 7, 7)
但是之前rois应该是一个batch中所有图片的roi集合,这里没有用index是怎么区分的,回头再看)

File3: faster_rcnn.py

from __future__ import  absolute_import
from __future__ import division
import torch as t
import numpy as np
import cupy as cp
from utils import array_tool as at
from model.utils.bbox_tools import loc2bbox
from model.utils.nms import non_maximum_suppression
from torch import nn
from data.dataset import preprocess
from torch.nn import functional as F
from utils.config import opt

1. def nograd(f)

def nograd(f):
    def new_f(*args,**kwargs):
        with t.no_grad():
           return f(*args,**kwargs)
    return new_f

2. class FasterRCNN(nn.Module):

作用:Faster RCNN 的基类。Faster RCNN由三部分组成:特征提取;RPN;定位和分类头;

2.1 def __init__(self, extractor, rpn, head,loc_normalize_mean = (0., 0., 0., 0.),loc_normalize_std = (0.1, 0.1, 0.2, 0.2))

    def __init__(self, extractor, rpn, head,
                loc_normalize_mean = (0., 0., 0., 0.),
                loc_normalize_std = (0.1, 0.1, 0.2, 0.2)
    ):
        super(FasterRCNN, self).__init__()
        self.extractor = extractor
        self.rpn = rpn
        self.head = head

        # mean and std
        self.loc_normalize_mean = loc_normalize_mean
        self.loc_normalize_std = loc_normalize_std
        self.use_preset('evaluate')

作用:Faster RCNN由三部分组成:Feature extraction;RPN;Localization and Classification Heads;每部分分别由 extractor, rpn, head这三个nn.Module对象实现。

2.2 def n_class(self)

    @property
    def n_class(self):
        # Total number of classes including the background.
        return self.head.n_class

2.3 def forward(self, x, scale=1.)

    def forward(self, x, scale=1.):
        img_size = x.shape[2:]

        h = self.extractor(x)
        # 提取特征
        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.rpn(h, img_size, scale)
        # 生成proposal regions(rois)
        roi_cls_locs, roi_scores = self.head(
            h, rois, roi_indices)
        # 为每一个roi定位及预测类别
        # 每一个roi的定位有21种(84个值),分别对应它是每一类情况下要进行的偏移和缩放
        return roi_cls_locs, roi_scores, rois, roi_indices

2.4 def use_preset(self, preset)

def use_preset(self, preset):
    if preset == 'visualize':
        self.nms_thresh = 0.3
        self.score_thresh = 0.7
     # visualize时,一个roi被认为是a类,不仅要a类得分最高,
     # 还要求达到score_thresh = 0.7
    elif preset == 'evaluate':
        self.nms_thresh = 0.3
        self.score_thresh = 0.05
    else:
        raise ValueError('preset must be visualize or evaluate')

2.5 def _suppress(self, raw_cls_bbox, raw_prob)

    def _suppress(self, raw_cls_bbox, raw_prob):
        # 先看predict再回来看这个
        # raw_clf_bbox是一张图片中的rois经过21种修正后得到的所有的bbox
        # raw_clf_bbox.shape = (len(rois),84)
        # raw_prob.shape =  (len(rois),21)
        bbox = list()
        label = list()
        score = list()
        # skip cls_id = 0 because it is the background class
        for l in range(1, self.n_class):
            cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :]
            # 从reshape成(len(rois),21,4)之后的bbox选出按它属于第l类修正的
            # cls_bbox_l.shape = (len(rois),4)
            # 表示假定所有roi都属于第l类,按照第l类进行修正后得到的bbox
            prob_l = raw_prob[:, l]
            # prob_l.shape =  (len(rois),),表示rois属于第l类的概率
            mask = prob_l > self.score_thresh
            cls_bbox_l = cls_bbox_l[mask]
            prob_l = prob_l[mask]
            # 选出概率高于阈值的
            keep = non_maximum_suppression(
                cp.array(cls_bbox_l), self.nms_thresh, prob_l)
            # 进行nms
            keep = cp.asnumpy(keep)
            bbox.append(cls_bbox_l[keep])
            # The labels are in [0, self.n_class - 2].
            label.append((l - 1) * np.ones((len(keep),)))
            score.append(prob_l[keep])
        bbox = np.concatenate(bbox, axis=0).astype(np.float32)
        label = np.concatenate(label, axis=0).astype(np.int32)
        score = np.concatenate(score, axis=0).astype(np.float32)
        return bbox, label, score

作用:对于修正得到的bbox,对于每一类,选出达到了属于该类的可能性的阈值的bbox,并进行非极大值抑制,剩下的bbox就是最终留下的bbox

2.6 def predict(self, imgs,sizes=None,visualize=False)

    @nograd
    def predict(self, imgs,sizes=None,visualize=False):
        self.eval()
        if visualize:
            self.use_preset('visualize')
            prepared_imgs = list()
            sizes = list()
            for img in imgs:
                size = img.shape[1:]
                img = preprocess(at.tonumpy(img))
                prepared_imgs.append(img)
                sizes.append(size)
        else:
             prepared_imgs = imgs 
        bboxes = list()
        labels = list()
        scores = list()
        
        # 到此为止,prepared_imgs是经过preprocess之后的图片,即进行了scale和normalize
        # sizes是这些图片的原始尺寸
        for img, size in zip(prepared_imgs, sizes):
            img = at.totensor(img[None]).float()
            #升一维,做成一个batch_size=1的batch
            scale = img.shape[3] / size[1]
            # scale后的W/原W得到scale
            roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
            # We are assuming that batch size is 1.
            # 对当前这一张图片进行forward计算
            # 本来的返回值是 roi_cls_locs, roi_scores, rois, roi_indices
            # 这里rois都属于同一张图片,不需要roi_indices
            # roi_cls_loc.shape = (len(rois),84)
            # roi_score.shape = (len(rois),21)
            roi_score = roi_scores.data
            roi_cls_loc = roi_cls_loc.data
            roi = at.totensor(rois) / scale
            # 将对应scale后图片的roi转为对应原始图片的roi

            # Convert predictions to bounding boxes in image coordinates.
            # Bounding boxes are scaled to the scale of the input images.
            mean = t.Tensor(self.loc_normalize_mean).cuda(). \
                repeat(self.n_class)[None]
            std = t.Tensor(self.loc_normalize_std).cuda(). \
                repeat(self.n_class)[None]

            roi_cls_loc = (roi_cls_loc * std + mean)
            roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
            # roi_cls_loc.shape = (len(rois),21,4)
            roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
            # roi.shape = (len(rois),4) 变为 roi.shape = (len(rois),1,4)
            # 再expand成 (len(rois),21,4)
            # 即将每个roi按照它属于21种不同类别下的偏移都移动一遍
            cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
                                at.tonumpy(roi_cls_loc).reshape((-1, 4)))
            # cls_bbox.shape = (len(rois),21,4)
            # 表示每一个roi,当它分别属于21类时,修正后的边框
            cls_bbox = at.totensor(cls_bbox)
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)
            # cls_bbox.shape = (len(rois),84)
            # clip bounding box
            cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
            cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])

            prob = at.tonumpy(F.softmax(at.totensor(roi_score), dim=1))
            # prob.shape = (len(rois),21)
            raw_cls_bbox = at.tonumpy(cls_bbox)
            raw_prob = at.tonumpy(prob)

            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
            bboxes.append(bbox)
            labels.append(label)
            scores.append(score)

        self.use_preset('evaluate')
        self.train()
        return bboxes, labels, scores

四 训练

File1: trainer.py

from __future__ import  absolute_import
import os
from collections import namedtuple
import time
from torch.nn import functional as F
from model.utils.creator_tool import AnchorTargetCreator, ProposalTargetCreator

from torch import nn
import torch as t
from utils import array_tool as at
from utils.vis_tool import Visualizer

from utils.config import opt
from torchnet.meter import ConfusionMeter, AverageValueMeter

1. LossTuple

LossTuple = namedtuple('LossTuple',
                       ['rpn_loc_loss',
                        'rpn_cls_loss',
                        'roi_loc_loss',
                        'roi_cls_loss',
                        'total_loss'
                        ])

2. class FasterRCNNTrainer(nn.Module)

作用:wrapper for conveniently training. return losses

2.1 def init(self, faster_rcnn)

    def __init__(self, faster_rcnn):
        super(FasterRCNNTrainer, self).__init__()

        self.faster_rcnn = faster_rcnn
        self.rpn_sigma = opt.rpn_sigma
        self.roi_sigma = opt.roi_sigma

        # target creator create gt_bbox gt_label etc as training targets. 
        self.anchor_target_creator = AnchorTargetCreator()
        self.proposal_target_creator = ProposalTargetCreator()

        self.loc_normalize_mean = faster_rcnn.loc_normalize_mean
        self.loc_normalize_std = faster_rcnn.loc_normalize_std

        self.optimizer = self.faster_rcnn.get_optimizer()
        # visdom wrapper
        self.vis = Visualizer(env=opt.env)

        # indicators for training status
        self.rpn_cm = ConfusionMeter(2)
        self.roi_cm = ConfusionMeter(21)
        self.meters = {k: AverageValueMeter() for k in LossTuple._fields}  # average loss

2.2 def forward(self, imgs, bboxes, labels, scale)

    def forward(self, imgs, bboxes, labels, scale):
        n = bboxes.shape[0]
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape
        img_size = (H, W)

        features = self.faster_rcnn.extractor(imgs)
        # 提取特征

        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.faster_rcnn.rpn(features, img_size, scale)
        #生成proposal regions

        # Since batch size is one, convert variables to singular form
        bbox = bboxes[0]
        label = labels[0]
        rpn_score = rpn_scores[0]
        rpn_loc = rpn_locs[0]
        roi = rois

        # Sample RoIs and forward
        # it's fine to break the computation graph of rois, 
        # consider them as constant input
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi,
            at.tonumpy(bbox),
            at.tonumpy(label),
            self.loc_normalize_mean,
            self.loc_normalize_std)
        #为这些roi打上标签(0-20),并取样出若干个作为一批训练数据
        # sample_roi, gt_roi_loc, gt_roi_label 分别是取样出的roi,roi与groud truth 的偏移,roi依附的groud truth的label
        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))
        roi_cls_loc, roi_score = self.faster_rcnn.head(
            features,
            sample_roi,
            sample_roi_index)

        # ------------------ RPN losses -------------------#
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox),
            anchor,
            img_size)
        # 将所有anchor打上标签,计算它与所依附gt bbox的偏差(只有标签为0或1的才会用到)
        # loc (array):anchors与ground truth bbox的偏差,loc.shape=(S,4)
        # label (array):anchors是正例还是负例,0负1正,-1忽略,label.shape=(S, )
        gt_rpn_label = at.totensor(gt_rpn_label).long()
        gt_rpn_loc = at.totensor(gt_rpn_loc)
        rpn_loc_loss = _fast_rcnn_loc_loss(
            rpn_loc,  #对于所有anchor,RPN预测的与gt bbox的偏差
            gt_rpn_loc, #所有的anchor与gt bbox的真实偏差
            gt_rpn_label.data, #所有的anchor的标签(0,1,-1)
            # 虽然把所有anchor都传了进去,实际上还是算了正例的
            self.rpn_sigma)

        # NOTE: default value of ignore_index is -100 ...
        rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long())

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        n_sample = roi_cls_loc.shape[0]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
        # roi_cls_loc .shape = (n_sample, 21, 4)
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
                              at.totensor(gt_roi_label).long()]
        # 对每个roi选出与gt_roi_label相对应的预测的偏移,roi_loc.shape = (n_sample,4)
        gt_roi_label = at.totensor(gt_roi_label).long()
        gt_roi_loc = at.totensor(gt_roi_loc)
        # 每个roi与gt bbox 的真实偏移 gt_roi_loc.shape = (n_sample,4)

        roi_loc_loss = _fast_rcnn_loc_loss(
            roi_loc.contiguous(),
            gt_roi_loc,
            gt_roi_label.data,  # 只计算label>0的roi的loc loss
            self.roi_sigma)

        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

        self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]

        return LossTuple(*losses)

实现:
一、参数
imgs (~torch.autograd.Variable):一个batch的图片(N,C,H,W)
bboxes (~torch.autograd.Variable):图片对应的groud truth bbox,(N,R,4)
labels (~torch.autograd…Variable):groud truth bbox的label,(N,R),元素取值为0-19
scale (float):原图的scale因子,个人认为这儿应该也是一个数组,但是训练的batch_size都是1,无妨

2.3 def train_step(self, imgs, bboxes, labels, scale)

    def train_step(self, imgs, bboxes, labels, scale):
        self.optimizer.zero_grad()
        losses = self.forward(imgs, bboxes, labels, scale)
        losses.total_loss.backward()
        self.optimizer.step()
        self.update_meters(losses)
        return losses

2.4 def save(self, save_optimizer=False, save_path=None, **kwargs)

    def save(self, save_optimizer=False, save_path=None, **kwargs):
        save_dict = dict()

        save_dict['model'] = self.faster_rcnn.state_dict()
        save_dict['config'] = opt._state_dict()
        save_dict['other_info'] = kwargs
        save_dict['vis_info'] = self.vis.state_dict()

        if save_optimizer:
            save_dict['optimizer'] = self.optimizer.state_dict()

        if save_path is None:
            timestr = time.strftime('%m%d%H%M')
            save_path = 'checkpoints/fasterrcnn_%s' % timestr
            for k_, v_ in kwargs.items():
                save_path += '_%s' % v_

        save_dir = os.path.dirname(save_path)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        t.save(save_dict, save_path)
        self.vis.save([self.vis.env])
        return save_path

2.5 def load(self, path, load_optimizer=True, parse_opt=False, )

    def load(self, path, load_optimizer=True, parse_opt=False, ):
        state_dict = t.load(path)
        if 'model' in state_dict:
            self.faster_rcnn.load_state_dict(state_dict['model'])
        else:  # legacy way, for backward compatibility
            self.faster_rcnn.load_state_dict(state_dict)
            return self
        if parse_opt:
            opt._parse(state_dict['config'])
        if 'optimizer' in state_dict and load_optimizer:
            self.optimizer.load_state_dict(state_dict['optimizer'])
        return self

2.6 几个用于操作meters的函数

    def update_meters(self, losses):
        loss_d = {k: at.scalar(v) for k, v in losses._asdict().items()}
        for key, meter in self.meters.items():
            meter.add(loss_d[key])

    def reset_meters(self):
        for key, meter in self.meters.items():
            meter.reset()
        self.roi_cm.reset()
        self.rpn_cm.reset()

    def get_meter_data(self):
        return {k: v.value()[0] for k, v in self.meters.items()}

loc loss的具体实现

def _smooth_l1_loss(x, t, in_weight, sigma):
    sigma2 = sigma ** 2
    diff = in_weight * (x - t)
    abs_diff = diff.abs()
    flag = (abs_diff.data < (1. / sigma2)).float()
    y = (flag * (sigma2 / 2.) * (diff ** 2) +
         (1 - flag) * (abs_diff - 0.5 / sigma2))
    return y.sum()


def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
    in_weight = t.zeros(gt_loc.shape).cuda()
    # Localization loss is calculated only for positive rois.
    # NOTE:  unlike origin implementation, 
    # we don't need inside_weight and outside_weight, they can calculate by gt_label
    in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1
    loc_loss = _smooth_l1_loss(pred_loc, gt_loc, in_weight.detach(), sigma)
    # Normalize by total number of negtive and positive rois.
    loc_loss /= ((gt_label >= 0).sum().float()) # ignore gt_label==-1 for rpn_loss
    return loc_loss

2019/11/12

File2: train.py

1. def eval(dataloader, faster_rcnn, test_num=10000)

def eval(dataloader, faster_rcnn, test_num=10000):
    pred_bboxes, pred_labels, pred_scores = list(), list(), list()
    gt_bboxes, gt_labels, gt_difficults = list(), list(), list()
    for ii, (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) in tqdm(enumerate(dataloader)):
        sizes = [sizes[0][0].item(), sizes[1][0].item()]
        pred_bboxes_, pred_labels_, pred_scores_ = faster_rcnn.predict(imgs, [sizes])
        gt_bboxes += list(gt_bboxes_.numpy())
        gt_labels += list(gt_labels_.numpy())
        gt_difficults += list(gt_difficults_.numpy())
        pred_bboxes += pred_bboxes_
        pred_labels += pred_labels_
        pred_scores += pred_scores_
        if ii == test_num: break

    result = eval_detection_voc(
        pred_bboxes, pred_labels, pred_scores,
        gt_bboxes, gt_labels, gt_difficults,
        use_07_metric=True)
    return result

def train(**kwargs)

def train(**kwargs):
    opt._parse(kwargs)

    dataset = Dataset(opt)
    print('load data')
    dataloader = data_.DataLoader(dataset, \
                                  batch_size=1, \
                                  shuffle=True, \
                                  # pin_memory=True,
                                  num_workers=opt.num_workers)
    testset = TestDataset(opt)
    test_dataloader = data_.DataLoader(testset,
                                       batch_size=1,
                                       num_workers=opt.test_num_workers,
                                       shuffle=False, \
                                       pin_memory=True
                                       )
    faster_rcnn = FasterRCNNVGG16()
    print('model construct completed')
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()
    if opt.load_path:
        trainer.load(opt.load_path)
        print('load pretrained model from %s' % opt.load_path)
    trainer.vis.text(dataset.db.label_names, win='labels')
    best_map = 0
    lr_ = opt.lr
    for epoch in range(opt.epoch):
        trainer.reset_meters()
        for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
            scale = at.scalar(scale)
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            trainer.train_step(img, bbox, label, scale)

            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_,
                                     at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)

                # plot predicti bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True)
                pred_img = visdom_bbox(ori_img_,
                                       at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float())
        eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)
        trainer.vis.plot('test_map', eval_result['map'])
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
        log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_),
                                                  str(eval_result['map']),
                                                  str(trainer.get_meter_data()))
        trainer.vis.log(log_info)

        if eval_result['map'] > best_map:
            best_map = eval_result['map']
            best_path = trainer.save(best_map=best_map)
        if epoch == 9:
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)
            lr_ = lr_ * opt.lr_decay

        if epoch == 13: 
            break

你可能感兴趣的:(物体检测算法实现,Faster,RCNN,Object,Detection,pytorch)