Faster RCNN 原文
本文所搬运代码来源
这部分对应data目录下的内容
from __future__ import absolute_import
from __future__ import division
import torch as t
from data.voc_dataset import VOCBboxDataset
from skimage import transform as sktsf
from torchvision import transforms as tvtsf
from data import util
import numpy as np
from utils.config import op
def inverse_normalize(img):
if opt.caffe_pretrain:
img = img + (np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1))
return img[::-1, :, :]
# approximate un-normalize for visualize
return (img * 0.225 + 0.45).clip(min=0, max=1) * 255
作用:对已经归一化的图片进行去归一化以进行可视化
实现:是归一化的逆操作,先通过opt.caffe_pretrain判断是否使用caffe_pretrain进行预训练,如果是,则进行的是caffe_normalize的逆操作,否则进行的是pytorch_normalize的逆操作
def pytorch_normalze(img):
normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
img = normalize(t.from_numpy(img))
return img.numpy()
作用:pytorch形式的归一化
实现:公式(x - mean)/std,这里使用的是imagenet数据集中的图像的均值和标准差(img中每个像素的RGB值都已经除以255)
def caffe_normalize(img):
"""
return appr -125-125 BGR
"""
img = img[[2, 1, 0], :, :] # RGB-BGR
img = img * 255
mean = np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1)
img = (img - mean).astype(np.float32, copy=True)
return img
作用:caffe形式的归一化
实现:caffe中的图像是BGR形式,所以首先要通过交换通道次序将RGB图像转化为BGR图像;然后各个通道分别减去本通道的均值,进行零均值化
def preprocess(img, min_size=600, max_size=1000):
C, H, W = img.shape
scale1 = min_size / min(H, W)
scale2 = max_size / max(H, W)
scale = min(scale1, scale2)
img = img / 255.
img = sktsf.resize(img, (C, H * scale, W * scale), mode='reflect',anti_aliasing=False)
if opt.caffe_pretrain:
normalize = caffe_normalize
else:
normalize = pytorch_normalze
return normalize(img)
作用:在提取特征之前对图片进行预处理,主要目的是进行缩放,以保证图片的高和宽都在[min_size,max_size]区间内,然后对图片进行归一化
实现:输入img是一个numpy.ndarray,格式为CHW三个通道分别为RGB。将较短的边缩放到min_size所需要的缩放倍数为scale1,将较长的边缩放到max_size所需要的缩放倍数为scale2,选择其中缩放倍数较小的作为缩放因子scale,这一步完成后,要么较短的边变成了min_size并且较长边落在[min_size,max_size]区间内,要么较长的边变成了max_size并且较短边在[min_size,max_size]区间内。
以第一种情况为例进行证明:
不妨设H
显然H经过scale后变成min
而Wscale=W(min/H)
即min
class Transform(object):
# 可调用对象
def __init__(self, min_size=600, max_size=1000):
self.min_size = min_size
self.max_size = max_size
def __call__(self, in_data):
img, bbox, label = in_data
_, H, W = img.shape
img = preprocess(img, self.min_size, self.max_size)
_, o_H, o_W = img.shape
scale = o_H / H
bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W))
# horizontally flip
img, params = util.random_flip(
img, x_random=True, return_param=True)
bbox = util.flip_bbox(
bbox, (o_H, o_W), x_flip=params['x_flip'])
return img, bbox, label, scale
作用:使用一个可调用对象,将对输入数据所要进行的全部转化操作及相关参数集成到一起
实现:输入数据是一个(img,bbox,label)三元祖,对img使用前述preprocess进行预处理,并使用img缩放因子对bbox进行同样的缩放,然后对图片惊醒随机的翻转,并对bbox进行相同的翻转
class Dataset:
def __init__(self, opt):
self.opt = opt
self.db = VOCBboxDataset(opt.voc_data_dir)
self.tsf = Transform(opt.min_size, opt.max_size)
def __getitem__(self, idx):
ori_img, bbox, label, difficult = self.db.get_example(idx)
img, bbox, label, scale = self.tsf((ori_img, bbox, label))
# TODO: check whose stride is negative to fix this instead copy all
# some of the strides of a given numpy array are negative.
return img.copy(), bbox.copy(), label.copy(), scale
def __len__(self):
return len(self.db)
作用:训练数据集
实现:对VOCBboxDataset数据集(后面会详细解读)进行又一层包装,以作为训练所用的数据集,从self.db(即VOCBboxDataset)中取出一个数据,对其使用self.tsf(即前述Transform)进行转化,再将该数据返回
class TestDataset:
def __init__(self, opt, split='test', use_difficult=True):
self.opt = opt
self.db = VOCBboxDataset(opt.voc_data_dir, split=split, use_difficult=use_difficult)
def __getitem__(self, idx):
ori_img, bbox, label, difficult = self.db.get_example(idx)
img = preprocess(ori_img)
return img, ori_img.shape[1:], bbox, label, difficult
def __len__(self):
return len(self.db)
作用:测试数据集
实现:与训练集类似,只是测试集只需要简单的对图片进行preprocess(缩放和归一化),不需要进行翻转等图像增强。另外,这边将原始图像的尺寸也作为了返回值,可能是为了完成detection后进行可视化等操作。
2019/11/05
在阅读这部分内容之前应当对PASCAL VOC数据集有一个了解,可参考这篇blog
import os
import xml.etree.ElementTree as ET
import numpy as np
from .util import read_image
class VOCBboxDataset:
def __init__(self, data_dir, split='trainval',
use_difficult=False, return_difficult=False,
):
# if split not in ['train', 'trainval', 'val']:
# if not (split == 'test' and year == '2007'):
# warnings.warn(
# 'please pick split from \'train\', \'trainval\', \'val\''
# 'for 2012 dataset. For 2007 dataset, you can pick \'test\''
# ' in addition to the above mentioned splits.'
# )
id_list_file = os.path.join(
data_dir, 'ImageSets/Main/{0}.txt'.format(split))
self.ids = [id_.strip() for id_ in open(id_list_file)]
self.data_dir = data_dir
self.use_difficult = use_difficult
self.return_difficult = return_difficult
self.label_names = VOC_BBOX_LABEL_NAMES
def __len__(self):
return len(self.ids)
def get_example(self, i):
id_ = self.ids[i]
anno = ET.parse(
os.path.join(self.data_dir, 'Annotations', id_ + '.xml'))
bbox = list()
label = list()
difficult = list()
for obj in anno.findall('object'):
# when in not using difficult split, and the object is
# difficult, skipt it.
if not self.use_difficult and int(obj.find('difficult').text) == 1:
continue
difficult.append(int(obj.find('difficult').text))
bndbox_anno = obj.find('bndbox')
# subtract 1 to make pixel indexes 0-based
bbox.append([
int(bndbox_anno.find(tag).text) - 1
for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
name = obj.find('name').text.lower().strip()
label.append(VOC_BBOX_LABEL_NAMES.index(name))
bbox = np.stack(bbox).astype(np.float32)
label = np.stack(label).astype(np.int32)
# When `use_difficult==False`, all elements in `difficult` are False.
difficult = np.array(difficult, dtype=np.bool).astype(np.uint8) # PyTorch don't support np.bool
# Load a image
img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg')
img = read_image(img_file, color=True)
# if self.return_difficult:
# return img, bbox, label, difficult
return img, bbox, label, difficult
__getitem__ = get_example
作用:构造VOCBbox数据集
实现:实现构造函数,__len__和__getitem__函数
作用:初始化一个VOCBboxDataset对象
实现:
一、 参数:
data_dir(string):到VOC数据根目录的路径,即"/data/image/voc/VOCdevkit/VOC2007/"
split({‘train’, ‘val’, ‘trainval’, ‘test’}):选择要取出哪部分数据,(test数据只有2007年的可获得)
year({‘2007’, ‘2012’}): 选择使用哪一年的VOC数据集
use_difficult (bool):如果为True,则使用在annotation中被标记为difficult的图片
return_difficult (bool):如果为True,__ getitem__ 函数在返回图片,图片中所有bbox,每个bbox对应的label的同时,还要返回一个bool数组,其中每个元素对应一个bbox,指示该bbox是否被标记为difficult
二、具体步骤:
只有一点要讲:data_dir+'ImageSets/Main’目录下保存了具体数据集的索引,train数据集的索引保存为train.txt, trainval数据集的索引保存为trainval.txt,val数据集的索引保存为val.txt
从txt文件中获得的索引个数就是数据集的长度
作用:返回第i条数据
实现:
一、返回值:
img:img就是一个图片对应的张量 img.dtype = numpy.float32 CHW RGB
bbox:img中所有的物体框 bbox.dtype = numpy.float32 bbox.shape=(R,4),其中R是img中物体框的个数,实际上是R个(y_{min}, x_{min}, y_{max}, x_{max})元祖所形成的列表
label:每个物体框对应的label形成的列表,label.dtype = numpy.int32,元素取值为0-19,对应20类物品,label.shape=(R,0)
difficult:每个物体框是否困难标记,difficult.dtype = numpy.bool
二、具体实现:
整体思路是,通过下标索引i获得对应索引id_ , 通过索引在JPEGImages中取得img,在Annotations中获得对应的xml文件并进行解析得到bbox,label,difficult
VOC_BBOX_LABEL_NAMES = (
'aeroplane',
'bicycle',
'bird',
'boat',
'bottle',
'bus',
'car',
'cat',
'chair',
'cow',
'diningtable',
'dog',
'horse',
'motorbike',
'person',
'pottedplant',
'sheep',
'sofa',
'train',
'tvmonitor')
label 0-19对应的类别名称
import numpy as np
from PIL import Image
import random
def read_image(path, dtype=np.float32, color=True):
f = Image.open(path)
try:
if color:
img = f.convert('RGB')
else:
img = f.convert('P')
img = np.asarray(img, dtype=dtype)
finally:
if hasattr(f, 'close'):
f.close()
if img.ndim == 2:
# reshape (H, W) -> (1, H, W)
return img[np.newaxis]
else:
# transpose (H, W, C) -> (C, H, W)
return img.transpose((2, 0, 1))
作用:读取一张图片,并进行格式转换
def resize_bbox(bbox, in_size, out_size):
bbox = bbox.copy()
y_scale = float(out_size[0]) / in_size[0]
x_scale = float(out_size[1]) / in_size[1]
bbox[:, 0] = y_scale * bbox[:, 0]
bbox[:, 2] = y_scale * bbox[:, 2]
bbox[:, 1] = x_scale * bbox[:, 1]
bbox[:, 3] = x_scale * bbox[:, 3]
return bbox
作用:根据image resize对bbox进行resize,in_size和out_size是image resize前后的H和W
实现:注意bbox格式 (y_{min}, x_{min}, y_{max}, x_{max})
def flip_bbox(bbox, size, y_flip=False, x_flip=False):
H, W = size
bbox = bbox.copy()
if y_flip:
y_max = H - bbox[:, 0]
y_min = H - bbox[:, 2]
bbox[:, 0] = y_min
bbox[:, 2] = y_max
if x_flip:
x_max = W - bbox[:, 1]
x_min = W - bbox[:, 3]
bbox[:, 1] = x_min
bbox[:, 3] = x_max
return bbox
作用:翻转bbox
def crop_bbox(
bbox, y_slice=None, x_slice=None,
allow_outside_center=True, return_param=False):
t, b = _slice_to_bounds(y_slice)
l, r = _slice_to_bounds(x_slice)
crop_bb = np.array((t, l, b, r))
if allow_outside_center:
mask = np.ones(bbox.shape[0], dtype=bool)
else:
center = (bbox[:, :2] + bbox[:, 2:]) / 2.0
mask = np.logical_and(crop_bb[:2] <= center, center < crop_bb[2:]) \
.all(axis=1)
bbox = bbox.copy()
bbox[:, :2] = np.maximum(bbox[:, :2], crop_bb[:2])
bbox[:, 2:] = np.minimum(bbox[:, 2:], crop_bb[2:])
bbox[:, :2] -= crop_bb[:2]
bbox[:, 2:] -= crop_bb[:2]
mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:]).all(axis=1))
bbox = bbox[mask]
if return_param:
return bbox, {'index': np.flatnonzero(mask)}
else:
return bbox
作用:按照对image的crop对bbox进行crop。若bbox完全在cropped area中,则保留;若bbox完全在cropped area之外,则丢弃;若bbox部分在cropped area外,则跟据crop的边界进行截断
实现:
一、参数:
bbox:(R,4)
yslice:crop后的image在原image中y轴的切片
xslice:crop后的image在原image中x轴的切片
allow_outside_center (bool):如果为False,则将中心在cropped area外的bbox也直接丢弃
return_param (bool):如果为True,则返回一个bool列表,指示哪些bbox被保留下来了 (R,)
二、具体实现:
小技巧:丢弃bbox的操作并不需要真的del,只需要用一个长度为R的mask数组,一开始为全1,丢弃哪个bbox就将对应值设为0,最后通过bbox=bbox[mask]完成丢弃
整体思路:先通过yslice、xslice得到crop area的完整边界(t, l, b, r);如果allow_outside_center为False,丢弃中心点在crop area外的bbox,中心点坐标要满足(t,l)<=center<=(b,r);再对bbox进行截断,每个bbox的(t,l)取自身与cropped area较大的,(b,r)取较小的,然后将每个bbox的(t,l,b,r)减去cropped area的(t,l),得到bbox在以cropped area的左上角为原点的坐标系中的坐标。最后再删去那些(t,l)<(b,r)的bbox(表示在cropped area外)
def _slice_to_bounds(slice_):
if slice_ is None:
return 0, np.inf
if slice_.start is None:
l = 0
else:
l = slice_.start
if slice_.stop is None:
u = np.inf
else:
u = slice_.stop
return l, u
作用:通过切片得到边界值
def translate_bbox(bbox, y_offset=0, x_offset=0):
out_bbox = bbox.copy()
out_bbox[:, :2] += (y_offset, x_offset)
out_bbox[:, 2:] += (y_offset, x_offset)
return out_bbox
作用:平移bbox,主要与image padding一起使用
def random_flip(img, y_random=False, x_random=False,
return_param=False, copy=False):
y_flip, x_flip = False, False
if y_random:
y_flip = random.choice([True, False])
if x_random:
x_flip = random.choice([True, False])
if y_flip:
img = img[:, ::-1, :]
if x_flip:
img = img[:, :, ::-1]
if copy:
img = img.copy()
if return_param:
return img, {'y_flip': y_flip, 'x_flip': x_flip}
else:
return img
作用:随机翻转图片
2019/11/07
这部分对应model/utils目录下的内容
import numpy as np
import numpy as xp
import six
from six import __init__
在理解下面两个函数之前,先回顾这组公式:
G ^ x = P w t x + P x G ^ y = P h t y + P y G ^ w = P w e x p ( t w ) G ^ h = P h e x p ( t h ) { \hat { G } }_{ x }={ P }_{ w }{ t }_{ x }+{ P }_{ x }\\ { \hat { G } }_{ y }={ P }_{ h }{ t }_{ y }+{ P }_{ y }\\ { \hat { G } }_{ w }={ P }_{ w }exp({ t }_{ w })\\ { \hat { G } }_{ h }={ P }_{ h }exp({ t }_{ h })\\ G^x=Pwtx+PxG^y=Phty+PyG^w=Pwexp(tw)G^h=Phexp(th)
def loc2bbox(src_bbox, loc):
if src_bbox.shape[0] == 0:
return xp.zeros((0, 4), dtype=loc.dtype)
src_bbox = src_bbox.astype(src_bbox.dtype, copy=False)
src_height = src_bbox[:, 2] - src_bbox[:, 0]
src_width = src_bbox[:, 3] - src_bbox[:, 1]
src_ctr_y = src_bbox[:, 0] + 0.5 * src_height
src_ctr_x = src_bbox[:, 1] + 0.5 * src_width
dy = loc[:, 0::4]
dx = loc[:, 1::4]
dh = loc[:, 2::4]
dw = loc[:, 3::4]
ctr_y = dy * src_height[:, xp.newaxis] + src_ctr_y[:, xp.newaxis]
ctr_x = dx * src_width[:, xp.newaxis] + src_ctr_x[:, xp.newaxis]
h = xp.exp(dh) * src_height[:, xp.newaxis]
w = xp.exp(dw) * src_width[:, xp.newaxis]
dst_bbox = xp.zeros(loc.shape, dtype=loc.dtype)
dst_bbox[:, 0::4] = ctr_y - 0.5 * h
dst_bbox[:, 1::4] = ctr_x - 0.5 * w
dst_bbox[:, 2::4] = ctr_y + 0.5 * h
dst_bbox[:, 3::4] = ctr_x + 0.5 * w
return dst_bbox
作用:给出原始框 ( P x , P y , P w , P h ) ({ P }_{ x },{ P }_{ y },{ P }_{ w },{ P }_{ h }) (Px,Py,Pw,Ph)(事实上是以 ( P y m i n , P x m i n , P y m a x , P x m a x ) ({ P }_{ ymin },{ P }_{ xmin },{ P }_{ ymax },{ P }_{ xmax }) (Pymin,Pxmin,Pymax,Pxmax)形式给出)和偏移 ( t x , t y , t w , t h ) ({ t }_{ x },{ t }_{ y },{ t }_{ w },{ t }_{ h }) (tx,ty,tw,th),求出偏移后的框 ( G x , G y , G w , G h ) ({ G }_{ x },{ G }_{ y },{ G }_{ w },{ G }_{ h }) (Gx,Gy,Gw,Gh)(事实上又转化回了 ( G y m i n , G x m i n , G y m a x , G x m a x ) ({ G }_{ ymin },{ G }_{ xmin },{ G }_{ ymax },{ G }_{ xmax }) (Gymin,Gxmin,Gymax,Gxmax)的形式)
实现:先将 ( P y m i n , P x m i n , P y m a x , P x m a x ) ({ P }_{ ymin },{ P }_{ xmin },{ P }_{ ymax },{ P }_{ xmax }) (Pymin,Pxmin,Pymax,Pxmax)转化为 ( P x , P y , P w , P h ) ({ P }_{ x },{ P }_{ y },{ P }_{ w },{ P }_{ h }) (Px,Py,Pw,Ph),再根据公式进行转化,最后再将得到的框又转化成 ( G y m i n , G x m i n , G y m a x , G x m a x ) ({ G }_{ ymin },{ G }_{ xmin },{ G }_{ ymax },{ G }_{ xmax }) (Gymin,Gxmin,Gymax,Gxmax)的形式
def bbox2loc(src_bbox, dst_bbox):
height = src_bbox[:, 2] - src_bbox[:, 0]
width = src_bbox[:, 3] - src_bbox[:, 1]
ctr_y = src_bbox[:, 0] + 0.5 * height
ctr_x = src_bbox[:, 1] + 0.5 * width
base_height = dst_bbox[:, 2] - dst_bbox[:, 0]
base_width = dst_bbox[:, 3] - dst_bbox[:, 1]
base_ctr_y = dst_bbox[:, 0] + 0.5 * base_height
base_ctr_x = dst_bbox[:, 1] + 0.5 * base_width
eps = xp.finfo(height.dtype).eps
height = xp.maximum(height, eps)
width = xp.maximum(width, eps)
dy = (base_ctr_y - ctr_y) / height
dx = (base_ctr_x - ctr_x) / width
dh = xp.log(base_height / height)
dw = xp.log(base_width / width)
loc = xp.vstack((dy, dx, dh, dw)).transpose()
return loc
作用:给出原始框 ( P y m i n , P x m i n , P y m a x , P x m a x ) ({ P }_{ ymin },{ P }_{ xmin },{ P }_{ ymax },{ P }_{ xmax }) (Pymin,Pxmin,Pymax,Pxmax)和偏移后的框 ( G y m i n , G x m i n , G y m a x , G x m a x ) ({ G }_{ ymin },{ G }_{ xmin },{ G }_{ ymax },{ G }_{ xmax }) (Gymin,Gxmin,Gymax,Gxmax),求偏移 ( t x , t y , t w , t h ) ({ t }_{ x },{ t }_{ y },{ t }_{ w },{ t }_{ h }) (tx,ty,tw,th)
注意以上两个函数都是批量操作,输入都是(R,4),输出也是(R,4)
def bbox_iou(bbox_a, bbox_b):
if bbox_a.shape[1] != 4 or bbox_b.shape[1] != 4:
raise IndexError
# top left
tl = xp.maximum(bbox_a[:, None, :2], bbox_b[:, :2])
# bottom right
br = xp.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:])
area_i = xp.prod(br - tl, axis=2) * (tl < br).all(axis=2)
area_a = xp.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1)
area_b = xp.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1)
return area_i / (area_a[:, None] + area_b - area_i)
作用:计算两组bbox的IOU
实现:bbox_a.shape=(N,4),bbox_b.shape=(K,4),a和b中的框两两配对,要求的IOU一共有N*K个,即返回值.shape=(N,K)。bbox_a[:, None, :2].shape=[N,1,2],bbox_b[:, :2].shape=[K,2],故tl.shape=[N,K,2],同理br.shape=[N,K,2]。(br - tl).shape=[N,K,2],沿轴2第一个数是h,第二个数是w,故沿轴2累乘得到面积。area_i.shape=[N,K]。同理area_a,area_b。area_i是相交的面积,area_a是a的面积,area_b是b的面积。
def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2],
anchor_scales=[8, 16, 32]):
py = base_size / 2.
px = base_size / 2.
anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4),
dtype=np.float32)
for i in six.moves.range(len(ratios)):
for j in six.moves.range(len(anchor_scales)):
h = base_size * anchor_scales[j] * np.sqrt(ratios[i])
w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i])
index = i * len(anchor_scales) + j
anchor_base[index, 0] = py - h / 2.
anchor_base[index, 1] = px - w / 2.
anchor_base[index, 2] = py + h / 2.
anchor_base[index, 3] = px + w / 2.
return anchor_base
作用:以(base_size/2,base_size/2)为中心点,产生anchors。
具体实现:即anchor的产生方法:给定中心点,base_size,ratio,anchor_scale,只需确定h和w就确定了一个anchor。h和w scale后的长度都为base_size * anchor_scale,在保证hw不变的情况下,使h/w=ratio,则h=base_size * anchor_scale * sqrt(ratio),w=base_size * anchor_scale * sqrt(1/ratio),遍历不同的ratio,anchor_scale组合,则一共得到len(ratios) * len(anchor_scales)个anchors,故返回的是一个(len(ratios) len(anchor_scales),4)数组
class ProposalCreator:
def __init__(self,
parent_model,
nms_thresh=0.7,
n_train_pre_nms=12000,
n_train_post_nms=2000,
n_test_pre_nms=6000,
n_test_post_nms=300,
min_size=16
):
self.parent_model = parent_model
self.nms_thresh = nms_thresh
self.n_train_pre_nms = n_train_pre_nms
self.n_train_post_nms = n_train_post_nms
self.n_test_pre_nms = n_test_pre_nms
self.n_test_post_nms = n_test_post_nms
self.min_size = min_size
def __call__(self, loc, score,
anchor, img_size, scale=1.):
# NOTE: when test, remember
# faster_rcnn.eval()
# to set self.traing = False
if self.parent_model.training:
n_pre_nms = self.n_train_pre_nms
n_post_nms = self.n_train_post_nms
else:
n_pre_nms = self.n_test_pre_nms
n_post_nms = self.n_test_post_nms
# Convert anchors into proposal via bbox transformations.
# roi = loc2bbox(anchor, loc)
roi = loc2bbox(anchor, loc)
# Clip predicted boxes to image.
# proposal(即anchor+loc)可能超出了图片的边界,需要将其clip进图片
roi[:, slice(0, 4, 2)] = np.clip(
roi[:, slice(0, 4, 2)], 0, img_size[0])
# 0,2是ymin,ymax
roi[:, slice(1, 4, 2)] = np.clip(
roi[:, slice(1, 4, 2)], 0, img_size[1])
#1,3是xmin,xmax
# Remove predicted boxes with either height or width < threshold.
#对应原图中长或宽小于min_size的propasal移除
#即对应scale后的图片中小于min_size*scale的propasal移除
min_size = self.min_size * scale
hs = roi[:, 2] - roi[:, 0]
ws = roi[:, 3] - roi[:, 1]
keep = np.where((hs >= min_size) & (ws >= min_size))[0]
roi = roi[keep, :]
score = score[keep]
# Sort all (proposal, score) pairs by score from highest to lowest.
# Take top pre_nms_topN (e.g. 6000).
order = score.ravel().argsort()[::-1]
#n_pre_nms<=0表示全要
if n_pre_nms > 0:
order = order[:n_pre_nms]
roi = roi[order, :]
# Apply nms (e.g. threshold = 0.7).
# Take after_nms_topN (e.g. 300).
# unNOTE: somthing is wrong here!
# TODO: remove cuda.to_gpu
keep = non_maximum_suppression(
cp.ascontiguousarray(cp.asarray(roi)),
thresh=self.nms_thresh)
if n_post_nms > 0:
keep = keep[:n_post_nms]
roi = roi[keep]
return roi
作用:产生proposal regions
作用:实例化一个ProposalCreator对象,传入一些参数值
实现:
一、参数:
nms_thresh (float):非极大值抑制的阈值(默认为0.7)
n_train_pre_nms (int):训练模式下,从所有的bbox中选出前n_train_pre_nms个是前景的概率最大的去进行非极大值抑制,默认为12000个
n_train_post_nms (int):训练模式下,进行nms后要保留的bbox的个数,默认为2000个
n_test_pre_nms (int):测试模式下,从所有的bbox中选出前n_test_pre_nms个是前景的概率最大的去进行非极大值抑制,默认为6000个
n_test_post_nms (int):测试模式下,进行nms后要保留的bbox的个数,默认为300个
min_size (int):当bbox的长或宽对应到原图中小于min_size时,丢弃
作用:根据anchors和预测的偏移得到bbox,并将其clip进边界,再选出其中是前景概率最高的一部,删去长或宽太小的一部分,再进行nms,选出nms后前若干个bbox,作为Proposal Regions
实现:
一、参数
loc (array):预测的anchors的偏移,loc.shape=(R,4)
score (array):anchors是前景的概率,score.shape=(R, )
anchor (array):anchors的坐标,anchor.shape=(R,4)
img_size (tuple of ints):(height,width)元祖,scaling之后的图片尺寸
scale (float):用于scale图片的scale因子
二、返回值
返回一个proposal boxes 的坐标数组,数组shape=(S,4),训练模式下,S小于n_train_post_nms;测试模式下,S小于n_test_post_nms。S取决于预测的bbox(anchor加偏移)的大小以及nms丢弃的bbox的数量
import numpy as np
import cupy as cp
from model.utils.bbox_tools import bbox2loc, bbox_iou, loc2bbox
from model.utils.nms import non_maximum_suppression
class ProposalTargetCreator(object):
def __init__(self,
n_sample=128,
pos_ratio=0.25, pos_iou_thresh=0.5,
neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0
):
self.n_sample = n_sample
self.pos_ratio = pos_ratio
self.pos_iou_thresh = pos_iou_thresh
self.neg_iou_thresh_hi = neg_iou_thresh_hi
self.neg_iou_thresh_lo = neg_iou_thresh_lo # NOTE:default 0.1 in py-faster-rcnn
def __call__(self, roi, bbox, label,
loc_normalize_mean=(0., 0., 0., 0.),
loc_normalize_std=(0.1, 0.1, 0.2, 0.2)):
n_bbox, _ = bbox.shape
# n_bbox是ground truth bbox的个数
roi = np.concatenate((roi, bbox), axis=0)
# 将proposals和ground truth box合到一起,形成一个大的roi
pos_roi_per_image = np.round(self.n_sample * self.pos_ratio)
# pos_roi_per_image是根据pos_ratio算出的要产生的正例个数
iou = bbox_iou(roi, bbox)
# iou.shape=(len(roi),len(bbox)),可以视为每个roi (行) 分别与每个ground truth bbox 的IOU
gt_assignment = iou.argmax(axis=1)
# gt_assignment.shape=(len(roi),),每个roi IOU最大的ground truth bbox 的序号
max_iou = iou.max(axis=1)
#max_iou.shape=(len(roi),),每个roi 与ground truth bbox 最大IOU
# Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class].
# The label with value 0 is the background.
gt_roi_label = label[gt_assignment] + 1
#每个roi依附的ground truth bbox的label (从 0-19 变成 1-20)
# Select foreground RoIs as those with >= pos_iou_thresh IoU.
pos_index = np.where(max_iou >= self.pos_iou_thresh)[0]
#where的返回值是一个元祖,尽管在这里元祖里面只有一个元素,也要用[0]取出来
#pos_index是所有达到了正例阈值的roi的索引
pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
#最终的正例个数取pos_roi_per_image和达到正例阈值的roi数量的较小值
if pos_index.size > 0:
pos_index = np.random.choice(
pos_index, size=pos_roi_per_this_image, replace=False)
#从pos_index随机取出pos_roi_per_this_image个索引,作为最终的正例
#pos_index.shape=(pos_roi_per_this_image,)
# Select background RoIs as those within
# [neg_iou_thresh_lo, neg_iou_thresh_hi).
neg_index = np.where((max_iou < self.neg_iou_thresh_hi) &
(max_iou >= self.neg_iou_thresh_lo))[0]
neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image
neg_roi_per_this_image = int(min(neg_roi_per_this_image,
neg_index.size))
if neg_index.size > 0:
neg_index = np.random.choice(
neg_index, size=neg_roi_per_this_image, replace=False)
# 选负例,与选正例过程完全类似
# 注意一点,负例是要与每个ground truth bbox的IOU都小于阈值,只需最大的IOU小于阈值即可
# neg_index.shape=(neg_roi_per_this_image,)
# pos_roi_per_this_image + neg_roi_per_this_image = 256
# The indices that we're selecting (both positive and negative).
keep_index = np.append(pos_index, neg_index)
#将正例索引和负例索引合并到一起
#注意,这些索引值都是anchors和ground truth box合成的roi中的下标值
gt_roi_label = gt_roi_label[keep_index]
#先将每个roi的label设置为与之IOU最大的ground truth box的label
gt_roi_label[pos_roi_per_this_image:] = 0 # negative labels --> 0
#再将负例的label修改为0
sample_roi = roi[keep_index]
# Compute offsets and scales to match sampled RoIs to the GTs.
gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]])
# 算出sample出的roi与他们分别所依附的ground truth bbox之间的偏移
# gt_assignment是前面定义的数组,存放每个roi IOU最大的ground truth bbox的下标
gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)
) / np.array(loc_normalize_std, np.float32))
return sample_roi, gt_roi_loc, gt_roi_label
作用:从 ProposalCreator产生的proposal regions以及ground truth bbox中选出n_sample个,并打上标签(0-20)
作用:初始化一个ProposalTargetCreator对象,设置一些参数
实现:
一、参数:
n_sample (int):要采样出的bbox的总个数
pos_ratio (float):正例所占的比例
pos_iou_thresh (float):一个bbox要被认为是正例需与某个ground truth达到的IOU的最小阈值
neg_iou_thresh_lo (float),neg_iou_thresh_hi (float):一个bbox被认为是负例,若它与任意一个ground truth的IOU都在[neg_iou_thresh_lo, neg_iou_thresh_hi )区间内
作用:
从proposals(roi)和grout truth bbox(bbox)中随机选出n_sample个roi,其中有固定比例的正例,当正例不足时用负例补齐
实现:
一、参数
roi (array):ProposalCreator产生的bbox,roi.shape=(R,4)
bbox (array):ground truth bounding boxes,bbox.shape=(R’ , 4)
label (array):Ground truth bounding box labels,label.shape=(R’ ,),元素的取值范围是1-20
loc_normalize_mean (tuple of four floats):采样出的roi与它所依附的ground truth bbox之间的偏移的均值
loc_normalize_std (tupler of four floats):采样出的roi与它所依附的ground truth bbox之间的偏移的标准差
二、返回值
sample_roi:采样出的roi,sample_roi.shape=(self.n_sample,4)
gt_roi_loc:采样出的roi与它所依附的ground truth bbox之间的偏移,gt_roi_loc.shape=(self.n_sample,4)
gt_roi_label:采样出的roi对应的label,gt_roi_label.shape=(self.n_sample,),元素取值为0-20,0表示负例(背景)
三、具体的实现过程解读在代码注释中
class AnchorTargetCreator(object):
def __init__(self,
n_sample=256,
pos_iou_thresh=0.7, neg_iou_thresh=0.3,
pos_ratio=0.5):
self.n_sample = n_sample
self.pos_iou_thresh = pos_iou_thresh
self.neg_iou_thresh = neg_iou_thresh
self.pos_ratio = pos_ratio
def __call__(self, bbox, anchor, img_size):
img_H, img_W = img_size
n_anchor = len(anchor)
inside_index = _get_inside_index(anchor, img_H, img_W)
#_get_inside_index得到那些完全在图片内的anchor的索引
anchor = anchor[inside_index]
#不完全在图片内的anchor先删掉,但是等下要_unmap回来
argmax_ious, label = self._create_label(
inside_index, anchor, bbox)
# compute bounding box regression targets
loc = bbox2loc(anchor, bbox[argmax_ious])
#每个anchor到与其IOU最大的gt bbox的偏移 (a,4)
# map up to original set of anchors
label = _unmap(label, n_anchor, inside_index, fill=-1)
#_unmap后label.shape回到(R,),之前删掉的anchor对应的label设为-1
loc = _unmap(loc, n_anchor, inside_index, fill=0)
#_unmap后loc.shape回到(R,4),之前删掉的anchor对应的loc值全设为fill(值无影响,反正用不到)
return loc, label
def _create_label(self, inside_index, anchor, bbox):
# label: 1 is positive, 0 is negative, -1 is dont care
label = np.empty((len(inside_index),), dtype=np.int32)
label.fill(-1)
argmax_ious, max_ious, gt_argmax_ious = \
self._calc_ious(anchor, bbox, inside_index)
# assign negative labels first so that positive labels can clobber them
label[max_ious < self.neg_iou_thresh] = 0
# positive label: for each gt, anchor with highest iou
label[gt_argmax_ious] = 1
# positive label: above threshold IOU
label[max_ious >= self.pos_iou_thresh] = 1
# subsample positive labels if we have too many
n_pos = int(self.pos_ratio * self.n_sample)
pos_index = np.where(label == 1)[0]
if len(pos_index) > n_pos:
disable_index = np.random.choice(
pos_index, size=(len(pos_index) - n_pos), replace=False)
label[disable_index] = -1
# subsample negative labels if we have too many
n_neg = self.n_sample - np.sum(label == 1)
neg_index = np.where(label == 0)[0]
if len(neg_index) > n_neg:
disable_index = np.random.choice(
neg_index, size=(len(neg_index) - n_neg), replace=False)
label[disable_index] = -1
return argmax_ious, label
#返回值argmax_ious:对于一个anchor,与之IOU最大的gt bbox的索引 (a,)
#label,每个anchor的label (a,)
def _calc_ious(self, anchor, bbox, inside_index):
# ious between the anchors and the gt boxes
ious = bbox_iou(anchor, bbox)
# ious.shape=(a,b)
argmax_ious = ious.argmax(axis=1)
# argmax_ious.shape=(a,),对于一个anchor,与之IOU最大的gt bbox的索引
max_ious = ious[np.arange(len(inside_index)), argmax_ious]
# 等价于 max_ious = ious.max(axis=1)
gt_argmax_ious = ious.argmax(axis=0)
# gt_argmax_ious.shape=(b,),对于一个gt bbox,与之IOU最大的anchor的索引
gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
# 等价于 gt_max_ious = ious.max(axis=0)
gt_argmax_ious = np.where(ious == gt_max_ious)[0]
# ious.shape=(a,b)
# gt_max_ious.shape=(b,)
#gt_argmax_ious.shape=(>=b,)
#以上操作就是先找到每个gt bbox对应IOU最大的anchor缩影
#再通过索引找到每个gt bbox对应的最大IOU
#为毛不直接用.max找最大值
#再与每个gt bbox对应的最大IOU值相等的anchor的索引(每个gt bbox对应的可能不止一个)
return argmax_ious, max_ious, gt_argmax_ious
#返回值argmax_ious:对于一个anchor,与之IOU最大的gt bbox的索引 (a,)
#max_ious:对于一个anchor,最大的IOU值 (a,)
#gt_argmax_ious:那些是某个gt bbox的最大IOU对应的anchor的索引集合,(>=b,)
作用:为训练RPN产生带标签的数据
作用:初始化一个AnchorTargetCreator对象,设置一些参数
实现:
一、参数:
n_sample (int):要采样出的anchor的总个数
pos_ratio (float):正例所占的比例
pos_iou_thresh (float):一个anchor要被认为是正例需与某个ground truth达到的IOU的最小阈值
neg_iou_thresh:一个anchor被认为是负例,若它与任意一个ground truth的IOU都小于neg_iou_thresh
作用:为anchors计算到gt bbox的偏移,为anchors设置标签,从中选出n_sample个正例比例为pos_ratio的anchors作为一个batch用于RPN的训练
实现:
一、参数
bbox (array):ground truth bbox的坐标,bbox.shape=(R,4)
anchor (array):anchor的坐标,anchor.shape=(S,4)
img_size (tuple of ints):(H,W)图片尺寸
二、返回值
loc (array):anchors与ground truth bbox的偏差,loc.shape=(S,4)
label (array):anchors是正例还是负例,0负1正,-1忽略,label.shape=(S, )
此外,还有两个函数,帮助实现AnchorTargetCreator的功能,已经在解读AnchorTargetCreator的代码时提及过,这里不再重复。
def _unmap(data, count, index, fill=0):
# Unmap a subset of item (data) back to the original set of items (of
# size count)
if len(data.shape) == 1:
ret = np.empty((count,), dtype=data.dtype)
ret.fill(fill)
ret[index] = data
else:
ret = np.empty((count,) + data.shape[1:], dtype=data.dtype)
ret.fill(fill)
ret[index, :] = data
return ret
def _get_inside_index(anchor, H, W):
# Calc indicies of anchors which are located completely inside of the image
# whose size is speficied.
index_inside = np.where(
(anchor[:, 0] >= 0) &
(anchor[:, 1] >= 0) &
(anchor[:, 2] <= H) &
(anchor[:, 3] <= W)
)[0]
return index_inside
2019/11/08 & 2019/11/10
import numpy as np
from torch.nn import functional as F
import torch as t
from torch import nn
from model.utils.bbox_tools import generate_anchor_base
from model.utils.creator_tool import ProposalCreator
def __init__(
self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2],
anchor_scales=[8, 16, 32], feat_stride=16,
proposal_creator_params=dict(),
):
super(RegionProposalNetwork, self).__init__()
self.anchor_base = generate_anchor_base(
anchor_scales=anchor_scales, ratios=ratios)
self.feat_stride = feat_stride
self.proposal_layer = ProposalCreator(self, **proposal_creator_params)
n_anchor = self.anchor_base.shape[0]
self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0)
self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)
normal_init(self.conv1, 0, 0.01)
normal_init(self.score, 0, 0.01)
normal_init(self.loc, 0, 0.01)
def forward(self, x, img_size, scale=1.):
n, _, hh, ww = x.shape
anchor = _enumerate_shifted_anchor(
np.array(self.anchor_base),
self.feat_stride, hh, ww)
# anchor.shape = (A*hh*ww,4)
n_anchor = anchor.shape[0] // (hh * ww)
h = F.relu(self.conv1(x))
# h.shape = (n,mid_channels,hh,ww)
rpn_locs = self.loc(h)
# rpn_locs.shape = (n,n_anchor * 4,hh,ww)
# UNNOTE: check whether need contiguous
# A: Yes
rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4)
# rpn_locs.shape = (n,hh,ww,n_anchor * 4)
rpn_scores = self.score(h)
rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous()
# rpn_scores.shape = (n,hh,ww,n_anchor * 2)
rpn_softmax_scores = F.softmax(rpn_scores.view(n, hh, ww, n_anchor, 2), dim=4)
rpn_fg_scores = rpn_softmax_scores[:, :, :, :, 1].contiguous()
# fg前景分数
rpn_fg_scores = rpn_fg_scores.view(n, -1)
#rpn_fg_scores.shape = (n,hh*ww*n_anchors)
rpn_scores = rpn_scores.view(n, -1, 2)
# rpn_scores.shape = (n,hh*ww*n_anchors,2)
#使用ProposalCreator,根据anchors和预测的偏移得到bbox,并将其clip进边界
#再选出其中是前景概率最高的一部分,删去长或宽太小的一部分,
#再进行nms,选出nms后前若干个bbox,作为Proposal Regions
rois = list()
roi_indices = list()
# 为n张图片分别产生roi,然后将其合成一个大的rois
# 为了记录合并后的rois中每一个roi属于哪一张图片,为每个roi设置一个index,即生成一个与rois等长的roi_indices
for i in range(n):
roi = self.proposal_layer(
rpn_locs[i].cpu().data.numpy(),
rpn_fg_scores[i].cpu().data.numpy(),
anchor, img_size,
scale=scale)
batch_index = i * np.ones((len(roi),), dtype=np.int32)
rois.append(roi)
roi_indices.append(batch_index)
rois = np.concatenate(rois, axis=0)
roi_indices = np.concatenate(roi_indices, axis=0)
return rpn_locs, rpn_scores, rois, roi_indices, anchor
作用:RPN的实现
作用:
参数赋值,定义RPN中的“零部件”,初始化权重
实现:
一、参数:
in_channels (int):输入的通道数
mid_channels (int):卷积之后的通道数
ratios (list of floats):anchor的H和W之比
anchor_scales (list of numbers):anchors的面积(等于(basescale)**2)
feat_stride (int):输入到RPN的张量两个相邻特征点映射到原始图片上的具体,在本代码中,特征提取网络一共进行了4次pool/2,所以feat_stride应该为16
initialW (callable):初始权值,为None时使用标准差为0.1的高斯分布
proposal_creator_params (dict):model.utils.creator_tools.ProposalCreator的关键词参数
二、主要的结构
(nn.Conv2d参数 i o k s p)
conv1:卷积层 in_channels, mid_channels, 3, 1, 1
score:全卷积层 in_channels, n_anchor2, 1, 1, 0
loc:全卷积层 in_channels, n_anchor*4, 1, 1, 0
proposal_layer:ProposalCreator
实现:
一、参数
x (~torch.autograd.Variable):从图像中提取出的特征,x.shape=(N,C,H,W)
img_size (tuple of ints):scale后的图像尺寸,(height, width)
scale:scale因子
二、返回值
rpn_locs:预测的所有anchors的偏移,rpn_locs.shape=(N,H*W*A,4)
rpn_scores:预测的anchors是前景和背景的概率, rpn_scores.shape=(N,H*W*A,2)
rois:为这一个batch中n张图片生成的roi组成的数组, rois.shape=(S,4)
roi_indices:为了区分rois中每一个roi属于哪一张图片而设置的index, roi_indices.shape=(S,),元素取值为[0, N)
anchor:(A*H*W,4)
def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width):
import numpy as xp
shift_y = xp.arange(0, height * feat_stride, feat_stride)
shift_x = xp.arange(0, width * feat_stride, feat_stride)
shift_x, shift_y = xp.meshgrid(shift_x, shift_y)
shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
shift_y.ravel(), shift_x.ravel()), axis=1)
A = anchor_base.shape[0]
K = shift.shape[0]
anchor = anchor_base.reshape((1, A, 4)) + \
shift.reshape((1, K, 4)).transpose((1, 0, 2))
anchor = anchor.reshape((K * A, 4)).astype(np.float32)
return anchor
作用:通过平移base_anchors产生出一张图片上的所有anchors,共A*K个,A=len(scales)*len(ratios), K=特征图的height*特征图的width
具体实现的解读另附文件
def _enumerate_shifted_anchor_torch(anchor_base, feat_stride, height, width):
# xp = cuda.get_array_module(anchor_base)
import torch as t
shift_y = t.arange(0, height * feat_stride, feat_stride)
shift_x = t.arange(0, width * feat_stride, feat_stride)
shift_x, shift_y = xp.meshgrid(shift_x, shift_y)
shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
shift_y.ravel(), shift_x.ravel()), axis=1)
A = anchor_base.shape[0]
K = shift.shape[0]
anchor = anchor_base.reshape((1, A, 4)) + \
shift.reshape((1, K, 4)).transpose((1, 0, 2))
anchor = anchor.reshape((K * A, 4)).astype(np.float32)
return anchor
作用:与上一个函数一样,只是上一个用的是numpy,这个用的torch
def normal_init(m, mean, stddev, truncated=False):
"""
weight initalizer: truncated normal and random normal.
"""
# x is a parameter
if truncated:
m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation
else:
m.weight.data.normal_(mean, stddev)
m.bias.data.zero_()
作用: normal_init
2019/11/11
这个文件实现的是ROI Pooling模块,这个模块的作用在[这篇博文]中讲得很清楚,(https://www.cnblogs.com/wangyong/p/8523814.html)下面只简单介绍一下RoI类和RoIPooling2D
该类继承了Function,实现了forward和backward函数,可以想象成F.max_pool2d的一个变体
该类继承了Module,只需实现forward函数,可以想象成nn.MaxPool2d的一个变体。输入特征features(N,C,H,W)及其对应的rois,得到每个rois固定大小(如7*7)的特征。大致过程为:先将roi除以16(从原图到特征图缩小的倍数,记得roi的坐标对应的是原图),就得到了roi在特征图上对应的区域,将这个区域划分成7*7的网格状,对每一个网格进行max pool,于是就得到了7*7的特征,最终的输出应该是 (len(rois), channel, 7, 7)
但是之前rois应该是一个batch中所有图片的roi集合,这里没有用index是怎么区分的,回头再看)
from __future__ import absolute_import
from __future__ import division
import torch as t
import numpy as np
import cupy as cp
from utils import array_tool as at
from model.utils.bbox_tools import loc2bbox
from model.utils.nms import non_maximum_suppression
from torch import nn
from data.dataset import preprocess
from torch.nn import functional as F
from utils.config import opt
def nograd(f):
def new_f(*args,**kwargs):
with t.no_grad():
return f(*args,**kwargs)
return new_f
作用:Faster RCNN 的基类。Faster RCNN由三部分组成:特征提取;RPN;定位和分类头;
def __init__(self, extractor, rpn, head,
loc_normalize_mean = (0., 0., 0., 0.),
loc_normalize_std = (0.1, 0.1, 0.2, 0.2)
):
super(FasterRCNN, self).__init__()
self.extractor = extractor
self.rpn = rpn
self.head = head
# mean and std
self.loc_normalize_mean = loc_normalize_mean
self.loc_normalize_std = loc_normalize_std
self.use_preset('evaluate')
作用:Faster RCNN由三部分组成:Feature extraction;RPN;Localization and Classification Heads;每部分分别由 extractor, rpn, head这三个nn.Module对象实现。
@property
def n_class(self):
# Total number of classes including the background.
return self.head.n_class
def forward(self, x, scale=1.):
img_size = x.shape[2:]
h = self.extractor(x)
# 提取特征
rpn_locs, rpn_scores, rois, roi_indices, anchor = \
self.rpn(h, img_size, scale)
# 生成proposal regions(rois)
roi_cls_locs, roi_scores = self.head(
h, rois, roi_indices)
# 为每一个roi定位及预测类别
# 每一个roi的定位有21种(84个值),分别对应它是每一类情况下要进行的偏移和缩放
return roi_cls_locs, roi_scores, rois, roi_indices
def use_preset(self, preset):
if preset == 'visualize':
self.nms_thresh = 0.3
self.score_thresh = 0.7
# visualize时,一个roi被认为是a类,不仅要a类得分最高,
# 还要求达到score_thresh = 0.7
elif preset == 'evaluate':
self.nms_thresh = 0.3
self.score_thresh = 0.05
else:
raise ValueError('preset must be visualize or evaluate')
def _suppress(self, raw_cls_bbox, raw_prob):
# 先看predict再回来看这个
# raw_clf_bbox是一张图片中的rois经过21种修正后得到的所有的bbox
# raw_clf_bbox.shape = (len(rois),84)
# raw_prob.shape = (len(rois),21)
bbox = list()
label = list()
score = list()
# skip cls_id = 0 because it is the background class
for l in range(1, self.n_class):
cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :]
# 从reshape成(len(rois),21,4)之后的bbox选出按它属于第l类修正的
# cls_bbox_l.shape = (len(rois),4)
# 表示假定所有roi都属于第l类,按照第l类进行修正后得到的bbox
prob_l = raw_prob[:, l]
# prob_l.shape = (len(rois),),表示rois属于第l类的概率
mask = prob_l > self.score_thresh
cls_bbox_l = cls_bbox_l[mask]
prob_l = prob_l[mask]
# 选出概率高于阈值的
keep = non_maximum_suppression(
cp.array(cls_bbox_l), self.nms_thresh, prob_l)
# 进行nms
keep = cp.asnumpy(keep)
bbox.append(cls_bbox_l[keep])
# The labels are in [0, self.n_class - 2].
label.append((l - 1) * np.ones((len(keep),)))
score.append(prob_l[keep])
bbox = np.concatenate(bbox, axis=0).astype(np.float32)
label = np.concatenate(label, axis=0).astype(np.int32)
score = np.concatenate(score, axis=0).astype(np.float32)
return bbox, label, score
作用:对于修正得到的bbox,对于每一类,选出达到了属于该类的可能性的阈值的bbox,并进行非极大值抑制,剩下的bbox就是最终留下的bbox
@nograd
def predict(self, imgs,sizes=None,visualize=False):
self.eval()
if visualize:
self.use_preset('visualize')
prepared_imgs = list()
sizes = list()
for img in imgs:
size = img.shape[1:]
img = preprocess(at.tonumpy(img))
prepared_imgs.append(img)
sizes.append(size)
else:
prepared_imgs = imgs
bboxes = list()
labels = list()
scores = list()
# 到此为止,prepared_imgs是经过preprocess之后的图片,即进行了scale和normalize
# sizes是这些图片的原始尺寸
for img, size in zip(prepared_imgs, sizes):
img = at.totensor(img[None]).float()
#升一维,做成一个batch_size=1的batch
scale = img.shape[3] / size[1]
# scale后的W/原W得到scale
roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
# We are assuming that batch size is 1.
# 对当前这一张图片进行forward计算
# 本来的返回值是 roi_cls_locs, roi_scores, rois, roi_indices
# 这里rois都属于同一张图片,不需要roi_indices
# roi_cls_loc.shape = (len(rois),84)
# roi_score.shape = (len(rois),21)
roi_score = roi_scores.data
roi_cls_loc = roi_cls_loc.data
roi = at.totensor(rois) / scale
# 将对应scale后图片的roi转为对应原始图片的roi
# Convert predictions to bounding boxes in image coordinates.
# Bounding boxes are scaled to the scale of the input images.
mean = t.Tensor(self.loc_normalize_mean).cuda(). \
repeat(self.n_class)[None]
std = t.Tensor(self.loc_normalize_std).cuda(). \
repeat(self.n_class)[None]
roi_cls_loc = (roi_cls_loc * std + mean)
roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
# roi_cls_loc.shape = (len(rois),21,4)
roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
# roi.shape = (len(rois),4) 变为 roi.shape = (len(rois),1,4)
# 再expand成 (len(rois),21,4)
# 即将每个roi按照它属于21种不同类别下的偏移都移动一遍
cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
at.tonumpy(roi_cls_loc).reshape((-1, 4)))
# cls_bbox.shape = (len(rois),21,4)
# 表示每一个roi,当它分别属于21类时,修正后的边框
cls_bbox = at.totensor(cls_bbox)
cls_bbox = cls_bbox.view(-1, self.n_class * 4)
# cls_bbox.shape = (len(rois),84)
# clip bounding box
cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])
prob = at.tonumpy(F.softmax(at.totensor(roi_score), dim=1))
# prob.shape = (len(rois),21)
raw_cls_bbox = at.tonumpy(cls_bbox)
raw_prob = at.tonumpy(prob)
bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
bboxes.append(bbox)
labels.append(label)
scores.append(score)
self.use_preset('evaluate')
self.train()
return bboxes, labels, scores
from __future__ import absolute_import
import os
from collections import namedtuple
import time
from torch.nn import functional as F
from model.utils.creator_tool import AnchorTargetCreator, ProposalTargetCreator
from torch import nn
import torch as t
from utils import array_tool as at
from utils.vis_tool import Visualizer
from utils.config import opt
from torchnet.meter import ConfusionMeter, AverageValueMeter
LossTuple = namedtuple('LossTuple',
['rpn_loc_loss',
'rpn_cls_loss',
'roi_loc_loss',
'roi_cls_loss',
'total_loss'
])
作用:wrapper for conveniently training. return losses
def __init__(self, faster_rcnn):
super(FasterRCNNTrainer, self).__init__()
self.faster_rcnn = faster_rcnn
self.rpn_sigma = opt.rpn_sigma
self.roi_sigma = opt.roi_sigma
# target creator create gt_bbox gt_label etc as training targets.
self.anchor_target_creator = AnchorTargetCreator()
self.proposal_target_creator = ProposalTargetCreator()
self.loc_normalize_mean = faster_rcnn.loc_normalize_mean
self.loc_normalize_std = faster_rcnn.loc_normalize_std
self.optimizer = self.faster_rcnn.get_optimizer()
# visdom wrapper
self.vis = Visualizer(env=opt.env)
# indicators for training status
self.rpn_cm = ConfusionMeter(2)
self.roi_cm = ConfusionMeter(21)
self.meters = {k: AverageValueMeter() for k in LossTuple._fields} # average loss
def forward(self, imgs, bboxes, labels, scale):
n = bboxes.shape[0]
if n != 1:
raise ValueError('Currently only batch size 1 is supported.')
_, _, H, W = imgs.shape
img_size = (H, W)
features = self.faster_rcnn.extractor(imgs)
# 提取特征
rpn_locs, rpn_scores, rois, roi_indices, anchor = \
self.faster_rcnn.rpn(features, img_size, scale)
#生成proposal regions
# Since batch size is one, convert variables to singular form
bbox = bboxes[0]
label = labels[0]
rpn_score = rpn_scores[0]
rpn_loc = rpn_locs[0]
roi = rois
# Sample RoIs and forward
# it's fine to break the computation graph of rois,
# consider them as constant input
sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
roi,
at.tonumpy(bbox),
at.tonumpy(label),
self.loc_normalize_mean,
self.loc_normalize_std)
#为这些roi打上标签(0-20),并取样出若干个作为一批训练数据
# sample_roi, gt_roi_loc, gt_roi_label 分别是取样出的roi,roi与groud truth 的偏移,roi依附的groud truth的label
# NOTE it's all zero because now it only support for batch=1 now
sample_roi_index = t.zeros(len(sample_roi))
roi_cls_loc, roi_score = self.faster_rcnn.head(
features,
sample_roi,
sample_roi_index)
# ------------------ RPN losses -------------------#
gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
at.tonumpy(bbox),
anchor,
img_size)
# 将所有anchor打上标签,计算它与所依附gt bbox的偏差(只有标签为0或1的才会用到)
# loc (array):anchors与ground truth bbox的偏差,loc.shape=(S,4)
# label (array):anchors是正例还是负例,0负1正,-1忽略,label.shape=(S, )
gt_rpn_label = at.totensor(gt_rpn_label).long()
gt_rpn_loc = at.totensor(gt_rpn_loc)
rpn_loc_loss = _fast_rcnn_loc_loss(
rpn_loc, #对于所有anchor,RPN预测的与gt bbox的偏差
gt_rpn_loc, #所有的anchor与gt bbox的真实偏差
gt_rpn_label.data, #所有的anchor的标签(0,1,-1)
# 虽然把所有anchor都传了进去,实际上还是算了正例的
self.rpn_sigma)
# NOTE: default value of ignore_index is -100 ...
rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)
_gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
_rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long())
# ------------------ ROI losses (fast rcnn loss) -------------------#
n_sample = roi_cls_loc.shape[0]
roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
# roi_cls_loc .shape = (n_sample, 21, 4)
roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
at.totensor(gt_roi_label).long()]
# 对每个roi选出与gt_roi_label相对应的预测的偏移,roi_loc.shape = (n_sample,4)
gt_roi_label = at.totensor(gt_roi_label).long()
gt_roi_loc = at.totensor(gt_roi_loc)
# 每个roi与gt bbox 的真实偏移 gt_roi_loc.shape = (n_sample,4)
roi_loc_loss = _fast_rcnn_loc_loss(
roi_loc.contiguous(),
gt_roi_loc,
gt_roi_label.data, # 只计算label>0的roi的loc loss
self.roi_sigma)
roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())
self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long())
losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
losses = losses + [sum(losses)]
return LossTuple(*losses)
实现:
一、参数
imgs (~torch.autograd.Variable):一个batch的图片(N,C,H,W)
bboxes (~torch.autograd.Variable):图片对应的groud truth bbox,(N,R,4)
labels (~torch.autograd…Variable):groud truth bbox的label,(N,R),元素取值为0-19
scale (float):原图的scale因子,个人认为这儿应该也是一个数组,但是训练的batch_size都是1,无妨
def train_step(self, imgs, bboxes, labels, scale):
self.optimizer.zero_grad()
losses = self.forward(imgs, bboxes, labels, scale)
losses.total_loss.backward()
self.optimizer.step()
self.update_meters(losses)
return losses
def save(self, save_optimizer=False, save_path=None, **kwargs):
save_dict = dict()
save_dict['model'] = self.faster_rcnn.state_dict()
save_dict['config'] = opt._state_dict()
save_dict['other_info'] = kwargs
save_dict['vis_info'] = self.vis.state_dict()
if save_optimizer:
save_dict['optimizer'] = self.optimizer.state_dict()
if save_path is None:
timestr = time.strftime('%m%d%H%M')
save_path = 'checkpoints/fasterrcnn_%s' % timestr
for k_, v_ in kwargs.items():
save_path += '_%s' % v_
save_dir = os.path.dirname(save_path)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
t.save(save_dict, save_path)
self.vis.save([self.vis.env])
return save_path
def load(self, path, load_optimizer=True, parse_opt=False, ):
state_dict = t.load(path)
if 'model' in state_dict:
self.faster_rcnn.load_state_dict(state_dict['model'])
else: # legacy way, for backward compatibility
self.faster_rcnn.load_state_dict(state_dict)
return self
if parse_opt:
opt._parse(state_dict['config'])
if 'optimizer' in state_dict and load_optimizer:
self.optimizer.load_state_dict(state_dict['optimizer'])
return self
def update_meters(self, losses):
loss_d = {k: at.scalar(v) for k, v in losses._asdict().items()}
for key, meter in self.meters.items():
meter.add(loss_d[key])
def reset_meters(self):
for key, meter in self.meters.items():
meter.reset()
self.roi_cm.reset()
self.rpn_cm.reset()
def get_meter_data(self):
return {k: v.value()[0] for k, v in self.meters.items()}
def _smooth_l1_loss(x, t, in_weight, sigma):
sigma2 = sigma ** 2
diff = in_weight * (x - t)
abs_diff = diff.abs()
flag = (abs_diff.data < (1. / sigma2)).float()
y = (flag * (sigma2 / 2.) * (diff ** 2) +
(1 - flag) * (abs_diff - 0.5 / sigma2))
return y.sum()
def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
in_weight = t.zeros(gt_loc.shape).cuda()
# Localization loss is calculated only for positive rois.
# NOTE: unlike origin implementation,
# we don't need inside_weight and outside_weight, they can calculate by gt_label
in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1
loc_loss = _smooth_l1_loss(pred_loc, gt_loc, in_weight.detach(), sigma)
# Normalize by total number of negtive and positive rois.
loc_loss /= ((gt_label >= 0).sum().float()) # ignore gt_label==-1 for rpn_loss
return loc_loss
2019/11/12
def eval(dataloader, faster_rcnn, test_num=10000):
pred_bboxes, pred_labels, pred_scores = list(), list(), list()
gt_bboxes, gt_labels, gt_difficults = list(), list(), list()
for ii, (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) in tqdm(enumerate(dataloader)):
sizes = [sizes[0][0].item(), sizes[1][0].item()]
pred_bboxes_, pred_labels_, pred_scores_ = faster_rcnn.predict(imgs, [sizes])
gt_bboxes += list(gt_bboxes_.numpy())
gt_labels += list(gt_labels_.numpy())
gt_difficults += list(gt_difficults_.numpy())
pred_bboxes += pred_bboxes_
pred_labels += pred_labels_
pred_scores += pred_scores_
if ii == test_num: break
result = eval_detection_voc(
pred_bboxes, pred_labels, pred_scores,
gt_bboxes, gt_labels, gt_difficults,
use_07_metric=True)
return result
def train(**kwargs):
opt._parse(kwargs)
dataset = Dataset(opt)
print('load data')
dataloader = data_.DataLoader(dataset, \
batch_size=1, \
shuffle=True, \
# pin_memory=True,
num_workers=opt.num_workers)
testset = TestDataset(opt)
test_dataloader = data_.DataLoader(testset,
batch_size=1,
num_workers=opt.test_num_workers,
shuffle=False, \
pin_memory=True
)
faster_rcnn = FasterRCNNVGG16()
print('model construct completed')
trainer = FasterRCNNTrainer(faster_rcnn).cuda()
if opt.load_path:
trainer.load(opt.load_path)
print('load pretrained model from %s' % opt.load_path)
trainer.vis.text(dataset.db.label_names, win='labels')
best_map = 0
lr_ = opt.lr
for epoch in range(opt.epoch):
trainer.reset_meters()
for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
scale = at.scalar(scale)
img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
trainer.train_step(img, bbox, label, scale)
if (ii + 1) % opt.plot_every == 0:
if os.path.exists(opt.debug_file):
ipdb.set_trace()
# plot loss
trainer.vis.plot_many(trainer.get_meter_data())
# plot groud truth bboxes
ori_img_ = inverse_normalize(at.tonumpy(img[0]))
gt_img = visdom_bbox(ori_img_,
at.tonumpy(bbox_[0]),
at.tonumpy(label_[0]))
trainer.vis.img('gt_img', gt_img)
# plot predicti bboxes
_bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True)
pred_img = visdom_bbox(ori_img_,
at.tonumpy(_bboxes[0]),
at.tonumpy(_labels[0]).reshape(-1),
at.tonumpy(_scores[0]))
trainer.vis.img('pred_img', pred_img)
# rpn confusion matrix(meter)
trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm')
# roi confusion matrix
trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float())
eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)
trainer.vis.plot('test_map', eval_result['map'])
lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_),
str(eval_result['map']),
str(trainer.get_meter_data()))
trainer.vis.log(log_info)
if eval_result['map'] > best_map:
best_map = eval_result['map']
best_path = trainer.save(best_map=best_map)
if epoch == 9:
trainer.load(best_path)
trainer.faster_rcnn.scale_lr(opt.lr_decay)
lr_ = lr_ * opt.lr_decay
if epoch == 13:
break