训练脚本 ./tools/train_net.py 主函数开始.
数据读取层 RoIDataLayer
首先,
imdb, roidb = combined_roidb(args.imdb_name) # 输入参数 imdb_name,默认是 voc_2007_trainval(数据集名字)
print '{:d} roidb entries'.format(len(roidb))
然后,函数 combined_roidb:
def combined_roidb(imdb_names):
def get_roidb(imdb_name):
imdb = get_imdb(imdb_name) # factory.py 中的函数,调用的是 pascal_voc 的数据集对象
# get_imdb 默认返回的是 pascal_voc('trainval', '2007')
# 设置imdb的一些属性,如图片路径,图片名称索引等,未读取真正的图片数据
print 'Loaded dataset `{:s}` for training'.format(imdb.name)
imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)
print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
roidb = get_training_roidb(imdb)
return roidb
roidbs = [get_roidb(s) for s in imdb_names.split('+')]
# imdb_names.split('+') 默认值是 voc_2007_trainval
# 需要调用内部函数 get_roidb
roidb = roidbs[0]
if len(roidbs) > 1:
for r in roidbs[1:]:
roidb.extend(r)
imdb = datasets.imdb.imdb(imdb_names)
else:
imdb = get_imdb(imdb_names)
return imdb, roidb
pascal_voc 数据集对应的类的对象:
class pascal_voc(imdb): # 继承于 imdb 类的子类
def __init__(self, image_set, year, devkit_path=None):
imdb.__init__(self, 'voc_' + year + '_' + image_set)
self._year = year
self._image_set = image_set
self._devkit_path = '/data/VOCdevkit'
self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
self._classes = ('__background__', # always index 0
'aeroplane', 'bicycle', 'bird', 'boat',
'bottle', 'bus', 'car', 'cat', 'chair',
'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant',
'sheep', 'sofa', 'train', 'tvmonitor')
self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes)))
self._image_ext = '.jpg'
self._image_index = self._load_image_set_index()
# Default to roidb handler
self._roidb_handler = self.selective_search_roidb
self._salt = str(uuid.uuid4()) # ?
self._comp_id = 'comp4' # ?
# PASCAL specific config options
self.config = {'cleanup' : True,
'use_salt' : True,
'use_diff' : False,
'matlab_eval' : False,
'rpn_file' : None,
'min_size' : 2}
assert os.path.exists(self._devkit_path), \
'VOCdevkit path does not exist: {}'.format(self._devkit_path)
assert os.path.exists(self._data_path), \
'Path does not exist: {}'.format(self._data_path)
#
class imdb(object):
"""Image database."""
def __init__(self, name):
self._name = name
self._num_classes = 0
self._classes = []
self._image_index = []
self._obj_proposer = 'selective_search'
self._roidb = None
self._roidb_handler = self.default_roidb
# Use this dict for storing dataset specific config options
self.config = {}
得到的 imdb = pascal_voc(‘trainval’, ‘2007’) 记录的内容如下:
[1] - _class_to_ind,dict 类型,key 是类别名,value 是 label 值(从 0 开始),其中 (key[0], value[0]) = [background, 0]
[2] - _classes,object 类别名,共 20(object classes) + 1(background) = 21 classes.
[3] - _data_path,数据集路径
[4] - _image_ext,’.jpg’ 数据类型
[5] - _image_index,图片索引列表
[6] - _image_set,’trainval’
[7] - _name,数据集名称 voc_2007_trainval
[8] - _num_classes,0
[9] - _obj_proposer,selective_search
[10] - _roidb,None
[11] - classes,与_classes 相同
[12] - image_index,与_image_index 相同
[13] - name,数据集名称,与 _name 相同
[14] - num_classes,类别数,21
[15] - num_images,图片数
[16] - config,dict 类型,PASCAL 数据集指定的配置
读取 imdb 后,是 ,
imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)
print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
config.py 中 cfg.TRAIN.PROPOSAL_METHOD 值为 selective_search
experiments/cfgs/faster_rcnn_end2end.yml 中 cfg.TRAIN.PROPOSAL_METHOD 值为 gt
set_proposal_method 函数,
def set_proposal_method(self, method):
method = eval('self.' + method + '_roidb') # eval 函数把字符串转成表达式,self.gt_roidb/pascal_voc 内的函数
self.roidb_handler = method
def gt_roidb(self):
"""
Return the database of ground-truth regions of interest.
This function loads/saves from/to a cache file to speed up future calls.
"""
cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
if os.path.exists(cache_file):
with open(cache_file, 'rb') as fid:
roidb = cPickle.load(fid)
print '{} gt roidb loaded from {}'.format(self.name, cache_file)
return roidb
gt_roidb = [self._load_pascal_annotation(index)
for index in self.image_index]
with open(cache_file, 'wb') as fid:
cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
print 'wrote gt roidb to {}'.format(cache_file)
return gt_roidb
def _load_pascal_annotation(self, index):
"""
Load image and bounding boxes info from XML file in the PASCAL VOC
format.
"""
filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
tree = ET.parse(filename)
objs = tree.findall('object')
if not self.config['use_diff']:
# Exclude the samples labeled as difficult
non_diff_objs = [
obj for obj in objs if int(obj.find('difficult').text) == 0]
# if len(non_diff_objs) != len(objs):
# print 'Removed {} difficult objects'.format(
# len(objs) - len(non_diff_objs))
objs = non_diff_objs
num_objs = len(objs)
boxes = np.zeros((num_objs, 4), dtype=np.uint16)
gt_classes = np.zeros((num_objs), dtype=np.int32)
overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
# "Seg" area for pascal is just the box area
seg_areas = np.zeros((num_objs), dtype=np.float32)
# Load object bounding boxes into a data frame.
for ix, obj in enumerate(objs):
bbox = obj.find('bndbox')
# Make pixel indexes 0-based
x1 = float(bbox.find('xmin').text) - 1
y1 = float(bbox.find('ymin').text) - 1
x2 = float(bbox.find('xmax').text) - 1
y2 = float(bbox.find('ymax').text) - 1
cls = self._class_to_ind[obj.find('name').text.lower().strip()]
boxes[ix, :] = [x1, y1, x2, y2]
gt_classes[ix] = cls
overlaps[ix, cls] = 1.0
seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
overlaps = scipy.sparse.csr_matrix(overlaps)
return {'boxes' : boxes,
'gt_classes': gt_classes,
'gt_overlaps' : overlaps,
'flipped' : False,
'seg_areas' : seg_areas}
继续,
roidb = get_training_roidb(imdb)
进入 get_training_roidb 函数,位于 fast_rcnn 内的 train.py 中,
def get_training_roidb(imdb):
"""Returns a roidb (Region of Interest database) for use in training."""
if cfg.TRAIN.USE_FLIPPED: # 是否进行图片翻转
print 'Appending horizontally-flipped training examples...'
imdb.append_flipped_images()
# 对imdb中涉及到的图像做了一个水平镜像,使得trainval中的5011张图片,变成了10022张图片;
print 'done'
print 'Preparing training data...'
rdl_roidb.prepare_roidb(imdb) # 准备数据
print 'done'
return imdb.roidb
进入 rdl_rxingoidb.prepare_roidb 函数,
def prepare_roidb(imdb):
"""Enrich the imdb's roidb by adding some derived quantities that
are useful for training. This function precomputes the maximum
overlap, taken over ground-truth boxes, between each ROI and
each ground-truth box. The class with maximum overlap is also
recorded.
"""
sizes = [PIL.Image.open(imdb.image_path_at(i)).size
for i in xrange(imdb.num_images)] # 所有图片的 width 和 heigth
roidb = imdb.roidb
for i in xrange(len(imdb.image_index)):
roidb[i]['image'] = imdb.image_path_at(i) # 图片名
roidb[i]['width'] = sizes[i][0] # 图片 width
roidb[i]['height'] = sizes[i][1] # 图片 height
# need gt_overlaps as a dense array for argmax
gt_overlaps = roidb[i]['gt_overlaps'].toarray()
# max overlap with gt over classes (columns)
max_overlaps = gt_overlaps.max(axis=1)
# gt class that had the max overlap
max_classes = gt_overlaps.argmax(axis=1)
roidb[i]['max_classes'] = max_classes
roidb[i]['max_overlaps'] = max_overlaps
# sanity checks 合理性检查
# max overlap of 0 => class should be zero (background)
zero_inds = np.where(max_overlaps == 0)[0]
assert all(max_classes[zero_inds] == 0)
# max overlap > 0 => class should not be zero (must be a fg class)
nonzero_inds = np.where(max_overlaps > 0)[0]
assert all(max_classes[nonzero_inds] != 0)
得到的 imdb.roidb 是 list 形式,包含所有样本的信息,imdb.roidb[index] 是每个样本的数据,dict 类型,其内容信息如下:
roidb[index],dict 类型,key 与 value 分别对应数据为,
[1] - boxes,box 的位置数据,box_num×4 的 ndarray
hanshu[2] - flipped,是否图片翻转,True or False
[3] - gt_classes,图片内所有 boxes 的真实类别标注,box_num×1 的 ndarray
[4] - gt_overlaps,图片内所有 boxes 在不同类别对应的 score,box_num×classes_num 的 matrix
[5] - height,图片的高
[6] - width,图片的宽
[7] - image,图片路径
[8] - max_classes,每个 box 的最高 score 所对应的类别,box_num×1 的 ndarray
[9] - max_overlaps,每个 box 对所有类别的 score 最大值,box_num×1 的 ndarray
函数调用结束,返回 train_net.py 中的 combined_roidb(imdb_names) 函数,
roidb = roidbs[0]
if len(roidbs) > 1: # 跳过
for r in roidbs[1:]:
roidb.extend(r)
imdb = datasets.imdb.imdb(imdb_names)
else:
imdb = get_imdb(imdb_names) # 即前面提到的 imdb=pascal_voc('trainval', '2007')
roidb 应该是属于 imdb 的.
roidb 是没有真正的读取数据的,只是建立相关的数据信息.
下面继续是,
train_net(args.solver, roidb, output_dir, pretrained_model=args.pretrained_model, max_iters=args.max_iters)
进入 train_net,
def train_net(solver_prototxt, roidb, output_dir,
pretrained_model=None, max_iters=40000):
"""Train a Fast R-CNN network."""
roidb = filter_roidb(roidb)
sw = SolverWrapper(solver_prototxt, roidb, output_dir,
pretrained_model=pretrained_model)
print 'Solving...'
model_paths = sw.train_model(max_iters)
print 'done solving'
return model_paths
这里对 roidb 先进行处理,即函数 filter_roidb,去除没用的 RoIs,
def filter_roidb(roidb):
"""Remove roidb entries that have no usable RoIs."""
def is_valid(entry):
# Valid images have:
# (1) At least one foreground RoI OR
# (2) At least one background RoI
overlaps = entry['max_overlaps']
# find boxes with sufficient overlap
fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
# Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
(overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
# image is only valid if such boxes exist
valid = len(fg_inds) > 0 or len(bg_inds) > 0
return valid
num = len(roidb)
filtered_roidb = [entry for entry in roidb if is_valid(entry)]
num_after = len(filtered_roidb)
print 'Filtered {} roidb entries: {} -> {}'.format(num - num_after,
num, num_after)
return filtered_roidb
接着是,
sw = SolverWrapper(solver_prototxt, roidb, output_dir,
pretrained_model=pretrained_model)
采用 SolverWrapper 来封装 solver 类的对象,
class SolverWrapper(object):
"""A simple wrapper around Caffe's solver.
This wrapper gives us control over he snapshotting process, which we
use to unnormalize the learned bounding-box regression weights.
"""
def __init__(self, solver_prototxt, roidb, output_dir,
pretrained_model=None):
"""Initialize the SolverWrapper."""
self.output_dir = output_dir
if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and
cfg.TRAIN.BBOX_NORMALIZE_TARGETS):
# RPN can only use precomputed normalization because there are no
# fixed statistics to compute a priori
assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED
if cfg.TRAIN.BBOX_REG:
print 'Computing bounding-box regression targets...'
self.bbox_means, self.bbox_stds = \
rdl_roidb.add_bbox_regression_targets(roidb)
# 添加 box 回归目标,gt,给roidb dict添加 bbox_targets key.
print 'done'
self.solver = caffe.SGDSolver(solver_prototxt) # 加载 solver.prototxt
# train.prototxt data_top:
# top[o] - data (1, 3, 600, 1000)
# top[1] - im_info (1, 3)
# top[2] - gt_boxes (1, 4)
if pretrained_model is not None:
print ('Loading pretrained model '
'weights from {:s}').format(pretrained_model)
self.solver.net.copy_from(pretrained_model) # 从预训练模型加载初始化参数
self.solver_param = caffe_pb2.SolverParameter()
with open(solver_prototxt, 'rt') as f:
pb2.text_format.Merge(f.read(), self.solver_param)
self.solver.net.layers[0].set_roidb(roidb) # 关键步骤,调用 RoIDataLayer 中的 set_roidb 函数,以将读取的 roidb 读入网络.
这里调用的第一个函数是 add_bbox_regression_targets,
def add_bbox_regression_targets(roidb):
"""Add information needed to train bounding-box regressors."""
assert len(roidb) > 0
assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'
num_images = len(roidb)
# Infer number of classes from the number of columns in gt_overlaps
num_classes = roidb[0]['gt_overlaps'].shape[1]
for im_i in xrange(num_images):
rois = roidb[im_i]['boxes']
max_overlaps = roidb[im_i]['max_overlaps']
max_classes = roidb[im_i]['max_classes']
roidb[im_i]['bbox_targets'] = \
_compute_targets(rois, max_overlaps, max_classes)
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
# Use fixed / precomputed "means" and "stds" instead of empirical values
# 采用固定/预先计算的均值和方差
means = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
stds = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
else:
# Compute values needed for means and stds
# var(x) = E(x^2) - E(x)^2
class_counts = np.zeros((num_classes, 1)) + cfg.EPS
sums = np.zeros((num_classes, 4))
squared_sums = np.zeros((num_classes, 4))
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes):
cls_inds = np.where(targets[:, 0] == cls)[0]
if cls_inds.size > 0:
class_counts[cls] += cls_inds.size
sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
squared_sums[cls, :] += \
(targets[cls_inds, 1:] ** 2).sum(axis=0)
means = sums / class_counts
stds = np.sqrt(squared_sums / class_counts - means ** 2)
print 'bbox target means:'
print means
print means[1:, :].mean(axis=0) # ignore bg class
print 'bbox target stdevs:'
print stds
print stds[1:, :].mean(axis=0) # ignore bg class
# Normalize targets
# 归一化处理
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
print "Normalizing targets"
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes):
cls_inds = np.where(targets[:, 0] == cls)[0]
roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
else:
print "NOT normalizing targets"
# These values will be needed for making predictions
# (the predicts will need to be unnormalized and uncentered)
return means.ravel(), stds.ravel()
def _compute_targets(rois, overlaps, labels):
"""Compute bounding-box regression targets for an image."""
# Indices of ground-truth ROIs
gt_inds = np.where(overlaps == 1)[0]
if len(gt_inds) == 0:
# Bail if the image has no ground-truth ROIs
return np.zeros((rois.shape[0], 5), dtype=np.float32)
# Indices of examples for which we try to make predictions
ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0]
# Get IoU overlap between each ex ROI and gt ROI
# 计算 ex RoI 和 gt RoI 的 IoU
ex_gt_overlaps = bbox_overlaps(
np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))
# Find which gt ROI each ex ROI has max overlap with:
# this will be the ex ROI's gt target
gt_assignment = ex_gt_overlaps.argmax(axis=1)
gt_rois = rois[gt_inds[gt_assignment], :]
ex_rois = rois[ex_inds, :]
targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
targets[ex_inds, 0] = labels[ex_inds] # 类别标签
targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) # 转换 bbox
return targets
def bbox_transform(ex_rois, gt_rois):
ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
targets_dw = np.log(gt_widths / ex_widths)
targets_dh = np.log(gt_heights / ex_heights)
targets = np.vstack(
(targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
return targets
这里对 SolverWrapper 的 __init__ 中的最后一行 self.solver.net.layers[0].set_roidb(roidb) 理解,
def set_roidb(self, roidb):
"""Set the roidb to be used by this layer during training."""
self._roidb = roidb
self._shuffle_roidb_inds() # 打乱 roidb 的顺序
if cfg.TRAIN.USE_PREFETCH: # 这些都跳过了,忽略
self._blob_queue = Queue(10)
self._prefetch_process = BlobFetcher(self._blob_queue,
self._roidb,
self._num_classes)
self._prefetch_process.start()
# Terminate the child process when the parent exists
def cleanup():
print 'Terminating BlobFetcher'
self._prefetch_process.terminate()
self._prefetch_process.join()
import atexit
atexit.register(cleanup)
def _shuffle_roidb_inds(self):
"""Randomly permute the training roidb."""
if cfg.TRAIN.ASPECT_GROUPING:
widths = np.array([r['width'] for r in self._roidb])
heights = np.array([r['height'] for r in self._roidb])
horz = (widths >= heights)
vert = np.logical_not(horz)
horz_inds = np.where(horz)[0]
vert_inds = np.where(vert)[0]
inds = np.hstack((
np.random.permutation(horz_inds),
np.random.permutation(vert_inds)))
inds = np.reshape(inds, (-1, 2))
row_perm = np.random.permutation(np.arange(inds.shape[0]))
inds = np.reshape(inds[row_perm, :], (-1,))
self._perm = inds
else:
self._perm = np.random.permutation(np.arange(len(self._roidb)))
self._cur = 0
到这里,roidb 处理好,但仍没有读取图片数据.
RoIDataLayer 的 forward 部分:
class RoIDataLayer(caffe.Layer):
"""Fast R-CNN data layer used for training."""
def _shuffle_roidb_inds(self):
"""Randomly permute the training roidb."""
if cfg.TRAIN.ASPECT_GROUPING:
widths = np.array([r['width'] for r in self._roidb])
heights = np.array([r['height'] for r in self._roidb])
horz = (widths >= heights)
vert = np.logical_not(horz)
horz_inds = np.where(horz)[0]
vert_inds = np.where(vert)[0]
inds = np.hstack((
np.random.permutation(horz_inds),
np.random.permutation(vert_inds)))
inds = np.reshape(inds, (-1, 2))
row_perm = np.random.permutation(np.arange(inds.shape[0]))
inds = np.reshape(inds[row_perm, :], (-1,))
self._perm = inds
else:
self._perm = np.random.permutation(np.arange(len(self._roidb)))
self._cur = 0
def _get_next_minibatch_inds(self):
"""Return the roidb indices for the next minibatch."""
if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb):
self._shuffle_roidb_inds() # 每完成一次全部样本的训练,打乱一次 roidb 的顺序
db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] # 选取一个 mini-batch 的图片索引
self._cur += cfg.TRAIN.IMS_PER_BATCH
return db_inds
def _get_next_minibatch(self):
"""Return the blobs to be used for the next minibatch.
If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a
separate process and made available through self._blob_queue.
"""
if cfg.TRAIN.USE_PREFETCH: # 跳过
return self._blob_queue.get()
else:
db_inds = self._get_next_minibatch_inds() # 读取 next minibatch 的 index
minibatch_db = [self._roidb[i] for i in db_inds] # 读取一个 mini-batch 的 roidb
return get_minibatch(minibatch_db, self._num_classes) # 读取 mini-batch 的数据
def setup(self, bottom, top):
"""Setup the RoIDataLayer."""
# parse the layer parameter string, which must be valid YAML
layer_params = yaml.load(self.param_str_)
self._num_classes = layer_params['num_classes']
self._name_to_top_map = {}
# data blob: holds a batch of N images, each with 3 channels
idx = 0
top[idx].reshape(cfg.TRAIN.IMS_PER_BATCH, 3,
max(cfg.TRAIN.SCALES), cfg.TRAIN.MAX_SIZE)
self._name_to_top_map['data'] = idx
idx += 1
if cfg.TRAIN.HAS_RPN:
top[idx].reshape(1, 3)
self._name_to_top_map['im_info'] = idx
idx += 1
top[idx].reshape(1, 4)
self._name_to_top_map['gt_boxes'] = idx
idx += 1
else: # not using RPN
# rois blob: holds R regions of interest, each is a 5-tuple
# (n, x1, y1, x2, y2) specifying an image batch index n and a
# rectangle (x1, y1, x2, y2)
top[idx].reshape(1, 5)
self._name_to_top_map['rois'] = idx
idx += 1
# labels blob: R categorical labels in [0, ..., K] for K foreground
# classes plus background
top[idx].reshape(1)
self._name_to_top_map['labels'] = idx
idx += 1
if cfg.TRAIN.BBOX_REG:
# bbox_targets blob: R bounding-box regression targets with 4
# targets per class
top[idx].reshape(1, self._num_classes * 4)
self._name_to_top_map['bbox_targets'] = idx
idx += 1
# bbox_inside_weights blob: At most 4 targets per roi are active;
# thisbinary vector sepcifies the subset of active targets
top[idx].reshape(1, self._num_classes * 4)
self._name_to_top_map['bbox_inside_weights'] = idx
idx += 1
top[idx].reshape(1, self._num_classes * 4)
self._name_to_top_map['bbox_outside_weights'] = idx
idx += 1
print 'RoiDataLayer: name_to_top:', self._name_to_top_map
assert len(top) == len(self._name_to_top_map)
def forward(self, bottom, top):
"""Get blobs and copy them into this layer's top blob vector."""
blobs = self._get_next_minibatch()
# blobs - data, im_info, gt_boxes
for blob_name, blob in blobs.iteritems():
top_ind = self._name_to_top_map[blob_name]
# Reshape net's input blobs
top[top_ind].reshape(*(blob.shape))
# Copy data into net's input blobs
top[top_ind].data[...] = blob.astype(np.float32, copy=False) # blobs 到 top 输出
def backward(self, top, propagate_down, bottom):
"""This layer does not propagate gradients."""
pass
def reshape(self, bottom, top):
"""Reshaping happens during the call to forward."""
pass
这里 get_minibatch 读取 mini-batch 的图片数据和 roidb 数据,
def get_minibatch(roidb, num_classes):
"""Given a roidb, construct a minibatch sampled from it."""
num_images = len(roidb) # mini-batch 的样本数
# Sample random scales to use for each image in this batch
random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES),
size=num_images)
assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \
'num_images ({}) must divide BATCH_SIZE ({})'. \
format(num_images, cfg.TRAIN.BATCH_SIZE)
rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images # 每张图片的 rois 数
fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) # 每张图片被标注为 foreground 的 rois 数
# cfg.TRAIN.FG_FRACTION=0.25 被标注为 foreground 的比例(class>0)
# Get the input image blob, formatted for caffe
im_blob, im_scales = _get_image_blob(roidb, random_scale_inds)
blobs = {'data': im_blob} # data blob
if cfg.TRAIN.HAS_RPN:
assert len(im_scales) == 1, "Single batch only" # RPN mini-batch 只用一个 scale
assert len(roidb) == 1, "Single batch only"
# gt boxes: (x1, y1, x2, y2, cls)
gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
blobs['gt_boxes'] = gt_boxes
blobs['im_info'] = np.array(
[[im_blob.shape[2], im_blob.shape[3], im_scales[0]]],
dtype=np.float32)
else: # not using RPN
# Now, build the region of interest and label blobs
rois_blob = np.zeros((0, 5), dtype=np.float32)
labels_blob = np.zeros((0), dtype=np.float32)
bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32)
bbox_inside_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32)
# all_overlaps = []
for im_i in xrange(num_images):
labels, overlaps, im_rois, bbox_targets, bbox_inside_weights \
= _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image,
num_classes)
# Add to RoIs blob
rois = _project_im_rois(im_rois, im_scales[im_i])
batch_ind = im_i * np.ones((rois.shape[0], 1))
rois_blob_this_image = np.hstack((batch_ind, rois))
rois_blob = np.vstack((rois_blob, rois_blob_this_image))
# Add to labels, bbox targets, and bbox loss blobs
labels_blob = np.hstack((labels_blob, labels))
bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets))
bbox_inside_blob = np.vstack((bbox_inside_blob, bbox_inside_weights))
# all_overlaps = np.hstack((all_overlaps, overlaps))
# For debug visualizations
# _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps)
blobs['rois'] = rois_blob
blobs['labels'] = labels_blob
if cfg.TRAIN.BBOX_REG:
blobs['bbox_targets'] = bbox_targets_blob
blobs['bbox_inside_weights'] = bbox_inside_blob
blobs['bbox_outside_weights'] = \
np.array(bbox_inside_blob > 0).astype(np.float32)
return blobs
def _get_image_blob(roidb, scale_inds):
"""Builds an input blob from the images in the roidb at the specified
scales.
"""
num_images = len(roidb)
processed_ims = []
im_scales = []
for i in xrange(num_images):
im = cv2.imread(roidb[i]['image']) # 到这里才真正读入图片数据
if roidb[i]['flipped']:
im = im[:, ::-1, :]
target_size = cfg.TRAIN.SCALES[scale_inds[i]] # 图片短边尺寸为 600
im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
cfg.TRAIN.MAX_SIZE)
im_scales.append(im_scale)
processed_ims.append(im)
# Create a blob to hold the input images
# 将输入图片转化为网络 data_blobs
blob = im_list_to_blob(processed_ims)
return blob, im_scales
def prep_im_for_blob(im, pixel_means, target_size, max_size):
"""Mean subtract and scale an image for use in a blob."""
im = im.astype(np.float32, copy=False)
im -= pixel_means # 减均值
im_shape = im.shape
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
im_scale = float(target_size) / float(im_size_min) #
# Prevent the biggest axis from being more than MAX_SIZE
if np.round(im_scale * im_size_max) > max_size:
im_scale = float(max_size) / float(im_size_max)
im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
interpolation=cv2.INTER_LINEAR)
return im, im_scale
def im_list_to_blob(ims):
"""Convert a list of images into a network input.
Assumes images are already prepared (means subtracted, BGR order, ...).
"""
max_shape = np.array([im.shape for im in ims]).max(axis=0)
num_images = len(ims)
blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
dtype=np.float32)
for i in xrange(num_images):
im = ims[i]
blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
# Move channels (axis 3) to axis 1
# Axis order will become: (batch elem, channel, height, width)
channel_swap = (0, 3, 1, 2)
blob = blob.transpose(channel_swap)
return blob
大致顺序梳理了一下,比较绕,太容易晕了.
实际上大致流程应该是: