Faster R-CNN 训练过程源码理解

训练脚本 ./tools/train_net.py 主函数开始.

数据读取层 RoIDataLayer


imdb, roidb = combined_roidb(args.imdb_name) # 输入参数 imdb_name,默认是 voc_2007_trainval(数据集名字)
print '{:d} roidb entries'.format(len(roidb))

然后,函数 combined_roidb:

def combined_roidb(imdb_names):
    def get_roidb(imdb_name):
        imdb = get_imdb(imdb_name) # factory.py 中的函数,调用的是 pascal_voc 的数据集对象
        # get_imdb 默认返回的是 pascal_voc('trainval', '2007')
        # 设置imdb的一些属性,如图片路径,图片名称索引等,未读取真正的图片数据

        print 'Loaded dataset `{:s}` for training'.format(imdb.name)
        print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
        roidb = get_training_roidb(imdb)
        return roidb

    roidbs = [get_roidb(s) for s in imdb_names.split('+')] 
    # imdb_names.split('+') 默认值是 voc_2007_trainval 
    # 需要调用内部函数 get_roidb

    roidb = roidbs[0]
    if len(roidbs) > 1:
        for r in roidbs[1:]:
        imdb = datasets.imdb.imdb(imdb_names)
        imdb = get_imdb(imdb_names)
    return imdb, roidb

pascal_voc 数据集对应的类的对象:

class pascal_voc(imdb):  # 继承于 imdb 类的子类
    def __init__(self, image_set, year, devkit_path=None):
        imdb.__init__(self, 'voc_' + year + '_' + image_set)
        self._year = year
        self._image_set = image_set
        self._devkit_path = '/data/VOCdevkit'
        self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
        self._classes = ('__background__', # always index 0
                         'aeroplane', 'bicycle', 'bird', 'boat',
                         'bottle', 'bus', 'car', 'cat', 'chair',
                         'cow', 'diningtable', 'dog', 'horse',
                         'motorbike', 'person', 'pottedplant',
                         'sheep', 'sofa', 'train', 'tvmonitor')
        self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes)))
        self._image_ext = '.jpg'
        self._image_index = self._load_image_set_index()
        # Default to roidb handler
        self._roidb_handler = self.selective_search_roidb
        self._salt = str(uuid.uuid4()) # ?
        self._comp_id = 'comp4' # ?

        # PASCAL specific config options
        self.config = {'cleanup'     : True,
                       'use_salt'    : True,
                       'use_diff'    : False,
                       'matlab_eval' : False,
                       'rpn_file'    : None,
                       'min_size'    : 2}

        assert os.path.exists(self._devkit_path), \
                'VOCdevkit path does not exist: {}'.format(self._devkit_path)
        assert os.path.exists(self._data_path), \
                'Path does not exist: {}'.format(self._data_path)

class imdb(object):
    """Image database."""

    def __init__(self, name):
        self._name = name
        self._num_classes = 0
        self._classes = []
        self._image_index = []
        self._obj_proposer = 'selective_search'
        self._roidb = None
        self._roidb_handler = self.default_roidb
        # Use this dict for storing dataset specific config options
        self.config = {}

得到的 imdb = pascal_voc(‘trainval’, ‘2007’) 记录的内容如下:

[1] - _class_to_ind,dict 类型,key 是类别名,value 是 label 值(从 0 开始),其中 (key[0], value[0]) = [background, 0]

[2] - _classes,object 类别名,共 20(object classes) + 1(background) = 21 classes.

[3] - _data_path,数据集路径

[4] - _image_ext,’.jpg’ 数据类型

[5] - _image_index,图片索引列表

[6] - _image_set,’trainval’

[7] - _name,数据集名称 voc_2007_trainval

[8] - _num_classes,0

[9] - _obj_proposer,selective_search

[10] - _roidb,None

[11] - classes,与_classes 相同

[12] - image_index,与_image_index 相同

[13] - name,数据集名称,与 _name 相同

[14] - num_classes,类别数,21

[15] - num_images,图片数

[16] - config,dict 类型,PASCAL 数据集指定的配置

读取 imdb 后,是 ,

print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)

config.py 中 cfg.TRAIN.PROPOSAL_METHOD 值为 selective_search

experiments/cfgs/faster_rcnn_end2end.yml 中 cfg.TRAIN.PROPOSAL_METHOD 值为 gt

set_proposal_method 函数,

def set_proposal_method(self, method):
    method = eval('self.' + method + '_roidb') # eval 函数把字符串转成表达式,self.gt_roidb/pascal_voc 内的函数 
    self.roidb_handler = method
    def gt_roidb(self):
        Return the database of ground-truth regions of interest.

        This function loads/saves from/to a cache file to speed up future calls.
        cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
        if os.path.exists(cache_file):
            with open(cache_file, 'rb') as fid:
                roidb = cPickle.load(fid)
            print '{} gt roidb loaded from {}'.format(self.name, cache_file)
            return roidb

        gt_roidb = [self._load_pascal_annotation(index)
                    for index in self.image_index]
        with open(cache_file, 'wb') as fid:
            cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
        print 'wrote gt roidb to {}'.format(cache_file)

        return gt_roidb

    def _load_pascal_annotation(self, index):
        Load image and bounding boxes info from XML file in the PASCAL VOC
        filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
        tree = ET.parse(filename)
        objs = tree.findall('object')
        if not self.config['use_diff']:
            # Exclude the samples labeled as difficult
            non_diff_objs = [
                obj for obj in objs if int(obj.find('difficult').text) == 0]
            # if len(non_diff_objs) != len(objs):
            #     print 'Removed {} difficult objects'.format(
            #         len(objs) - len(non_diff_objs))
            objs = non_diff_objs
        num_objs = len(objs)

        boxes = np.zeros((num_objs, 4), dtype=np.uint16)
        gt_classes = np.zeros((num_objs), dtype=np.int32)
        overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
        # "Seg" area for pascal is just the box area
        seg_areas = np.zeros((num_objs), dtype=np.float32)

        # Load object bounding boxes into a data frame.
        for ix, obj in enumerate(objs):
            bbox = obj.find('bndbox')
            # Make pixel indexes 0-based
            x1 = float(bbox.find('xmin').text) - 1
            y1 = float(bbox.find('ymin').text) - 1
            x2 = float(bbox.find('xmax').text) - 1
            y2 = float(bbox.find('ymax').text) - 1
            cls = self._class_to_ind[obj.find('name').text.lower().strip()]
            boxes[ix, :] = [x1, y1, x2, y2]
            gt_classes[ix] = cls
            overlaps[ix, cls] = 1.0
            seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)

        overlaps = scipy.sparse.csr_matrix(overlaps)

        return {'boxes' : boxes,
                'gt_classes': gt_classes,
                'gt_overlaps' : overlaps,
                'flipped' : False,
                'seg_areas' : seg_areas}


roidb = get_training_roidb(imdb)

进入 get_training_roidb 函数,位于 fast_rcnn 内的 train.py 中,

def get_training_roidb(imdb):
    """Returns a roidb (Region of Interest database) for use in training."""
    if cfg.TRAIN.USE_FLIPPED: # 是否进行图片翻转
        print 'Appending horizontally-flipped training examples...'
        # 对imdb中涉及到的图像做了一个水平镜像,使得trainval中的5011张图片,变成了10022张图片;
        print 'done'

    print 'Preparing training data...'
    rdl_roidb.prepare_roidb(imdb) # 准备数据
    print 'done'

    return imdb.roidb

进入 rdl_rxingoidb.prepare_roidb 函数,

def prepare_roidb(imdb):
    """Enrich the imdb's roidb by adding some derived quantities that
    are useful for training. This function precomputes the maximum
    overlap, taken over ground-truth boxes, between each ROI and
    each ground-truth box. The class with maximum overlap is also
    sizes = [PIL.Image.open(imdb.image_path_at(i)).size
             for i in xrange(imdb.num_images)] # 所有图片的 width 和 heigth
    roidb = imdb.roidb 
    for i in xrange(len(imdb.image_index)):
        roidb[i]['image'] = imdb.image_path_at(i) # 图片名
        roidb[i]['width'] = sizes[i][0] # 图片 width
        roidb[i]['height'] = sizes[i][1] # 图片 height
        # need gt_overlaps as a dense array for argmax
        gt_overlaps = roidb[i]['gt_overlaps'].toarray()
        # max overlap with gt over classes (columns)
        max_overlaps = gt_overlaps.max(axis=1)
        # gt class that had the max overlap
        max_classes = gt_overlaps.argmax(axis=1)
        roidb[i]['max_classes'] = max_classes
        roidb[i]['max_overlaps'] = max_overlaps
        # sanity checks 合理性检查
        # max overlap of 0 => class should be zero (background)
        zero_inds = np.where(max_overlaps == 0)[0]
        assert all(max_classes[zero_inds] == 0)
        # max overlap > 0 => class should not be zero (must be a fg class)
        nonzero_inds = np.where(max_overlaps > 0)[0]
        assert all(max_classes[nonzero_inds] != 0)

得到的 imdb.roidb 是 list 形式,包含所有样本的信息,imdb.roidb[index] 是每个样本的数据,dict 类型,其内容信息如下:

roidb[index],dict 类型,key 与 value 分别对应数据为,

[1] - boxes,box 的位置数据,box_num×4 的 ndarray

hanshu[2] - flipped,是否图片翻转,True or False

[3] - gt_classes,图片内所有 boxes 的真实类别标注,box_num×1 的 ndarray

[4] - gt_overlaps,图片内所有 boxes 在不同类别对应的 score,box_num×classes_num 的 matrix

[5] - height,图片的高

[6] - width,图片的宽

[7] - image,图片路径

[8] - max_classes,每个 box 的最高 score 所对应的类别,box_num×1 的 ndarray

[9] - max_overlaps,每个 box 对所有类别的 score 最大值,box_num×1 的 ndarray

函数调用结束,返回 train_net.py 中的 combined_roidb(imdb_names) 函数,

    roidb = roidbs[0]
    if len(roidbs) > 1: # 跳过
        for r in roidbs[1:]:
        imdb = datasets.imdb.imdb(imdb_names)
        imdb = get_imdb(imdb_names) # 即前面提到的 imdb=pascal_voc('trainval', '2007')

roidb 应该是属于 imdb 的.

roidb 是没有真正的读取数据的,只是建立相关的数据信息.


train_net(args.solver, roidb, output_dir, pretrained_model=args.pretrained_model, max_iters=args.max_iters)

进入 train_net,

def train_net(solver_prototxt, roidb, output_dir,
              pretrained_model=None, max_iters=40000):
    """Train a Fast R-CNN network."""

    roidb = filter_roidb(roidb)
    sw = SolverWrapper(solver_prototxt, roidb, output_dir,

    print 'Solving...'
    model_paths = sw.train_model(max_iters)
    print 'done solving'
    return model_paths

这里对 roidb 先进行处理,即函数 filter_roidb,去除没用的 RoIs,

def filter_roidb(roidb):
    """Remove roidb entries that have no usable RoIs."""

    def is_valid(entry):
        # Valid images have:
        #   (1) At least one foreground RoI OR
        #   (2) At least one background RoI
        overlaps = entry['max_overlaps']
        # find boxes with sufficient overlap
        fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
        # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
        bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
                           (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
        # image is only valid if such boxes exist
        valid = len(fg_inds) > 0 or len(bg_inds) > 0
        return valid

    num = len(roidb)
    filtered_roidb = [entry for entry in roidb if is_valid(entry)]
    num_after = len(filtered_roidb)
    print 'Filtered {} roidb entries: {} -> {}'.format(num - num_after,
                                                       num, num_after)
    return filtered_roidb


sw = SolverWrapper(solver_prototxt, roidb, output_dir,

采用 SolverWrapper 来封装 solver 类的对象,

class SolverWrapper(object):
    """A simple wrapper around Caffe's solver.
    This wrapper gives us control over he snapshotting process, which we
    use to unnormalize the learned bounding-box regression weights.

    def __init__(self, solver_prototxt, roidb, output_dir,
        """Initialize the SolverWrapper."""
        self.output_dir = output_dir

        if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and
            # RPN can only use precomputed normalization because there are no
            # fixed statistics to compute a priori

        if cfg.TRAIN.BBOX_REG:
            print 'Computing bounding-box regression targets...'
            self.bbox_means, self.bbox_stds = \
            # 添加 box 回归目标,gt,给roidb dict添加 bbox_targets key.
            print 'done'

        self.solver = caffe.SGDSolver(solver_prototxt) # 加载 solver.prototxt
        # train.prototxt data_top:
        # top[o] - data     (1, 3, 600, 1000)
        # top[1] - im_info  (1, 3)
        # top[2] - gt_boxes (1, 4)

        if pretrained_model is not None:
            print ('Loading pretrained model '
                   'weights from {:s}').format(pretrained_model)
            self.solver.net.copy_from(pretrained_model) # 从预训练模型加载初始化参数

        self.solver_param = caffe_pb2.SolverParameter()
        with open(solver_prototxt, 'rt') as f:
            pb2.text_format.Merge(f.read(), self.solver_param)

        self.solver.net.layers[0].set_roidb(roidb) # 关键步骤,调用 RoIDataLayer 中的 set_roidb 函数,以将读取的 roidb 读入网络.

这里调用的第一个函数是 add_bbox_regression_targets,

def add_bbox_regression_targets(roidb):
    """Add information needed to train bounding-box regressors."""
    assert len(roidb) > 0
    assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'

    num_images = len(roidb)
    # Infer number of classes from the number of columns in gt_overlaps
    num_classes = roidb[0]['gt_overlaps'].shape[1]
    for im_i in xrange(num_images):
        rois = roidb[im_i]['boxes']
        max_overlaps = roidb[im_i]['max_overlaps']
        max_classes = roidb[im_i]['max_classes']
        roidb[im_i]['bbox_targets'] = \
                _compute_targets(rois, max_overlaps, max_classes)

        # Use fixed / precomputed "means" and "stds" instead of empirical values
        # 采用固定/预先计算的均值和方差
        means = np.tile(
                np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
        stds = np.tile(
                np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
        # Compute values needed for means and stds
        # var(x) = E(x^2) - E(x)^2
        class_counts = np.zeros((num_classes, 1)) + cfg.EPS
        sums = np.zeros((num_classes, 4))
        squared_sums = np.zeros((num_classes, 4))
        for im_i in xrange(num_images):
            targets = roidb[im_i]['bbox_targets']
            for cls in xrange(1, num_classes):
                cls_inds = np.where(targets[:, 0] == cls)[0]
                if cls_inds.size > 0:
                    class_counts[cls] += cls_inds.size
                    sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
                    squared_sums[cls, :] += \
                            (targets[cls_inds, 1:] ** 2).sum(axis=0)

        means = sums / class_counts
        stds = np.sqrt(squared_sums / class_counts - means ** 2)

    print 'bbox target means:'
    print means
    print means[1:, :].mean(axis=0) # ignore bg class
    print 'bbox target stdevs:'
    print stds
    print stds[1:, :].mean(axis=0) # ignore bg class

    # Normalize targets
    # 归一化处理
        print "Normalizing targets"
        for im_i in xrange(num_images):
            targets = roidb[im_i]['bbox_targets']
            for cls in xrange(1, num_classes):
                cls_inds = np.where(targets[:, 0] == cls)[0]
                roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
                roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
        print "NOT normalizing targets"

    # These values will be needed for making predictions
    # (the predicts will need to be unnormalized and uncentered)
    return means.ravel(), stds.ravel()

def _compute_targets(rois, overlaps, labels):
    """Compute bounding-box regression targets for an image."""
    # Indices of ground-truth ROIs
    gt_inds = np.where(overlaps == 1)[0]
    if len(gt_inds) == 0:
        # Bail if the image has no ground-truth ROIs
        return np.zeros((rois.shape[0], 5), dtype=np.float32)
    # Indices of examples for which we try to make predictions
    ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0]

    # Get IoU overlap between each ex ROI and gt ROI
    # 计算 ex RoI 和 gt RoI 的 IoU
    ex_gt_overlaps = bbox_overlaps( 
        np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
        np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))

    # Find which gt ROI each ex ROI has max overlap with:
    # this will be the ex ROI's gt target
    gt_assignment = ex_gt_overlaps.argmax(axis=1)
    gt_rois = rois[gt_inds[gt_assignment], :]
    ex_rois = rois[ex_inds, :]

    targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
    targets[ex_inds, 0] = labels[ex_inds] # 类别标签
    targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) # 转换 bbox 
    return targets

def bbox_transform(ex_rois, gt_rois):
    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights

    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights

    targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
    targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
    targets_dw = np.log(gt_widths / ex_widths)
    targets_dh = np.log(gt_heights / ex_heights)

    targets = np.vstack(
        (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
    return targets

这里对 SolverWrapper 的 __init__ 中的最后一行 self.solver.net.layers[0].set_roidb(roidb) 理解,

    def set_roidb(self, roidb):
        """Set the roidb to be used by this layer during training."""
        self._roidb = roidb
        self._shuffle_roidb_inds() # 打乱 roidb 的顺序
        if cfg.TRAIN.USE_PREFETCH: # 这些都跳过了,忽略
            self._blob_queue = Queue(10)
            self._prefetch_process = BlobFetcher(self._blob_queue,
            # Terminate the child process when the parent exists
            def cleanup():
                print 'Terminating BlobFetcher'
            import atexit

    def _shuffle_roidb_inds(self):
        """Randomly permute the training roidb.""" 
            widths = np.array([r['width'] for r in self._roidb])
            heights = np.array([r['height'] for r in self._roidb])
            horz = (widths >= heights)
            vert = np.logical_not(horz)
            horz_inds = np.where(horz)[0]
            vert_inds = np.where(vert)[0]
            inds = np.hstack((
            inds = np.reshape(inds, (-1, 2))
            row_perm = np.random.permutation(np.arange(inds.shape[0]))
            inds = np.reshape(inds[row_perm, :], (-1,))
            self._perm = inds
            self._perm = np.random.permutation(np.arange(len(self._roidb)))
        self._cur = 0

到这里,roidb 处理好,但仍没有读取图片数据.

RoIDataLayer 的 forward 部分:

class RoIDataLayer(caffe.Layer):
    """Fast R-CNN data layer used for training."""

    def _get_next_minibatch_inds(self):
        """Return the roidb indices for the next minibatch."""
        if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb):
            self._shuffle_roidb_inds() # 每完成一次全部样本的训练,打乱一次 roidb 的顺序

        db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] # 选取一个 mini-batch 的图片索引
        self._cur += cfg.TRAIN.IMS_PER_BATCH
        return db_inds

    def _get_next_minibatch(self):
        """Return the blobs to be used for the next minibatch.

        If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a
        separate process and made available through self._blob_queue.
        if cfg.TRAIN.USE_PREFETCH: # 跳过
            return self._blob_queue.get()
            db_inds = self._get_next_minibatch_inds() # 读取 next minibatch 的 index
            minibatch_db = [self._roidb[i] for i in db_inds] # 读取一个 mini-batch 的 roidb
            return get_minibatch(minibatch_db, self._num_classes) # 读取 mini-batch 的数据

    def setup(self, bottom, top):
        """Setup the RoIDataLayer."""

        # parse the layer parameter string, which must be valid YAML
        layer_params = yaml.load(self.param_str_)

        self._num_classes = layer_params['num_classes']

        self._name_to_top_map = {}

        # data blob: holds a batch of N images, each with 3 channels
        idx = 0
        top[idx].reshape(cfg.TRAIN.IMS_PER_BATCH, 3,
            max(cfg.TRAIN.SCALES), cfg.TRAIN.MAX_SIZE)
        self._name_to_top_map['data'] = idx
        idx += 1

        if cfg.TRAIN.HAS_RPN:
            top[idx].reshape(1, 3)
            self._name_to_top_map['im_info'] = idx
            idx += 1

            top[idx].reshape(1, 4)
            self._name_to_top_map['gt_boxes'] = idx
            idx += 1
        else: # not using RPN
            # rois blob: holds R regions of interest, each is a 5-tuple
            # (n, x1, y1, x2, y2) specifying an image batch index n and a
            # rectangle (x1, y1, x2, y2)
            top[idx].reshape(1, 5)
            self._name_to_top_map['rois'] = idx
            idx += 1

            # labels blob: R categorical labels in [0, ..., K] for K foreground
            # classes plus background
            self._name_to_top_map['labels'] = idx
            idx += 1

            if cfg.TRAIN.BBOX_REG:
                # bbox_targets blob: R bounding-box regression targets with 4
                # targets per class
                top[idx].reshape(1, self._num_classes * 4)
                self._name_to_top_map['bbox_targets'] = idx
                idx += 1

                # bbox_inside_weights blob: At most 4 targets per roi are active;
                # thisbinary vector sepcifies the subset of active targets
                top[idx].reshape(1, self._num_classes * 4)
                self._name_to_top_map['bbox_inside_weights'] = idx
                idx += 1

                top[idx].reshape(1, self._num_classes * 4)
                self._name_to_top_map['bbox_outside_weights'] = idx
                idx += 1

        print 'RoiDataLayer: name_to_top:', self._name_to_top_map
        assert len(top) == len(self._name_to_top_map)

    def forward(self, bottom, top):
        """Get blobs and copy them into this layer's top blob vector."""
        blobs = self._get_next_minibatch() 
        # blobs - data, im_info, gt_boxes

        for blob_name, blob in blobs.iteritems():
            top_ind = self._name_to_top_map[blob_name]
            # Reshape net's input blobs
            # Copy data into net's input blobs
            top[top_ind].data[...] = blob.astype(np.float32, copy=False) # blobs 到 top 输出

    def backward(self, top, propagate_down, bottom):
        """This layer does not propagate gradients."""

    def reshape(self, bottom, top):
        """Reshaping happens during the call to forward."""

这里 get_minibatch 读取 mini-batch 的图片数据和 roidb 数据,

def get_minibatch(roidb, num_classes):
    """Given a roidb, construct a minibatch sampled from it."""
    num_images = len(roidb) # mini-batch 的样本数
    # Sample random scales to use for each image in this batch
    random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES),
    assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \
        'num_images ({}) must divide BATCH_SIZE ({})'. \
        format(num_images, cfg.TRAIN.BATCH_SIZE)
    rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images # 每张图片的 rois 数
    fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) # 每张图片被标注为 foreground 的 rois 数
    # cfg.TRAIN.FG_FRACTION=0.25 被标注为 foreground 的比例(class>0)

    # Get the input image blob, formatted for caffe
    im_blob, im_scales = _get_image_blob(roidb, random_scale_inds)

    blobs = {'data': im_blob} # data blob

    if cfg.TRAIN.HAS_RPN:
        assert len(im_scales) == 1, "Single batch only" # RPN mini-batch 只用一个 scale
        assert len(roidb) == 1, "Single batch only"
        # gt boxes: (x1, y1, x2, y2, cls) 
        gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
        gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
        gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
        gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
        blobs['gt_boxes'] = gt_boxes
        blobs['im_info'] = np.array(
            [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]],
    else: # not using RPN
        # Now, build the region of interest and label blobs
        rois_blob = np.zeros((0, 5), dtype=np.float32)
        labels_blob = np.zeros((0), dtype=np.float32)
        bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32)
        bbox_inside_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32)
        # all_overlaps = []
        for im_i in xrange(num_images):
            labels, overlaps, im_rois, bbox_targets, bbox_inside_weights \
                = _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image,

            # Add to RoIs blob
            rois = _project_im_rois(im_rois, im_scales[im_i])
            batch_ind = im_i * np.ones((rois.shape[0], 1))
            rois_blob_this_image = np.hstack((batch_ind, rois))
            rois_blob = np.vstack((rois_blob, rois_blob_this_image))

            # Add to labels, bbox targets, and bbox loss blobs
            labels_blob = np.hstack((labels_blob, labels))
            bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets))
            bbox_inside_blob = np.vstack((bbox_inside_blob, bbox_inside_weights))
            # all_overlaps = np.hstack((all_overlaps, overlaps))

        # For debug visualizations
        # _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps)

        blobs['rois'] = rois_blob
        blobs['labels'] = labels_blob

        if cfg.TRAIN.BBOX_REG:
            blobs['bbox_targets'] = bbox_targets_blob
            blobs['bbox_inside_weights'] = bbox_inside_blob
            blobs['bbox_outside_weights'] = \
                np.array(bbox_inside_blob > 0).astype(np.float32)

    return blobs
def _get_image_blob(roidb, scale_inds):
    """Builds an input blob from the images in the roidb at the specified
    num_images = len(roidb)
    processed_ims = []
    im_scales = []
    for i in xrange(num_images):
        im = cv2.imread(roidb[i]['image']) # 到这里才真正读入图片数据
        if roidb[i]['flipped']:
            im = im[:, ::-1, :]
        target_size = cfg.TRAIN.SCALES[scale_inds[i]] # 图片短边尺寸为 600
        im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,

    # Create a blob to hold the input images
    # 将输入图片转化为网络 data_blobs
    blob = im_list_to_blob(processed_ims)

    return blob, im_scales

def prep_im_for_blob(im, pixel_means, target_size, max_size):
    """Mean subtract and scale an image for use in a blob."""
    im = im.astype(np.float32, copy=False)
    im -= pixel_means # 减均值
    im_shape = im.shape
    im_size_min = np.min(im_shape[0:2])
    im_size_max = np.max(im_shape[0:2])
    im_scale = float(target_size) / float(im_size_min) # 
    # Prevent the biggest axis from being more than MAX_SIZE
    if np.round(im_scale * im_size_max) > max_size:
        im_scale = float(max_size) / float(im_size_max)
    im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,

    return im, im_scale

def im_list_to_blob(ims):
    """Convert a list of images into a network input.

    Assumes images are already prepared (means subtracted, BGR order, ...).
    max_shape = np.array([im.shape for im in ims]).max(axis=0)
    num_images = len(ims)
    blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
    for i in xrange(num_images):
        im = ims[i]
        blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
    # Move channels (axis 3) to axis 1
    # Axis order will become: (batch elem, channel, height, width)
    channel_swap = (0, 3, 1, 2)
    blob = blob.transpose(channel_swap)
    return blob



  • 读取 pascal voc 提供的数据 到roidb;
  • 添加 bbox_regression_targets 到 roidb;
  • 开始训练,读取一个 mini-batch 的数据层 blobs,读取图片数据和 gt_boxes 数据.
