CenterNet(二) 源码解读之bouding box 检测

我们接着上一篇文章CenterNet(一)论文解读, 来了解一下作者具体的代码是如何实现的吧。
这里我们可以下代码地址:Github CenterNet




  1. CenterNet/src/
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import _init_paths

import os

import torch
from opts import opts
from models.model import create_model, load_model, save_model
from models.data_parallel import DataParallel
from logger import Logger
from datasets.dataset_factory import get_dataset
from trains.train_factory import train_factory

def main(opt):
  torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
  Dataset = get_dataset(opt.dataset, opt.task)
  opt = opts().update_dataset_info_and_set_heads(opt, Dataset)

  logger = Logger(opt)

  os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
  opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')
  print('Creating model...')
  model = create_model(opt.arch, opt.heads, opt.head_conv)
  optimizer = torch.optim.Adam(model.parameters(),
  start_epoch = 0
  if opt.load_model != '':
    model, optimizer, start_epoch = load_model(
      model, opt.load_model, optimizer, opt.resume,, opt.lr_step)

  Trainer = train_factory[opt.task]
  trainer = Trainer(opt, model, optimizer)
  trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

  print('Setting up data...')
  val_loader =
      Dataset(opt, 'val'), 

  if opt.test:
    _, preds = trainer.val(0, val_loader)
    val_loader.dataset.run_eval(preds, opt.save_dir)

  train_loader =
      Dataset(opt, 'train'), 

  print('Starting training...')
  best = 1e10
  for epoch in range(start_epoch + 1, opt.num_epochs + 1):
    mark = epoch if opt.save_all else 'last'
    log_dict_train, _ = trainer.train(epoch, train_loader)
    logger.write('epoch: {} |'.format(epoch))
    for k, v in log_dict_train.items():
      logger.scalar_summary('train_{}'.format(k), v, epoch)
      logger.write('{} {:8f} | '.format(k, v))
    if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
      save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), 
                 epoch, model, optimizer)
      with torch.no_grad():
        log_dict_val, preds = trainer.val(epoch, val_loader)
      for k, v in log_dict_val.items():
        logger.scalar_summary('val_{}'.format(k), v, epoch)
        logger.write('{} {:8f} | '.format(k, v))
      if log_dict_val[opt.metric] < best:
        best = log_dict_val[opt.metric]
        save_model(os.path.join(opt.save_dir, 'model_best.pth'), 
                   epoch, model)
      save_model(os.path.join(opt.save_dir, 'model_last.pth'), 
                 epoch, model, optimizer)
    if epoch in opt.lr_step:
      save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), 
                 epoch, model, optimizer)
      lr = * (0.1 ** (opt.lr_step.index(epoch) + 1))
      print('Drop LR to', lr)
      for param_group in optimizer.param_groups:
          param_group['lr'] = lr

if __name__ == '__main__':
  opt = opts().parse()
  ctdet --exp_id coco_dla --batch_size 64 --master_batch 32 --lr 1.25e-4 --gpu 1,2,3 --num_workers 32

a. torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test benchmark = True 自动寻找最适合当前配置的高效算法,来达到优化运行效率的问题
b. Dataset = get_dataset(opt.dataset, opt.task)获取训练特定任务模型需要数据,
c. opt = opts().update_dataset_info_and_set_heads(opt, Dataset)更新数据等配置,设置模型输出heads。比如我们需要bounding box识别任务, 我们就需要设置三个输出hm, wh, reg.

    elif opt.task == 'ctdet':
      # assert opt.dataset in ['pascal', 'coco']
      opt.heads = {'hm': opt.num_classes,
                   'wh': 2 if not opt.cat_spec_wh else 2 * opt.num_classes}
      if opt.reg_offset:
        opt.heads.update({'reg': 2})

这里opt.cat_spec_wh作者仅仅是实验中使用, 并发现效果没有提升。

作者这样说的 We never used cat_spec_wh is the experiments.
I have tried once this on Pascal VOC but it doesn't give improvement. Feel free to try it on COCO yourself.

这里面最主要的部分莫过于Dataset = get_dataset(opt.dataset, opt.task)以及model = create_model(opt.arch, opt.heads, opt.head_conv)。下面我们来好好分析一下。

  1. CenterNet/src/lib/datasets/dataset_factory.py以及 CenterNet/src/lib/datasets/dataset/
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from .sample.ddd import DddDataset
from .sample.exdet import EXDetDataset
from .sample.ctdet import CTDetDataset
from .sample.multi_pose import MultiPoseDataset

from .dataset.coco import COCO
from .dataset.pascal import PascalVOC
from .dataset.kitti import KITTI
from .dataset.coco_hp import COCOHP
from import Clothing

dataset_factory = {
  'coco': COCO,
  'pascal': PascalVOC,
  'kitti': KITTI,
  'coco_hp': COCOHP,
  'clothing': Clothing

_sample_factory = {
  'exdet': EXDetDataset,
  'ctdet': CTDetDataset,
  'ddd': DddDataset,
  'multi_pose': MultiPoseDataset

def get_dataset(dataset, task):
  class Dataset(dataset_factory[dataset], _sample_factory[task]):
  return Dataset



from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import pycocotools.coco as coco
from pycocotools.cocoeval import COCOeval
import numpy as np
import json
import os

import as data

class COCO(data.Dataset):
  num_classes = 80
  default_resolution = [512, 512]
  mean = np.array([0.40789654, 0.44719302, 0.47026115],
                   dtype=np.float32).reshape(1, 1, 3)
  std  = np.array([0.28863828, 0.27408164, 0.27809835],
                   dtype=np.float32).reshape(1, 1, 3)

  def __init__(self, opt, split):
    super(COCO, self).__init__()
    self.data_dir = os.path.join(opt.data_dir, 'coco') # 图片存储地址
    self.img_dir = os.path.join(self.data_dir, '{}2017'.format(split)) # 标签根路径存储
    if split == 'test':
      self.annot_path = os.path.join(
          self.data_dir, 'annotations', 
      if opt.task == 'exdet':
        self.annot_path = os.path.join(
          self.data_dir, 'annotations', 
        self.annot_path = os.path.join(
          self.data_dir, 'annotations', 
    self.max_objs = 100
    self.class_name = [
      '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
      'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
      'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
      'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
      'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
      'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
      'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass',
      'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
      'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
      'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
      'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
      'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
      'scissors', 'teddy bear', 'hair drier', 'toothbrush']
    self._valid_ids = [
      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 
      14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 
      24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 
      37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 
      48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 
      58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 
      72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 
      82, 84, 85, 86, 87, 88, 89, 90]
    self.cat_ids = {v: i for i, v in enumerate(self._valid_ids)} # 生成对应的category dict
    self.voc_color = [(v // 32 * 64 + 64, (v // 8) % 4 * 64, v % 8 * 32) \
                      for v in range(1, self.num_classes + 1)]
    self._data_rng = np.random.RandomState(123)
    self._eig_val = np.array([0.2141788, 0.01817699, 0.00341571],
    # 这里是们了后面图片增广中的颜色增广的参数
    self._eig_vec = np.array([
        [-0.58752847, -0.69563484, 0.41340352],
        [-0.5832747, 0.00994535, -0.81221408],
        [-0.56089297, 0.71832671, 0.41158938]
    ], dtype=np.float32)
    # self.mean = np.array([0.485, 0.456, 0.406], np.float32).reshape(1, 1, 3)
    # self.std = np.array([0.229, 0.224, 0.225], np.float32).reshape(1, 1, 3)

    self.split = split
    self.opt = opt

    print('==> initializing coco 2017 {} data.'.format(split))
    self.coco = coco.COCO(self.annot_path)
    self.images = self.coco.getImgIds()
    self.num_samples = len(self.images)

    print('Loaded {} {} samples'.format(split, self.num_samples))

  def _to_float(self, x):
    return float("{:.2f}".format(x))
 # 遍历每一个标注文件解析写入detections. 输出结果使用
  def convert_eval_format(self, all_bboxes):
    # import pdb; pdb.set_trace()
    detections = []
    for image_id in all_bboxes:
      for cls_ind in all_bboxes[image_id]:
        category_id = self._valid_ids[cls_ind - 1]
        for bbox in all_bboxes[image_id][cls_ind]:
          bbox[2] -= bbox[0]
          bbox[3] -= bbox[1]
          score = bbox[4]
          bbox_out  = list(map(self._to_float, bbox[0:4]))

          detection = {
              "image_id": int(image_id),
              "category_id": int(category_id),
              "bbox": bbox_out,
              "score": float("{:.2f}".format(score))
          if len(bbox) > 5:
              extreme_points = list(map(self._to_float, bbox[5:13]))
              detection["extreme_points"] = extreme_points
    return detections

  def __len__(self):
    return self.num_samples

  def save_results(self, results, save_dir):
                open('{}/results.json'.format(save_dir), 'w'))
  def run_eval(self, results, save_dir):
    # result_json = os.path.join(save_dir, "results.json")
    # detections  = self.convert_eval_format(results)
    # json.dump(detections, open(result_json, "w"))
    self.save_results(results, save_dir)
    coco_dets = self.coco.loadRes('{}/results.json'.format(save_dir))
    coco_eval = COCOeval(self.coco, coco_dets, "bbox")


from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import as data
import numpy as np
import torch
import json
import cv2
import os
from utils.image import flip, color_aug
from utils.image import get_affine_transform, affine_transform
from utils.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian
from utils.image import draw_dense_reg
import math

class CTDetDataset(data.Dataset):
  def _coco_box_to_bbox(self, box):
    bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]],
    return bbox

  def _get_border(self, border, size):
    #border 128  pic_len w or h
    i = 1
    while size - border // i <= border // i:
      # 如果图像宽高小于 boder*2,i增大,返回128 // i
      # 正常返回128,图像小于256,则返回64
        i *= 2
    return border // i

  def __getitem__(self, index):
    img_id = self.images[index]
    file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name']
    img_path = os.path.join(self.img_dir, file_name)
    ann_ids = self.coco.getAnnIds(imgIds=[img_id])
    anns = self.coco.loadAnns(ids=ann_ids)
    num_objs = min(len(anns), self.max_objs)      # 目标个数,这里为100
    img = cv2.imread(img_path)
      height, width = img.shape[0], img.shape[1]
    c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32) # 获取中心点
    if self.opt.keep_res: # False
      input_h = (height | self.opt.pad) + 1
      input_w = (width | self.opt.pad) + 1
      s = np.array([input_w, input_h], dtype=np.float32)
    else: # True
      s = max(img.shape[0], img.shape[1]) * 1.0 # s最长的边长
      input_h, input_w = self.opt.input_h, self.opt.input_w # 512, 512
    flipped = False
    if self.split == 'train':
      if not self.opt.not_rand_crop:
        s = s * np.random.choice(np.arange(0.6, 1.4, 0.1)) # 随机尺度
        w_border = self._get_border(128, img.shape[1])
        h_border = self._get_border(128, img.shape[0])
        c[0] = np.random.randint(low=w_border, high=img.shape[1] - w_border)
        c[1] = np.random.randint(low=h_border, high=img.shape[0] - h_border)
        sf = self.opt.scale
        cf = self.opt.shift
        c[0] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
        c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
        s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
      if np.random.random() < self.opt.flip:
        flipped = True
        img = img[:, ::-1, :]
        c[0] =  width - c[0] - 1 # 随机裁剪

    trans_input = get_affine_transform(
      c, s, 0, [input_w, input_h])
    inp = cv2.warpAffine(img, trans_input, 
                         (input_w, input_h),
                         flags=cv2.INTER_LINEAR)# 放射变换
    inp = (inp.astype(np.float32) / 255.)
    if self.split == 'train' and not self.opt.no_color_aug:
      color_aug(self._data_rng, inp, self._eig_val, self._eig_vec)
    # 归一化
    inp = (inp - self.mean) / self.std
    inp = inp.transpose(2, 0, 1)

    output_h = input_h // self.opt.down_ratio # 输出512//4=128
    output_w = input_w // self.opt.down_ratio
    num_classes = self.num_classes
    trans_output = get_affine_transform(c, s, 0, [output_w, output_h])

    hm = np.zeros((num_classes, output_h, output_w), dtype=np.float32) # heatmap(80,128,128)
    wh = np.zeros((self.max_objs, 2), dtype=np.float32) # 中心点宽高(100*2)
    dense_wh = np.zeros((2, output_h, output_w), dtype=np.float32)# 返回2*128*128
    reg = np.zeros((self.max_objs, 2), dtype=np.float32) # 记录下采样带来的误差,返回100*2的小数
    ind = np.zeros((self.max_objs), dtype=np.int64) # 返回100个ind
    reg_mask = np.zeros((self.max_objs), dtype=np.uint8)# 返回8个 回归mask
    cat_spec_wh = np.zeros((self.max_objs, num_classes * 2), dtype=np.float32) # 100*80*2
    cat_spec_mask = np.zeros((self.max_objs, num_classes * 2), dtype=np.uint8) # 100*80*2
    draw_gaussian = draw_msra_gaussian if self.opt.mse_loss else \

    gt_det = []
    for k in range(num_objs):
      ann = anns[k]
      bbox = self._coco_box_to_bbox(ann['bbox'])
      cls_id = int(self.cat_ids[ann['category_id']])
      if flipped:
        bbox[[0, 2]] = width - bbox[[2, 0]] - 1
      bbox[:2] = affine_transform(bbox[:2], trans_output)
      bbox[2:] = affine_transform(bbox[2:], trans_output)
      bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
      bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
      h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
      if h > 0 and w > 0:
        radius = gaussian_radius((math.ceil(h), math.ceil(w)))
        radius = max(0, int(radius))
        radius = self.opt.hm_gauss if self.opt.mse_loss else radius
        ct = np.array(
          [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
        ct_int = ct.astype(np.int32)
        draw_gaussian(hm[cls_id], ct_int, radius)
        #cv2.imwrite("/data/humaocheng/CenterNet-master/single_heatmap.jpg", hm[0]*255)
        wh[k] = 1. * w, 1. * h # 目标矩形框的宽高——目标尺寸损失
        ind[k] = ct_int[1] * output_w + ct_int[0] # 目标中心点在128×128特征图中的索引
        reg[k] = ct - ct_int # off Loss, # ct 即 center point reg是偏置回归数组,存放每个中心店的偏置值 k是当前图中第k个目标
        # 实际例子为
        # [98.97667 2.3566666] - [98  2] = [0.97667, 0.3566666]
        reg_mask[k] = 1 # 有目标的位置的mask
        cat_spec_wh[k, cls_id * 2: cls_id * 2 + 2] = wh[k]
        cat_spec_mask[k, cls_id * 2: cls_id * 2 + 2] = 1
        if self.opt.dense_wh:
          draw_dense_reg(dense_wh, hm.max(axis=0), ct_int, wh[k], radius)
        gt_det.append([ct[0] - w / 2, ct[1] - h / 2, 
                       ct[0] + w / 2, ct[1] + h / 2, 1, cls_id])

    # cv2.imwrite("/data/humaocheng/CenterNet-master/heatmap.jpg",hm[0]*255)
    ret = {'input': inp, 'hm': hm, 'reg_mask': reg_mask, 'ind': ind, 'wh': wh}
    if self.opt.dense_wh:
      hm_a = hm.max(axis=0, keepdims=True)
      dense_wh_mask = np.concatenate([hm_a, hm_a], axis=0)
      ret.update({'dense_wh': dense_wh, 'dense_wh_mask': dense_wh_mask})
      del ret['wh']
    elif self.opt.cat_spec_wh:
      ret.update({'cat_spec_wh': cat_spec_wh, 'cat_spec_mask': cat_spec_mask})
      del ret['wh']
    if self.opt.reg_offset:
      ret.update({'reg': reg})
    if self.opt.debug > 0 or not self.split == 'train':
      gt_det = np.array(gt_det, dtype=np.float32) if len(gt_det) > 0 else \
               np.zeros((1, 6), dtype=np.float32)
      meta = {'c': c, 's': s, 'gt_det': gt_det, 'img_id': img_id}
      ret['meta'] = meta
    return ret

我们还是从def __getitem__(self, index)函数为入口。这里我们可以得到我们输出参数,分别是。

    img_id = self.images[index]
    file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name']
    img_path = os.path.join(self.img_dir, file_name)
    ann_ids = self.coco.getAnnIds(imgIds=[img_id])
    anns = self.coco.loadAnns(ids=ann_ids)
    num_objs = min(len(anns), self.max_objs)      # 目标个数,这里为100
    img = cv2.imread(img_path)

分别获取图片img_id, 并根据img_id获取图片名称及地址。并根据img_id获取标注的ann_ids, 并借此获取对于应的标签。这里需要强调的num_objs为我们一张图片选取top中心点的数量(即类似起到NMS作用)。可以理解为超参数这里我们默认设置为100。
同时我们也获取中心点的坐标c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32


    if self.opt.keep_res: # False
      input_h = (height | self.opt.pad) + 1
      input_w = (width | self.opt.pad) + 1
      s = np.array([input_w, input_h], dtype=np.float32)
    else: # True
      s = max(img.shape[0], img.shape[1]) * 1.0 # s最长的边长
      input_h, input_w = self.opt.input_h, self.opt.input_w # 512, 512


flipped = False
    if self.split == 'train':
      if not self.opt.not_rand_crop:
        s = s * np.random.choice(np.arange(0.6, 1.4, 0.1)) # 随机尺度
        w_border = self._get_border(128, img.shape[1])
        h_border = self._get_border(128, img.shape[0])
        c[0] = np.random.randint(low=w_border, high=img.shape[1] - w_border)
        c[1] = np.random.randint(low=h_border, high=img.shape[0] - h_border)
        sf = self.opt.scale
        cf = self.opt.shift
        c[0] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
        c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
        s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
      if np.random.random() < self.opt.flip:
        flipped = True
        img = img[:, ::-1, :]
        c[0] =  width - c[0] - 1 # 随机裁剪

    trans_input = get_affine_transform(
      c, s, 0, [input_w, input_h])
    inp = cv2.warpAffine(img, trans_input, 
                         (input_w, input_h),
                         flags=cv2.INTER_LINEAR)# 放射变换
    inp = (inp.astype(np.float32) / 255.)
    if self.split == 'train' and not self.opt.no_color_aug:
      color_aug(self._data_rng, inp, self._eig_val, self._eig_vec)
    # 归一化
    inp = (inp - self.mean) / self.std
    inp = inp.transpose(2, 0, 1)


    output_h = input_h // self.opt.down_ratio # 输出512//4=128
    output_w = input_w // self.opt.down_ratio
    num_classes = self.num_classes # num_classes=80
    trans_output = get_affine_transform(c, s, 0, [output_w, output_h])

    hm = np.zeros((num_classes, output_h, output_w), dtype=np.float32) # heatmap(80,128,128)
    wh = np.zeros((self.max_objs, 2), dtype=np.float32) # 中心点宽高(32*2)
    dense_wh = np.zeros((2, output_h, output_w), dtype=np.float32)# 返回2*128*128
    reg = np.zeros((self.max_objs, 2), dtype=np.float32) # 记录下采样带来的误差,返回32*2的小数
    ind = np.zeros((self.max_objs), dtype=np.int64) # 返回32个ind
    reg_mask = np.zeros((self.max_objs), dtype=np.uint8)# 返回100个 回归mask
    cat_spec_wh = np.zeros((self.max_objs, num_classes * 2), dtype=np.float32) # 32*80*2
    cat_spec_mask = np.zeros((self.max_objs, num_classes * 2), dtype=np.uint8) # 32*80*2
    draw_gaussian = draw_msra_gaussian if self.opt.mse_loss else \

这里mse_loss为False, 所以我们只需要关注draw_umich_gaussian函数即可。

    gt_det = []
    for k in range(num_objs):
      ann = anns[k]
      bbox = self._coco_box_to_bbox(ann['bbox'])
      cls_id = int(self.cat_ids[ann['category_id']])
      if flipped:
        bbox[[0, 2]] = width - bbox[[2, 0]] - 1
      bbox[:2] = affine_transform(bbox[:2], trans_output)
      bbox[2:] = affine_transform(bbox[2:], trans_output)
      bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
      bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
      h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
      if h > 0 and w > 0:
        radius = gaussian_radius((math.ceil(h), math.ceil(w)))
        radius = max(0, int(radius))
        radius = self.opt.hm_gauss if self.opt.mse_loss else radius
        ct = np.array(
          [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
        ct_int = ct.astype(np.int32)
        draw_gaussian(hm[cls_id], ct_int, radius)
        #cv2.imwrite("/data/humaocheng/CenterNet-master/single_heatmap.jpg", hm[0]*255)
        wh[k] = 1. * w, 1. * h # 目标矩形框的宽高——目标尺寸损失
        ind[k] = ct_int[1] * output_w + ct_int[0] # 目标中心点在128×128特征图中的索引
        reg[k] = ct - ct_int # off Loss, # ct 即 center point reg是偏置回归数组,存放每个中心店的偏置值 k是当前图中第k个目标
        # 实际例子为
        # [98.97667 2.3566666] - [98  2] = [0.97667, 0.3566666]
        reg_mask[k] = 1 # 有目标的位置的mask
        cat_spec_wh[k, cls_id * 2: cls_id * 2 + 2] = wh[k]
        cat_spec_mask[k, cls_id * 2: cls_id * 2 + 2] = 1
        if self.opt.dense_wh:
          draw_dense_reg(dense_wh, hm.max(axis=0), ct_int, wh[k], radius)
        gt_det.append([ct[0] - w / 2, ct[1] - h / 2, 
                       ct[0] + w / 2, ct[1] + h / 2, 1, cls_id])

我们来看下这个代码,是如何画出heatmap的。这里的关键点在于如何画出高斯半径。下面展示求解高斯半径的初始化的函数。这个函数也是照搬CornerNet的方式求解的。这里的理解可以参考CornerNet半径求解。这里在CornerNet里面是这么解释的。而半径设置时需要满足半径内的点组成的box与gt box之间的IoU达到某个阈值0.3.

We determine the radius by the size of an object by ensuring that a pair of points within the radius would generate a bounding box with at least t IoU with the ground-truth annotation (we set to 0.3 in all experiments)

def gaussian_radius(det_size, min_overlap=0.7):
 height, width = det_size

 a1  = 1
 b1  = (height + width)
 c1  = width * height * (1 - min_overlap) / (1 + min_overlap)
 sq1 = np.sqrt(b1 ** 2 - 4 * a1 * c1)
 r1  = (b1 + sq1) / 2

 a2  = 4
 b2  = 2 * (height + width)
 c2  = (1 - min_overlap) * width * height
 sq2 = np.sqrt(b2 ** 2 - 4 * a2 * c2)
 r2  = (b2 + sq2) / 2

 a3  = 4 * min_overlap
 b3  = -2 * min_overlap * (height + width)
 c3  = (min_overlap - 1) * width * height
 sq3 = np.sqrt(b3 ** 2 - 4 * a3 * c3)
 r3  = (b3 + sq3) / 2
 return min(r1, r2, r3)

这样的话我们通过def draw_umich_gaussian画出我们想要的heatmap.

def draw_umich_gaussian(heatmap, center, radius, k=1):
  diameter = 2 * radius + 1
  gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
  x, y = int(center[0]), int(center[1])

  height, width = heatmap.shape[0:2]
  left, right = min(x, radius), min(width - x, radius + 1)
  top, bottom = min(y, radius), min(height - y, radius + 1)

  masked_heatmap  = heatmap[y - top:y + bottom, x - left:x + right]
  masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right]
  if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: # TODO debug
    np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
  return heatmap

这里的center其实是bounding box的中心点。np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)相当于不断的在heatmap基础上添加关键点的高斯,即同一种类型的框会在一个heatmap某一个类别通道上面上面不断添加。最终通过函数总体的for循环,相当于不断将目标画到heatmap上面生成我们第二个输出参数。

上面的代码已经介绍了我们的输出参数以及, 还有以及这三个参数我们可以看下代码。

        wh[k] = 1. * w, 1. * h # 目标矩形框的宽高——目标尺寸损失
        ind[k] = ct_int[1] * output_w + ct_int[0] # 目标中心点在128×128特征图中的索引
        reg[k] = ct - ct_int # off Loss, # ct 即 center point reg是偏置回归数组,存放每个中心店的偏置值 k是当前图中第k个目标
        # 实际例子为
        # [98.97667 2.3566666] - [98  2] = [0.97667, 0.3566666]
        reg_mask[k] = 1 # 有目标的位置的mask
        cat_spec_wh[k, cls_id * 2: cls_id * 2 + 2] = wh[k]
        cat_spec_mask[k, cls_id * 2: cls_id * 2 + 2] = 1
        if self.opt.dense_wh:
          draw_dense_reg(dense_wh, hm.max(axis=0), ct_int, wh[k], radius)
        gt_det.append([ct[0] - w / 2, ct[1] - h / 2, 
                       ct[0] + w / 2, ct[1] + h / 2, 1, cls_id])



这样基本上我们就把ret = {'input': inp, 'hm': hm, 'reg_mask': reg_mask, 'ind': ind, 'wh': wh}这个参数介绍完成。
最后进行ret更新ret.update({'reg': reg}), 即把reg这个item添加到ret字典中去。

  1. 下面我们来看下
from .networks.msra_resnet import get_pose_net
from .networks.dlav0 import get_pose_net as get_dlav0
from .networks.pose_dla_dcn import get_pose_net as get_dla_dcn
from .networks.resnet_dcn import get_pose_net as get_pose_net_dcn
from .networks.large_hourglass import get_large_hourglass_net

_model_factory = {
  'res': get_pose_net, # default Resnet with deconv
  'dlav0': get_dlav0, # default DLAup
  'dla': get_dla_dcn,
  'resdcn': get_pose_net_dcn,
  'hourglass': get_large_hourglass_net,

def create_model(arch, heads, head_conv):
  num_layers = int(arch[arch.find('_') + 1:]) if '_' in arch else 0
  arch = arch[:arch.find('_')] if '_' in arch else arch
  get_model = _model_factory[arch]
  model = get_model(num_layers=num_layers, heads=heads, head_conv=head_conv)
  return model

从代码可以看出我们是获取了pose_net。这里需要我们去看一下CenterNet\src\lib\models\networks\pose_dla_dcn.py文件。在这里就不做过多的解释特征提取模型了,提供一个链接可以帮助我们理解这个模型cvpr2018 Deep Layer Aggregation(DLANet)。主要结构如下:

DLA structure


for head in self.heads:
    classes = self.heads[head]
    if head_conv > 0:
        fc = nn.Sequential(
            nn.Conv2d(channels[self.first_level], head_conv,
                      kernel_size=3, padding=1, bias=True),
            nn.Conv2d(head_conv, classes,
                      kernel_size=final_kernel, stride=1,
                      padding=final_kernel // 2, bias=True)

这里的heads = {'hm':3, 'wh':2, 'reg':2} 通过dla模型的输出的通道为5, 先经过的卷积,输出通道为256,再经过最后在进行1\times1$卷积输出如果是hm则为3通道,wh则为2通道,reg则为2通道。

  1. 最后我们看下loss函数的定义。可以参考CenterNet\src\lib\trains\
  def forward(self, outputs, batch):
    opt = self.opt
    hm_loss, wh_loss, off_loss = 0, 0, 0
    for s in range(opt.num_stacks): # num_stacks = 1
      output = outputs[s]
      if not opt.mse_loss:
        output['hm'] = _sigmoid(output['hm']) 

      if opt.eval_oracle_hm:
        output['hm'] = batch['hm']
      if opt.eval_oracle_wh:
        output['wh'] = torch.from_numpy(gen_oracle_map(
          output['wh'].shape[3], output['wh'].shape[2])).to(opt.device)
      if opt.eval_oracle_offset:
        output['reg'] = torch.from_numpy(gen_oracle_map(
          output['reg'].shape[3], output['reg'].shape[2])).to(opt.device)

      hm_loss += self.crit(output['hm'], batch['hm']) / opt.num_stacks
      if opt.wh_weight > 0:
        if opt.dense_wh:
          mask_weight = batch['dense_wh_mask'].sum() + 1e-4
          wh_loss += (
            self.crit_wh(output['wh'] * batch['dense_wh_mask'],
            batch['dense_wh'] * batch['dense_wh_mask']) / 
            mask_weight) / opt.num_stacks
        elif opt.cat_spec_wh:
          wh_loss += self.crit_wh(
            output['wh'], batch['cat_spec_mask'],
            batch['ind'], batch['cat_spec_wh']) / opt.num_stacks
          wh_loss += self.crit_reg(
            output['wh'], batch['reg_mask'],
            batch['ind'], batch['wh']) / opt.num_stacks
      if opt.reg_offset and opt.off_weight > 0:
        off_loss += self.crit_reg(output['reg'], batch['reg_mask'],
                             batch['ind'], batch['reg']) / opt.num_stacks
    loss = opt.hm_weight * hm_loss + opt.wh_weight * wh_loss + \
           opt.off_weight * off_loss
    loss_stats = {'loss': loss, 'hm_loss': hm_loss,
                  'wh_loss': wh_loss, 'off_loss': off_loss}
    return loss, loss_stats

 output['hm'] = _sigmoid(output['hm']) 
 hm_loss += self.crit(output['hm'], batch['hm']) / opt.num_stacks

这里使用sigmoid函数自己理解人为这是对heatmap做归一化, 在进行对heatmap loss计算加快收敛。

elif opt.cat_spec_wh:
    wh_loss += self.crit_wh(
        output['wh'], batch['cat_spec_mask'],
        batch['ind'], batch['cat_spec_wh']) / opt.num_stacks
  def forward(self, output, mask, ind, target):
    pred = _tranpose_and_gather_feat(output, ind)
    mask = mask.unsqueeze(2).expand_as(pred).float()
    # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
    loss = F.l1_loss(pred * mask, target * mask, size_average=False)
    loss = loss / (mask.sum() + 1e-4)
    return loss
def _gather_feat(feat, ind, mask=None):
    dim  = feat.size(2)
    ind  = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
    feat = feat.gather(1, ind)
    if mask is not None:
        mask = mask.unsqueeze(2).expand_as(feat)
        feat = feat[mask]
        feat = feat.view(-1, dim)
    return feat

def _tranpose_and_gather_feat(feat, ind):
    feat = feat.permute(0, 2, 3, 1).contiguous()
    feat = feat.view(feat.size(0), -1, feat.size(3))
    feat = _gather_feat(feat, ind)
    return feat

我们相当于在ind中记录了目标在heatmap上的地址索引,通过_tranpose_and_gather_feat以及def _gather_feat(feat, ind, mask=None):函数得出我们预测的宽高。 _gather_feat根据ind取出feat中对应的元素
__gather_feat起到的作用是消除各个channel区别的作用,最终得到的inds是对于所有channel而言的。输入: feat(topk_inds): batch * (cat x K) * 1 (假设输入的是topk_inds和topk_ind)
ind(topk_ind):batch * K
首先将ind扩展一个指标,变为 batch * K * 1, 之后使用gather,将ind对应的值取出来。返回的是index:
feat: batch * K * 1 取值:[0, cat x K - 1]
更一般的情况如下: feat : A * B * C, ind:A * D
首先将ind扩展一个指标,并且expand为dim的大小,变为 A * D * C,其中对于任意的i, j, 数组ind[i, j, :]中所有的元素均相同,等于原来A * D shape的ind[i, j]。 之后使用gather,将ind对应的值取出来。 得到的feat: A * D * C

off_loss += self.crit_reg(output['reg'], batch['reg_mask'],batch['ind'], batch['reg']) / opt.num_stacks

原理和计算 一样

loss = opt.hm_weight * hm_loss + opt.wh_weight * wh_loss + opt.off_weight * off_loss



from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import _init_paths

import os
import cv2

from opts import opts
from detectors.detector_factory import detector_factory

image_ext = ['jpg', 'jpeg', 'png', 'webp']
video_ext = ['mp4', 'mov', 'avi', 'mkv']
time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge']

def demo(opt):
  os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
  opt.debug = max(opt.debug, 1)
  Detector = detector_factory[opt.task]
  detector = Detector(opt)

  if opt.demo == 'webcam' or \
    opt.demo[opt.demo.rfind('.') + 1:].lower() in video_ext:
    cam = cv2.VideoCapture(0 if opt.demo == 'webcam' else opt.demo)
    detector.pause = False
    while True:
        _, img =
        cv2.imshow('input', img)
        ret =
        time_str = ''
        for stat in time_stats:
          time_str = time_str + '{} {:.3f}s |'.format(stat, ret[stat])
        if cv2.waitKey(1) == 27:
            return  # esc to quit
    if os.path.isdir(opt.demo):
      image_names = []
      ls = os.listdir(opt.demo)
      for file_name in sorted(ls):
          ext = file_name[file_name.rfind('.') + 1:].lower()
          if ext in image_ext:
              image_names.append(os.path.join(opt.demo, file_name))
      image_names = [opt.demo]
    for (image_name) in image_names:
      ret =
      time_str = ''
      for stat in time_stats:
        time_str = time_str + '{} {:.3f}s |'.format(stat, ret[stat])
if __name__ == '__main__':
  opt = opts().init()


Detector = detector_factory[opt.task]
detector = Detector(opt)


from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import cv2
import numpy as np
from import Bar
import time
import torch

  from external.nms import soft_nms
  print('NMS not imported! If you need it,'
        ' do \n cd $CenterNet_ROOT/src/lib/external \n make')
from models.decode import ctdet_decode
from models.utils import flip_tensor
from utils.image import get_affine_transform
from utils.post_process import ctdet_post_process
from utils.debugger import Debugger

from .base_detector import BaseDetector

class CtdetDetector(BaseDetector):
  def __init__(self, opt):
    super(CtdetDetector, self).__init__(opt)
  def process(self, images, return_time=False):
    with torch.no_grad():
      output = self.model(images)[-1]
      hm = output['hm'].sigmoid_()
      wh = output['wh']
      reg = output['reg'] if self.opt.reg_offset else None
      if self.opt.flip_test:
        hm = (hm[0:1] + flip_tensor(hm[1:2])) / 2
        wh = (wh[0:1] + flip_tensor(wh[1:2])) / 2
        reg = reg[0:1] if reg is not None else None
      forward_time = time.time()
      dets = ctdet_decode(hm, wh, reg=reg, cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)
    if return_time:
      return output, dets, forward_time
      return output, dets

  def post_process(self, dets, meta, scale=1):
    dets = dets.detach().cpu().numpy()
    dets = dets.reshape(1, -1, dets.shape[2])
    dets = ctdet_post_process(
        dets.copy(), [meta['c']], [meta['s']],
        meta['out_height'], meta['out_width'], self.opt.num_classes)
    for j in range(1, self.num_classes + 1):
      dets[0][j] = np.array(dets[0][j], dtype=np.float32).reshape(-1, 5)
      dets[0][j][:, :4] /= scale
    return dets[0]

  def merge_outputs(self, detections):
    results = {}
    for j in range(1, self.num_classes + 1):
      results[j] = np.concatenate(
        [detection[j] for detection in detections], axis=0).astype(np.float32)
      if len(self.scales) > 1 or self.opt.nms:
         soft_nms(results[j], Nt=0.5, method=2)
    scores = np.hstack(
      [results[j][:, 4] for j in range(1, self.num_classes + 1)])
    if len(scores) > self.max_per_image:
      kth = len(scores) - self.max_per_image
      thresh = np.partition(scores, kth)[kth]
      for j in range(1, self.num_classes + 1):
        keep_inds = (results[j][:, 4] >= thresh)
        results[j] = results[j][keep_inds]
    return results

  def debug(self, debugger, images, dets, output, scale=1):
    detection = dets.detach().cpu().numpy().copy()
    detection[:, :, :4] *= self.opt.down_ratio
    for i in range(1):
      img = images[i].detach().cpu().numpy().transpose(1, 2, 0)
      img = ((img * self.std + self.mean) * 255).astype(np.uint8)
      pred = debugger.gen_colormap(output['hm'][i].detach().cpu().numpy())
      debugger.add_blend_img(img, pred, 'pred_hm_{:.1f}'.format(scale))
      debugger.add_img(img, img_id='out_pred_{:.1f}'.format(scale))
      for k in range(len(dets[i])):
        if detection[i, k, 4] > self.opt.center_thresh:
          debugger.add_coco_bbox(detection[i, k, :4], detection[i, k, -1],
                                 detection[i, k, 4], 

  def show_results(self, debugger, image, results):
    debugger.add_img(image, img_id='ctdet')
    for j in range(1, self.num_classes + 1):
      for bbox in results[j]:
        if bbox[4] > self.opt.vis_thresh:
          debugger.add_coco_bbox(bbox[:4], j - 1, bbox[4], img_id='ctdet')

我们先从def process(self, images, return_time=False)说起。

  def process(self, images, return_time=False):
    with torch.no_grad():
      output = self.model(images)[-1]
      hm = output['hm'].sigmoid_()
      wh = output['wh']
      reg = output['reg'] if self.opt.reg_offset else None
      if self.opt.flip_test: #False
        hm = (hm[0:1] + flip_tensor(hm[1:2])) / 2
        wh = (wh[0:1] + flip_tensor(wh[1:2])) / 2
        reg = reg[0:1] if reg is not None else None
      forward_time = time.time()
      dets = ctdet_decode(hm, wh, reg=reg, cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)

首先获取预测出来的图片hm, 因为我们在训练的时候也是做了sigmoid所以我们在预测的时候也进行sigmoid。之后进入dets = ctdet_decode(hm, wh, reg=reg, cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)


  • 首先进入_nms函数
def _nms(heat, kernel=3):
    pad = (kernel - 1) // 2
    hmax = nn.functional.max_pool2d(
        heat, (kernel, kernel), stride=1, padding=pad)
    keep = (hmax == heat).float()
    return heat * keep



def _topk(scores, K=40):
    batch, cat, height, width = scores.size()
    topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
    topk_inds = topk_inds % (height * width)
    topk_ys   = (topk_inds / width).int().float()
    topk_xs   = (topk_inds % width).int().float()
    topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
    topk_clses = (topk_ind / K).int()
    topk_inds = _gather_feat(
        topk_inds.view(batch, -1, 1), topk_ind).view(batch, K)
    topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K)
    topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K)
    return topk_score, topk_inds, topk_clses, topk_ys, topk_xs

topk_scores: batch * cat * K, batch代表batchsize,cat代表类别数,K代表K个最大值。topk_inds:batch * cat * K, index取值:[0, W x H - 1]。 topk_scores和topk_inds分别为每个batch每张heatmap(每个类别)中前K个最大的score和id。之后对topk_inds使用取余和除法得到横纵坐标top_ys、top_xs。然后在每个batch中取所有heatmap的前K个最大score以及id,。topk_score:batch * K topk_ind:batch * K index取值:[0, cat x K - 1] 之后对topk_inds(view后)和topk_ind调用了_gather_feat函数,在utils文件中

  • _gather_feat
def _gather_feat(feat, ind, mask=None):
    dim  = feat.size(2)
    ind  = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
    feat = feat.gather(1, ind)
    if mask is not None:
        mask = mask.unsqueeze(2).expand_as(feat)
        feat = feat[mask]
        feat = feat.view(-1, dim)
    return feat

feat(topk_inds): batch * (cat x K) * 1 (假设输入的是topk_inds和topk_ind) ind(topk_ind):batch * K 首先将ind扩展一个指标,变为 batch * K * 1 之后使用gather,将ind对应的值取出来。返回的是index:
feat: batch * K * 1 取值:[0, cat x K - 1]更一般的情况如下:
feat : A * B * C
ind:A * D
首先将ind扩展一个指标,并且expand为dim的大小,变为 A * D * C,其中对于任意的i, j, 数组ind[i, j, :]中所有的元素均相同,等于原来A * D shape的ind[i, j]。之后使用gather,将ind对应的值取出来。得到的feat: A * D * C

  • scores, inds, clses, ys, xs = _topk(heat, K=K)
    最后返回有四个:topk_score, topk_inds, topk_clses, topk_ys, topk_xs
    topk_score:batch * K。每张图片中最大的K个值topk_inds:batch * K 。没张图片中最大的K个值对应的index,这个index在[0, W x H - 1]之间。后两个类似。

    1. reg = _tranpose_and_gather_feat(reg, inds)
  1. wh = _tranpose_and_gather_feat(wh, inds)
  • 之后会进行尺度变换等后处理, 在做soft_nms(oft-nms来移除冗余的bbox。根据得分情况,最后选择top-100个bbox用于检测。),之后scores进行一个筛选,将那些大于最多检测框(100)剔除掉。
  def post_process(self, dets, meta, scale=1):
    dets = dets.detach().cpu().numpy()
    dets = dets.reshape(1, -1, dets.shape[2])
    dets = ctdet_post_process(
        dets.copy(), [meta['c']], [meta['s']],
        meta['out_height'], meta['out_width'], self.opt.num_classes)
    for j in range(1, self.num_classes + 1):
      dets[0][j] = np.array(dets[0][j], dtype=np.float32).reshape(-1, 5)
      dets[0][j][:, :4] /= scale
    return dets[0]
  def merge_outputs(self, detections):
    results = {}
    for j in range(1, self.num_classes + 1):
      results[j] = np.concatenate(
        [detection[j] for detection in detections], axis=0).astype(np.float32)
      if len(self.scales) > 1 or self.opt.nms:
         soft_nms(results[j], Nt=0.5, method=2)
    scores = np.hstack(
      [results[j][:, 4] for j in range(1, self.num_classes + 1)])
    if len(scores) > self.max_per_image:
      kth = len(scores) - self.max_per_image
      thresh = np.partition(scores, kth)[kth]
      for j in range(1, self.num_classes + 1):
        keep_inds = (results[j][:, 4] >= thresh)
        results[j] = results[j][keep_inds]
    return results
  def run(self, image_or_path_or_tensor, meta=None):
    load_time, pre_time, net_time, dec_time, post_time = 0, 0, 0, 0, 0
    merge_time, tot_time = 0, 0
    debugger = Debugger(dataset=self.opt.dataset, ipynb=(self.opt.debug==3),
    start_time = time.time()
    pre_processed = False
    if isinstance(image_or_path_or_tensor, np.ndarray):
      image = image_or_path_or_tensor
    elif type(image_or_path_or_tensor) == type (''): 
      image = cv2.imread(image_or_path_or_tensor)
      image = image_or_path_or_tensor['image'][0].numpy()
      pre_processed_images = image_or_path_or_tensor
      pre_processed = True
    loaded_time = time.time()
    load_time += (loaded_time - start_time)
    detections = []
    for scale in self.scales:
      scale_start_time = time.time()
      if not pre_processed:
        images, meta = self.pre_process(image, scale, meta)
        # import pdb; pdb.set_trace()
        images = pre_processed_images['images'][scale][0]
        meta = pre_processed_images['meta'][scale]
        meta = {k: v.numpy()[0] for k, v in meta.items()}
      images =
      pre_process_time = time.time()
      pre_time += pre_process_time - scale_start_time
      output, dets, forward_time = self.process(images, return_time=True)

      net_time += forward_time - pre_process_time
      decode_time = time.time()
      dec_time += decode_time - forward_time
      if self.opt.debug >= 2:
        self.debug(debugger, images, dets, output, scale)
      dets = self.post_process(dets, meta, scale)
      post_process_time = time.time()
      post_time += post_process_time - decode_time

    results = self.merge_outputs(detections)
    end_time = time.time()
    merge_time += end_time - post_process_time
    tot_time += end_time - start_time

    if self.opt.debug >= 1:
      self.show_results(debugger, image, results)
    return {'results': results, 'tot': tot_time, 'load': load_time,
            'pre': pre_time, 'net': net_time, 'dec': dec_time,
            'post': post_time, 'merge': merge_time}

关于softNMS可以参考这篇文章论文阅读: Soft-NMS。



1、根据候选框的类别分类概率做排序,假如有4个 BBox ,其置信度A>B>C>D。








现,则将导致对该物体的检测失败并降低了算法的平均检测率(average precision, AP)。




你可能感兴趣的:(CenterNet(二) 源码解读之bouding box 检测)