CenterNet(二) 源码解读之bouding box 检测

我们接着上一篇文章CenterNet(一)论文解读, 来了解一下作者具体的代码是如何实现的吧。
这里我们可以下代码地址:Github CenterNet
首先我们看一下大致的目录:

catalogue

一、训练代码

首先我们从训练模型的主函数开始说起吧。

  1. CenterNet/src/main.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import _init_paths

import os

import torch
import torch.utils.data
from opts import opts
from models.model import create_model, load_model, save_model
from models.data_parallel import DataParallel
from logger import Logger
from datasets.dataset_factory import get_dataset
from trains.train_factory import train_factory


def main(opt):
  torch.manual_seed(opt.seed)
  torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
  Dataset = get_dataset(opt.dataset, opt.task)
  opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
  print(opt)

  logger = Logger(opt)

  os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
  opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')
  
  print('Creating model...')
  model = create_model(opt.arch, opt.heads, opt.head_conv)
  optimizer = torch.optim.Adam(model.parameters(), opt.lr)
  start_epoch = 0
  if opt.load_model != '':
    model, optimizer, start_epoch = load_model(
      model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step)

  Trainer = train_factory[opt.task]
  trainer = Trainer(opt, model, optimizer)
  trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

  print('Setting up data...')
  val_loader = torch.utils.data.DataLoader(
      Dataset(opt, 'val'), 
      batch_size=1, 
      shuffle=False,
      num_workers=1,
      pin_memory=True
  )

  if opt.test:
    _, preds = trainer.val(0, val_loader)
    val_loader.dataset.run_eval(preds, opt.save_dir)
    return

  train_loader = torch.utils.data.DataLoader(
      Dataset(opt, 'train'), 
      batch_size=opt.batch_size, 
      shuffle=True,
      num_workers=opt.num_workers,
      pin_memory=True,
      drop_last=True
  )

  print('Starting training...')
  best = 1e10
  for epoch in range(start_epoch + 1, opt.num_epochs + 1):
    mark = epoch if opt.save_all else 'last'
    log_dict_train, _ = trainer.train(epoch, train_loader)
    logger.write('epoch: {} |'.format(epoch))
    for k, v in log_dict_train.items():
      logger.scalar_summary('train_{}'.format(k), v, epoch)
      logger.write('{} {:8f} | '.format(k, v))
    if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
      save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), 
                 epoch, model, optimizer)
      with torch.no_grad():
        log_dict_val, preds = trainer.val(epoch, val_loader)
      for k, v in log_dict_val.items():
        logger.scalar_summary('val_{}'.format(k), v, epoch)
        logger.write('{} {:8f} | '.format(k, v))
      if log_dict_val[opt.metric] < best:
        best = log_dict_val[opt.metric]
        save_model(os.path.join(opt.save_dir, 'model_best.pth'), 
                   epoch, model)
    else:
      save_model(os.path.join(opt.save_dir, 'model_last.pth'), 
                 epoch, model, optimizer)
    logger.write('\n')
    if epoch in opt.lr_step:
      save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), 
                 epoch, model, optimizer)
      lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1))
      print('Drop LR to', lr)
      for param_group in optimizer.param_groups:
          param_group['lr'] = lr
  logger.close()

if __name__ == '__main__':
  opt = opts().parse()
  """
  ctdet --exp_id coco_dla --batch_size 64 --master_batch 32 --lr 1.25e-4 --gpu 1,2,3 --num_workers 32
  """
  main(opt)

我们从main函数开始说起吧。
a. torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test benchmark = True 自动寻找最适合当前配置的高效算法,来达到优化运行效率的问题
b. Dataset = get_dataset(opt.dataset, opt.task)获取训练特定任务模型需要数据,
c. opt = opts().update_dataset_info_and_set_heads(opt, Dataset)更新数据等配置,设置模型输出heads。比如我们需要bounding box识别任务, 我们就需要设置三个输出hm, wh, reg.

    elif opt.task == 'ctdet':
      # assert opt.dataset in ['pascal', 'coco']
      opt.heads = {'hm': opt.num_classes,
                   'wh': 2 if not opt.cat_spec_wh else 2 * opt.num_classes}
      if opt.reg_offset:
        opt.heads.update({'reg': 2})

这里opt.cat_spec_wh作者仅仅是实验中使用, 并发现效果没有提升。

作者这样说的 We never used cat_spec_wh is the experiments.
I have tried once this on Pascal VOC but it doesn't give improvement. Feel free to try it on COCO yourself.




这里面最主要的部分莫过于Dataset = get_dataset(opt.dataset, opt.task)以及model = create_model(opt.arch, opt.heads, opt.head_conv)。下面我们来好好分析一下。

  1. CenterNet/src/lib/datasets/dataset_factory.py以及 CenterNet/src/lib/datasets/dataset/coco.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from .sample.ddd import DddDataset
from .sample.exdet import EXDetDataset
from .sample.ctdet import CTDetDataset
from .sample.multi_pose import MultiPoseDataset

from .dataset.coco import COCO
from .dataset.pascal import PascalVOC
from .dataset.kitti import KITTI
from .dataset.coco_hp import COCOHP
from .dataset.clothing import Clothing

dataset_factory = {
  'coco': COCO,
  'pascal': PascalVOC,
  'kitti': KITTI,
  'coco_hp': COCOHP,
  'clothing': Clothing
}

_sample_factory = {
  'exdet': EXDetDataset,
  'ctdet': CTDetDataset,
  'ddd': DddDataset,
  'multi_pose': MultiPoseDataset
}


def get_dataset(dataset, task):
  class Dataset(dataset_factory[dataset], _sample_factory[task]):
    pass
  return Dataset

可以看出Dataset继承了dataset_factory_sample_factory这里我们就拿coco数据作为例子来解释(即data_set='coco'),根据上述代码我们先从CenterNet/src/lib/datasets/dataset/coco.py以及
\CenterNet\src\lib\datasets\sample\ctdet.py代码入手。

首先我们看一下coco.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import pycocotools.coco as coco
from pycocotools.cocoeval import COCOeval
import numpy as np
import json
import os

import torch.utils.data as data

class COCO(data.Dataset):
  num_classes = 80
  default_resolution = [512, 512]
  mean = np.array([0.40789654, 0.44719302, 0.47026115],
                   dtype=np.float32).reshape(1, 1, 3)
  std  = np.array([0.28863828, 0.27408164, 0.27809835],
                   dtype=np.float32).reshape(1, 1, 3)

  def __init__(self, opt, split):
    super(COCO, self).__init__()
    self.data_dir = os.path.join(opt.data_dir, 'coco') # 图片存储地址
    self.img_dir = os.path.join(self.data_dir, '{}2017'.format(split)) # 标签根路径存储
    if split == 'test':
      self.annot_path = os.path.join(
          self.data_dir, 'annotations', 
          'image_info_test-dev2017.json').format(split)
    else:
      if opt.task == 'exdet':
        self.annot_path = os.path.join(
          self.data_dir, 'annotations', 
          'instances_extreme_{}2017.json').format(split)
      else:
        self.annot_path = os.path.join(
          self.data_dir, 'annotations', 
          'instances_{}2017.json').format(split)
    self.max_objs = 100
    self.class_name = [
      '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
      'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
      'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
      'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
      'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
      'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
      'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass',
      'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
      'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
      'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
      'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
      'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
      'scissors', 'teddy bear', 'hair drier', 'toothbrush']
    self._valid_ids = [
      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 
      14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 
      24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 
      37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 
      48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 
      58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 
      72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 
      82, 84, 85, 86, 87, 88, 89, 90]
    self.cat_ids = {v: i for i, v in enumerate(self._valid_ids)} # 生成对应的category dict
    self.voc_color = [(v // 32 * 64 + 64, (v // 8) % 4 * 64, v % 8 * 32) \
                      for v in range(1, self.num_classes + 1)]
    self._data_rng = np.random.RandomState(123)
    self._eig_val = np.array([0.2141788, 0.01817699, 0.00341571],
                             dtype=np.float32)
    # 这里是们了后面图片增广中的颜色增广的参数
    self._eig_vec = np.array([
        [-0.58752847, -0.69563484, 0.41340352],
        [-0.5832747, 0.00994535, -0.81221408],
        [-0.56089297, 0.71832671, 0.41158938]
    ], dtype=np.float32)
    # self.mean = np.array([0.485, 0.456, 0.406], np.float32).reshape(1, 1, 3)
    # self.std = np.array([0.229, 0.224, 0.225], np.float32).reshape(1, 1, 3)

    self.split = split
    self.opt = opt

    print('==> initializing coco 2017 {} data.'.format(split))
    self.coco = coco.COCO(self.annot_path)
    self.images = self.coco.getImgIds()
    self.num_samples = len(self.images)

    print('Loaded {} {} samples'.format(split, self.num_samples))

  def _to_float(self, x):
    return float("{:.2f}".format(x))
 
 # 遍历每一个标注文件解析写入detections. 输出结果使用
  def convert_eval_format(self, all_bboxes):
    # import pdb; pdb.set_trace()
    detections = []
    for image_id in all_bboxes:
      for cls_ind in all_bboxes[image_id]:
        category_id = self._valid_ids[cls_ind - 1]
        for bbox in all_bboxes[image_id][cls_ind]:
          bbox[2] -= bbox[0]
          bbox[3] -= bbox[1]
          score = bbox[4]
          bbox_out  = list(map(self._to_float, bbox[0:4]))

          detection = {
              "image_id": int(image_id),
              "category_id": int(category_id),
              "bbox": bbox_out,
              "score": float("{:.2f}".format(score))
          }
          if len(bbox) > 5:
              extreme_points = list(map(self._to_float, bbox[5:13]))
              detection["extreme_points"] = extreme_points
          detections.append(detection)
    return detections

  def __len__(self):
    return self.num_samples

  def save_results(self, results, save_dir):
    json.dump(self.convert_eval_format(results), 
                open('{}/results.json'.format(save_dir), 'w'))
  
  def run_eval(self, results, save_dir):
    # result_json = os.path.join(save_dir, "results.json")
    # detections  = self.convert_eval_format(results)
    # json.dump(detections, open(result_json, "w"))
    self.save_results(results, save_dir)
    coco_dets = self.coco.loadRes('{}/results.json'.format(save_dir))
    coco_eval = COCOeval(self.coco, coco_dets, "bbox")
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()

接着我们来看一下ctdet.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import torch.utils.data as data
import numpy as np
import torch
import json
import cv2
import os
from utils.image import flip, color_aug
from utils.image import get_affine_transform, affine_transform
from utils.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian
from utils.image import draw_dense_reg
import math

class CTDetDataset(data.Dataset):
  def _coco_box_to_bbox(self, box):
    bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]],
                    dtype=np.float32)
    return bbox

  def _get_border(self, border, size):
    #border 128  pic_len w or h
    i = 1
    while size - border // i <= border // i:
      # 如果图像宽高小于 boder*2,i增大,返回128 // i
      # 正常返回128,图像小于256,则返回64
        i *= 2
    return border // i

  def __getitem__(self, index):
    img_id = self.images[index]
    file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name']
    img_path = os.path.join(self.img_dir, file_name)
    ann_ids = self.coco.getAnnIds(imgIds=[img_id])
    anns = self.coco.loadAnns(ids=ann_ids)
    num_objs = min(len(anns), self.max_objs)      # 目标个数,这里为100
    img = cv2.imread(img_path)
    try:
      height, width = img.shape[0], img.shape[1]
    except:
      print(img_path)
    c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32) # 获取中心点
    if self.opt.keep_res: # False
      input_h = (height | self.opt.pad) + 1
      input_w = (width | self.opt.pad) + 1
      s = np.array([input_w, input_h], dtype=np.float32)
    else: # True
      s = max(img.shape[0], img.shape[1]) * 1.0 # s最长的边长
      input_h, input_w = self.opt.input_h, self.opt.input_w # 512, 512
    
    flipped = False
    if self.split == 'train':
      if not self.opt.not_rand_crop:
        s = s * np.random.choice(np.arange(0.6, 1.4, 0.1)) # 随机尺度
        w_border = self._get_border(128, img.shape[1])
        h_border = self._get_border(128, img.shape[0])
        c[0] = np.random.randint(low=w_border, high=img.shape[1] - w_border)
        c[1] = np.random.randint(low=h_border, high=img.shape[0] - h_border)
      else:
        sf = self.opt.scale
        cf = self.opt.shift
        c[0] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
        c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
        s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
      
      if np.random.random() < self.opt.flip:
        flipped = True
        img = img[:, ::-1, :]
        c[0] =  width - c[0] - 1 # 随机裁剪
        

    trans_input = get_affine_transform(
      c, s, 0, [input_w, input_h])
    inp = cv2.warpAffine(img, trans_input, 
                         (input_w, input_h),
                         flags=cv2.INTER_LINEAR)# 放射变换
    inp = (inp.astype(np.float32) / 255.)
    if self.split == 'train' and not self.opt.no_color_aug:
      color_aug(self._data_rng, inp, self._eig_val, self._eig_vec)
    # 归一化
    inp = (inp - self.mean) / self.std
    inp = inp.transpose(2, 0, 1)

    output_h = input_h // self.opt.down_ratio # 输出512//4=128
    output_w = input_w // self.opt.down_ratio
    num_classes = self.num_classes
    trans_output = get_affine_transform(c, s, 0, [output_w, output_h])

    hm = np.zeros((num_classes, output_h, output_w), dtype=np.float32) # heatmap(80,128,128)
    wh = np.zeros((self.max_objs, 2), dtype=np.float32) # 中心点宽高(100*2)
    dense_wh = np.zeros((2, output_h, output_w), dtype=np.float32)# 返回2*128*128
    reg = np.zeros((self.max_objs, 2), dtype=np.float32) # 记录下采样带来的误差,返回100*2的小数
    ind = np.zeros((self.max_objs), dtype=np.int64) # 返回100个ind
    reg_mask = np.zeros((self.max_objs), dtype=np.uint8)# 返回8个 回归mask
    cat_spec_wh = np.zeros((self.max_objs, num_classes * 2), dtype=np.float32) # 100*80*2
    cat_spec_mask = np.zeros((self.max_objs, num_classes * 2), dtype=np.uint8) # 100*80*2
    
    draw_gaussian = draw_msra_gaussian if self.opt.mse_loss else \
                    draw_umich_gaussian


    gt_det = []
    for k in range(num_objs):
      ann = anns[k]
      bbox = self._coco_box_to_bbox(ann['bbox'])
      cls_id = int(self.cat_ids[ann['category_id']])
      if flipped:
        bbox[[0, 2]] = width - bbox[[2, 0]] - 1
      bbox[:2] = affine_transform(bbox[:2], trans_output)
      bbox[2:] = affine_transform(bbox[2:], trans_output)
      bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
      bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
      h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
      if h > 0 and w > 0:
        radius = gaussian_radius((math.ceil(h), math.ceil(w)))
        radius = max(0, int(radius))
        radius = self.opt.hm_gauss if self.opt.mse_loss else radius
        ct = np.array(
          [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
        ct_int = ct.astype(np.int32)
        draw_gaussian(hm[cls_id], ct_int, radius)
        #cv2.imwrite("/data/humaocheng/CenterNet-master/single_heatmap.jpg", hm[0]*255)
        wh[k] = 1. * w, 1. * h # 目标矩形框的宽高——目标尺寸损失
        ind[k] = ct_int[1] * output_w + ct_int[0] # 目标中心点在128×128特征图中的索引
        reg[k] = ct - ct_int # off Loss, # ct 即 center point reg是偏置回归数组,存放每个中心店的偏置值 k是当前图中第k个目标
        # 实际例子为
        # [98.97667 2.3566666] - [98  2] = [0.97667, 0.3566666]
        reg_mask[k] = 1 # 有目标的位置的mask
        cat_spec_wh[k, cls_id * 2: cls_id * 2 + 2] = wh[k]
        cat_spec_mask[k, cls_id * 2: cls_id * 2 + 2] = 1
        if self.opt.dense_wh:
          draw_dense_reg(dense_wh, hm.max(axis=0), ct_int, wh[k], radius)
        gt_det.append([ct[0] - w / 2, ct[1] - h / 2, 
                       ct[0] + w / 2, ct[1] + h / 2, 1, cls_id])

    # cv2.imwrite("/data/humaocheng/CenterNet-master/heatmap.jpg",hm[0]*255)
    
    ret = {'input': inp, 'hm': hm, 'reg_mask': reg_mask, 'ind': ind, 'wh': wh}
    if self.opt.dense_wh:
      hm_a = hm.max(axis=0, keepdims=True)
      dense_wh_mask = np.concatenate([hm_a, hm_a], axis=0)
      ret.update({'dense_wh': dense_wh, 'dense_wh_mask': dense_wh_mask})
      del ret['wh']
    elif self.opt.cat_spec_wh:
      ret.update({'cat_spec_wh': cat_spec_wh, 'cat_spec_mask': cat_spec_mask})
      del ret['wh']
    if self.opt.reg_offset:
      ret.update({'reg': reg})
    if self.opt.debug > 0 or not self.split == 'train':
      gt_det = np.array(gt_det, dtype=np.float32) if len(gt_det) > 0 else \
               np.zeros((1, 6), dtype=np.float32)
      meta = {'c': c, 's': s, 'gt_det': gt_det, 'img_id': img_id}
      ret['meta'] = meta
    return ret

我们还是从def __getitem__(self, index)函数为入口。这里我们可以得到我们输出参数,分别是。

    img_id = self.images[index]
    file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name']
    img_path = os.path.join(self.img_dir, file_name)
    ann_ids = self.coco.getAnnIds(imgIds=[img_id])
    anns = self.coco.loadAnns(ids=ann_ids)
    num_objs = min(len(anns), self.max_objs)      # 目标个数,这里为100
    img = cv2.imread(img_path)

分别获取图片img_id, 并根据img_id获取图片名称及地址。并根据img_id获取标注的ann_ids, 并借此获取对于应的标签。这里需要强调的num_objs为我们一张图片选取top中心点的数量(即类似起到NMS作用)。可以理解为超参数这里我们默认设置为100。
同时我们也获取中心点的坐标c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32

接着我们获取图片的最长边以及输入尺寸(512,512)

    if self.opt.keep_res: # False
      input_h = (height | self.opt.pad) + 1
      input_w = (width | self.opt.pad) + 1
      s = np.array([input_w, input_h], dtype=np.float32)
    else: # True
      s = max(img.shape[0], img.shape[1]) * 1.0 # s最长的边长
      input_h, input_w = self.opt.input_h, self.opt.input_w # 512, 512

剩下的为了保持数据的泛化性,对数据进行一系列处理。最终输出结果即我们第一个所需要的输入图像.

flipped = False
    if self.split == 'train':
      if not self.opt.not_rand_crop:
        s = s * np.random.choice(np.arange(0.6, 1.4, 0.1)) # 随机尺度
        w_border = self._get_border(128, img.shape[1])
        h_border = self._get_border(128, img.shape[0])
        c[0] = np.random.randint(low=w_border, high=img.shape[1] - w_border)
        c[1] = np.random.randint(low=h_border, high=img.shape[0] - h_border)
      else:
        sf = self.opt.scale
        cf = self.opt.shift
        c[0] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
        c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
        s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
      
      if np.random.random() < self.opt.flip:
        flipped = True
        img = img[:, ::-1, :]
        c[0] =  width - c[0] - 1 # 随机裁剪
        

    trans_input = get_affine_transform(
      c, s, 0, [input_w, input_h])
    inp = cv2.warpAffine(img, trans_input, 
                         (input_w, input_h),
                         flags=cv2.INTER_LINEAR)# 放射变换
    inp = (inp.astype(np.float32) / 255.)
    if self.split == 'train' and not self.opt.no_color_aug:
      color_aug(self._data_rng, inp, self._eig_val, self._eig_vec)
    # 归一化
    inp = (inp - self.mean) / self.std
    inp = inp.transpose(2, 0, 1)

接着我们需要完成我们的heatmap的生成。

    output_h = input_h // self.opt.down_ratio # 输出512//4=128
    output_w = input_w // self.opt.down_ratio
    num_classes = self.num_classes # num_classes=80
    trans_output = get_affine_transform(c, s, 0, [output_w, output_h])

    hm = np.zeros((num_classes, output_h, output_w), dtype=np.float32) # heatmap(80,128,128)
    wh = np.zeros((self.max_objs, 2), dtype=np.float32) # 中心点宽高(32*2)
    dense_wh = np.zeros((2, output_h, output_w), dtype=np.float32)# 返回2*128*128
    reg = np.zeros((self.max_objs, 2), dtype=np.float32) # 记录下采样带来的误差,返回32*2的小数
    ind = np.zeros((self.max_objs), dtype=np.int64) # 返回32个ind
    reg_mask = np.zeros((self.max_objs), dtype=np.uint8)# 返回100个 回归mask
    cat_spec_wh = np.zeros((self.max_objs, num_classes * 2), dtype=np.float32) # 32*80*2
    cat_spec_mask = np.zeros((self.max_objs, num_classes * 2), dtype=np.uint8) # 32*80*2
    
    draw_gaussian = draw_msra_gaussian if self.opt.mse_loss else \
                    draw_umich_gaussian

这里mse_loss为False, 所以我们只需要关注draw_umich_gaussian函数即可。

    gt_det = []
    for k in range(num_objs):
      ann = anns[k]
      bbox = self._coco_box_to_bbox(ann['bbox'])
      cls_id = int(self.cat_ids[ann['category_id']])
      if flipped:
        bbox[[0, 2]] = width - bbox[[2, 0]] - 1
      bbox[:2] = affine_transform(bbox[:2], trans_output)
      bbox[2:] = affine_transform(bbox[2:], trans_output)
      bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
      bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
      h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
      if h > 0 and w > 0:
        radius = gaussian_radius((math.ceil(h), math.ceil(w)))
        radius = max(0, int(radius))
        radius = self.opt.hm_gauss if self.opt.mse_loss else radius
        ct = np.array(
          [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
        ct_int = ct.astype(np.int32)
        draw_gaussian(hm[cls_id], ct_int, radius)
        #cv2.imwrite("/data/humaocheng/CenterNet-master/single_heatmap.jpg", hm[0]*255)
        wh[k] = 1. * w, 1. * h # 目标矩形框的宽高——目标尺寸损失
        ind[k] = ct_int[1] * output_w + ct_int[0] # 目标中心点在128×128特征图中的索引
        reg[k] = ct - ct_int # off Loss, # ct 即 center point reg是偏置回归数组,存放每个中心店的偏置值 k是当前图中第k个目标
        # 实际例子为
        # [98.97667 2.3566666] - [98  2] = [0.97667, 0.3566666]
        reg_mask[k] = 1 # 有目标的位置的mask
        cat_spec_wh[k, cls_id * 2: cls_id * 2 + 2] = wh[k]
        cat_spec_mask[k, cls_id * 2: cls_id * 2 + 2] = 1
        if self.opt.dense_wh:
          draw_dense_reg(dense_wh, hm.max(axis=0), ct_int, wh[k], radius)
        gt_det.append([ct[0] - w / 2, ct[1] - h / 2, 
                       ct[0] + w / 2, ct[1] + h / 2, 1, cls_id])

我们来看下这个代码,是如何画出heatmap的。这里的关键点在于如何画出高斯半径。下面展示求解高斯半径的初始化的函数。这个函数也是照搬CornerNet的方式求解的。这里的理解可以参考CornerNet半径求解。这里在CornerNet里面是这么解释的。而半径设置时需要满足半径内的点组成的box与gt box之间的IoU达到某个阈值0.3.

We determine the radius by the size of an object by ensuring that a pair of points within the radius would generate a bounding box with at least t IoU with the ground-truth annotation (we set to 0.3 in all experiments)

def gaussian_radius(det_size, min_overlap=0.7):
 height, width = det_size

 a1  = 1
 b1  = (height + width)
 c1  = width * height * (1 - min_overlap) / (1 + min_overlap)
 sq1 = np.sqrt(b1 ** 2 - 4 * a1 * c1)
 r1  = (b1 + sq1) / 2

 a2  = 4
 b2  = 2 * (height + width)
 c2  = (1 - min_overlap) * width * height
 sq2 = np.sqrt(b2 ** 2 - 4 * a2 * c2)
 r2  = (b2 + sq2) / 2

 a3  = 4 * min_overlap
 b3  = -2 * min_overlap * (height + width)
 c3  = (min_overlap - 1) * width * height
 sq3 = np.sqrt(b3 ** 2 - 4 * a3 * c3)
 r3  = (b3 + sq3) / 2
 return min(r1, r2, r3)

这样的话我们通过def draw_umich_gaussian画出我们想要的heatmap.

def draw_umich_gaussian(heatmap, center, radius, k=1):
  diameter = 2 * radius + 1
  gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
  
  x, y = int(center[0]), int(center[1])

  height, width = heatmap.shape[0:2]
    
  left, right = min(x, radius), min(width - x, radius + 1)
  top, bottom = min(y, radius), min(height - y, radius + 1)

  masked_heatmap  = heatmap[y - top:y + bottom, x - left:x + right]
  masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right]
  if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: # TODO debug
    np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
  return heatmap

这里的center其实是bounding box的中心点。np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)相当于不断的在heatmap基础上添加关键点的高斯,即同一种类型的框会在一个heatmap某一个类别通道上面上面不断添加。最终通过函数总体的for循环,相当于不断将目标画到heatmap上面生成我们第二个输出参数。

上面的代码已经介绍了我们的输出参数以及, 还有以及这三个参数我们可以看下代码。

        wh[k] = 1. * w, 1. * h # 目标矩形框的宽高——目标尺寸损失
        ind[k] = ct_int[1] * output_w + ct_int[0] # 目标中心点在128×128特征图中的索引
        reg[k] = ct - ct_int # off Loss, # ct 即 center point reg是偏置回归数组,存放每个中心店的偏置值 k是当前图中第k个目标
        # 实际例子为
        # [98.97667 2.3566666] - [98  2] = [0.97667, 0.3566666]
        reg_mask[k] = 1 # 有目标的位置的mask
        cat_spec_wh[k, cls_id * 2: cls_id * 2 + 2] = wh[k]
        cat_spec_mask[k, cls_id * 2: cls_id * 2 + 2] = 1
        if self.opt.dense_wh:
          draw_dense_reg(dense_wh, hm.max(axis=0), ct_int, wh[k], radius)
        gt_det.append([ct[0] - w / 2, ct[1] - h / 2, 
                       ct[0] + w / 2, ct[1] + h / 2, 1, cls_id])

这里的w,h分别代表目标框的宽与高ind代表目标中心点在矩阵上面的索引,reg[k]表示offset。关于reg[k]是因为将float转成int类型而产生的误差。我们可以根据下面这个图清楚的理解这里的loss。

offset

reg_mask是记录我们前100个点,这里相当于记载一张图片存在哪些目标,有的话对应索引设置为1,其余设置为0。
这样基本上我们就把ret = {'input': inp, 'hm': hm, 'reg_mask': reg_mask, 'ind': ind, 'wh': wh}这个参数介绍完成。
最后进行ret更新ret.update({'reg': reg}), 即把reg这个item添加到ret字典中去。

  1. 下面我们来看下model.py
    数据的预处理我们已经看了。下面我们来看下模型代码。
from .networks.msra_resnet import get_pose_net
from .networks.dlav0 import get_pose_net as get_dlav0
from .networks.pose_dla_dcn import get_pose_net as get_dla_dcn
from .networks.resnet_dcn import get_pose_net as get_pose_net_dcn
from .networks.large_hourglass import get_large_hourglass_net

_model_factory = {
  'res': get_pose_net, # default Resnet with deconv
  'dlav0': get_dlav0, # default DLAup
  'dla': get_dla_dcn,
  'resdcn': get_pose_net_dcn,
  'hourglass': get_large_hourglass_net,
}

def create_model(arch, heads, head_conv):
  num_layers = int(arch[arch.find('_') + 1:]) if '_' in arch else 0
  arch = arch[:arch.find('_')] if '_' in arch else arch
  get_model = _model_factory[arch]
  model = get_model(num_layers=num_layers, heads=heads, head_conv=head_conv)
  return model

这里我们使用arch默认为dla_34。在这里代表的是dla特征提取的层数。
从代码可以看出我们是获取了pose_net。这里需要我们去看一下CenterNet\src\lib\models\networks\pose_dla_dcn.py文件。在这里就不做过多的解释特征提取模型了,提供一个链接可以帮助我们理解这个模型cvpr2018 Deep Layer Aggregation(DLANet)。主要结构如下:

DLA structure

我们看一下DLA模型输出端是什么样子的。

for head in self.heads:
    classes = self.heads[head]
    if head_conv > 0:
        fc = nn.Sequential(
            nn.Conv2d(channels[self.first_level], head_conv,
                      kernel_size=3, padding=1, bias=True),
            nn.ReLU(inplace=True),
            nn.Conv2d(head_conv, classes,
                      kernel_size=final_kernel, stride=1,
                      padding=final_kernel // 2, bias=True)

这里的heads = {'hm':3, 'wh':2, 'reg':2} 通过dla模型的输出的通道为5, 先经过的卷积,输出通道为256,再经过最后在进行1\times1$卷积输出如果是hm则为3通道,wh则为2通道,reg则为2通道。

  1. 最后我们看下loss函数的定义。可以参考CenterNet\src\lib\trains\ctdet.py
  def forward(self, outputs, batch):
    opt = self.opt
    hm_loss, wh_loss, off_loss = 0, 0, 0
    for s in range(opt.num_stacks): # num_stacks = 1
      output = outputs[s]
      if not opt.mse_loss:
        output['hm'] = _sigmoid(output['hm']) 

      if opt.eval_oracle_hm:
        output['hm'] = batch['hm']
      if opt.eval_oracle_wh:
        output['wh'] = torch.from_numpy(gen_oracle_map(
          batch['wh'].detach().cpu().numpy(), 
          batch['ind'].detach().cpu().numpy(), 
          output['wh'].shape[3], output['wh'].shape[2])).to(opt.device)
      if opt.eval_oracle_offset:
        output['reg'] = torch.from_numpy(gen_oracle_map(
          batch['reg'].detach().cpu().numpy(), 
          batch['ind'].detach().cpu().numpy(), 
          output['reg'].shape[3], output['reg'].shape[2])).to(opt.device)

      hm_loss += self.crit(output['hm'], batch['hm']) / opt.num_stacks
      if opt.wh_weight > 0:
        if opt.dense_wh:
          mask_weight = batch['dense_wh_mask'].sum() + 1e-4
          wh_loss += (
            self.crit_wh(output['wh'] * batch['dense_wh_mask'],
            batch['dense_wh'] * batch['dense_wh_mask']) / 
            mask_weight) / opt.num_stacks
        elif opt.cat_spec_wh:
          wh_loss += self.crit_wh(
            output['wh'], batch['cat_spec_mask'],
            batch['ind'], batch['cat_spec_wh']) / opt.num_stacks
        else:
          wh_loss += self.crit_reg(
            output['wh'], batch['reg_mask'],
            batch['ind'], batch['wh']) / opt.num_stacks
      
      if opt.reg_offset and opt.off_weight > 0:
        off_loss += self.crit_reg(output['reg'], batch['reg_mask'],
                             batch['ind'], batch['reg']) / opt.num_stacks
        
    loss = opt.hm_weight * hm_loss + opt.wh_weight * wh_loss + \
           opt.off_weight * off_loss
    loss_stats = {'loss': loss, 'hm_loss': hm_loss,
                  'wh_loss': wh_loss, 'off_loss': off_loss}
    return loss, loss_stats

 output['hm'] = _sigmoid(output['hm']) 
 hm_loss += self.crit(output['hm'], batch['hm']) / opt.num_stacks

这里使用sigmoid函数自己理解人为这是对heatmap做归一化, 在进行对heatmap loss计算加快收敛。

elif opt.cat_spec_wh:
    wh_loss += self.crit_wh(
        output['wh'], batch['cat_spec_mask'],
        batch['ind'], batch['cat_spec_wh']) / opt.num_stacks
  def forward(self, output, mask, ind, target):
    pred = _tranpose_and_gather_feat(output, ind)
    mask = mask.unsqueeze(2).expand_as(pred).float()
    # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
    loss = F.l1_loss(pred * mask, target * mask, size_average=False)
    loss = loss / (mask.sum() + 1e-4)
    return loss
def _gather_feat(feat, ind, mask=None):
    dim  = feat.size(2)
    ind  = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
    feat = feat.gather(1, ind)
    if mask is not None:
        mask = mask.unsqueeze(2).expand_as(feat)
        feat = feat[mask]
        feat = feat.view(-1, dim)
    return feat

def _tranpose_and_gather_feat(feat, ind):
    feat = feat.permute(0, 2, 3, 1).contiguous()
    feat = feat.view(feat.size(0), -1, feat.size(3))
    feat = _gather_feat(feat, ind)
    return feat

我们相当于在ind中记录了目标在heatmap上的地址索引,通过_tranpose_and_gather_feat以及def _gather_feat(feat, ind, mask=None):函数得出我们预测的宽高。 _gather_feat根据ind取出feat中对应的元素
__gather_feat起到的作用是消除各个channel区别的作用,最终得到的inds是对于所有channel而言的。输入: feat(topk_inds): batch * (cat x K) * 1 (假设输入的是topk_inds和topk_ind)
ind(topk_ind):batch * K
首先将ind扩展一个指标,变为 batch * K * 1, 之后使用gather,将ind对应的值取出来。返回的是index:
feat: batch * K * 1 取值:[0, cat x K - 1]
更一般的情况如下: feat : A * B * C, ind:A * D
首先将ind扩展一个指标,并且expand为dim的大小,变为 A * D * C,其中对于任意的i, j, 数组ind[i, j, :]中所有的元素均相同,等于原来A * D shape的ind[i, j]。 之后使用gather,将ind对应的值取出来。 得到的feat: A * D * C

off_loss += self.crit_reg(output['reg'], batch['reg_mask'],batch['ind'], batch['reg']) / opt.num_stacks

原理和计算 一样

loss = opt.hm_weight * hm_loss + opt.wh_weight * wh_loss + opt.off_weight * off_loss

二、测试代码

这里我们可以从demo.py这个函数开始入手了解。

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import _init_paths

import os
import cv2

from opts import opts
from detectors.detector_factory import detector_factory

image_ext = ['jpg', 'jpeg', 'png', 'webp']
video_ext = ['mp4', 'mov', 'avi', 'mkv']
time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge']

def demo(opt):
  os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
  opt.debug = max(opt.debug, 1)
  Detector = detector_factory[opt.task]
  detector = Detector(opt)

  if opt.demo == 'webcam' or \
    opt.demo[opt.demo.rfind('.') + 1:].lower() in video_ext:
    cam = cv2.VideoCapture(0 if opt.demo == 'webcam' else opt.demo)
    detector.pause = False
    while True:
        _, img = cam.read()
        cv2.imshow('input', img)
        ret = detector.run(img)
        time_str = ''
        for stat in time_stats:
          time_str = time_str + '{} {:.3f}s |'.format(stat, ret[stat])
        print(time_str)
        if cv2.waitKey(1) == 27:
            return  # esc to quit
  else:
    if os.path.isdir(opt.demo):
      image_names = []
      ls = os.listdir(opt.demo)
      for file_name in sorted(ls):
          ext = file_name[file_name.rfind('.') + 1:].lower()
          if ext in image_ext:
              image_names.append(os.path.join(opt.demo, file_name))
    else:
      image_names = [opt.demo]
    
    for (image_name) in image_names:
      ret = detector.run(image_name)
      time_str = ''
      for stat in time_stats:
        time_str = time_str + '{} {:.3f}s |'.format(stat, ret[stat])
      print(time_str)
if __name__ == '__main__':
  opt = opts().init()
  demo(opt)

根据

Detector = detector_factory[opt.task]
detector = Detector(opt)

这里我们主要研究一下CenterNet\src\lib\detectors\ctdet.py文件。

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import cv2
import numpy as np
from progress.bar import Bar
import time
import torch

try:
  from external.nms import soft_nms
except:
  print('NMS not imported! If you need it,'
        ' do \n cd $CenterNet_ROOT/src/lib/external \n make')
from models.decode import ctdet_decode
from models.utils import flip_tensor
from utils.image import get_affine_transform
from utils.post_process import ctdet_post_process
from utils.debugger import Debugger

from .base_detector import BaseDetector

class CtdetDetector(BaseDetector):
  def __init__(self, opt):
    super(CtdetDetector, self).__init__(opt)
  
  def process(self, images, return_time=False):
    with torch.no_grad():
      output = self.model(images)[-1]
      hm = output['hm'].sigmoid_()
      wh = output['wh']
      reg = output['reg'] if self.opt.reg_offset else None
      if self.opt.flip_test:
        hm = (hm[0:1] + flip_tensor(hm[1:2])) / 2
        wh = (wh[0:1] + flip_tensor(wh[1:2])) / 2
        reg = reg[0:1] if reg is not None else None
      torch.cuda.synchronize()
      forward_time = time.time()
      dets = ctdet_decode(hm, wh, reg=reg, cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)
      
    if return_time:
      return output, dets, forward_time
    else:
      return output, dets

  def post_process(self, dets, meta, scale=1):
    dets = dets.detach().cpu().numpy()
    dets = dets.reshape(1, -1, dets.shape[2])
    dets = ctdet_post_process(
        dets.copy(), [meta['c']], [meta['s']],
        meta['out_height'], meta['out_width'], self.opt.num_classes)
    for j in range(1, self.num_classes + 1):
      dets[0][j] = np.array(dets[0][j], dtype=np.float32).reshape(-1, 5)
      dets[0][j][:, :4] /= scale
    return dets[0]

  def merge_outputs(self, detections):
    results = {}
    for j in range(1, self.num_classes + 1):
      results[j] = np.concatenate(
        [detection[j] for detection in detections], axis=0).astype(np.float32)
      if len(self.scales) > 1 or self.opt.nms:
         soft_nms(results[j], Nt=0.5, method=2)
    scores = np.hstack(
      [results[j][:, 4] for j in range(1, self.num_classes + 1)])
    if len(scores) > self.max_per_image:
      kth = len(scores) - self.max_per_image
      thresh = np.partition(scores, kth)[kth]
      for j in range(1, self.num_classes + 1):
        keep_inds = (results[j][:, 4] >= thresh)
        results[j] = results[j][keep_inds]
    return results

  def debug(self, debugger, images, dets, output, scale=1):
    detection = dets.detach().cpu().numpy().copy()
    detection[:, :, :4] *= self.opt.down_ratio
    for i in range(1):
      img = images[i].detach().cpu().numpy().transpose(1, 2, 0)
      img = ((img * self.std + self.mean) * 255).astype(np.uint8)
      pred = debugger.gen_colormap(output['hm'][i].detach().cpu().numpy())
      debugger.add_blend_img(img, pred, 'pred_hm_{:.1f}'.format(scale))
      debugger.add_img(img, img_id='out_pred_{:.1f}'.format(scale))
      for k in range(len(dets[i])):
        if detection[i, k, 4] > self.opt.center_thresh:
          debugger.add_coco_bbox(detection[i, k, :4], detection[i, k, -1],
                                 detection[i, k, 4], 
                                 img_id='out_pred_{:.1f}'.format(scale))

  def show_results(self, debugger, image, results):
    debugger.add_img(image, img_id='ctdet')
    for j in range(1, self.num_classes + 1):
      for bbox in results[j]:
        if bbox[4] > self.opt.vis_thresh:
          debugger.add_coco_bbox(bbox[:4], j - 1, bbox[4], img_id='ctdet')
    debugger.show_all_imgs(pause=self.pause)

我们先从def process(self, images, return_time=False)说起。

  def process(self, images, return_time=False):
    with torch.no_grad():
      output = self.model(images)[-1]
      hm = output['hm'].sigmoid_()
      wh = output['wh']
      reg = output['reg'] if self.opt.reg_offset else None
      if self.opt.flip_test: #False
        hm = (hm[0:1] + flip_tensor(hm[1:2])) / 2
        wh = (wh[0:1] + flip_tensor(wh[1:2])) / 2
        reg = reg[0:1] if reg is not None else None
      torch.cuda.synchronize()
      forward_time = time.time()
      dets = ctdet_decode(hm, wh, reg=reg, cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)

首先获取预测出来的图片hm, 因为我们在训练的时候也是做了sigmoid所以我们在预测的时候也进行sigmoid。之后进入dets = ctdet_decode(hm, wh, reg=reg, cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)

这个函数主要的作用是将我们的heatmap转换成bbox

  • 首先进入_nms函数
def _nms(heat, kernel=3):
    pad = (kernel - 1) // 2
 
    hmax = nn.functional.max_pool2d(
        heat, (kernel, kernel), stride=1, padding=pad)
    keep = (hmax == heat).float()
    return heat * keep

hmax用来寻找8-近邻极大值点,keep为h极大值点的位置,返回heat*keep,筛选出极大值点,为原值,其余为0。

*其次进入_topk:函数


def _topk(scores, K=40):
    batch, cat, height, width = scores.size()
      
    topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
 
    topk_inds = topk_inds % (height * width)
    topk_ys   = (topk_inds / width).int().float()
    topk_xs   = (topk_inds % width).int().float()
      
    topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
    topk_clses = (topk_ind / K).int()
    topk_inds = _gather_feat(
        topk_inds.view(batch, -1, 1), topk_ind).view(batch, K)
    topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K)
    topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K)
 
    return topk_score, topk_inds, topk_clses, topk_ys, topk_xs

topk_scores: batch * cat * K, batch代表batchsize,cat代表类别数,K代表K个最大值。topk_inds:batch * cat * K, index取值:[0, W x H - 1]。 topk_scores和topk_inds分别为每个batch每张heatmap(每个类别)中前K个最大的score和id。之后对topk_inds使用取余和除法得到横纵坐标top_ys、top_xs。然后在每个batch中取所有heatmap的前K个最大score以及id,。topk_score:batch * K topk_ind:batch * K index取值:[0, cat x K - 1] 之后对topk_inds(view后)和topk_ind调用了_gather_feat函数,在utils文件中

  • _gather_feat
def _gather_feat(feat, ind, mask=None):
    dim  = feat.size(2)
    ind  = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
    feat = feat.gather(1, ind)
    if mask is not None:
        mask = mask.unsqueeze(2).expand_as(feat)
        feat = feat[mask]
        feat = feat.view(-1, dim)
    return feat

输入:
feat(topk_inds): batch * (cat x K) * 1 (假设输入的是topk_inds和topk_ind) ind(topk_ind):batch * K 首先将ind扩展一个指标,变为 batch * K * 1 之后使用gather,将ind对应的值取出来。返回的是index:
feat: batch * K * 1 取值:[0, cat x K - 1]更一般的情况如下:
feat : A * B * C
ind:A * D
首先将ind扩展一个指标,并且expand为dim的大小,变为 A * D * C,其中对于任意的i, j, 数组ind[i, j, :]中所有的元素均相同,等于原来A * D shape的ind[i, j]。之后使用gather,将ind对应的值取出来。得到的feat: A * D * C

  • scores, inds, clses, ys, xs = _topk(heat, K=K)
    最后返回有四个:topk_score, topk_inds, topk_clses, topk_ys, topk_xs
    topk_score:batch * K。每张图片中最大的K个值topk_inds:batch * K 。没张图片中最大的K个值对应的index,这个index在[0, W x H - 1]之间。后两个类似。

    1. reg = _tranpose_and_gather_feat(reg, inds)
  1. wh = _tranpose_and_gather_feat(wh, inds)
    前者后者都是过inds分别获取reg以及wh.
  • 之后会进行尺度变换等后处理, 在做soft_nms(oft-nms来移除冗余的bbox。根据得分情况,最后选择top-100个bbox用于检测。),之后scores进行一个筛选,将那些大于最多检测框(100)剔除掉。
  def post_process(self, dets, meta, scale=1):
    dets = dets.detach().cpu().numpy()
    dets = dets.reshape(1, -1, dets.shape[2])
    dets = ctdet_post_process(
        dets.copy(), [meta['c']], [meta['s']],
        meta['out_height'], meta['out_width'], self.opt.num_classes)
    for j in range(1, self.num_classes + 1):
      dets[0][j] = np.array(dets[0][j], dtype=np.float32).reshape(-1, 5)
      dets[0][j][:, :4] /= scale
    return dets[0]
  def merge_outputs(self, detections):
    results = {}
    for j in range(1, self.num_classes + 1):
      results[j] = np.concatenate(
        [detection[j] for detection in detections], axis=0).astype(np.float32)
      if len(self.scales) > 1 or self.opt.nms:
         soft_nms(results[j], Nt=0.5, method=2)
    scores = np.hstack(
      [results[j][:, 4] for j in range(1, self.num_classes + 1)])
    if len(scores) > self.max_per_image:
      kth = len(scores) - self.max_per_image
      thresh = np.partition(scores, kth)[kth]
      for j in range(1, self.num_classes + 1):
        keep_inds = (results[j][:, 4] >= thresh)
        results[j] = results[j][keep_inds]
    return results
  def run(self, image_or_path_or_tensor, meta=None):
    load_time, pre_time, net_time, dec_time, post_time = 0, 0, 0, 0, 0
    merge_time, tot_time = 0, 0
    debugger = Debugger(dataset=self.opt.dataset, ipynb=(self.opt.debug==3),
                        theme=self.opt.debugger_theme)
    start_time = time.time()
    pre_processed = False
    if isinstance(image_or_path_or_tensor, np.ndarray):
      image = image_or_path_or_tensor
    elif type(image_or_path_or_tensor) == type (''): 
      image = cv2.imread(image_or_path_or_tensor)
    else:
      image = image_or_path_or_tensor['image'][0].numpy()
      pre_processed_images = image_or_path_or_tensor
      pre_processed = True
    
    loaded_time = time.time()
    load_time += (loaded_time - start_time)
    
    detections = []
    for scale in self.scales:
      scale_start_time = time.time()
      if not pre_processed:
        images, meta = self.pre_process(image, scale, meta)
      else:
        # import pdb; pdb.set_trace()
        images = pre_processed_images['images'][scale][0]
        meta = pre_processed_images['meta'][scale]
        meta = {k: v.numpy()[0] for k, v in meta.items()}
      images = images.to(self.opt.device)
      torch.cuda.synchronize()
      pre_process_time = time.time()
      pre_time += pre_process_time - scale_start_time
      
      output, dets, forward_time = self.process(images, return_time=True)

      torch.cuda.synchronize()
      net_time += forward_time - pre_process_time
      decode_time = time.time()
      dec_time += decode_time - forward_time
      
      if self.opt.debug >= 2:
        self.debug(debugger, images, dets, output, scale)
      
      dets = self.post_process(dets, meta, scale)
      torch.cuda.synchronize()
      post_process_time = time.time()
      post_time += post_process_time - decode_time

      detections.append(dets)
    
    results = self.merge_outputs(detections)
    torch.cuda.synchronize()
    end_time = time.time()
    merge_time += end_time - post_process_time
    tot_time += end_time - start_time

    if self.opt.debug >= 1:
      self.show_results(debugger, image, results)
    
    return {'results': results, 'tot': tot_time, 'load': load_time,
            'pre': pre_time, 'net': net_time, 'dec': dec_time,
            'post': post_time, 'merge': merge_time}

关于softNMS可以参考这篇文章论文阅读: Soft-NMS。

image.png

传统的NMS原则:

1、根据候选框的类别分类概率做排序,假如有4个 BBox ,其置信度A>B>C>D。

2、先标记最大概率矩形框A是算法要保留的BBox;

3、从最大概率矩形框A开始,分别判断ABC与D的重叠度IOU(两框的交并比)是否大于某个设定的阈值(0.5),假设D与A的重叠度超过阈值,那么就舍弃D;

4、从剩下的矩形框BC中,选择概率最大的B,标记为保留,然后判读C与B的重叠度,扔掉重叠度超过设定阈值的矩形框;

5、一直重复进行,标记完所有要保留下来的矩形框。

传统NMS缺点:

NMS缺点:

1、NMS算法中的最大问题就是它将相邻检测框的分数均强制归零(既将重叠部分大于重叠阈值Nt的检测框移除)。在这种情况下,如果一个真实物体在重叠区域出

现,则将导致对该物体的检测失败并降低了算法的平均检测率(average precision, AP)。

2、NMS的阈值也不太容易确定,设置过小会出现误删,设置过高又容易增大误检。

3、NMS一般只能使用CPU计算,无法使用GPU计算。

写这篇博客是为了做一下模型代码理解的笔记,方便自己日后的复习,如果问题,欢迎读者指出。
参考:

  1. cvpr2018 Deep Layer Aggregation(DLANet)
  2. 【代码】CenterNet代码解析
  3. 论文阅读: Soft-NMS

你可能感兴趣的:(CenterNet(二) 源码解读之bouding box 检测)