如何使用nuScenes数据集格式的单帧数据推理(以DETR3D为例)

【请尊重原创!转载和引用文章内容务必注明出处!未经许可上传到某文库或其他收费阅读/下载网站赚钱的必追究责任!】

无论是mmdetection3D还是OpenPCDet都只有使用数据集(使用哪个数据集由配置文件里指定)训练和测试的代码,没有使用某种数据集格式的单帧数据进行推理的代码(也就是一些2D目标检测框里里提供的推理demo代码),而OpenPCDet尤其是mmdetection里面为了支持多种不同的数据集和多种不同模型的训练和测试,把很多实现步骤高度配置化,这是好事也是坏事,如果你只是使用这些框架(尤其是使用它们已支持的公开数据集)进行训练和测试,那么相对简单,如果是要使用这些框架对单帧数据(可以是他们支持的公开数据集里的数据或者遵循这些数据集的格式自己制作的数据集的数据)进行推理,就比较复杂了,框架没有提供工具代码只能自己去写,写之前当然得把框架的相关代码翻看几遍把支持某个数据集和模型的配置文件和分散在多处的代码的逻辑理顺连贯起来,不然没法下手,因为框架高度配置化的话相关的代码就会非常分散在连贯性可读性方面很差,需要花时间先理清楚,然后把那些配置化的相关零散代码都整理并修改和串联起来,完成数据的多步预处理、调用模型推理、对推理结果的解析和后处理的全部流程的实现。

用nuScense格式数据调用detr3d模型的一个最烦的地方就是,你在调用模型推理时非得提供对应的img_metas数据,模型的这个设计我觉得不是很合理,起码不算友好,对于硬件和相关标定参数定了后,这种参数完全可以通过配置隐式提供给模型,为何每次调用都得提供这些meta参数呢!反正我写下面的推理代码之前是浪费了不少时间在厘清img_metas这个参数的数据该怎么准备上!

此处已DETR3D最初的官方实现代码GitHub - WangYueFt/detr3d基础给出了相关数据预处理和模型调用及推理结果后处理等代码,另外,这些代码是用于在ROS2节点中运行,解析出的推理结果还按项目实际要求转换成autoware的ObjectDetection数据格式后发布,所以还包含了ROS2和autoware相关的代码,这些代码不是重点仅供做代码逻辑完整性展示。当然,如果你项目中有类似需要的话,可以直接拷贝使用这些代码。

import rclpy
from rclpy.node import Node
from rclpy.executors import MultiThreadedExecutor
from rclpy.callback_groups import MutuallyExclusiveCallbackGroup, ReentrantCallbackGroup

from cv_bridge import CvBridge
from sensor_msgs.msg import CompressedImage, Image
from std_msgs.msg import String
from autoware_auto_perception_msgs.msg import DetectedObjects
from autoware_auto_perception_msgs.msg import DetectedObject
from autoware_auto_perception_msgs.msg import ObjectClassification
from autoware_auto_perception_msgs.msg import DetectedObjectKinematics
from autoware_auto_perception_msgs.msg import Shape

import numpy as np
import mmcv
import sys, os
import torch
import warnings
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
                         wrap_fp16_model)

from mmdet3d.models import build_model
from mmdet.apis import multi_gpu_test, set_random_seed
from mmdet.datasets import replace_ImageToTensor
from mmdet3d.core.bbox.structures.box_3d_mode import (Box3DMode, CameraInstance3DBoxes,
                              DepthInstance3DBoxes, LiDARInstance3DBoxes)

from nuscenes import NuScenes
from pyquaternion import Quaternion
from geometry_msgs.msg import Point
from geometry_msgs.msg import Vector3
from trimesh.transformations import quaternion_from_euler
from geometry_msgs.msg import Quaternion as GeoQuaternion
from geometry_msgs.msg import Twist
import math
import time

class DetectionPublisher(Node):

    def __init__(self):
        super().__init__('DetectionPublisher_python')
        
        self.publisher_ = self.create_publisher(String, 'example_topic', 10)
        timer_period = 0.5  # seconds
        self.timer = self.create_timer(timer_period, self.timer_callback)
        self.i = 0
        

    def timer_callback(self):
        msg = String()
        msg.data = 'Hello World: %d' % self.i
        self.publisher_.publish(msg)
        self.get_logger().info('Publishing: "%s"' % msg.data)
        self.i += 1

class PadMultiViewImage(object):
    """Pad the multi-view image.
    There are two padding modes: (1) pad to a fixed size and (2) pad to the
    minimum size that is divisible by some number.
    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
    Args:
        size (tuple, optional): Fixed padding size.
        size_divisor (int, optional): The divisor of padded size.
        pad_val (float, optional): Padding value, 0 by default.
    """

    def __init__(self, size=None, size_divisor=32, pad_val=0):
        self.size = size
        self.size_divisor = size_divisor
        self.pad_val = pad_val
        # only one of size and size_divisor should be valid
        assert size is not None or size_divisor is not None
        assert size is None or size_divisor is None

    def _pad_img(self, results):
        """Pad images according to ``self.size``."""
        if self.size is not None:
            padded_img = [mmcv.impad(
                img, shape=self.size, pad_val=self.pad_val) for img in results['img']]
        elif self.size_divisor is not None:
            padded_img = [mmcv.impad_to_multiple(
                img, self.size_divisor, pad_val=self.pad_val) for img in results['img']]
        results['img'] = padded_img
        results['img_shape'] = [img.shape for img in padded_img]
        results['pad_shape'] = [img.shape for img in padded_img]
        results['pad_fixed_size'] = self.size
        results['pad_size_divisor'] = self.size_divisor

    def __call__(self, results):
        """Call function to pad images, masks, semantic segmentation maps.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Updated result dict.
        """
        self._pad_img(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(size={self.size}, '
        repr_str += f'size_divisor={self.size_divisor}, '
        repr_str += f'pad_val={self.pad_val})'
        return repr_str

class NormalizeMultiviewImage(object):
    """Normalize the image.
    Added key is "img_norm_cfg".
    Args:
        mean (sequence): Mean values of 3 channels.
        std (sequence): Std values of 3 channels.
        to_rgb (bool): Whether to convert the image from BGR to RGB,
            default is true.
    """

    def __init__(self, mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=True):
        self.mean = np.array(mean, dtype=np.float32)
        self.std = np.array(std, dtype=np.float32)
        self.to_rgb = to_rgb

    def __call__(self, results):
        """Call function to normalize images.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Normalized results, 'img_norm_cfg' key is added into
                result dict.
        """
        results['img'] = [mmcv.imnormalize(
            img, self.mean, self.std, self.to_rgb) for img in results['img']]
        results['img_norm_cfg'] = dict(
            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
        return repr_str

class LoadSingleViewImageFromFiles(object):
    """Load multi channel images from a list of separate channel files.

    Expects results['img_filename'] to be a list of filenames.

    Args:
        to_float32 (bool): Whether to convert the img to float32.
            Defaults to False.
        color_type (str): Color type of the file. Defaults to 'unchanged'.
    """

    def __init__(self, to_float32=True, color_type='unchanged'):
        self.to_float32 = to_float32
        self.color_type = color_type

    def __call__(self, results):
        """Call function to load multi-view image from files.

        Args:
            results (dict): Result dict containing multi-view image filenames.

        Returns:
            dict: The result dict containing the multi-view image data. \
                Added keys and values are described below.

                - filename (str): Multi-view image filenames.
                - img (np.ndarray): Multi-view image arrays.
                - img_shape (tuple[int]): Shape of multi-view image arrays.
                - ori_shape (tuple[int]): Shape of original image arrays.
                - pad_shape (tuple[int]): Shape of padded image arrays.
                - scale_factor (float): Scale factor.
                - img_norm_cfg (dict): Normalization configuration of images.
        """
        results['filename'] = 'sample.jpg'
        # h,w,c   => h, w, c, nv 
        img = np.stack([results['img']], axis=-1)
        if self.to_float32:
            img = img.astype(np.float32)
        results['img'] = [img[..., i] for i in range(img.shape[-1])]
        results['img_shape'] = img.shape
        results['ori_shape'] = img.shape
        # Set initial values for default meta_keys
        results['pad_shape'] = img.shape
        results['scale_factor'] = 1.0
        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
        results['img_norm_cfg'] = dict(
            mean=np.zeros(num_channels, dtype=np.float32),
            std=np.ones(num_channels, dtype=np.float32),
            to_rgb=False)
        return results

    def __repr__(self):
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(to_float32={self.to_float32}, '
        repr_str += f"color_type='{self.color_type}')"
        return repr_str

def obtain_sensor2top(nusc,
                      sensor_token,
                      l2e_t,
                      l2e_r_mat,
                      e2g_t,
                      e2g_r_mat,
                      sensor_type='lidar'):
    """Obtain the info with RT matric from general sensor to Top LiDAR.

    Args:
        nusc (class): Dataset class in the nuScenes dataset.
        sensor_token (str): Sample data token corresponding to the
            specific sensor type.
        l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
        l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
            in shape (3, 3).
        e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
        e2g_r_mat (np.ndarray): Rotation matrix from ego to global
            in shape (3, 3).
        sensor_type (str, optional): Sensor to calibrate. Default: 'lidar'.

    Returns:
        sweep (dict): Sweep information after transformation.
    """
    sd_rec = nusc.get('sample_data', sensor_token)
    cs_record = nusc.get('calibrated_sensor',
                         sd_rec['calibrated_sensor_token'])
    pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
    data_path = str(nusc.get_sample_data_path(sd_rec['token']))
    if os.getcwd() in data_path:  # path from lyftdataset is absolute path
        data_path = data_path.split(f'{os.getcwd()}/')[-1]  # relative path
    sweep = {
        'data_path': data_path,
        'type': sensor_type,
        'sample_data_token': sd_rec['token'],
        'sensor2ego_translation': cs_record['translation'],
        'sensor2ego_rotation': cs_record['rotation'],
        'ego2global_translation': pose_record['translation'],
        'ego2global_rotation': pose_record['rotation'],
        'timestamp': sd_rec['timestamp']
    }
    l2e_r_s = sweep['sensor2ego_rotation']
    l2e_t_s = sweep['sensor2ego_translation']
    e2g_r_s = sweep['ego2global_rotation']
    e2g_t_s = sweep['ego2global_translation']

    # obtain the RT from sensor to Top LiDAR
    # sweep->ego->global->ego'->lidar
    l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
    e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
    R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
    T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
    T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
                  ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
    sweep['sensor2lidar_rotation'] = R.T  # points @ R.T + T
    sweep['sensor2lidar_translation'] = T
    return sweep

def getSemanticType(class_name):
    if (class_name == "CAR" or class_name == "Car"):
       return ObjectClassification.CAR
    elif (class_name == "TRUCK" or class_name == "Medium_Truck" or class_name =="Big_Truck"):
       return ObjectClassification.TRUCK
    elif (class_name == "BUS"):
       return ObjectClassification.BUS
    elif (class_name == "TRAILER"):
       return ObjectClassification.TRAILER
    elif (class_name == "BICYCLE"):
       return ObjectClassification.BICYCLE
    elif (class_name == "MOTORBIKE"):
       return ObjectClassification.MOTORCYCLE
    elif (class_name == "PEDESTRIAN" or class_name == "Pedestrian"):
       return ObjectClassification.PEDESTRIAN
    else: 
       return ObjectClassification.UNKNOWN


class CustomBox3D(object):
  def __init__(self,nid,score,x,y,z,w,l,h,rt,vel_x,vel_y):
      self.id = nid
      self.score = score
      self.x = x
      self.y = y
      self.z = z
      self.w = w
      self.l = l
      self.h = h
      self.rt = rt
      self.vel_x = vel_x
      self.vel_y = vel_y

def isCarLikeVehicle(label):
   return label == ObjectClassification.BICYCLE or label == ObjectClassification.BUS or \
         label == ObjectClassification.CAR or label == ObjectClassification.MOTORCYCLE or \
         label == ObjectClassification.TRAILER or label == ObjectClassification.TRUCK 

def createPoint(x, y, z):
  p = Point()
  p.x = float(x)
  p.y = float(y)
  p.z = float(z)
  return p

def createQuaternionFromYaw(yaw):
  # tf2.Quaternion
  # q.setRPY(0, 0, yaw)
  q = quaternion_from_euler(0, 0, yaw)
  # geometry_msgs.msg.Quaternion
  #return tf2.toMsg(q)
  #return GeoQuaternion(*q)
  return GeoQuaternion(x=q[0],y=q[1],z=q[2],w=q[3])

def createTranslation(x, y, z):
  v = Vector3()
  v.x = float(x)
  v.y = float(y)
  v.z = float(z)
  return v

def box3DToDetectedObject(box3d, class_names, has_twist, is_sign):
  obj = DetectedObject()
  obj.existence_probability = float(box3d.score)

  classification = ObjectClassification()
  classification.probability = 1.0
  if (box3d.id >= 0 and box3d.id < len(class_names)):
    classification.label = getSemanticType(class_names[box3d.id])
  else: 
    if is_sign:
      sign_label = 255
      classification.label = sign_label
    else:
      classification.label = ObjectClassification.UNKNOWN
      print("Unexpected label: UNKNOWN is set.")
  
  if (isCarLikeVehicle(classification.label)):
    obj.kinematics.orientation_availability = DetectedObjectKinematics.SIGN_UNKNOWN

  obj.classification.append(classification)

  # pose and shape
  # mmdet3d yaw format to ros yaw format
  yaw = -box3d.rt - np.pi / 2
  obj.kinematics.pose_with_covariance.pose.position = createPoint(box3d.x, box3d.y, box3d.z)
  obj.kinematics.pose_with_covariance.pose.orientation = createQuaternionFromYaw(yaw)
  obj.shape.type = Shape.BOUNDING_BOX
  obj.shape.dimensions = createTranslation(box3d.l, box3d.w, box3d.h)
  # twist
  if (has_twist):
    vel_x = float(box3d.vel_x)
    vel_y = float(box3d.vel_y)
    twist = Twist()
    twist.linear.x = math.sqrt(pow(vel_x, 2) + pow(vel_y, 2))
    twist.angular.z = 2 * (math.atan2(vel_y, vel_x) - yaw)
    obj.kinematics.twist_with_covariance.twist = twist
    obj.kinematics.has_twist = has_twist
  return obj  


class ImageSubscriber(Node):

    def __init__(self):
        super().__init__('ImageSubscriber_python')
        cb_group = MutuallyExclusiveCallbackGroup()
        self.img_sub = self.create_subscription(
            CompressedImage,
            'pub_image/compressed',
            self.image_callback,
            10,
            callback_group=cb_group)
        self.img_sub
        self.od_pub = self.create_publisher(DetectedObjects, 'pub_detection', 10)
        self.cvBridge  = CvBridge()
        self.pad = PadMultiViewImage()
        self.norm = NormalizeMultiviewImage()
        self.file_loader = LoadSingleViewImageFromFiles() 
        config_path = "./detr3d_res101_gridmask_wst.py"
        self.cfg = Config.fromfile(config_path)
        if self.cfg.get('custom_imports', None):
           from mmcv.utils import import_modules_from_strings
           import_modules_from_strings(**self.cfg['custom_imports'])
        if hasattr(self.cfg, 'plugin'):
           if self.cfg.plugin:
             import importlib
             if hasattr(self.cfg, 'plugin_dir'):
                plugin_dir = self.cfg.plugin_dir
                _module_dir = os.path.dirname(plugin_dir)
                _module_dir = _module_dir.split('/')
                _module_path = _module_dir[0]

                for m in _module_dir[1:]:
                    _module_path = _module_path + '.' + m
                print(_module_path)
                print(sys.path)
                plg_lib = importlib.import_module(_module_path)
        if self.cfg.get('cudnn_benchmark', False):
           torch.backends.cudnn.benchmark = True
        self.cfg.model.pretrained = None
        self.cfg.model.train_cfg = None
        self.model = build_model(self.cfg.model, test_cfg=self.cfg.get('test_cfg'))
        fp16_cfg = self.cfg.get('fp16', None)
        if fp16_cfg is not None:
           wrap_fp16_model(self.model)
        checkpoint = load_checkpoint(self.model, "epoch_200.pth", map_location='cpu')
        
        if 'CLASSES' in checkpoint.get('meta', {}):
           self.model.CLASSES = checkpoint['meta']['CLASSES']
        else:
           self.model.CLASSS = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
               'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
               'barrier')
        # palette for visualization in segmentation tasks
        if 'PALETTE' in checkpoint.get('meta', {}):
           self.model.PALETTE = checkpoint['meta']['PALETTE']
        self.model.cfg  = self.cfg 

        self.model.cuda()
        self.model.eval()
        #if torch.cuda.device_count() > 1:  # for server side
        #   self.model = nn.DataParallel(self.model)
        print("model is created!")

        nusc = NuScenes(version='v1.0-mini', dataroot='nuscenes_mini', verbose=True)
        scene0 = nusc.scene[0]
        first_sample_token = scene0['first_sample_token']
        first_sample = nusc.get('sample', first_sample_token)
        sd_rec = nusc.get('sample_data', first_sample['data']['LIDAR_TOP'])
        cs_record = nusc.get('calibrated_sensor',
                             sd_rec['calibrated_sensor_token'])
        pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
        lidar_token = first_sample['data']['LIDAR_TOP']
        lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
        info = {
            'lidar_path': lidar_path,
            'token': first_sample['token'],
            'sweeps': [],
            'cams': dict(),
            'lidar2ego_translation': cs_record['translation'],
            'lidar2ego_rotation': cs_record['rotation'],
            'ego2global_translation': pose_record['translation'],
            'ego2global_rotation': pose_record['rotation'],
            'timestamp': first_sample['timestamp'],
        }

        l2e_r = info['lidar2ego_rotation']
        l2e_t = info['lidar2ego_translation']
        e2g_r = info['ego2global_rotation']
        e2g_t = info['ego2global_translation']
        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
        camera_types = [
            'CAM_FRONT',
            'CAM_FRONT_RIGHT',
            'CAM_FRONT_LEFT',
            'CAM_BACK',
            'CAM_BACK_LEFT',
            'CAM_BACK_RIGHT',
        ]
        for cam in camera_types:
            cam_token = first_sample['data'][cam]
            cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
            cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat,
                                         e2g_t, e2g_r_mat, cam)
            cam_info.update(cam_intrinsic=cam_intrinsic)
            info['cams'].update({cam: cam_info})

        '''
        cam_front_sample_data = nusc.get('sample_data', first_sample['data']['CAM_FRONT'])
        cam_front_sample_path = os.path.join(nusc.dataroot, cam_front_sample_data['filename'])
        print("sample image file:", cam_front_sample_path)
        cam_front_calibrate = nusc.get('calibrated_sensor', cam_front_sample_data['calibrated_sensor_token'])
        sensor2lidar = translation = np.expand_dims(np.array(cam_front_calibrate['translation']), axis=-1)
        sensor2lidar_rotation = np.expand_dims(np.array(cam_front_calibrate['rotation']), axis=-1)
        camera_intrinsic = np.array(cam_front_calibrate['camera_intrinsic'])
        '''

        image_paths = []
        lidar2img_rts = []
        lidar2cam_rts = []
        cam_intrinsics = []
        for cam_type, cam_info in info['cams'].items():
            image_paths.append(cam_info['data_path'])
             # obtain lidar to image transformation matrix
            lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
            lidar2cam_t = cam_info[
                'sensor2lidar_translation'] @ lidar2cam_r.T
            lidar2cam_rt = np.eye(4)
            lidar2cam_rt[:3, :3] = lidar2cam_r.T
            lidar2cam_rt[3, :3] = -lidar2cam_t
            intrinsic = cam_info['cam_intrinsic']
            viewpad = np.eye(4)
            viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
            lidar2img_rt = (viewpad @ lidar2cam_rt.T)
            lidar2img_rts.append(lidar2img_rt)

            cam_intrinsics.append(viewpad)
            lidar2cam_rts.append(lidar2cam_rt.T)

        self.img_metas = {}
        self.img_metas.update(
                dict(
                    img_filename=image_paths,
                    lidar2img=lidar2img_rts,
                    cam_intrinsic=cam_intrinsics,
                    lidar2cam=lidar2cam_rts,
                ))
        self.class_names_ = self.cfg.class_names
        print("ImageSubscriber init done")
 
    
    def image_callback(self, msg):
        #image = self.cvBridge.imgmsg_to_cv2(msg, "bgr8")
        image = self.cvBridge.compressed_imgmsg_to_cv2(msg, "bgr8")
        #print("image received, shape:", image.shape)
        results = {'img': image}
        self.file_loader(results)
        self.norm(results)
        self.pad(results)
        image = results['img'][0]
        meta = {'filename': results['filename'], 
                'img_shape': results['img_shape'],
                'ori_shape': results['ori_shape'],
                'pad_shape': results['pad_shape'],
                'scale_factor': results['scale_factor'],
                'box_type_3d': LiDARInstance3DBoxes,     #CameraInstance3DBoxes/LiDARInstance3DBoxes
                'box_mode_3d': Box3DMode.LIDAR}
        meta.update(self.img_metas)
        #print("meta:", meta)
        img_metas = [[meta]]
        inputs = torch.tensor(image).to('cuda')
        # h,w,c => bs,nv,c,h,w
        inputs = inputs.permute(2,0,1)
        inputs = torch.unsqueeze(inputs, 0)
        inputs = torch.unsqueeze(inputs, 0)
        #print("input tensor shape:", inputs.shape)
        with torch.no_grad():
           outputs = self.model(return_loss=False, rescale=True, img=inputs, img_metas=img_metas) 
           torch.cuda.synchronize()
           pts_bbox = outputs[0]['pts_bbox']
           boxes_3d_enc = pts_bbox['boxes_3d']
           scores_3d = pts_bbox['scores_3d']
           labels_3d = pts_bbox['labels_3d']

           filter = scores_3d >= 0.5
           boxes_3d_enc.tensor = boxes_3d_enc.tensor[filter]
           boxes_3d = boxes_3d_enc.tensor.numpy() # [[cx, cy, cz, w, l, h, rot, vx, vy]]
           scores_3d = scores_3d[filter].numpy()
           labels_3d = labels_3d[filter].numpy()
           custom_boxes_3d = []
           for i, box_3d in enumerate(boxes_3d):
              box3d = CustomBox3D(labels_3d[i], scores_3d[i],
                                   box_3d[0],box_3d[1],box_3d[2],
                                   box_3d[3],box_3d[4],box_3d[5],
                                   box_3d[6],box_3d[7],box_3d[8])
              custom_boxes_3d.append(box3d)
               
           #print("boxes_3d", boxes_3d)
           #print("scores_3d", scores_3d)
           #print("labels_3d", labels_3d)
           output_msg = DetectedObjects()
           obj_num = len(boxes_3d)
           for i, box3d in enumerate(custom_boxes_3d):
              obj = box3DToDetectedObject(box3d, self.class_names_, True, False);
              output_msg.objects.append(obj)

           output_msg.header.stamp = self.get_clock().now().to_msg() #rclpy.time.Time()
           output_msg.header.frame_id = "base_link"
           self.od_pub.publish(output_msg)
           print(obj_num, "Objects published")


def main(args=None):
    rclpy.init(args=args)

    sys.path.append(os.getcwd())
    image_subscriber = ImageSubscriber()
    executor = MultiThreadedExecutor()
    executor.add_node(image_subscriber)
    executor.spin()

    image_subscriber.destroy_node()
    detection_publisher.destroy_node()
    rclpy.shutdown()


if __name__ == '__main__':
    main()

这里说的单帧数据可以是单张图片或者nuScenes格式的6张环视图片,之所以说一帧,是对应点云的概念,实际做3D目标检测尤其是雷视融合推理时一般都是以点云帧的频率为时间单位进行推理(在获取一帧点云的同时获取一张或多张图片(取决于有几个摄像头),前融合的话是将点云和图像的数据做数据融合后输入模型或直接输入模型由模型内部抽取特征做特征融合,后融合的话是将点云和图像分别输入不同的模型推理,再对模型各自的结果进行融合处理)。

上面的代码由于是以DETR3D官方代码为基础的,所以和mmdetection3D后来集成DETR3D的实现代码不同,对比两者代码可以发现,mmdetection3D将DETR3D官方代码里的一些数据预处理环节的实现代码都移走了,在模型内部单独提供了一个Det3DDataPreprocessor类来进行一些公共处理,总体的配置文件和实现思路还是比较类似,毕竟DETR3D官方是在mmdetection3D的早期版本基础上开发的,例如两者的detr3d_r101_gridmask.py配置文件里面有些配置不同,但是思路是相似的:

mmdetection3d/projects/DETR3D/configs/detr3d_r101_gridmask.py

model = dict(
    type='DETR3D',
    use_grid_mask=True,
    data_preprocessor=dict(
        type='Det3DDataPreprocessor', **img_norm_cfg, pad_size_divisor=32),
    img_backbone=dict(
        type='mmdet.ResNet',
        depth=101,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN2d', requires_grad=False),
        norm_eval=True,
        style='caffe',
        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
        stage_with_dcn=(False, False, True, True)),
    img_neck=dict(
        type='mmdet.FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=1,
        add_extra_convs='on_output',
        num_outs=4,
        relu_before_extra_convs=True),
    pts_bbox_head=dict(
        type='DETR3DHead',
        num_query=900,
        num_classes=10,
        in_channels=256,
        sync_cls_avg_factor=True,
        with_box_refine=True,
        as_two_stage=False,
        transformer=dict(
            type='Detr3DTransformer',
            decoder=dict(
                type='Detr3DTransformerDecoder',
                num_layers=6,
                return_intermediate=True,
                transformerlayers=dict(
                    type='mmdet.DetrTransformerDecoderLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',  # mmcv.
                            embed_dims=256,
                            num_heads=8,
                            dropout=0.1),
                        dict(
                            type='Detr3DCrossAtten',
                            pc_range=point_cloud_range,
                            num_points=1,
                            embed_dims=256)
                    ],
                    feedforward_channels=512,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                     'ffn', 'norm')))),
        bbox_coder=dict(
            type='NMSFreeCoder',
            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
            pc_range=point_cloud_range,
            max_num=300,
            voxel_size=voxel_size,
            num_classes=10),
        positional_encoding=dict(
            type='mmdet.SinePositionalEncoding',
            num_feats=128,
            normalize=True,
            offset=-0.5),
        loss_cls=dict(
            type='mmdet.FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=2.0),
        loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25),
        loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)),
    # model training and testing settings
    train_cfg=dict(
        pts=dict(
            grid_size=[512, 512, 1],
            voxel_size=voxel_size,
            point_cloud_range=point_cloud_range,
            out_size_factor=4,
            assigner=dict(
                type='HungarianAssigner3D',
                cls_cost=dict(type='mmdet.FocalLossCost', weight=2.0),
                reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
                # ↓ Fake cost. This is just to get compatible with DETR head
                iou_cost=dict(type='mmdet.IoUCost', weight=0.0),
                pc_range=point_cloud_range))))

dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'

test_transforms = [
    dict(
        type='RandomResize3D',
        scale=(1600, 900),
        ratio_range=(1., 1.),
        keep_ratio=True)
]
train_transforms = [dict(type='PhotoMetricDistortion3D')] + test_transforms

backend_args = None
train_pipeline = [
    dict(
        type='LoadMultiViewImageFromFiles',
        to_float32=True,
        num_views=6,
        backend_args=backend_args),
    dict(
        type='LoadAnnotations3D',
        with_bbox_3d=True,
        with_label_3d=True,
        with_attr_label=False),
    dict(type='MultiViewWrapper', transforms=train_transforms),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectNameFilter', classes=class_names),
    dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
]

test_pipeline = [
    dict(
        type='LoadMultiViewImageFromFiles',
        to_float32=True,
        num_views=6,
        backend_args=backend_args),
    dict(type='MultiViewWrapper', transforms=test_transforms),
    dict(type='Pack3DDetInputs', keys=['img'])
]

detr3d/projects/configs/detr3d/detr3d_res101_gridmask.py

model = dict(
    type='Detr3D',
    use_grid_mask=True,
    img_backbone=dict(
        type='ResNet',
        depth=101,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=1,
        norm_cfg=dict(type='BN2d', requires_grad=False),
        norm_eval=True,
        style='caffe',
        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
        stage_with_dcn=(False, False, True, True)),
    img_neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=1,
        add_extra_convs='on_output',
        num_outs=4,
        relu_before_extra_convs=True),
    pts_bbox_head=dict(
        type='Detr3DHead',
        num_query=900,
        num_classes=10,
        in_channels=256,
        sync_cls_avg_factor=True,
        with_box_refine=True,
        as_two_stage=False,
        transformer=dict(
            type='Detr3DTransformer',
            decoder=dict(
                type='Detr3DTransformerDecoder',
                num_layers=6,
                return_intermediate=True,
                transformerlayers=dict(
                    type='DetrTransformerDecoderLayer',
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=256,
                            num_heads=8,
                            dropout=0.1),
                        dict(
                            type='Detr3DCrossAtten',
                            pc_range=point_cloud_range,
                            num_points=1,
                            embed_dims=256)
                    ],
                    feedforward_channels=512,
                    ffn_dropout=0.1,
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                     'ffn', 'norm')))),
        bbox_coder=dict(
            type='NMSFreeCoder',
            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
            pc_range=point_cloud_range,
            max_num=300,
            voxel_size=voxel_size,
            num_classes=10), 
        positional_encoding=dict(
            type='SinePositionalEncoding',
            num_feats=128,
            normalize=True,
            offset=-0.5),
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=2.0),
        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
    # model training and testing settings
    train_cfg=dict(pts=dict(
        grid_size=[512, 512, 1],
        voxel_size=voxel_size,
        point_cloud_range=point_cloud_range,
        out_size_factor=4,
        assigner=dict(
            type='HungarianAssigner3D',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
            pc_range=point_cloud_range))))

dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'

...

train_pipeline = [
    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectNameFilter', classes=class_names),
    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
    dict(type='PadMultiViewImage', size_divisor=32),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
]
test_pipeline = [
    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
    dict(type='PadMultiViewImage', size_divisor=32),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['img'])
        ])
]

你可能感兴趣的:(detr3d,nuScenes,mmdetection3D,autoware,ROS2)