nuScenes数据集官网https://www.nuscenes.org/overview
标注格式:
官方提供的标注数据一共有15个json文件,可以参考官方数据标注说明,下面只对3D检测会用到的信息进行说明。
也可以参考Nuscenes数据集标注格式
通过这个文件获取相机的translation和rotation, camera_intrinsic
包含4个key,分别是:
1、token:唯一标识;
2、timestamp:Unix时间戳,应该是保存数据表时候的一个时间戳,怀疑与图片名的后缀一一对应,没有详细考证;
3、rotation:车辆外参,四元数旋转角;
4、translation:车辆外参,偏移矩阵,单位为米。
ego车辆,还有照片中其他车辆(sample_annotation.json)的外参,参考坐标系是世界坐标系,世界坐标系的原点是lidar或radar定义的,没有什么规律,所以要求其他车辆的相机坐标系坐标,就需要在这三个外参(ego、camera、sample)换算一下
calibrated_sensor {
"token": <str> -- Unique record identifier.
"sensor_token": <str> -- Foreign key pointing to the sensor type.
"translation": <float> [3] -- Coordinate system origin in meters: x, y, z.
"rotation": <float> [4] -- Coordinate system orientation as quaternion: w, x, y, z.
"camera_intrinsic": <float> [3, 3] -- Intrinsic camera calibration. Empty for sensors that are not cameras.
}
通过这个文件获取原始图片的width和height,file_name
sample_data {
"token": <str> -- Unique record identifier.
"sample_token": <str> -- Foreign key. Sample to which this sample_data is associated.
"ego_pose_token": <str> -- Foreign key.
"calibrated_sensor_token": <str> -- Foreign key.
"filename": <str> -- Relative path to data-blob on disk.
"fileformat": <str> -- Data file format.
"width": <int> -- If the sample data is an image, this is the image width in pixels.
"height": <int> -- If the sample data is an image, this is the image height in pixels.
"timestamp": <int> -- Unix time stamp.
"is_key_frame": <bool> -- True if sample_data is part of key_frame, else False.
"next": <str> -- Foreign key. Sample data from the same sensor that follows this in time. Empty if end of scene.
"prev": <str> -- Foreign key. Sample data from the same sensor that precedes this in time. Empty if start of scene.
}
rotation四元数获取ret[alpha]。
translation:bbox的中心
size: bbox的wlh
sample_annotation {
"translation": <float> [3] -- Bounding box location in meters as center_x, center_y, center_z.
"size": <float> [3] -- Bounding box size in meters as width, length, height.
"rotation": <float> [4] -- Bounding box orientation as quaternion: w, x, y, z.
对于能直接读取的部分不再赘述,这里只讲需要计算获取的部分。
_, boxes, camera_intrinsic = nusc.get_sample_data(
image_token, box_vis_level=BoxVisibility.ANY)
calib = np.eye(4, dtype=np.float32)
calib[:3, :3] = camera_intrinsic
calib = calib[:3]
def get_sample_data(self, sample_data_token: str,
box_vis_level: BoxVisibility = BoxVisibility.ANY,
selected_anntokens: List[str] = None,
use_flat_vehicle_coordinates: bool = False) -> \
Tuple[str, List[Box], np.array]:
"""
Returns the data path as well as all annotations related to that sample_data.
Note that the boxes are transformed into the current sensor's coordinate frame.
:param sample_data_token: Sample_data token.
:param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
:param selected_anntokens: If provided only return the selected annotation.
:param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
aligned to z-plane in the world.
:return: (data_path, boxes, camera_intrinsic )
"""
# Retrieve sensor & pose records
sd_record = self.get('sample_data', sample_data_token)
# 1. calibrated_sensor
cs_record = self.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
sensor_record = self.get('sensor', cs_record['sensor_token'])
pose_record = self.get('ego_pose', sd_record['ego_pose_token'])
data_path = self.get_sample_data_path(sample_data_token)
if sensor_record['modality'] == 'camera':
# 2. 获取camera下的内置参数cam_intrinsic
cam_intrinsic = np.array(cs_record['camera_intrinsic'])
imsize = (sd_record['width'], sd_record['height'])
else:
cam_intrinsic = None
imsize = None
# Retrieve all sample annotations and map to sensor coordinate system.
if selected_anntokens is not None:
boxes = list(map(self.get_box, selected_anntokens))
else:
boxes = self.get_boxes(sample_data_token)
# Make list of Box objects including coord system transforms.
box_list = []
for box in boxes:
if use_flat_vehicle_coordinates:
# Move box to ego vehicle coord system parallel to world z plane.
yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
box.translate(-np.array(pose_record['translation']))
box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
else:
# Move box to ego vehicle coord system.
box.translate(-np.array(pose_record['translation']))
box.rotate(Quaternion(pose_record['rotation']).inverse)
# Move box to sensor coord system.
box.translate(-np.array(cs_record['translation']))
box.rotate(Quaternion(cs_record['rotation']).inverse)
if sensor_record['modality'] == 'camera' and not \
box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
continue
box_list.append(box)
return data_path, box_list, cam_intrinsic
v = np.dot(box.rotation_matrix, np.array([1, 0, 0]))
yaw = -np.arctan2(v[2], v[0])
alpha = _rot_y2alpha(yaw, (bbox[0] + bbox[2]) / 2, camera_intrinsic[0, 2], camera_intrinsic[0, 0])
ann['bbox'] = [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]]
ann['area'] = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
ann['alpha'] = alpha
这里的box为Box类。用的是四元数旋转矩阵做的三维空间转换。其中orientation: Quaternion是sample_annotation.json文件中读取的rotation。
def _rot_y2alpha(rot_y, x, cx, fx):
"""
Get rotation_y by alpha + theta - 180
alpha : Observation angle of object, ranging [-pi..pi]
x : Object center x to the camera center (x-W/2), in pixels
rotation_y : Rotation ry around Y-axis in camera coordinates [-pi..pi]
"""
alpha = rot_y - np.arctan2(x - cx, fx)
if alpha > np.pi:
alpha -= 2 * np.pi
if alpha < -np.pi:
alpha += 2 * np.pi
return alpha
# 通过3D中心点(x,y,z)和相机内置参数calib,做矩阵变换,得到该3D中心点在二维图像上的位置(x',y')
amodel_center = project_to_image(
np.array([box.center[0], box.center[1] - box.wlh[2] / 2, box.center[2]],
np.float32).reshape(1, 3), calib)[0].tolist()
def project_to_image(pts_3d, P):
# pts_3d: n x 3
# P: 3 x 4
# return: n x 2
pts_3d_homo = np.concatenate(
[pts_3d, np.ones((pts_3d.shape[0], 1), dtype=np.float32)], axis=1)
pts_2d = np.dot(P, pts_3d_homo.transpose(1, 0)).transpose(1, 0)
pts_2d = pts_2d[:, :2] / pts_2d[:, 2:]
return pts_2d
ann = {
'id': num_anns,
'image_id': num_images,
'category_id': category_id,
'dim': [box.wlh[2], box.wlh[0], box.wlh[1]],
'location': [box.center[0], box.center[1], box.center[2]],
'depth': box.center[2],
'occluded': 0,
'truncated': 0,
'rotation_y': yaw,
'amodel_center': amodel_center,
'iscrowd': 0,
'track_id': track_id,
'attributes': ATTRIBUTE_TO_ID[att],
'velocity': vel
}
bbox = KittiDB.project_kitti_box_to_image(
copy.deepcopy(box), camera_intrinsic, imsize=(1600, 900))
def project_kitti_box_to_image(box: Box, p_left: np.ndarray, imsize: Tuple[int, int]) \
-> Union[None, Tuple[int, int, int, int]]:
"""
Projects 3D box into KITTI image FOV.
:param box: 3D box in KITTI reference frame.
:param p_left: . Projection matrix.
:param imsize: (width, height). Image size.
:return: (xmin, ymin, xmax, ymax). Bounding box in image plane or None if box is not in the image.
"""
# Create a new box.
# box = box.copy()
# KITTI defines the box center as the bottom center of the object.
# We use the true center, so we need to adjust half height in negative y direction.
box.translate(np.array([0, -box.wlh[2] / 2, 0]))
# Check that some corners are inside the image.
corners = np.array([corner for corner in box.corners().T if corner[2] > 0]).T
if len(corners) == 0:
return None
# Project corners that are in front of the camera to 2d to get bbox in pixel coords.
# 将camera下的3D的corner,通过相机内置参数camera_intrinsic转换成图片中的2D坐标。
# 从而确定2Dbbox。
imcorners = view_points(corners, p_left, normalize=True)[:2]
bbox = (np.min(imcorners[0]), np.min(imcorners[1]), np.max(imcorners[0]), np.max(imcorners[1]))
# Crop bbox to prevent it extending outside image.
bbox_crop = tuple(max(0, b) for b in bbox)
bbox_crop = (min(imsize[0], bbox_crop[0]),
min(imsize[0], bbox_crop[1]),
min(imsize[0], bbox_crop[2]),
min(imsize[1], bbox_crop[3]))
# Detect if a cropped box is empty.
if bbox_crop[0] >= bbox_crop[2] or bbox_crop[1] >= bbox_crop[3]:
return None
return bbox_crop
def view_points(points: np.ndarray, view: np.ndarray, normalize: bool) -> np.ndarray:
"""
This is a helper class that maps 3d points to a 2d plane. It can be used to implement both perspective and orthographic projections. It first applies the dot product between the points and the view. By convention, the view should be such that the data is projected onto the first 2 axis. It then optionally applies a normalization along the third dimension.
For a perspective projection the view should be a 3x3 camera matrix, and normalize=True
For an orthographic projection with translation the view is a 3x4 matrix and normalize=False
For an orthographic projection without translation the view is a 3x3 matrix (optionally 3x4 with last columns
all zeros) and normalize=False
:param points: Matrix of points, where each point (x, y, z) is along each column.
:param view: . Defines an arbitrary projection (n <= 4).
The projection should be such that the corners are projected onto the first 2 axis.
:param normalize: Whether to normalize the remaining coordinate (along the third axis).
:return: . Mapped point. If normalize=False, the third coordinate is the height.
"""
assert view.shape[0] <= 4
assert view.shape[1] <= 4
assert points.shape[0] == 3
viewpad = np.eye(4)
viewpad[:view.shape[0], :view.shape[1]] = view
nbr_points = points.shape[1]
# Do operation in homogenous coordinates.
points = np.concatenate((points, np.ones((1, nbr_points))))
# 最关键的运算:矩阵乘法
points = np.dot(viewpad, points)
points = points[:3, :]
if normalize:
points = points / points[2:3, :].repeat(3, 0).reshape(3, nbr_points)
return points
def pre_process(self, image, scale, input_meta={}):
resized_image, c, s, inp_width, inp_height, height, width = \
self._transform_scale(image)
#通过3点获取仿射变换的转换矩阵
trans_input = get_affine_transform(c, s, 0, [inp_width, inp_height])
out_height = inp_height // self.opt.down_ratio
out_width = inp_width // self.opt.down_ratio
trans_output = get_affine_transform(c, s, 0, [out_width, out_height])
# 进行仿射变换,获取固定的800x448的固定尺寸作为网络的输入
inp_image = cv2.warpAffine(
resized_image, trans_input, (inp_width, inp_height),
flags=cv2.INTER_LINEAR)
# 归一化
inp_image = ((inp_image / 255. - self.mean) / self.std).astype(np.float32)
#通道转置:HWC转成CHW
images = inp_image.transpose(2, 0, 1).reshape(1, 3, inp_height, inp_width)
if self.opt.flip_test:
images = np.concatenate((images, images[:, :, :, ::-1]), axis=0)
images = torch.from_numpy(images)
meta = {'calib': np.array(input_meta['calib'], dtype=np.float32) \
if 'calib' in input_meta else \
self._get_default_calib(width, height)}
meta.update({'c': c, 's': s, 'height': height, 'width': width,
'out_height': out_height, 'out_width': out_width,
'inp_height': inp_height, 'inp_width': inp_width,
'trans_input': trans_input, 'trans_output': trans_output})
if 'pre_dets' in input_meta:
meta['pre_dets'] = input_meta['pre_dets']
if 'cur_dets' in input_meta:
meta['cur_dets'] = input_meta['cur_dets']
return images, meta
def process(self, images, pre_images=None, pre_hms=None,
pre_inds=None, return_time=False):
with torch.no_grad():
torch.cuda.synchronize()
#model是dla34的模型
output = self.model(images, pre_images, pre_hms)[-1]
#对hm和dep做了sigmoid,具体见函数定义
##output:['hm', 'reg', 'wh', 'dep', 'rot', 'dim', 'amodel_offset', 'pre_inds']
output = self._sigmoid_output(output)
output.update({'pre_inds': pre_inds})
if self.opt.flip_test:
output = self._flip_output(output)
torch.cuda.synchronize()
forward_time = time.time()
#对上面的output进行解码
#解码后的输出为['scores', 'clses', 'xs', 'ys', 'cts', 'bbox', 'dep', 'rot', 'dim', 'amodel_offset', 'pre_inds']
dets = generic_decode(output, K=self.opt.K, opt=self.opt)
torch.cuda.synchronize()
for k in dets:
dets[k] = dets[k].detach().cpu().numpy()
if return_time:
return output, dets, forward_time
else:
return output, dets
def _sigmoid_output(self, output):
if 'hm' in output:
output['hm'] = output['hm'].sigmoid_()
if 'hm_hp' in output:
output['hm_hp'] = output['hm_hp'].sigmoid_()
if 'dep' in output:
output['dep'] = 1. / (output['dep'].sigmoid() + 1e-6) - 1.
output['dep'] *= self.opt.depth_scale
return output
def generic_decode(output, K=100, opt=None):
if not ('hm' in output):
return {}
if opt.zero_tracking:
output['tracking'] *= 0
heat = output['hm']
batch, cat, height, width = heat.size()
# 'hm'经过nms及topk输出['scores', 'clses', 'xs', 'ys', 'cts']
heat = _nms(heat)
scores, inds, clses, ys0, xs0 = _topk(heat, K=K)
clses = clses.view(batch, K)
scores = scores.view(batch, K)
bboxes = None
cts = torch.cat([xs0.unsqueeze(2), ys0.unsqueeze(2)], dim=2)
ret = {'scores': scores, 'clses': clses.float(),
'xs': xs0, 'ys': ys0, 'cts': cts}
#xs,ys连同'reg'获取center
if 'reg' in output:
reg = output['reg']
reg = _tranpose_and_gather_feat(reg, inds)
reg = reg.view(batch, K, 2)
xs = xs0.view(batch, K, 1) + reg[:, :, 0:1]
ys = ys0.view(batch, K, 1) + reg[:, :, 1:2]
else:
xs = xs0.view(batch, K, 1) + 0.5
ys = ys0.view(batch, K, 1) + 0.5
#center连同'wh',获取2D的bbox
if 'wh' in output:
wh = output['wh']
wh = _tranpose_and_gather_feat(wh, inds) # B x K x (F)
# wh = wh.view(batch, K, -1)
wh = wh.view(batch, K, 2)
wh[wh < 0] = 0
if wh.size(2) == 2 * cat: # cat spec
wh = wh.view(batch, K, -1, 2)
cats = clses.view(batch, K, 1, 1).expand(batch, K, 1, 2)
wh = wh.gather(2, cats.long()).squeeze(2) # B x K x 2
else:
pass
bboxes = torch.cat([xs - wh[..., 0:1] / 2,
ys - wh[..., 1:2] / 2,
xs + wh[..., 0:1] / 2,
ys + wh[..., 1:2] / 2], dim=2)
ret['bboxes'] = bboxes
# print('ret bbox', ret['bboxes'])
if 'ltrb' in output:
ltrb = output['ltrb']
ltrb = _tranpose_and_gather_feat(ltrb, inds) # B x K x 4
ltrb = ltrb.view(batch, K, 4)
bboxes = torch.cat([xs0.view(batch, K, 1) + ltrb[..., 0:1],
ys0.view(batch, K, 1) + ltrb[..., 1:2],
xs0.view(batch, K, 1) + ltrb[..., 2:3],
ys0.view(batch, K, 1) + ltrb[..., 3:4]], dim=2)
ret['bboxes'] = bboxes
regression_heads = ['tracking', 'dep', 'rot', 'dim', 'amodel_offset',
'nuscenes_att', 'velocity']
for head in regression_heads:
if head in output:
ret[head] = _tranpose_and_gather_feat(
output[head], inds).view(batch, K, -1)
return ret
output[‘hm’], batch[‘hm’], batch[‘ind’], batch[‘mask’], batch[‘cat’]
output['reg'], batch['reg_mask'], batch['ind'], batch['reg']
output['wh'], batch['wh_mask'], batch['ind'], batch['wh']
output['dep'], batch['dep_mask'], batch['ind'], batch['dep']
output['dim'], batch['dim_mask'], batch['ind'], batch['dim']
output['amodel_offset'], batch['amodel_offset_mask'], batch['ind'], batch['amodel_offset']
The classification are trained with softmax and the angular values are
trained with L1 loss
output['rot'], batch['rot_mask'], batch['ind'], batch['rotbin'], batch['rotres']
### init samples
self._init_ret(ret, gt_det)
calib = self._get_calib(img_info, width, height)
cls_id = int(self.cat_ids[ann['category_id']])
#bbox是限制输出在200x112,bbox_amodal是没有这一限制的两个角点
bbox, bbox_amodal = self._get_bbox_output(
ann['bbox'], trans_output, height, width)
def _add_instance(
self, ret, gt_det, k, cls_id, bbox, bbox_amodal, ann, trans_output,
aug_s, calib, pre_cts=None, track_ids=None):
h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
if h <= 0 or w <= 0:
return
radius = gaussian_radius((math.ceil(h), math.ceil(w)))
radius = max(0, int(radius))
ct = np.array(
[(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
ct_int = ct.astype(np.int32)
ret['cat'][k] = cls_id - 1
ret['mask'][k] = 1
if 'wh' in ret:
ret['wh'][k] = 1. * w, 1. * h
ret['wh_mask'][k] = 1
ret['ind'][k] = ct_int[1] * self.opt.output_w + ct_int[0]
ret['reg'][k] = ct - ct_int
ret['reg_mask'][k] = 1
draw_umich_gaussian(ret['hm'][cls_id - 1], ct_int, radius)
gt_det['bboxes'].append(
np.array([ct[0] - w / 2, ct[1] - h / 2,
ct[0] + w / 2, ct[1] + h / 2], dtype=np.float32))
gt_det['scores'].append(1)
gt_det['clses'].append(cls_id - 1)
gt_det['cts'].append(ct)
if 'rot' in self.opt.heads:
self._add_rot(ret, ann, k, gt_det)
if 'dep' in self.opt.heads:
if 'depth' in ann:
ret['dep_mask'][k] = 1
ret['dep'][k] = ann['depth'] * aug_s
gt_det['dep'].append(ret['dep'][k])
else:
gt_det['dep'].append(2)
if 'dim' in self.opt.heads:
if 'dim' in ann:
ret['dim_mask'][k] = 1
ret['dim'][k] = ann['dim']
gt_det['dim'].append(ret['dim'][k])
else:
gt_det['dim'].append([1,1,1])
if 'amodel_offset' in self.opt.heads:
if 'amodel_center' in ann:
amodel_center = affine_transform(ann['amodel_center'], trans_output)
ret['amodel_offset_mask'][k] = 1
ret['amodel_offset'][k] = amodel_center - ct_int
gt_det['amodel_offset'].append(ret['amodel_offset'][k])
else:
gt_det['amodel_offset'].append([0, 0])
def _add_rot(self, ret, ann, k, gt_det):
if 'alpha' in ann:
ret['rot_mask'][k] = 1
alpha = ann['alpha']
if alpha < np.pi / 6. or alpha > 5 * np.pi / 6.:
ret['rotbin'][k, 0] = 1
ret['rotres'][k, 0] = alpha - (-0.5 * np.pi)
if alpha > -np.pi / 6. or alpha < -5 * np.pi / 6.:
ret['rotbin'][k, 1] = 1
ret['rotres'][k, 1] = alpha - (0.5 * np.pi)
gt_det['rot'].append(self._alpha_to_8(ann['alpha']))
else:
gt_det['rot'].append(self._alpha_to_8(0))
def _alpha_to_8(self, alpha):
ret = [0, 0, 0, 1, 0, 0, 0, 1]
if alpha < np.pi / 6. or alpha > 5 * np.pi / 6.:
r = alpha - (-0.5 * np.pi)
ret[1] = 1
ret[2], ret[3] = np.sin(r), np.cos(r)
if alpha > -np.pi / 6. or alpha < -5 * np.pi / 6.:
r = alpha - (0.5 * np.pi)
ret[5] = 1
ret[6], ret[7] = np.sin(r), np.cos(r)
return ret
def post_process(self, dets, meta, scale=1):
dets = generic_post_process(
self.opt, dets, [meta['c']], [meta['s']],
meta['out_height'], meta['out_width'], self.opt.num_classes,
[meta['calib']], meta['height'], meta['width'])
self.this_calib = meta['calib']
if scale != 1:
for i in range(len(dets[0])):
for k in ['bbox', 'hps']:
if k in dets[0][i]:
dets[0][i][k] = (np.array(
dets[0][i][k], np.float32) / scale).tolist()
return dets[0]
def generic_post_process(
opt, dets, c, s, h, w, num_classes, calibs=None, height=-1, width=-1):
if not ('scores' in dets):
return [{}], [{}]
ret = []
for i in range(len(dets['scores'])):
preds = []
#获取输出的仿射变换的变换矩阵
trans = get_affine_transform(
c[i], s[i], 0, (w, h), inv=1).astype(np.float32)
for j in range(len(dets['scores'][i])):
if dets['scores'][i][j] < opt.out_thresh:
break
item = {}
item['score'] = dets['scores'][i][j]
item['class'] = int(dets['clses'][i][j]) + 1
#中心点center做仿射变换
item['ct'] = transform_preds_with_trans(
(dets['cts'][i][j]).reshape(1, 2), trans).reshape(2)
if 'tracking' in dets:
tracking = transform_preds_with_trans(
(dets['tracking'][i][j] + dets['cts'][i][j]).reshape(1, 2),
trans).reshape(2)
item['tracking'] = tracking - item['ct']
#bbox做仿射变换
if 'bboxes' in dets:
bbox = transform_preds_with_trans(
dets['bboxes'][i][j].reshape(2, 2), trans).reshape(4)
item['bbox'] = bbox
if 'dep' in dets and len(dets['dep'][i]) > j:
item['dep'] = dets['dep'][i][j]
if 'dim' in dets and len(dets['dim'][i]) > j:
# item['dim'] = dets['dim'][i][j]
item['dim'] = np.maximum(dets['dim'][i][j], 0)
#由rot获取alpha, arctan2()+/- 0.5*np.pi
if 'rot' in dets and len(dets['rot'][i]) > j:
item['alpha'] = get_alpha(dets['rot'][i][j:j+1])[0]
#bbox找出中心点后+amodel_offset,再进行仿射变换获取最终的center
if 'rot' in dets and 'dep' in dets and 'dim' in dets \
and len(dets['dep'][i]) > j:
if 'amodel_offset' in dets and len(dets['amodel_offset'][i]) > j:
ct_output = dets['bboxes'][i][j].reshape(2, 2).mean(axis=0)
amodel_ct_output = ct_output + dets['amodel_offset'][i][j]
ct = transform_preds_with_trans(
amodel_ct_output.reshape(1, 2), trans).reshape(2).tolist()
else:
bbox = item['bbox']
ct = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
item['ct'] = ct
#center, alpha, dim, dep, calib获取loc和rot_y
item['loc'], item['rot_y'] = ddd2locrot(
ct, item['alpha'], item['dim'], item['dep'], calibs[i])
preds.append(item)
#最终输出['score', 'class', 'ct', 'bbox', 'dep', 'dim', 'alpha', 'loc', 'rot_y']
ret.append(preds)
return ret
def get_alpha(rot):
# output: (B, 8) [bin1_cls[0], bin1_cls[1], bin1_sin, bin1_cos,
# bin2_cls[0], bin2_cls[1], bin2_sin, bin2_cos]
# return rot[:, 0]
idx = rot[:, 1] > rot[:, 5]
alpha1 = np.arctan2(rot[:, 2], rot[:, 3]) + (-0.5 * np.pi)
alpha2 = np.arctan2(rot[:, 6], rot[:, 7]) + ( 0.5 * np.pi)
return alpha1 * idx + alpha2 * (1 - idx)
def ddd2locrot(center, alpha, dim, depth, calib):
# single image
#center, depth, calib获取loc,即3D的中心点
locations = unproject_2d_to_3d(center, depth, calib)
locations[1] += dim[0] / 2
#alpha, center, calib获取rot_y
rotation_y = alpha2rot_y(alpha, center[0], calib[0, 2], calib[0, 0])
return locations, rotation_y
center, depth, calib获取loc,即3D的中心点
def unproject_2d_to_3d(pt_2d, depth, P):
# pts_2d: 2
# depth: 1
# P: 3 x 4
# return: 3
z = depth - P[2, 3]
x = (pt_2d[0] * depth - P[0, 3] - P[0, 2] * z) / P[0, 0]
y = (pt_2d[1] * depth - P[1, 3] - P[1, 2] * z) / P[1, 1]
pt_3d = np.array([x, y, z], dtype=np.float32).reshape(3)
return pt_3d
alpha, center, calib获取rot_y
def alpha2rot_y(alpha, x, cx, fx):
"""
Get rotation_y by alpha + theta - 180
alpha : Observation angle of object, ranging [-pi..pi]
x : Object center x to the camera center (x-W/2), in pixels
rotation_y : Rotation ry around Y-axis in camera coordinates [-pi..pi]
"""
rot_y = alpha + np.arctan2(x - cx, fx)
if rot_y > np.pi:
rot_y -= 2 * np.pi
if rot_y < -np.pi:
rot_y += 2 * np.pi
return rot_y
def add_3d_detection(
self, image_or_path, flipped, dets, calib, show_txt=False,
vis_thresh=0.3, img_id='det'):
if isinstance(image_or_path, np.ndarray):
self.imgs[img_id] = image_or_path.copy()
else:
self.imgs[img_id] = cv2.imread(image_or_path)
# thickness = 1
if self.opt.show_track_color:
# self.imgs[img_id] = (self.imgs[img_id] * 0.5 + \
# np.ones_like(self.imgs[img_id]) * 255 * 0.5).astype(np.uint8)
# thickness = 3
pass
if flipped:
self.imgs[img_id] = self.imgs[img_id][:, ::-1].copy()
for item in dets:
if item['score'] > vis_thresh \
and 'dim' in item and 'loc' in item and 'rot_y' in item:
cl = (self.colors[int(item['class']) - 1, 0, 0]).tolist() \
if not self.opt.show_track_color else \
self.track_color[int(item['tracking_id'])]
if self.theme == 'white' and not self.opt.show_track_color:
cl = (255 - np.array(cl)).tolist()
if self.opt.tango_color:
cl = (255 - tango_color_dark[int(item['class']) - 1, 0, 0]).tolist()
dim = item['dim']
loc = item['loc']
rot_y = item['rot_y']
if loc[2] > 1:
box_3d = compute_box_3d(dim, loc, rot_y)
box_2d = project_to_image(box_3d, calib)
self.imgs[img_id] = draw_box_3d(
self.imgs[img_id], box_2d.astype(np.int32), cl,
same_color=self.opt.show_track_color or self.opt.qualitative)
if self.opt.show_track_color or self.opt.qualitative:
bbox = [box_2d[:,0].min(), box_2d[:,1].min(),
box_2d[:,0].max(), box_2d[:,1].max()]
sc = int(item['tracking_id']) if self.opt.show_track_color else \
item['score']
self.add_coco_bbox(
bbox, item['class'] - 1, sc, no_bbox=True, img_id=img_id)
if self.opt.show_track_color:
self.add_arrow([(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
item['tracking'], img_id=img_id)
def compute_box_3d(dim, location, rotation_y):
# dim: 3
# location: 3
# rotation_y: 1
# return: 8 x 3
corners_3d = comput_corners_3d(dim, rotation_y)
#location为中心点的坐标
corners_3d = corners_3d + np.array(location, dtype=np.float32).reshape(1, 3)
return corners_3d
def comput_corners_3d(dim, rotation_y):
# dim: 3
# location: 3
# rotation_y: 1
# return: 8 x 3
c, s = np.cos(rotation_y), np.sin(rotation_y)
R = np.array([[c, 0, s], [0, 1, 0], [-s, 0, c]], dtype=np.float32)
l, w, h = dim[2], dim[1], dim[0]
x_corners = [l/2, l/2, -l/2, -l/2, l/2, l/2, -l/2, -l/2]
y_corners = [0,0,0,0,-h,-h,-h,-h]
z_corners = [w/2, -w/2, -w/2, w/2, w/2, -w/2, -w/2, w/2]
corners = np.array([x_corners, y_corners, z_corners], dtype=np.float32)
#R为绕y轴旋转的三维空间的旋转矩阵
corners_3d = np.dot(R, corners).transpose(1, 0)
return corners_3d
关于三维空间旋转部分请参考我的语雀笔记四元数与空间旋转
def project_to_image(pts_3d, P):
# pts_3d: n x 3
# P: 3 x 4
# return: n x 2
pts_3d_homo = np.concatenate(
[pts_3d, np.ones((pts_3d.shape[0], 1), dtype=np.float32)], axis=1)
pts_2d = np.dot(P, pts_3d_homo.transpose(1, 0)).transpose(1, 0)
pts_2d = pts_2d[:, :2] / pts_2d[:, 2:]
# import pdb; pdb.set_trace()
return pts_2d
同时也可以参考以下博客:
https://blog.csdn.net/fsalicealex/article/details/91955759