本文档节选翻译自这里,原文为韩语,使用谷歌翻译译为中文,纯机翻没有经过人工审核,辩证阅读。以下内容是原作者对dataset.proto文件的解释。
每个数据都包含在帧消息中。
由于框架内的消息也具有依赖性,因此我们将解释从基本消息到框架的所有内容。不过,因为这是面向解释的代码,所以强调插入的值与数据类型根本不匹配。
message Label {
// Upright box, zero pitch and roll.
message Box {
// Box coordinates in vehicle frame.
optional double center_x = 1;
optional double center_y = 2;
optional double center_z = 3;
// Dimensions of the box. length: dim x. width: dim y. height: dim z.
optional double length = 5;
optional double width = 4;
optional double height = 6;
// The heading of the bounding box (in radians). The heading is the angle
// required to rotate +x to the surface normal of the box front face. It is
// normalized to [-pi, pi).
optional double heading = 7;
enum Type {
TYPE_UNKNOWN = 0;
// 7-DOF 3D (a.k.a upright 3D box).
TYPE_3D = 1;
// 5-DOF 2D. Mostly used for laser top down representation.
TYPE_2D = 2;
// Axis aligned 2D. Mostly used for image.
TYPE_AA_2D = 3;
}
}
optional Box box = 1;
message Metadata {
optional double speed_x = 1;
optional double speed_y = 2;
optional double accel_x = 3;
optional double accel_y = 4;
}
optional Metadata metadata = 2;
enum Type {
TYPE_UNKNOWN = 0;
TYPE_VEHICLE = 1;
TYPE_PEDESTRIAN = 2;
TYPE_SIGN = 3;
TYPE_CYCLIST = 4;
}
optional Type type = 3;
// Object ID.
optional string id = 4;
// The difficulty level of this label. The higher the level, the harder it is.
enum DifficultyLevel {
UNKNOWN = 0;
LEVEL_1 = 1;
LEVEL_2 = 2;
}
// Difficulty level for detection problem.
optional DifficultyLevel detection_difficulty_level = 5;
// Difficulty level for tracking problem.
optional DifficultyLevel tracking_difficulty_level = 6;
// The total number of lidar points in this box.
optional int32 num_lidar_points_in_box = 7;
}
标签是一种表示带注释数据的消息类型。这包括框的位置、框的类型和对象的类型。还有激光雷达点数、识别难度等。
message MatrixShape {
// Dimensions for the Matrix messages defined below. Must not be empty.
//
// The order of entries in 'dims' matters, as it indicates the layout of the
// values in the tensor in-memory representation.
//
// The first entry in 'dims' is the outermost dimension used to lay out the
// values; the last entry is the innermost dimension. This matches the
// in-memory layout of row-major matrices.
repeated int32 dims = 1;
}
MatrixShape 声明维度的形状。这是因为随着维度的增加,所需的变量数量也会增加。
// Row-major matrix.
// Requires: data.size() = product(shape.dims()).
message MatrixFloat {
repeated float data = 1 [packed = true];
optional MatrixShape shape = 2;
}
MatrixFloat 将维度的形状声明为浮点数。
// Row-major matrix.
// Requires: data.size() = product(shape.dims()).
message MatrixFloat {
repeated float data = 1 [packed = true];
optional MatrixShape shape = 2;
}
MatrixInt32 将维度的形状声明为整数。
// Row-major matrix.
// Requires: data.size() = product(shape.dims()).
message MatrixInt32 {
repeated int32 data = 1 [packed = true];
optional MatrixShape shape = 2;
}
CameraName 是指安装在车辆上的摄像头。
message CameraName {
enum Name {
UNKNOWN = 0;
FRONT = 1;
FRONT_LEFT = 2;
FRONT_RIGHT = 3;
SIDE_LEFT = 4;
SIDE_RIGHT = 5;
}
}
LaserName 是指安装在车辆上的激光雷达。
// 'Laser' is used interchangeably with 'Lidar' in this file.
message LaserName {
enum Name {
UNKNOWN = 0;
TOP = 1;
FRONT = 2;
SIDE_LEFT = 3;
SIDE_RIGHT = 4;
REAR = 5;
}
}
变换是一个变化矩阵,用于将 3D 点从一帧变换到另一帧。
message Velocity {
// Velocity in m/s.
optional float v_x = 1;
optional float v_y = 2;
optional float v_z = 3;
// Angular velocity in rad/s.
optional double w_x = 4;
optional double w_y = 5;
optional double w_z = 6;
}
速度是指物体的速度。
message CameraCalibration {
optional CameraName.Name name = 1;
// 1d Array of [f_u, f_v, c_u, c_v, k{1, 2}, p{1, 2}, k{3}].
// Note that this intrinsic corresponds to the images after scaling.
// Camera model: pinhole camera.
// Lens distortion:
// Radial distortion coefficients: k1, k2, k3.
// Tangential distortion coefficients: p1, p2.
// k_{1, 2, 3}, p_{1, 2} follows the same definition as OpenCV.
// https://en.wikipedia.org/wiki/Distortion_(optics)
// https://docs.opencv.org/2.4/doc/tutorials/calib3d/camera_calibration/camera_calibration.html
repeated double intrinsic = 2;
// Camera frame to vehicle frame.
optional Transform extrinsic = 3;
// Camera image size.
optional int32 width = 4;
optional int32 height = 5;
enum RollingShutterReadOutDirection {
UNKNOWN = 0;
TOP_TO_BOTTOM = 1;
LEFT_TO_RIGHT = 2;
BOTTOM_TO_TOP = 3;
RIGHT_TO_LEFT = 4;
GLOBAL_SHUTTER = 5;
}
optional RollingShutterReadOutDirection rolling_shutter_direction = 6;
}
CameraCalibration 在校准相机时显示相机的图像尺寸和内部设置。
message LaserCalibration {
optional LaserName.Name name = 1;
// If non-empty, the beam pitch (in radians) is non-uniform. When constructing
// a range image, this mapping is used to map from beam pitch to range image
// row. If this is empty, we assume a uniform distribution.
repeated double beam_inclinations = 2;
// beam_inclination_{min,max} (in radians) are used to determine the mapping.
optional double beam_inclination_min = 3;
optional double beam_inclination_max = 4;
// Lidar frame to vehicle frame.
optional Transform extrinsic = 5;
}
LaserCalibration 显示 LiDAR 校准值。
message Context {
// A unique name that identifies the frame sequence.
optional string name = 1;
repeated CameraCalibration camera_calibrations = 2;
repeated LaserCalibration laser_calibrations = 3;
// Some stats for the run segment used.
message Stats {
message ObjectCount {
optional Label.Type type = 1;
// The number of unique objects with the type in the segment.
optional int32 count = 2;
}
repeated ObjectCount laser_object_counts = 1;
repeated ObjectCount camera_object_counts = 5;
// Day, Dawn/Dusk, or Night, determined from sun elevation.
optional string time_of_day = 2;
// Human readable location (e.g. CHD, SF) of the run segment.
optional string location = 3;
// Currently either Sunny or Rain.
optional string weather = 4;
}
optional Stats stats = 4;
}
上下文表示帧的名称、当时相机和激光雷达的标定值、注释对象的数量、当时的天气和位置等。
message RangeImage {
// Zlib compressed [H, W, 4] serialized version of MatrixFloat.
// To decompress:
// string val = ZlibDecompress(range_image_compressed);
// MatrixFloat range_image;
// range_image.ParseFromString(val);
// Inner dimensions are:
// * channel 0: range
// * channel 1: intensity
// * channel 2: elongation
// * channel 3: is in any no label zone.
optional bytes range_image_compressed = 2;
// Lidar point to camera image projections. A point can be projected to
// multiple camera images. We pick the first two at the following order:
// [FRONT, FRONT_LEFT, FRONT_RIGHT, SIDE_LEFT, SIDE_RIGHT].
//
// Zlib compressed [H, W, 6] serialized version of MatrixInt32.
// To decompress:
// string val = ZlibDecompress(camera_projection_compressed);
// MatrixInt32 camera_projection;
// camera_projection.ParseFromString(val);
// Inner dimensions are:
// * channel 0: CameraName.Name of 1st projection. Set to UNKNOWN if no
// projection.
// * channel 1: x (axis along image width)
// * channel 2: y (axis along image height)
// * channel 3: CameraName.Name of 2nd projection. Set to UNKNOWN if no
// projection.
// * channel 4: x (axis along image width)
// * channel 5: y (axis along image height)
// Note: pixel 0 corresponds to the left edge of the first pixel in the image.
optional bytes camera_projection_compressed = 3;
// Zlib compressed [H, W, 6] serialized version of MatrixFloat.
// To decompress:
// string val = ZlibDecompress(range_image_pose_compressed);
// MatrixFloat range_image_pose;
// range_image_pose.ParseFromString(val);
// Inner dimensions are [roll, pitch, yaw, x, y, z] represents a transform
// from vehicle frame to global frame for every range image pixel.
// This is ONLY populated for the first return. The second return is assumed
// to have exactly the same range_image_pose_compressed.
//
// The roll, pitch and yaw are specified as 3-2-1 Euler angle rotations,
// meaning that rotating from the navigation to vehicle frame consists of a
// yaw, then pitch and finally roll rotation about the z, y and x axes
// respectively. All rotations use the right hand rule and are positive
// in the counter clockwise direction.
optional bytes range_image_pose_compressed = 4;
// Zlib compressed [H, W, 5] serialized version of MatrixFloat.
// To decompress:
// string val = ZlibDecompress(range_image_flow_compressed);
// MatrixFloat range_image_flow;
// range_image_flow.ParseFromString(val);
// Inner dimensions are [vx, vy, vz, pointwise class].
//
// If the point is not annotated with scene flow information, class is set
// to -1. A point is not annotated if it is in a no-label zone or if its label
// bounding box does not have a corresponding match in the previous frame,
// making it infeasible to estimate the motion of the point.
// Otherwise, (vx, vy, vz) are velocity along (x, y, z)-axis for this point
// and class is set to one of the following values:
// -1: no-flow-label, the point has no flow information.
// 0: unlabeled or "background,", i.e., the point is not contained in a
// bounding box.
// 1: vehicle, i.e., the point corresponds to a vehicle label box.
// 2: pedestrian, i.e., the point corresponds to a pedestrian label box.
// 3: sign, i.e., the point corresponds to a sign label box.
// 4: cyclist, i.e., the point corresponds to a cyclist label box.
optional bytes range_image_flow_compressed = 5;
// Deprecated, do not use.
optional MatrixFloat range_image = 1 [deprecated = true];
}
RangeImage是指照片所代表的特征。通过针对每个特征调整通道,您可以获得您想要的数据。
// All timestamps in this proto are represented as seconds since Unix epoch.
message CameraImage {
optional CameraName.Name name = 1;
// JPEG image.
optional bytes image = 2;
// SDC pose.
optional Transform pose = 3;
// SDC velocity at 'pose_timestamp' below. The velocity value is represented
// at *global* frame.
// With this velocity, the pose can be extrapolated.
// r(t+dt) = r(t) + dr/dt * dt where dr/dt = v_{x,y,z}.
// dR(t)/dt = W*R(t) where W = SkewSymmetric(w_{x,y,z})
// This differential equation solves to: R(t) = exp(Wt)*R(0) if W is constant.
// When dt is small: R(t+dt) = (I+W*dt)R(t)
// r(t) = (x(t), y(t), z(t)) is vehicle location at t in the global frame.
// R(t) = Rotation Matrix (3x3) from the body frame to the global frame at t.
// SkewSymmetric(x,y,z) is defined as the cross-product matrix in the
// following:
// https://en.wikipedia.org/wiki/Cross_product#Conversion_to_matrix_multiplication
optional Velocity velocity = 4;
// Timestamp of the `pose` above.
optional double pose_timestamp = 5;
// Rolling shutter params.
// The following explanation assumes left->right rolling shutter.
//
// Rolling shutter cameras expose and read the image column by column, offset
// by the read out time for each column. The desired timestamp for each column
// is the middle of the exposure of that column as outlined below for an image
// with 3 columns:
// ------time------>
// |---- exposure col 1----| read |
// -------|---- exposure col 2----| read |
// --------------|---- exposure col 3----| read |
// ^trigger time ^readout end time
// ^time for row 1 (= middle of exposure of row 1)
// ^time image center (= middle of exposure of middle row)
// Shutter duration in seconds. Exposure time per column.
optional double shutter = 6;
// Time when the sensor was triggered and when last readout finished.
// The difference between trigger time and readout done time includes
// the exposure time and the actual sensor readout time.
optional double camera_trigger_time = 7;
optional double camera_readout_done_time = 8;
}
CameraImage表示照片拍摄的时间和环境、拍摄时的相机操作值等。
// The camera labels associated with a given camera image. This message
// indicates the ground truth information for the camera image
// recorded by the given camera. If there are no labeled objects in the image,
// then the labels field is empty.
message CameraLabels {
optional CameraName.Name name = 1;
repeated Label labels = 2;
}
CameraLabels 指示哪些注释存在于哪些相机上。
message Laser {
optional LaserName.Name name = 1;
optional RangeImage ri_return1 = 2;
optional RangeImage ri_return2 = 3;
}
激光指示哪个 RangeImage 在哪个激光雷达上。
message Frame {
// The following field numbers are reserved for third-party extensions. Users
// may declare new fields in that range in their own .proto files without
// having to edit the original file.
extensions 1000 to max;
// This context is the same for all frames belong to the same driving run
// segment. Use context.name to identify frames belong to the same driving
// segment. We do not store all frames from one driving segment in one proto
// to avoid huge protos.
optional Context context = 1;
// Frame start time, which is the timestamp of the first top lidar spin
// within this frame.
optional int64 timestamp_micros = 2;
// The vehicle pose.
optional Transform pose = 3;
repeated CameraImage images = 4;
repeated Laser lasers = 5;
repeated Label laser_labels = 6;
// Lidar labels (laser_labels) projected to camera images. A projected
// label is the smallest image axis aligned rectangle that can cover all
// projected points from the 3d lidar label. The projected label is ignored if
// the projection is fully outside a camera image. The projected label is
// clamped to the camera image if it is partially outside.
repeated CameraLabels projected_lidar_labels = 9;
// NOTE: if a camera identified by CameraLabels.name has an entry in this
// field, then it has been labeled, even though it is possible that there are
// no labeled objects in the corresponding image, which is identified by a
// zero sized CameraLabels.labels.
repeated CameraLabels camera_labels = 8;
// No label zones in the *global* frame.
repeated Polygon2dProto no_label_zones = 7;
}
Frame 组合了上面的所有值并将它们显示为一帧。每个值都显示为键值。