hough_voting_gpu_op.cc
/* Copyright 2015 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Computing label Op
#include
#include
#include
#include
#include
#include
#include "opencv2/opencv.hpp"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor_shape.h"
#define VERTEX_CHANNELS 3
using namespace tensorflow;
typedef Eigen::ThreadPoolDevice CPUDevice;
REGISTER_OP("Houghvotinggpu")
.Attr("T: {float, double}")
.Attr("is_train: int")
.Attr("threshold_vote: int")
.Attr("skip_pixels: int")
.Input("bottom_label: int32")
.Input("bottom_vertex: T")
.Input("bottom_extents: T")
.Input("bottom_meta_data: T")
.Input("bottom_gt: T")
.Output("top_box: T")
.Output("top_pose: T")
.Output("top_target: T")
.Output("top_weight: T")
.Output("top_domain: int32");
REGISTER_OP("HoughvotinggpuGrad")
.Attr("T: {float, double}")
.Input("bottom_label: int32")
.Input("bottom_vertex: T")
.Input("grad: T")
.Output("output_label: T")
.Output("output_vertex: T");
int clamp(int val, int min_val, int max_val)
{
return std::max(min_val, std::min(max_val, val));
}
void getBb3Ds(const float* extents, std::vector>& bb3Ds, int num_classes);
inline std::vector getBB3D(const cv::Vec& extent);
inline cv::Rect getBB2D(int imageWidth, int imageHeight, const std::vector& bb3D, const cv::Mat& camMat, const cv::Mat& rvec, const cv::Mat& tvec);
inline float getIoU(const cv::Rect& bb1, const cv::Rect bb2);
inline float angle_distance(cv::Point2f x, cv::Point2f n, cv::Point2f p);
void hough_voting(const int* labelmap, const float* vertmap, std::vector> bb3Ds,
int batch, int height, int width, int num_classes, int is_train,
float fx, float fy, float px, float py, std::vector >& outputs);
void compute_target_weight(int height, int width, float* target, float* weight, std::vector> bb3Ds,
const float* poses_gt, int num_gt, int num_classes, float fx, float fy, float px, float py, std::vector > outputs);
inline void compute_width_height(const int* labelmap, const float* vertmap, cv::Point2f center,
std::vector> bb3Ds, cv::Mat camMat, float inlierThreshold,
int height, int width, int channel, int num_classes, int & bb_width, int & bb_height, float & bb_distance);
// cuda functions
void HoughVotingLaucher(OpKernelContext* context,
const int* labelmap, const float* vertmap, const float* extents, const float* meta_data, const float* gt,
const int batch_index, const int height, const int width, const int num_classes, const int num_gt,
const int is_train, const float inlierThreshold, const int labelThreshold, const int votingThreshold, const int skip_pixels,
float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain, int* num_rois, const Eigen::GpuDevice& d);
void allocate_outputs(OpKernelContext* context, Tensor* top_box_tensor, Tensor* top_pose_tensor, Tensor* top_target_tensor, Tensor* top_weight_tensor, Tensor* top_domain_tensor, Tensor* top_rois_tensor, int num_classes)
{
int num = 1024;
int dims[2];
dims[0] = num;
dims[1] = 7;
TensorShape output_shape;
TensorShapeUtils::MakeShape(dims, 2, &output_shape);
OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, output_shape, top_box_tensor));
dims[1] = 7;
TensorShape output_shape_1;
TensorShapeUtils::MakeShape(dims, 2, &output_shape_1);
OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, output_shape_1, top_pose_tensor));
dims[1] = 4 * num_classes;
TensorShape output_shape_2;
TensorShapeUtils::MakeShape(dims, 2, &output_shape_2);
OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, output_shape_2, top_target_tensor));
OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, output_shape_2, top_weight_tensor));
TensorShape output_shape_3;
TensorShapeUtils::MakeShape(&num, 1, &output_shape_3);
OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_3, top_domain_tensor));
int len = 1;
TensorShape output_shape_4;
TensorShapeUtils::MakeShape(&len, 1, &output_shape_4);
OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_4, top_rois_tensor));
}
void reset_outputs(float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain, int* num_rois, int num_classes);
void copy_num_rois(int* num_rois, int* num_rois_device);
void copy_outputs(float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain,
float* top_box_final, float* top_pose_final, float* top_target_final, float* top_weight_final, int* top_domain_final, int num_classes, int num_rois);
void set_gradients(float* top_label, float* top_vertex, int batch_size, int height, int width, int num_classes);
template
class HoughvotinggpuOp : public OpKernel {
public:
explicit HoughvotinggpuOp(OpKernelConstruction* context) : OpKernel(context) {
// Get the pool height
OP_REQUIRES_OK(context,
context->GetAttr("is_train", &is_train_));
// Check that pooled_height is positive
OP_REQUIRES(context, is_train_ >= 0,
errors::InvalidArgument("Need is_train >= 0, got ",
is_train_));
OP_REQUIRES_OK(context,
context->GetAttr("threshold_vote", &threshold_vote_));
OP_REQUIRES_OK(context,
context->GetAttr("skip_pixels", &skip_pixels_));
}
// bottom_label: (batch_size, height, width)
// bottom_vertex: (batch_size, height, width, 3 * num_classes)
// top_box: (num, 7) i.e., batch_index, cls, x1, y1, x2, y2, score
void Compute(OpKernelContext* context) override
{
// Grab the input tensor
const Tensor& bottom_label = context->input(0);
const Tensor& bottom_vertex = context->input(1);
const Tensor& bottom_extents = context->input(2);
// format of the meta_data
// intrinsic matrix: meta_data[0 ~ 8]
// inverse intrinsic matrix: meta_data[9 ~ 17]
// pose_world2live: meta_data[18 ~ 29]
// pose_live2world: meta_data[30 ~ 41]
// voxel step size: meta_data[42, 43, 44]
// voxel min value: meta_data[45, 46, 47]
const Tensor& bottom_meta_data = context->input(3);
auto meta_data = bottom_meta_data.flat();
const Tensor& bottom_gt = context->input(4);
const float* gt = bottom_gt.flat().data();
// data should have 5 dimensions.
OP_REQUIRES(context, bottom_label.dims() == 3,
errors::InvalidArgument("label must be 3-dimensional"));
OP_REQUIRES(context, bottom_vertex.dims() == 4,
errors::InvalidArgument("vertex must be 4-dimensional"));
// batch size
int batch_size = bottom_label.dim_size(0);
// height
int height = bottom_label.dim_size(1);
// width
int width = bottom_label.dim_size(2);
// num of classes
int num_classes = bottom_vertex.dim_size(3) / VERTEX_CHANNELS;
int num_meta_data = bottom_meta_data.dim_size(3);
int num_gt = bottom_gt.dim_size(0);
// for each image, run hough voting
std::vector > outputs;
const float* extents = bottom_extents.flat().data();
// bb3Ds
std::vector> bb3Ds;
getBb3Ds(extents, bb3Ds, num_classes);
int index_meta_data = 0;
float fx, fy, px, py;
for (int n = 0; n < batch_size; n++)
{
const int* labelmap = bottom_label.flat().data() + n * height * width;
const float* vertmap = bottom_vertex.flat().data() + n * height * width * VERTEX_CHANNELS * num_classes;
fx = meta_data(index_meta_data + 0);
fy = meta_data(index_meta_data + 4);
px = meta_data(index_meta_data + 2);
py = meta_data(index_meta_data + 5);
hough_voting(labelmap, vertmap, bb3Ds, n, height, width, num_classes, is_train_, fx, fy, px, py, outputs);
index_meta_data += num_meta_data;
}
if (outputs.size() == 0)
{
std::cout << "no detection" << std::endl;
// add a dummy detection to the output
cv::Vec roi;
roi(0) = 0;
roi(1) = -1;
outputs.push_back(roi);
}
// Create output tensors
// top_box
int dims[2];
dims[0] = outputs.size();
dims[1] = 7;
TensorShape output_shape;
TensorShapeUtils::MakeShape(dims, 2, &output_shape);
Tensor* top_box_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &top_box_tensor));
float* top_box = top_box_tensor->template flat().data();
// top_pose
dims[1] = 7;
TensorShape output_shape_pose;
TensorShapeUtils::MakeShape(dims, 2, &output_shape_pose);
Tensor* top_pose_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(1, output_shape_pose, &top_pose_tensor));
float* top_pose = top_pose_tensor->template flat().data();
// top target
dims[1] = 4 * num_classes;
TensorShape output_shape_target;
TensorShapeUtils::MakeShape(dims, 2, &output_shape_target);
Tensor* top_target_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(2, output_shape_target, &top_target_tensor));
float* top_target = top_target_tensor->template flat().data();
memset(top_target, 0, outputs.size() * 4 * num_classes *sizeof(T));
// top weight
Tensor* top_weight_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(3, output_shape_target, &top_weight_tensor));
float* top_weight = top_weight_tensor->template flat().data();
memset(top_weight, 0, outputs.size() * 4 * num_classes *sizeof(T));
// top domain
int num = outputs.size();
TensorShape output_shape_domain;
TensorShapeUtils::MakeShape(&num, 1, &output_shape_domain);
Tensor* top_domain_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(4, output_shape_domain, &top_domain_tensor));
int* top_domain = top_domain_tensor->template flat().data();
memset(top_domain, 0, outputs.size() * sizeof(int));
for(int n = 0; n < outputs.size(); n++)
{
cv::Vec roi = outputs[n];
for (int i = 0; i < 7; i++)
top_box[n * 7 + i] = roi(i);
for (int i = 0; i < 7; i++)
top_pose[n * 7 + i] = roi(7 + i);
if (num_gt == 0)
top_domain[n] = 1;
else
top_domain[n] = 0;
}
if (is_train_)
compute_target_weight(height, width, top_target, top_weight, bb3Ds, gt, num_gt, num_classes, fx, fy, px, py, outputs);
}
private:
int is_train_;
int threshold_vote_;
int skip_pixels_;
};
REGISTER_KERNEL_BUILDER(Name("Houghvotinggpu").Device(DEVICE_CPU).TypeConstraint("T"), HoughvotinggpuOp);
template
class HoughvotinggpuOp : public OpKernel {
public:
typedef Eigen::GpuDevice Device;
explicit HoughvotinggpuOp(OpKernelConstruction* context) : OpKernel(context)
{
// Get the pool height
OP_REQUIRES_OK(context,
context->GetAttr("is_train", &is_train_));
// Check that pooled_height is positive
OP_REQUIRES(context, is_train_ >= 0,
errors::InvalidArgument("Need is_train >= 0, got ",
is_train_));
OP_REQUIRES_OK(context,
context->GetAttr("threshold_vote", &threshold_vote_));
OP_REQUIRES_OK(context,
context->GetAttr("skip_pixels", &skip_pixels_));
}
void Compute(OpKernelContext* context) override
{
// Grab the input tensor
const Tensor& bottom_label = context->input(0);
const Tensor& bottom_vertex = context->input(1);
// data should have 5 dimensions.
OP_REQUIRES(context, bottom_label.dims() == 3,
errors::InvalidArgument("label must be 3-dimensional"));
OP_REQUIRES(context, bottom_vertex.dims() == 4,
errors::InvalidArgument("vertex must be 4-dimensional"));
const Tensor& bottom_extents = context->input(2);
const float* extents = bottom_extents.flat().data();
// format of the meta_data
// intrinsic matrix: meta_data[0 ~ 8]
// inverse intrinsic matrix: meta_data[9 ~ 17]
// pose_world2live: meta_data[18 ~ 29]
// pose_live2world: meta_data[30 ~ 41]
// voxel step size: meta_data[42, 43, 44]
// voxel min value: meta_data[45, 46, 47]
const Tensor& bottom_meta_data = context->input(3);
const Tensor& bottom_gt = context->input(4);
const float* gt = bottom_gt.flat().data();
int batch_size = bottom_label.dim_size(0);
int height = bottom_label.dim_size(1);
int width = bottom_label.dim_size(2);
int num_classes = bottom_vertex.dim_size(3) / VERTEX_CHANNELS;
int num_meta_data = bottom_meta_data.dim_size(3);
int num_gt = bottom_gt.dim_size(0);
float inlierThreshold = 0.9;
int labelThreshold = 500;
Tensor top_box_tensor_tmp, top_pose_tensor_tmp, top_target_tensor_tmp, top_weight_tensor_tmp, top_domain_tensor_tmp, num_rois_tensor_tmp;
allocate_outputs(context, &top_box_tensor_tmp, &top_pose_tensor_tmp, &top_target_tensor_tmp, &top_weight_tensor_tmp,
&top_domain_tensor_tmp, &num_rois_tensor_tmp, num_classes);
float* top_box = top_box_tensor_tmp.flat().data();
float* top_pose = top_pose_tensor_tmp.flat().data();
float* top_target = top_target_tensor_tmp.flat().data();
float* top_weight = top_weight_tensor_tmp.flat().data();
int* top_domain = top_domain_tensor_tmp.flat().data();
int* num_rois_device = num_rois_tensor_tmp.flat().data();
reset_outputs(top_box, top_pose, top_target, top_weight, top_domain, num_rois_device, num_classes);
for (int n = 0; n < batch_size; n++)
{
const int* labelmap = bottom_label.flat().data() + n * height * width;
const float* vertmap = bottom_vertex.flat().data() + n * height * width * VERTEX_CHANNELS * num_classes;
const float* meta_data = bottom_meta_data.flat().data() + n * num_meta_data;
HoughVotingLaucher(context, labelmap, vertmap, extents, meta_data, gt, n, height, width, num_classes, num_gt,
is_train_, inlierThreshold, labelThreshold, threshold_vote_, skip_pixels_,
top_box, top_pose, top_target, top_weight, top_domain, num_rois_device, context->eigen_device());
}
int num_rois;
copy_num_rois(&num_rois, num_rois_device);
// dummy output
if (num_rois == 0)
num_rois = 1;
// Create output tensors
// top_box
int dims[2];
dims[0] = num_rois;
dims[1] = 7;
TensorShape output_shape;
TensorShapeUtils::MakeShape(dims, 2, &output_shape);
Tensor* top_box_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &top_box_tensor));
float* top_box_final = top_box_tensor->flat().data();
// top_pose
dims[1] = 7;
TensorShape output_shape_pose;
TensorShapeUtils::MakeShape(dims, 2, &output_shape_pose);
Tensor* top_pose_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(1, output_shape_pose, &top_pose_tensor));
float* top_pose_final = top_pose_tensor->flat().data();
// top target
dims[1] = 4 * num_classes;
TensorShape output_shape_target;
TensorShapeUtils::MakeShape(dims, 2, &output_shape_target);
Tensor* top_target_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(2, output_shape_target, &top_target_tensor));
float* top_target_final = top_target_tensor->flat().data();
// top weight
Tensor* top_weight_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(3, output_shape_target, &top_weight_tensor));
float* top_weight_final = top_weight_tensor->flat().data();
// top domain
TensorShape output_shape_domain;
TensorShapeUtils::MakeShape(&num_rois, 1, &output_shape_domain);
Tensor* top_domain_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(4, output_shape_domain, &top_domain_tensor));
int* top_domain_final = top_domain_tensor->template flat().data();
copy_outputs(top_box, top_pose, top_target, top_weight, top_domain, top_box_final,
top_pose_final, top_target_final, top_weight_final, top_domain_final, num_classes, num_rois);
}
private:
int is_train_;
int threshold_vote_;
int skip_pixels_;
};
REGISTER_KERNEL_BUILDER(Name("Houghvotinggpu").Device(DEVICE_GPU).TypeConstraint("T"), HoughvotinggpuOp);
// compute gradient
template
class HoughvotinggpuGradOp : public OpKernel {
public:
explicit HoughvotinggpuGradOp(OpKernelConstruction* context) : OpKernel(context) {
}
void Compute(OpKernelContext* context) override
{
// Grab the input tensor
const Tensor& bottom_label = context->input(0);
const Tensor& bottom_vertex = context->input(1);
// data should have 5 dimensions.
OP_REQUIRES(context, bottom_label.dims() == 3,
errors::InvalidArgument("label must be 3-dimensional"));
OP_REQUIRES(context, bottom_vertex.dims() == 4,
errors::InvalidArgument("vertex must be 4-dimensional"));
// batch size
int batch_size = bottom_label.dim_size(0);
// height
int height = bottom_label.dim_size(1);
// width
int width = bottom_label.dim_size(2);
// num of classes
int num_classes = bottom_vertex.dim_size(3) / VERTEX_CHANNELS;
// construct the output shape
TensorShape output_shape = bottom_label.shape();
Tensor* top_label_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &top_label_tensor));
float* top_label = top_label_tensor->flat().data();
TensorShape output_shape_1 = bottom_vertex.shape();
Tensor* top_vertex_tensor = NULL;
OP_REQUIRES_OK(context, context->allocate_output(1, output_shape_1, &top_vertex_tensor));
float* top_vertex = top_vertex_tensor->flat().data();
set_gradients(top_label, top_vertex, batch_size, height, width, num_classes);
}
};
// REGISTER_KERNEL_BUILDER(Name("HoughvotinggpuGrad").Device(DEVICE_CPU).TypeConstraint("T"), HoughvotinggpuGradOp);
REGISTER_KERNEL_BUILDER(Name("HoughvotinggpuGrad").Device(DEVICE_GPU).TypeConstraint("T"), HoughvotinggpuGradOp);
void hough_voting(const int* labelmap, const float* vertmap, std::vector> bb3Ds,
int batch, int height, int width, int num_classes, int is_train,
float fx, float fy, float px, float py, std::vector >& outputs)
{
float inlierThreshold = 0.9;
int votingThreshold = 50;
// camera matrix
cv::Mat_ camMat = cv::Mat_::zeros(3, 3);
camMat(0, 0) = fx;
camMat(1, 1) = fy;
camMat(2, 2) = 1.f;
camMat(0, 2) = px;
camMat(1, 2) = py;
// initialize hough space
int* hough_space = (int*)malloc(sizeof(int) * height * width * num_classes);
memset(hough_space, 0, height * width * num_classes);
int* flags = (int*)malloc(sizeof(int) * num_classes);
memset(flags, 0, num_classes);
// for each pixel
for (int x = 0; x < width; x++)
{
for (int y = 0; y < height; y++)
{
int c = labelmap[y * width + x];
if (c > 0)
{
flags[c] = 1;
// read the predict center direction
int offset = VERTEX_CHANNELS * c + VERTEX_CHANNELS * num_classes * (y * width + x);
float u = vertmap[offset];
float v = vertmap[offset + 1];
float norm = sqrt(u * u + v * v);
u /= norm;
v /= norm;
// voting
float delta = 1.0 / fabs(u);
float cx = x;
float cy = y;
while(1)
{
cx += delta * u;
cy += delta * v;
int center_x = int(cx);
int center_y = int(cy);
if (center_x >= 0 && center_x < width && center_y >= 0 && center_y < height)
{
offset = c + num_classes * (center_y * width + center_x);
hough_space[offset] += 1;
}
else
break;
}
}
}
}
// find the maximum in hough space
for (int c = 1; c < num_classes; c++)
{
if (flags[c])
{
int max_vote = 0;
int max_x, max_y;
for (int x = 0; x < width; x++)
{
for (int y = 0; y < height; y++)
{
int offset = c + num_classes * (y * width + x);
if (hough_space[offset] > max_vote)
{
max_vote = hough_space[offset];
max_x = x;
max_y = y;
}
}
}
if (max_vote < votingThreshold)
continue;
// center
cv::Point2f center(max_x, max_y);
int bb_width, bb_height;
float bb_distance;
compute_width_height(labelmap, vertmap, center, bb3Ds, camMat, inlierThreshold, height, width, c, num_classes, bb_width, bb_height, bb_distance);
// construct output
cv::Vec roi;
roi(0) = batch;
roi(1) = c;
// bounding box
float scale = 0.05;
roi(2) = center.x - bb_width * (0.5 + scale);
roi(3) = center.y - bb_height * (0.5 + scale);
roi(4) = center.x + bb_width * (0.5 + scale);
roi(5) = center.y + bb_height * (0.5 + scale);
// score
roi(6) = max_vote;
// pose
float rx = (center.x - px) / fx;
float ry = (center.y - py) / fy;
roi(7) = 1;
roi(8) = 0;
roi(9) = 0;
roi(10) = 0;
roi(11) = rx * bb_distance;
roi(12) = ry * bb_distance;
roi(13) = bb_distance;
outputs.push_back(roi);
if (is_train)
{
// add jittering rois
float x1 = roi(2);
float y1 = roi(3);
float x2 = roi(4);
float y2 = roi(5);
float ww = x2 - x1;
float hh = y2 - y1;
// (-1, -1)
roi(2) = x1 - 0.05 * ww;
roi(3) = y1 - 0.05 * hh;
roi(4) = roi(2) + ww;
roi(5) = roi(3) + hh;
outputs.push_back(roi);
// (+1, -1)
roi(2) = x1 + 0.05 * ww;
roi(3) = y1 - 0.05 * hh;
roi(4) = roi(2) + ww;
roi(5) = roi(3) + hh;
outputs.push_back(roi);
// (-1, +1)
roi(2) = x1 - 0.05 * ww;
roi(3) = y1 + 0.05 * hh;
roi(4) = roi(2) + ww;
roi(5) = roi(3) + hh;
outputs.push_back(roi);
// (+1, +1)
roi(2) = x1 + 0.05 * ww;
roi(3) = y1 + 0.05 * hh;
roi(4) = roi(2) + ww;
roi(5) = roi(3) + hh;
outputs.push_back(roi);
// (0, -1)
roi(2) = x1;
roi(3) = y1 - 0.05 * hh;
roi(4) = roi(2) + ww;
roi(5) = roi(3) + hh;
outputs.push_back(roi);
// (-1, 0)
roi(2) = x1 - 0.05 * ww;
roi(3) = y1;
roi(4) = roi(2) + ww;
roi(5) = roi(3) + hh;
outputs.push_back(roi);
// (0, +1)
roi(2) = x1;
roi(3) = y1 + 0.05 * hh;
roi(4) = roi(2) + ww;
roi(5) = roi(3) + hh;
outputs.push_back(roi);
// (+1, 0)
roi(2) = x1 + 0.05 * ww;
roi(3) = y1;
roi(4) = roi(2) + ww;
roi(5) = roi(3) + hh;
outputs.push_back(roi);
}
}
}
}
inline float angle_distance(cv::Point2f x, cv::Point2f n, cv::Point2f p)
{
return n.dot(x - p) / (cv::norm(n) * cv::norm(x - p));
}
inline void compute_width_height(const int* labelmap, const float* vertmap, cv::Point2f center,
std::vector> bb3Ds, cv::Mat camMat, float inlierThreshold,
int height, int width, int channel, int num_classes, int & bb_width, int & bb_height, float & bb_distance)
{
float d = 0;
int count = 0;
// for each pixel
std::vector dx;
std::vector dy;
for (int x = 0; x < width; x++)
{
for (int y = 0; y < height; y++)
{
if (labelmap[y * width + x] == channel)
{
cv::Point2f point(x, y);
// read out object coordinate
int offset = VERTEX_CHANNELS * channel + VERTEX_CHANNELS * num_classes * (y * width + x);
float u = vertmap[offset];
float v = vertmap[offset + 1];
float distance = exp(vertmap[offset + 2]);
float norm = sqrt(u * u + v * v);
u /= norm;
v /= norm;
cv::Point2f direction(u, v);
// inlier check
if(angle_distance(center, direction, point) > inlierThreshold)
{
dx.push_back(fabs(point.x - center.x));
dy.push_back(fabs(point.y - center.y));
d += distance;
count++;
}
}
}
}
bb_distance = d / count;
// estimate a projection
cv::Mat tvec(3, 1, CV_64F);
cv::Mat rvec(3, 1, CV_64F);
for(int i = 0; i < 3; i++)
{
tvec.at(i, 0) = 0;
rvec.at(i, 0) = 0;
}
tvec.at(2, 0) = bb_distance;
std::vector bb2D;
cv::projectPoints(bb3Ds[channel-1], rvec, tvec, camMat, cv::Mat(), bb2D);
// get min-max of projected vertices
int minX = 1e8;
int maxX = -1e8;
int minY = 1e8;
int maxY = -1e8;
for(int i = 0; i < bb2D.size(); i++)
{
minX = std::min((float) minX, bb2D[i].x);
minY = std::min((float) minY, bb2D[i].y);
maxX = std::max((float) maxX, bb2D[i].x);
maxY = std::max((float) maxY, bb2D[i].y);
}
cv::Rect bb = cv::Rect(0, 0, (maxX - minX + 1), (maxY - minY + 1));
std::vector::iterator it;
it = std::remove_if(dx.begin(), dx.end(), std::bind2nd(std::greater(), std::max(bb.width, bb.height) ));
dx.erase(it, dx.end());
it = std::remove_if(dy.begin(), dy.end(), std::bind2nd(std::greater(), std::max(bb.width, bb.height) ));
dy.erase(it, dy.end());
std::sort(dx.begin(), dx.end());
std::sort(dy.begin(), dy.end());
bb_width = 2 * dx[int(dx.size() * 0.95)];
bb_height = 2 * dy[int(dy.size() * 0.95)];
}
// compute the pose target and weight
void compute_target_weight(int height, int width, float* target, float* weight, std::vector> bb3Ds,
const float* poses_gt, int num_gt, int num_classes, float fx, float fy, float px, float py, std::vector > outputs)
{
int num = outputs.size();
float threshold = 0.2;
// camera matrix
cv::Mat_ camMat = cv::Mat_::zeros(3, 3);
camMat(0, 0) = fx;
camMat(1, 1) = fy;
camMat(2, 2) = 1.f;
camMat(0, 2) = px;
camMat(1, 2) = py;
// compute the gt boxes
std::vector bb2Ds_gt(num_gt);
for (int i = 0; i < num_gt; i++)
{
Eigen::Quaternionf quaternion(poses_gt[i * 13 + 6], poses_gt[i * 13 + 7], poses_gt[i * 13 + 8], poses_gt[i * 13 + 9]);
Eigen::Matrix3f rmatrix = quaternion.toRotationMatrix();
cv::Mat rmat_trans = cv::Mat(3, 3, CV_32F, rmatrix.data());
cv::Mat rmat;
cv::transpose(rmat_trans, rmat);
cv::Mat rvec(3, 1, CV_64F);
cv::Rodrigues(rmat, rvec);
cv::Mat tvec(3, 1, CV_64F);
tvec.at(0, 0) = poses_gt[i * 13 + 10];
tvec.at(1, 0) = poses_gt[i * 13 + 11];
tvec.at(2, 0) = poses_gt[i * 13 + 12];
int objID = int(poses_gt[i * 13 + 1]);
std::vector bb3D = bb3Ds[objID-1];
bb2Ds_gt[i] = getBB2D(width, height, bb3D, camMat, rvec, tvec);
}
for (int i = 0; i < num; i++)
{
cv::Vec roi = outputs[i];
int batch_id = int(roi(0));
int class_id = int(roi(1));
// find the gt index
int gt_ind = -1;
for (int j = 0; j < num_gt; j++)
{
int gt_batch = int(poses_gt[j * 13 + 0]);
int gt_id = int(poses_gt[j * 13 + 1]);
if(class_id == gt_id && batch_id == gt_batch)
{
gt_ind = j;
break;
}
}
if (gt_ind == -1)
continue;
// compute bounding box overlap
float x1 = roi(2);
float y1 = roi(3);
float x2 = roi(4);
float y2 = roi(5);
cv::Rect bb2D(x1, y1, x2-x1, y2-y1);
float overlap = getIoU(bb2D, bb2Ds_gt[gt_ind]);
if (overlap < threshold)
continue;
target[i * 4 * num_classes + 4 * class_id + 0] = poses_gt[gt_ind * 13 + 6];
target[i * 4 * num_classes + 4 * class_id + 1] = poses_gt[gt_ind * 13 + 7];
target[i * 4 * num_classes + 4 * class_id + 2] = poses_gt[gt_ind * 13 + 8];
target[i * 4 * num_classes + 4 * class_id + 3] = poses_gt[gt_ind * 13 + 9];
weight[i * 4 * num_classes + 4 * class_id + 0] = 1;
weight[i * 4 * num_classes + 4 * class_id + 1] = 1;
weight[i * 4 * num_classes + 4 * class_id + 2] = 1;
weight[i * 4 * num_classes + 4 * class_id + 3] = 1;
}
}
// get 3D bounding boxes
void getBb3Ds(const float* extents, std::vector>& bb3Ds, int num_classes)
{
// for each object
for (int i = 1; i < num_classes; i++)
{
cv::Vec extent;
extent(0) = extents[i * 3];
extent(1) = extents[i * 3 + 1];
extent(2) = extents[i * 3 + 2];
bb3Ds.push_back(getBB3D(extent));
}
}
inline std::vector getBB3D(const cv::Vec& extent)
{
std::vector bb;
float xHalf = extent[0] * 0.5;
float yHalf = extent[1] * 0.5;
float zHalf = extent[2] * 0.5;
bb.push_back(cv::Point3f(xHalf, yHalf, zHalf));
bb.push_back(cv::Point3f(-xHalf, yHalf, zHalf));
bb.push_back(cv::Point3f(xHalf, -yHalf, zHalf));
bb.push_back(cv::Point3f(-xHalf, -yHalf, zHalf));
bb.push_back(cv::Point3f(xHalf, yHalf, -zHalf));
bb.push_back(cv::Point3f(-xHalf, yHalf, -zHalf));
bb.push_back(cv::Point3f(xHalf, -yHalf, -zHalf));
bb.push_back(cv::Point3f(-xHalf, -yHalf, -zHalf));
return bb;
}
inline cv::Rect getBB2D(int imageWidth, int imageHeight, const std::vector& bb3D, const cv::Mat& camMat, const cv::Mat& rvec, const cv::Mat& tvec)
{
// project 3D bounding box vertices into the image
std::vector bb2D;
cv::projectPoints(bb3D, rvec, tvec, camMat, cv::Mat(), bb2D);
// get min-max of projected vertices
int minX = imageWidth - 1;
int maxX = 0;
int minY = imageHeight - 1;
int maxY = 0;
for(unsigned j = 0; j < bb2D.size(); j++)
{
minX = std::min((float) minX, bb2D[j].x);
minY = std::min((float) minY, bb2D[j].y);
maxX = std::max((float) maxX, bb2D[j].x);
maxY = std::max((float) maxY, bb2D[j].y);
}
// clamp at image border
minX = clamp(minX, 0, imageWidth - 1);
maxX = clamp(maxX, 0, imageWidth - 1);
minY = clamp(minY, 0, imageHeight - 1);
maxY = clamp(maxY, 0, imageHeight - 1);
return cv::Rect(minX, minY, (maxX - minX + 1), (maxY - minY + 1));
}
inline float getIoU(const cv::Rect& bb1, const cv::Rect bb2)
{
cv::Rect intersection = bb1 & bb2;
return (intersection.area() / (float) (bb1.area() + bb2.area() - intersection.area()));
}
hough_voting_gpu_op.cu.cc
#if GOOGLE_CUDA
#define EIGEN_USE_GPU
#include
#include
#include
#include
#include
#include
#include "hough_voting_gpu_op.h"
#define VERTEX_CHANNELS 3
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
// namespace tensorflow {
using namespace tensorflow;
__device__ inline float point2line(int cx, int cy, int x, int y, float u, float v)
{
float n1 = -v;
float n2 = u;
return fabs(n1 * (cx - x) + n2 * (cy - y)) / sqrt(n1 * n1 + n2 * n2);
}
__device__ inline float angle_distance(int cx, int cy, int x, int y, float u, float v)
{
float dx = cx - x;
float dy = cy - y;
float n1 = sqrt(u * u + v * v);
float n2 = sqrt(dx * dx + dy * dy);
float dot = u * dx + v * dy;
float distance = dot / (n1 * n2);
return distance;
}
__device__ inline float angle_distance_label(int cx, int cy, int x, int y, float u, float v,
int cls, const int height, const int width, const int* labelmap)
{
float dx = cx - x;
float dy = cy - y;
float n1 = sqrt(u * u + v * v);
float n2 = sqrt(dx * dx + dy * dy);
float dot = u * dx + v * dy;
float distance = dot / (n1 * n2);
int num = 20;
int count = 0;
for (int i = 1; i <= num; i++)
{
float step = float(i) / float(num);
int px = int(x + step * dx);
int py = int(y + step * dy);
if (px >= 0 && px < width && py >= 0 && py < height)
{
if (labelmap[py * width + px] == cls)
count++;
}
}
if ((float)count / float(num) < 0.8)
distance = 0;
return distance;
}
__device__ inline float IoU(float* a, float* b)
{
float left = fmax(a[0], b[0]), right = fmin(a[2], b[2]);
float top = fmax(a[1], b[1]), bottom = fmin(a[3], b[3]);
float width = fmax(right - left + 1, 0.f), height = fmax(bottom - top + 1, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
__device__ inline void project_box(int cls, const float* extents, const float* meta_data, float distance, float* threshold)
{
float xHalf = extents[cls * 3 + 0] * 0.5;
float yHalf = extents[cls * 3 + 1] * 0.5;
float zHalf = extents[cls * 3 + 2] * 0.5;
float bb3D[24];
bb3D[0] = xHalf; bb3D[1] = yHalf; bb3D[2] = zHalf + distance;
bb3D[3] = -xHalf; bb3D[4] = yHalf; bb3D[5] = zHalf + distance;
bb3D[6] = xHalf; bb3D[7] = -yHalf; bb3D[8] = zHalf + distance;
bb3D[9] = -xHalf; bb3D[10] = -yHalf; bb3D[11] = zHalf + distance;
bb3D[12] = xHalf; bb3D[13] = yHalf; bb3D[14] = -zHalf + distance;
bb3D[15] = -xHalf; bb3D[16] = yHalf; bb3D[17] = -zHalf + distance;
bb3D[18] = xHalf; bb3D[19] = -yHalf; bb3D[20] = -zHalf + distance;
bb3D[21] = -xHalf; bb3D[22] = -yHalf; bb3D[23] = -zHalf + distance;
float fx = meta_data[0];
float fy = meta_data[4];
float px = meta_data[2];
float py = meta_data[5];
float minX = 1e8;
float maxX = -1e8;
float minY = 1e8;
float maxY = -1e8;
for (int i = 0; i < 8; i++)
{
float x = fx * (bb3D[i * 3] / bb3D[i * 3 + 2]) + px;
float y = fy * (bb3D[i * 3 + 1] / bb3D[i * 3 + 2]) + py;
minX = fmin(minX, x);
minY = fmin(minY, y);
maxX = fmax(maxX, x);
maxY = fmax(maxY, y);
}
float width = maxX - minX + 1;
float height = maxY - minY + 1;
*threshold = fmax(width, height) * 0.6;
}
__device__ inline float compute_box_overlap(int cls, const float* extents, const float* meta_data, const float* pose, float* box)
{
float xHalf = extents[cls * 3 + 0] * 0.5;
float yHalf = extents[cls * 3 + 1] * 0.5;
float zHalf = extents[cls * 3 + 2] * 0.5;
Eigen::Matrix bb3D;
bb3D(0, 0) = xHalf; bb3D(0, 1) = yHalf; bb3D(0, 2) = zHalf;
bb3D(1, 0) = -xHalf; bb3D(1, 1) = yHalf; bb3D(1, 2) = zHalf;
bb3D(2, 0) = xHalf; bb3D(2, 1) = -yHalf; bb3D(2, 2) = zHalf;
bb3D(3, 0) = -xHalf; bb3D(3, 1) = -yHalf; bb3D(3, 2) = zHalf;
bb3D(4, 0) = xHalf; bb3D(4, 1) = yHalf; bb3D(4, 2) = -zHalf;
bb3D(5, 0) = -xHalf; bb3D(5, 1) = yHalf; bb3D(5, 2) = -zHalf;
bb3D(6, 0) = xHalf; bb3D(6, 1)= -yHalf; bb3D(6, 2) = -zHalf;
bb3D(7, 0) = -xHalf; bb3D(7, 1) = -yHalf; bb3D(7, 2) = -zHalf;
// rotation
Eigen::Quaternionf quaternion(pose[6], pose[7], pose[8], pose[9]);
Eigen::Matrix3f rmatrix = quaternion.toRotationMatrix();
Eigen::Matrix bb3D_new = rmatrix * bb3D.transpose();
// projection
float fx = meta_data[0];
float fy = meta_data[4];
float px = meta_data[2];
float py = meta_data[5];
float x1 = 1e8;
float x2 = -1e8;
float y1 = 1e8;
float y2 = -1e8;
for (int i = 0; i < 8; i++)
{
float X = bb3D_new(0, i) + pose[10];
float Y = bb3D_new(1, i) + pose[11];
float Z = bb3D_new(2, i) + pose[12];
float x = fx * (X / Z) + px;
float y = fy * (Y / Z) + py;
x1 = fmin(x1, x);
y1 = fmin(y1, y);
x2 = fmax(x2, x);
y2 = fmax(y2, y);
}
float box_gt[4];
box_gt[0] = x1;
box_gt[1] = y1;
box_gt[2] = x2;
box_gt[3] = y2;
return IoU(box, box_gt);
}
__global__ void compute_arrays_kernel(const int nthreads, const int* labelmap,
int* arrays, int* array_size, const int height, const int width)
{
CUDA_1D_KERNEL_LOOP(index, nthreads)
{
int cls = labelmap[index];
if (cls > 0)
{
int size = atomicAdd(array_size + cls, 1);
int offset = cls * height * width + size;
arrays[offset] = index;
}
}
}
__global__ void compute_hough_kernel(const int nthreads, float* hough_space, float* hough_data, const int* labelmap,
const float* vertmap, const float* extents, const float* meta_data, int* arrays, int* array_size,
int* class_indexes, const int height, const int width, const int num_classes, const int count, const float inlierThreshold, const int skip_pixels)
{
CUDA_1D_KERNEL_LOOP(index, nthreads)
{
// (cls, cx, cy) is an element in the hough space
int ind = index / (height * width);
int cls = class_indexes[ind];
int n = index % (height * width);
int cx = n % width;
int cy = n / width;
int size = array_size[cls];
float distance = 0;
float bb_width = -1;
float bb_height = -1;
float threshold;
for (int i = 0; i < size; i += skip_pixels)
{
int offset = cls * height * width + i;
int location = arrays[offset];
int x = location % width;
int y = location / width;
// read the direction
offset = VERTEX_CHANNELS * cls + VERTEX_CHANNELS * num_classes * (y * width + x);
float u = vertmap[offset];
float v = vertmap[offset + 1];
float d = exp(vertmap[offset + 2]);
// vote
if (angle_distance(cx, cy, x, y, u, v) > inlierThreshold)
// if (point2line(cx, cy, x, y, u, v) < 1 && angle_distance_label(cx, cy, x, y, u, v, cls, height, width, labelmap) > 0)
{
project_box(cls, extents, meta_data, d, &threshold);
float dx = fabsf(x - cx);
float dy = fabsf(y - cy);
if (dx < threshold && dy < threshold)
{
hough_space[index]++;
distance += d;
}
if (dx > bb_width && dx < threshold && dy < threshold)
bb_width = dx;
if (dy > bb_height && dx < threshold && dy < threshold)
bb_height = dy;
}
}
if (hough_space[index] > 0)
{
distance /= hough_space[index];
int offset = ind * height * width * 3 + 3 * (cy * width + cx);
hough_data[offset] = distance;
hough_data[offset + 1] = 2 * bb_height;
hough_data[offset + 2] = 2 * bb_width;
}
}
}
__global__ void compute_max_indexes_kernel(const int nthreads, int* max_indexes, int* num_max, float* hough_space,
int height, int width, float threshold)
{
CUDA_1D_KERNEL_LOOP(index, nthreads)
{
// (ind, cx, cy) is an element in the hough space
int ind = index / (height * width);
int n = index % (height * width);
int cx = n % width;
int cy = n / width;
int kernel_size = 3;
if (hough_space[index] > threshold)
{
// check if the location is local maximum
int flag = 0;
for (int x = cx - kernel_size; x <= cx + kernel_size; x++)
{
for (int y = cy - kernel_size; y <= cy + kernel_size; y++)
{
if (x >= 0 && x < width && y >= 0 && y < height)
{
if (hough_space[ind * height * width + y * width + x] > hough_space[index])
{
flag = 1;
break;
}
}
}
}
if (flag == 0)
{
// add the location to max_indexes
int max_index = atomicAdd(num_max, 1);
max_indexes[max_index] = index;
}
}
}
}
__global__ void compute_rois_kernel(const int nthreads, float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain,
const float* extents, const float* meta_data, const float* gt, float* hough_space, float* hough_data, int* max_indexes, int* class_indexes,
int is_train, int batch_index, const int height, const int width, const int num_classes, const int num_gt, int* num_rois)
{
CUDA_1D_KERNEL_LOOP(index, nthreads)
{
float scale = 0.05;
int max_index = max_indexes[index];
int ind = max_index / (height * width);
int cls = class_indexes[ind];
int n = max_index % (height * width);
int x = n % width;
int y = n / width;
float fx = meta_data[0];
float fy = meta_data[4];
float px = meta_data[2];
float py = meta_data[5];
float rx = (x - px) / fx;
float ry = (y - py) / fy;
int offset = ind * height * width * 3 + 3 * (y * width + x);
float bb_distance = hough_data[offset];
float bb_height = hough_data[offset + 1];
float bb_width = hough_data[offset + 2];
if (is_train)
{
int roi_index = atomicAdd(num_rois, 9);
top_box[roi_index * 7 + 0] = batch_index;
top_box[roi_index * 7 + 1] = cls;
top_box[roi_index * 7 + 2] = x - bb_width * (0.5 + scale);
top_box[roi_index * 7 + 3] = y - bb_height * (0.5 + scale);
top_box[roi_index * 7 + 4] = x + bb_width * (0.5 + scale);
top_box[roi_index * 7 + 5] = y + bb_height * (0.5 + scale);
top_box[roi_index * 7 + 6] = hough_space[max_index];
for (int i = 0; i < 9; i++)
{
top_pose[(roi_index + i) * 7 + 0] = 1;
top_pose[(roi_index + i) * 7 + 1] = 0;
top_pose[(roi_index + i) * 7 + 2] = 0;
top_pose[(roi_index + i) * 7 + 3] = 0;
top_pose[(roi_index + i) * 7 + 4] = rx * bb_distance;
top_pose[(roi_index + i) * 7 + 5] = ry * bb_distance;
top_pose[(roi_index + i) * 7 + 6] = bb_distance;
if (num_gt == 0)
top_domain[roi_index + i] = 1;
else
top_domain[roi_index + i] = 0;
}
// find the gt index
int gt_ind = -1;
for (int i = 0; i < num_gt; i++)
{
int gt_batch = int(gt[i * 13 + 0]);
int gt_id = int(gt[i * 13 + 1]);
if(cls == gt_id && batch_index == gt_batch)
{
gt_ind = i;
break;
}
}
if (gt_ind != -1)
{
float overlap = compute_box_overlap(cls, extents, meta_data, gt + gt_ind * 13, top_box + roi_index * 7 + 2);
if (overlap > 0.2)
{
for (int i = 0; i < 9; i++)
{
top_target[(roi_index + i) * 4 * num_classes + 4 * cls + 0] = gt[gt_ind * 13 + 6];
top_target[(roi_index + i) * 4 * num_classes + 4 * cls + 1] = gt[gt_ind * 13 + 7];
top_target[(roi_index + i) * 4 * num_classes + 4 * cls + 2] = gt[gt_ind * 13 + 8];
top_target[(roi_index + i) * 4 * num_classes + 4 * cls + 3] = gt[gt_ind * 13 + 9];
top_weight[(roi_index + i) * 4 * num_classes + 4 * cls + 0] = 1;
top_weight[(roi_index + i) * 4 * num_classes + 4 * cls + 1] = 1;
top_weight[(roi_index + i) * 4 * num_classes + 4 * cls + 2] = 1;
top_weight[(roi_index + i) * 4 * num_classes + 4 * cls + 3] = 1;
}
}
// else
// printf("small overlap\n");
}
// else
// printf("no gt pose\n");
// add jittering boxes
float x1 = top_box[roi_index * 7 + 2];
float y1 = top_box[roi_index * 7 + 3];
float x2 = top_box[roi_index * 7 + 4];
float y2 = top_box[roi_index * 7 + 5];
float ww = x2 - x1;
float hh = y2 - y1;
// (-1, -1)
roi_index++;
top_box[roi_index * 7 + 0] = batch_index;
top_box[roi_index * 7 + 1] = cls;
top_box[roi_index * 7 + 2] = x1 - 0.05 * ww;
top_box[roi_index * 7 + 3] = y1 - 0.05 * hh;
top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
top_box[roi_index * 7 + 6] = hough_space[max_index];
// (+1, -1)
roi_index++;
top_box[roi_index * 7 + 0] = batch_index;
top_box[roi_index * 7 + 1] = cls;
top_box[roi_index * 7 + 2] = x1 + 0.05 * ww;
top_box[roi_index * 7 + 3] = y1 - 0.05 * hh;
top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
top_box[roi_index * 7 + 6] = hough_space[max_index];
// (-1, +1)
roi_index++;
top_box[roi_index * 7 + 0] = batch_index;
top_box[roi_index * 7 + 1] = cls;
top_box[roi_index * 7 + 2] = x1 - 0.05 * ww;
top_box[roi_index * 7 + 3] = y1 + 0.05 * hh;
top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
top_box[roi_index * 7 + 6] = hough_space[max_index];
// (+1, +1)
roi_index++;
top_box[roi_index * 7 + 0] = batch_index;
top_box[roi_index * 7 + 1] = cls;
top_box[roi_index * 7 + 2] = x1 + 0.05 * ww;
top_box[roi_index * 7 + 3] = y1 + 0.05 * hh;
top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
top_box[roi_index * 7 + 6] = hough_space[max_index];
// (0, -1)
roi_index++;
top_box[roi_index * 7 + 0] = batch_index;
top_box[roi_index * 7 + 1] = cls;
top_box[roi_index * 7 + 2] = x1;
top_box[roi_index * 7 + 3] = y1 - 0.05 * hh;
top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
top_box[roi_index * 7 + 6] = hough_space[max_index];
// (-1, 0)
roi_index++;
top_box[roi_index * 7 + 0] = batch_index;
top_box[roi_index * 7 + 1] = cls;
top_box[roi_index * 7 + 2] = x1 - 0.05 * ww;
top_box[roi_index * 7 + 3] = y1;
top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
top_box[roi_index * 7 + 6] = hough_space[max_index];
// (0, +1)
roi_index++;
top_box[roi_index * 7 + 0] = batch_index;
top_box[roi_index * 7 + 1] = cls;
top_box[roi_index * 7 + 2] = x1;
top_box[roi_index * 7 + 3] = y1 + 0.05 * hh;
top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
top_box[roi_index * 7 + 6] = hough_space[max_index];
// (+1, 0)
roi_index++;
top_box[roi_index * 7 + 0] = batch_index;
top_box[roi_index * 7 + 1] = cls;
top_box[roi_index * 7 + 2] = x1 + 0.05 * ww;
top_box[roi_index * 7 + 3] = y1;
top_box[roi_index * 7 + 4] = top_box[roi_index * 7 + 2] + ww;
top_box[roi_index * 7 + 5] = top_box[roi_index * 7 + 3] + hh;
top_box[roi_index * 7 + 6] = hough_space[max_index];
}
else
{
int roi_index = atomicAdd(num_rois, 1);
top_box[roi_index * 7 + 0] = batch_index;
top_box[roi_index * 7 + 1] = cls;
top_box[roi_index * 7 + 2] = x - bb_width * (0.5 + scale);
top_box[roi_index * 7 + 3] = y - bb_height * (0.5 + scale);
top_box[roi_index * 7 + 4] = x + bb_width * (0.5 + scale);
top_box[roi_index * 7 + 5] = y + bb_height * (0.5 + scale);
top_box[roi_index * 7 + 6] = hough_space[max_index];
top_pose[roi_index * 7 + 0] = 1;
top_pose[roi_index * 7 + 1] = 0;
top_pose[roi_index * 7 + 2] = 0;
top_pose[roi_index * 7 + 3] = 0;
top_pose[roi_index * 7 + 4] = rx * bb_distance;
top_pose[roi_index * 7 + 5] = ry * bb_distance;
top_pose[roi_index * 7 + 6] = bb_distance;
}
}
}
void reset_outputs(float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain, int* num_rois, int num_classes)
{
int num = 1024;
cudaMemset(top_box, 0, num * 7 * sizeof(float));
cudaMemset(top_pose, 0, num * 7 * sizeof(float));
cudaMemset(top_target, 0, num * 4 *num_classes * sizeof(float));
cudaMemset(top_weight, 0, num * 4 * num_classes * sizeof(float));
cudaMemset(top_domain, 0, num * sizeof(int));
cudaMemset(num_rois, 0, sizeof(int));
}
void copy_num_rois(int* num_rois, int* num_rois_device)
{
cudaMemcpy(num_rois, num_rois_device, sizeof(int), cudaMemcpyDeviceToHost);
}
void copy_outputs(float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain,
float* top_box_final, float* top_pose_final, float* top_target_final, float* top_weight_final, int* top_domain_final, int num_classes, int num_rois)
{
cudaMemcpy(top_box_final, top_box, num_rois * 7 * sizeof(float), cudaMemcpyDeviceToDevice);
cudaMemcpy(top_pose_final, top_pose, num_rois * 7 * sizeof(float), cudaMemcpyDeviceToDevice);
cudaMemcpy(top_target_final, top_target, num_rois * 4 * num_classes * sizeof(float), cudaMemcpyDeviceToDevice);
cudaMemcpy(top_weight_final, top_weight, num_rois * 4 * num_classes * sizeof(float), cudaMemcpyDeviceToDevice);
cudaMemcpy(top_domain_final, top_domain, num_rois * sizeof(int), cudaMemcpyDeviceToDevice);
}
void set_gradients(float* top_label, float* top_vertex, int batch_size, int height, int width, int num_classes)
{
cudaMemset(top_label, 0, batch_size * height * width * sizeof(float));
cudaMemset(top_vertex, 0, batch_size * height * width * 3 * num_classes * sizeof(float));
}
void HoughVotingLaucher(OpKernelContext* context,
const int* labelmap, const float* vertmap, const float* extents, const float* meta_data, const float* gt,
const int batch_index, const int height, const int width, const int num_classes, const int num_gt,
const int is_train, const float inlierThreshold, const int labelThreshold, const int votingThreshold, const int skip_pixels,
float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain, int* num_rois, const Eigen::GpuDevice& d)
{
const int kThreadsPerBlock = 1024;
int output_size;
cudaError_t err;
// step 1: compute a label index array for each class
int dims[2];
dims[0] = num_classes;
dims[1] = height * width;
TensorShape output_shape_arrays;
TensorShapeUtils::MakeShape(dims, 2, &output_shape_arrays);
Tensor arrays_tensor;
OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_arrays, &arrays_tensor));
int* arrays = arrays_tensor.flat().data();
TensorShape output_shape_array_sizes;
TensorShapeUtils::MakeShape(&num_classes, 1, &output_shape_array_sizes);
Tensor array_sizes_tensor;
OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_array_sizes, &array_sizes_tensor));
int* array_sizes = array_sizes_tensor.flat().data();
cudaMemset(array_sizes, 0, num_classes * sizeof(int));
output_size = height * width;
compute_arrays_kernel<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
kThreadsPerBlock, 0, d.stream()>>>(
output_size, labelmap, arrays, array_sizes, height, width);
cudaThreadSynchronize();
// compute class indexes
int* array_sizes_host = (int*)malloc(num_classes * sizeof(int));
int* class_indexes_host = (int*)malloc(num_classes * sizeof(int));
cudaMemcpy(array_sizes_host, array_sizes, num_classes * sizeof(int), cudaMemcpyDeviceToHost);
int count = 0;
for (int c = 1; c < num_classes; c++)
{
if (array_sizes_host[c] > labelThreshold)
{
class_indexes_host[count] = c;
count++;
}
// else
// printf("class %d with only pixels %d\n", c, array_sizes_host[c]);
}
if (count == 0)
{
free(array_sizes_host);
free(class_indexes_host);
return;
}
TensorShape output_shape_class_indexes;
TensorShapeUtils::MakeShape(&count, 1, &output_shape_class_indexes);
Tensor class_indexes_tensor;
OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_class_indexes, &class_indexes_tensor));
int* class_indexes = class_indexes_tensor.flat().data();
cudaMemcpy(class_indexes, class_indexes_host, count * sizeof(int), cudaMemcpyHostToDevice);
err = cudaGetLastError();
if(cudaSuccess != err)
{
fprintf( stderr, "cudaCheckError() failed compute label index: %s\n", cudaGetErrorString( err ) );
exit( -1 );
}
// step 2: compute the hough space
int hdims[4];
hdims[0] = count;
hdims[1] = height;
hdims[2] = width;
hdims[3] = 1;
TensorShape output_shape_hough_space;
TensorShapeUtils::MakeShape(hdims, 4, &output_shape_hough_space);
Tensor hough_space_tensor;
OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, output_shape_hough_space, &hough_space_tensor));
float* hough_space = hough_space_tensor.flat().data();
if (cudaMemset(hough_space, 0, count * height * width * sizeof(float)) != cudaSuccess)
fprintf(stderr, "reset error\n");
hdims[3] = 3;
TensorShape output_shape_hough_data;
TensorShapeUtils::MakeShape(hdims, 4, &output_shape_hough_data);
Tensor hough_data_tensor;
OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, output_shape_hough_data, &hough_data_tensor));
float* hough_data = hough_data_tensor.flat().data();
if (cudaMemset(hough_data, 0, count * height * width * 3 * sizeof(float)) != cudaSuccess)
fprintf(stderr, "reset error\n");
output_size = count * height * width;
compute_hough_kernel<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
kThreadsPerBlock, 0, d.stream()>>>(
output_size, hough_space, hough_data, labelmap, vertmap, extents, meta_data,
arrays, array_sizes, class_indexes, height, width, num_classes, count, inlierThreshold, skip_pixels);
cudaThreadSynchronize();
err = cudaGetLastError();
if(cudaSuccess != err)
{
fprintf( stderr, "cudaCheckError() failed compute hough space: %s\n", cudaGetErrorString( err ) );
exit( -1 );
}
// step 3: find the maximum in hough space
int dim = 1;
TensorShape output_shape_num_max;
TensorShapeUtils::MakeShape(&dim, 1, &output_shape_num_max);
Tensor num_max_tensor;
OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_num_max, &num_max_tensor));
int* num_max = num_max_tensor.flat().data();
if (cudaMemset(num_max, 0, sizeof(int)) != cudaSuccess)
fprintf(stderr, "reset error\n");
dim = 1024;
TensorShape output_shape_max_indexes;
TensorShapeUtils::MakeShape(&dim, 1, &output_shape_max_indexes);
Tensor max_indexes_tensor;
OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, output_shape_max_indexes, &max_indexes_tensor));
int* max_indexes = max_indexes_tensor.flat().data();
if (cudaMemset(max_indexes, 0, dim * sizeof(int)) != cudaSuccess)
fprintf(stderr, "reset error\n");
if (votingThreshold > 0)
{
output_size = count * height * width;
compute_max_indexes_kernel<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
kThreadsPerBlock, 0, d.stream()>>>(
output_size, max_indexes, num_max, hough_space, height, width, votingThreshold);
cudaThreadSynchronize();
}
else
{
int* max_indexes_host = (int*)malloc(count * sizeof(int));
memset(max_indexes_host, 0, count * sizeof(int));
for (int i = 0; i < count; i++)
{
float *hmax = thrust::max_element(thrust::device, hough_space + i * height * width, hough_space + (i+1) * height * width);
max_indexes_host[i] = hmax - hough_space;
}
cudaMemcpy(num_max, &count, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(max_indexes, max_indexes_host, count * sizeof(int), cudaMemcpyHostToDevice);
free(max_indexes_host);
}
err = cudaGetLastError();
if(cudaSuccess != err)
{
fprintf( stderr, "cudaCheckError() failed compute maximum: %s\n", cudaGetErrorString( err ) );
exit( -1 );
}
// step 4: compute outputs
int num_max_host;
cudaMemcpy(&num_max_host, num_max, sizeof(int), cudaMemcpyDeviceToHost);
output_size = num_max_host;
compute_rois_kernel<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
kThreadsPerBlock, 0, d.stream()>>>(
output_size, top_box, top_pose, top_target, top_weight, top_domain,
extents, meta_data, gt, hough_space, hough_data, max_indexes, class_indexes,
is_train, batch_index, height, width, num_classes, num_gt, num_rois);
cudaThreadSynchronize();
// clean up
free(array_sizes_host);
free(class_indexes_host);
err = cudaGetLastError();
if(cudaSuccess != err)
{
fprintf( stderr, "cudaCheckError() failed compute outputs: %s\n", cudaGetErrorString( err ) );
exit( -1 );
}
}
// } // namespace tensorflow
#endif // GOOGLE_CUDA
hough_voting_gpu_op.h
#if !GOOGLE_CUDA
#error This file must only be included when building with Cuda support
#endif
#ifndef TENSORFLOW_USER_OPS_HOUGHVOTING_OP_GPU_H_
#define TENSORFLOW_USER_OPS_HOUGHVOTING_OP_GPU_H_
#define EIGEN_USE_GPU
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/platform/types.h"
namespace tensorflow {
void HoughVotingLaucher(OpKernelContext* context,
const int* labelmap, const float* vertmap, const float* extents, const float* meta_data, const float* gt,
const int batch_index, const int height, const int width, const int num_classes, const int num_gt,
const int is_train, const float inlierThreshold, const int votingThreshold,
float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain, int* num_rois, const Eigen::GpuDevice& d);
void reset_outputs(float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain, int* num_rois, int num_classes);
void copy_num_rois(int* num_rois, int* num_rois_device);
void copy_outputs(float* top_box, float* top_pose, float* top_target, float* top_weight, int* top_domain,
float* top_box_final, float* top_pose_final, float* top_target_final, float* top_weight_final, int* top_domain_final, int num_classes, int num_rois);
void set_gradients(float* top_label, float* top_vertex, int batch_size, int height, int width, int num_classes);
} // namespace tensorflow
#endif // TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_
The compile code use as make.sh
TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
TF_LIB=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_lib())')
CUDA_PATH=/usr/local/cuda
nvcc -std=c++11 -c -o hough_voting_gpu_op.cu.o hough_voting_gpu_op.cu.cc \
-I $TF_INC -I$TF_INC/external/nsync/public -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -arch=sm_61
g++ -std=c++11 -shared -D_GLIBCXX_USE_CXX11_ABI=0 -o hough_voting_gpu.so hough_voting_gpu_op.cc \
hough_voting_gpu_op.cu.o -I $TF_INC -I$TF_INC/external/nsync/public -fPIC -lcudart -lcublas -lopencv_imgproc -lopencv_calib3d -lopencv_core -L $CUDA_PATH/lib64 -L$TF_LIB -ltensorflow_framework
echo 'hough_voting_gpu_layer'
The python file has these four
hough_voting_gpu_op.py
import tensorflow as tf
import os.path as osp
filename = osp.join(osp.dirname(__file__), 'hough_voting_gpu.so')
_hough_voting_gpu_module = tf.load_op_library(filename)
hough_voting_gpu = _hough_voting_gpu_module.houghvotinggpu
hough_voting_gpu_grad = _hough_voting_gpu_module.houghvotinggpu_grad
hough_voting_gpu_op_grad.py
import tensorflow as tf
from tensorflow.python.framework import ops
import os
sys.path.insert(0,os.path.dirname(__file__))
import hough_voting_gpu_op
@ops.RegisterShape("Houghvotinggpu")
def _hough_voting_gpu_shape(op):
dims_vertex = op.inputs[1].get_shape().as_list()
num_classes = dims_vertex[3] / 3
output_shape_0 = tf.TensorShape([None, 7])
output_shape_1 = tf.TensorShape([None, 7])
output_shape_2 = tf.TensorShape([None, 4 * num_classes])
output_shape_3 = tf.TensorShape([None, 4 * num_classes])
output_shape_4 = tf.TensorShape([None])
return [output_shape_0, output_shape_1, output_shape_2, output_shape_3, output_shape_4]
@ops.RegisterGradient("Houghvotinggpu")
def _hough_voting_gpu_grad(op, grad, tmp, tmp1, tmp2, _):
"""The gradients for `Houghvotinggpu`.
Args:
op: The `backproject` `Operation` that we are differentiating, which we can use
to find the inputs and outputs of the original op.
grad: Gradient with respect to the output of the `backproject` op.
Returns:
Gradients with respect to the input of `backproject`.
"""
bottom_prob = op.inputs[0]
bottom_vertex = op.inputs[1]
# compute gradient
data_grad_prob, data_grad_vertex = hough_voting_gpu_op.hough_voting_gpu_grad(bottom_prob, bottom_vertex, grad)
return [data_grad_prob, data_grad_vertex, None, None, None] # List of one Tensor, since we have two input
__init__.py
test.py
from IPython import embed
import tensorflow as tf
import numpy as np
import hough_voting_gpu_op
import hough_voting_gpu_op_grad