前言:本文给出了yolov3-tiny网络TensorRT推理步骤实现的c++代码,手动计算显存与内存的大小,计算yolo层输出框的位置。
/*--------------------------------输入预处理---------------------------------*/
std::string imgPath = "imagePath";
image = cv::imread(imgPath);
float scale = min(float(DETECT_WIDTH) / image.cols, float(DETECT_HEIGHT) / image.rows);
Size scaleSize = cv::Size(image.cols * scale, image.rows * scale);
cv::Mat rgb;
cv::cvtColor(image, rgb, COLOR_BGR2RGB);
cv::Mat resized;
cv::resize(rgb, resized, scaleSize, 0, 0, INTER_NEAREST);
cv::Mat cropped(DETECT_HEIGHT, DETECT_WIDTH, CV_8UC3, 127);
Rect rect((DETECT_WIDTH - scaleSize.width) / 2, (DETECT_HEIGHT - scaleSize.height) / 2, scaleSize.width, scaleSize.height);
resized.copyTo(cropped(rect));
cv::Mat img_float;
cropped.convertTo(img_float, CV_32FC3, 1.f / 255.0);
//HWC TO CHW
vector input_channels(3);
cv::split(img_float, input_channels);
float* net_input = new float[DETECT_HEIGHT * DETECT_WIDTH * 3];
memset(net_input, 0, DETECT_HEIGHT * DETECT_WIDTH * 3 * sizeof(float));
int channelLength = DETECT_HEIGHT * DETECT_WIDTH;
for (int i = 0; i < 3; ++i)
{
memcpy(net_input, input_channels[i].data, channelLength * sizeof(float));
net_input += channelLength;
}
float* out[2];
out[0] = new float[3042];
out[1] = new float[12168];
float* out_merge = new float[3042 + 12168];
run inference and cout time
doInference(*context, net_input, out, 1);
out_merge = merge(out[0], out[1], 3042, 12168);
auto boxes = postProcess(image, out_merge);
delete[] net_input;
for (int i = 0; i < boxes.size(); i++)
{
std::cout << "boxes_" << i << ".x: " << boxes[i].x << endl;
std::cout << "boxes_" << i << ".y: " << boxes[i].y << endl;
std::cout << "boxes_" << i << ".w: " << boxes[i].w << endl;
std::cout << "boxes_" << i << ".h: " << boxes[i].h << endl;
std::cout << "boxes_" << i << ".prob: " << boxes[i].prob << endl;
}
if (boxes.size() > 0)
{
Rect rect_ptg = Rect(boxes[0].x, boxes[0].y, boxes[0].w, boxes[0].h);
rectangle(image, rect_ptg, (0, 0, 255), 2, 8, 0);
resize(image, image, Size(image.cols / 2, image.rows / 2));
imshow("Result", image);
waitKey();
}
float* merge(float* out1, float* out2, int bsize_out1, int bsize_out2)
{
float* out_total = new float[bsize_out1 + bsize_out2];
for (int j = 0; j < bsize_out1; ++j)
{
int index = j;
out_total[index] = out1[j];
}
for (int j = 0; j < bsize_out2; ++j)
{
int index = j + bsize_out1;
out_total[index] = out2[j];
}
return out_total;
}
void doInference(IExecutionContext& context, float* input, float** output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
assert(engine.getNbBindings() == 3);
void* buffers[3];
const int inputIndex = 0;
const int outputLocIndex = 1;
const int outputConfIndex = 2;
// create GPU buffers, 申请GPU显存, Allocate GPU memory for Input / Output data
cudaMalloc(&buffers[inputIndex], batchSize * INPUT_CHANNEL * DETECT_HEIGHT * DETECT_WIDTH * sizeof(float));
cudaMalloc(&buffers[outputLocIndex], batchSize * 12168 * sizeof(float));
cudaMalloc(&buffers[outputConfIndex], batchSize * 3042 * sizeof(float));
//使用cuda 流来管理并行计算, Use CUDA streams to manage the concurrency of copying and executing
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
//内存到显存,input是读入内存中的数据;buffers[inputIndex]是显存上的存储区域,用于存放输入数据
// Copy Input Data to the GPU
cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_CHANNEL * DETECT_HEIGHT * DETECT_WIDTH * sizeof(float), cudaMemcpyHostToDevice, stream);
context.enqueue(batchSize, buffers, stream, nullptr);
cudaMemcpyAsync(output[0], buffers[outputLocIndex], batchSize * 12168 * sizeof(float), cudaMemcpyDeviceToHost, stream);
cudaMemcpyAsync(output[1], buffers[outputConfIndex], batchSize * 3042 * sizeof(float), cudaMemcpyDeviceToHost, stream);
//如果使用了多个cuda流,需要同步
cudaStreamSynchronize(stream);
// release the stream and the buffers
cudaStreamDestroy(stream);
cudaFree(buffers[inputIndex]);
cudaFree(buffers[outputLocIndex]);
cudaFree(buffers[outputConfIndex]);
}
这其中:12168与3042的计算方式如下:
图像的输入为416*416*3,在yolov3-tiny中,yolo有两次输出,一次经过了4次卷积池化,另一次经过5次卷积池化后,宽高分别变为原理的1/16和1/32,因此经过4次池化的yolo层输出前的大小为:26*26*3,经过5次池化的yolo层输出前的大小为13*13*3,如果你识别的目标有一类,那么该类目标包含位置信息(x,y,w,h)以及class和置信度信息,所以:
经过4次池化的yolo层输出前的单位大小为:26*26*3*(4+1+1)=12168
经过4次池化的yolo层输出前的单位大小为:13*13*3*(4+1+1)=3042
分别占据的内存大小为:12168*sizeof(float)以及3042*sizeof(float),这部分如果理解不了可以查看下yolov3网络的架构;
void DoNms(vector& detections, float nmsThresh)
{
auto iouCompute = [](float * lbox, float* rbox)
{
float interBox[] =
{
max(lbox[0], rbox[0]), //left
min(lbox[0] + lbox[2], rbox[0] + rbox[2]), //right
max(lbox[1], rbox[1]), //top
min(lbox[1] + lbox[3], rbox[1] + rbox[3]), //bottom
};
if (interBox[2] >= interBox[3] || interBox[0] >= interBox[1])
return 0.0f;
float interBoxS = (interBox[1] - interBox[0] + 1) * (interBox[3] - interBox[2] + 1);
return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
};
sort(detections.begin(), detections.end(), [=](const DetectionRes & left, const DetectionRes & right)
{
return left.prob > right.prob;
});
vector result;
for (unsigned int m = 0; m < detections.size(); ++m)
{
result.push_back(detections[m]);
for (unsigned int n = m + 1; n < detections.size(); ++n)
{
if (iouCompute((float *)(&detections[m]), (float *)(&detections[n])) > nmsThresh)
{
detections.erase(detections.begin() + n);
--n;
}
}
}
detections = move(result);
}
vector postProcess(cv::Mat& image, float * output)
{
vector detections;
int total_size = 0;
for (int i = 0; i < output_shape.size(); i++)
{
auto shape = output_shape[i];
int size = 1;
for (int j = 0; j < shape.size(); j++)
{
size *= shape[j];
}
total_size += size;
}
int offset = 0;
float * transposed_output = new float[total_size];
float * transposed_output_t = transposed_output;
for (int i = 0; i < output_shape.size(); i++)
{
auto shape = output_shape[i]; // nchw
int chw = shape[1] * shape[2] * shape[3];
int hw = shape[2] * shape[3];
for (int n = 0; n < shape[0]; n++)
{
int offset_n = offset + n * chw;
for (int h = 0; h < shape[2]; h++)
{
for (int w = 0; w < shape[3]; w++)
{
int h_w = h * shape[3] + w;
for (int c = 0; c < shape[1]; c++)
{
int offset_c = offset_n + hw * c + h_w;
*transposed_output_t++ = output[offset_c];
}
}
}
}
offset += shape[0] * chw;
}
vector > shapes;
for (int i = 0; i < output_shape.size(); i++)
{
auto shape = output_shape[i];
vector tmp = { shape[2], shape[3], 3, 6 };
shapes.push_back(tmp);
}
offset = 0;
for (int i = 0; i < output_shape.size(); i++)
{
auto masks = g_masks[i];
vector > anchors;
for (auto mask : masks)
anchors.push_back(g_anchors[mask]);
auto shape = shapes[i];
for (int h = 0; h < shape[0]; h++)
{
int offset_h = offset + h * shape[1] * shape[2] * shape[3];
for (int w = 0; w < shape[1]; w++)
{
int offset_w = offset_h + w * shape[2] * shape[3];
for (int c = 0; c < shape[2]; c++)
{
int offset_c = offset_w + c * shape[3];
float * ptr = transposed_output + offset_c;
ptr[4] = sigmoid(ptr[4]);
ptr[5] = sigmoid(ptr[5]);
float score = ptr[4] * ptr[5];
if (score < obj_threshold)
continue;
ptr[0] = sigmoid(ptr[0]);
ptr[1] = sigmoid(ptr[1]);
ptr[2] = exponential(ptr[2]) * anchors[c][0];
ptr[3] = exponential(ptr[3]) * anchors[c][1];
ptr[0] += w;
ptr[1] += h;
ptr[0] /= shape[0];
ptr[1] /= shape[1];
ptr[2] /= DETECT_WIDTH;
ptr[3] /= DETECT_WIDTH;
ptr[0] -= ptr[2] / 2;
ptr[1] -= ptr[3] / 2;
DetectionRes det;;
det.x = ptr[0];
det.y = ptr[1];
det.w = ptr[2];
det.h = ptr[3];
det.prob = score;
detections.push_back(det);
}
}
}
offset += shape[0] * shape[1] * shape[2] * shape[3];
}
delete[]transposed_output;
int h = DETECT_WIDTH; //net h
int w = DETECT_WIDTH; //net w
//scale bbox to img
int width = image.cols;
int height = image.rows;
float scale = min(float(w) / width, float(h) / height);
float scaleSize[] = { width * scale, height * scale };
//correct box
for (auto& bbox : detections)
{
bbox.x = (bbox.x * w - (w - scaleSize[0]) / 2.f) / scale;
bbox.y = (bbox.y * h - (h - scaleSize[1]) / 2.f) / scale;
bbox.w *= w;
bbox.h *= h;
bbox.w /= scale;
bbox.h /= scale;
}
//nms
float nmsThresh = nms_threshold;
if (nmsThresh > 0)
DoNms(detections, nmsThresh);
return detections;
}