TensorRT网络推理c++实现

前言:本文给出了yolov3-tiny网络TensorRT推理步骤实现的c++代码,手动计算显存与内存的大小,计算yolo层输出框的位置。

demo函数:

/*--------------------------------输入预处理---------------------------------*/
	std::string imgPath = "imagePath";
	image = cv::imread(imgPath);
	float scale = min(float(DETECT_WIDTH) / image.cols, float(DETECT_HEIGHT) / image.rows);
	Size scaleSize = cv::Size(image.cols * scale, image.rows * scale);
	cv::Mat rgb;
	cv::cvtColor(image, rgb, COLOR_BGR2RGB);
	cv::Mat resized;
	cv::resize(rgb, resized, scaleSize, 0, 0, INTER_NEAREST);
	cv::Mat cropped(DETECT_HEIGHT, DETECT_WIDTH, CV_8UC3, 127);
	Rect rect((DETECT_WIDTH - scaleSize.width) / 2, (DETECT_HEIGHT - scaleSize.height) / 2, scaleSize.width, scaleSize.height);
	resized.copyTo(cropped(rect));
	cv::Mat img_float;
	cropped.convertTo(img_float, CV_32FC3, 1.f / 255.0);
	//HWC TO CHW
	vector input_channels(3);
	cv::split(img_float, input_channels);
	float* net_input = new float[DETECT_HEIGHT * DETECT_WIDTH * 3];
	memset(net_input, 0, DETECT_HEIGHT * DETECT_WIDTH * 3 * sizeof(float));
	int channelLength = DETECT_HEIGHT * DETECT_WIDTH;
	for (int i = 0; i < 3; ++i)
	{
		memcpy(net_input, input_channels[i].data, channelLength * sizeof(float));
		net_input += channelLength;
	}
    float* out[2];
	out[0] = new float[3042];
	out[1] = new float[12168];
	float* out_merge = new float[3042 + 12168];
	 run inference and cout time
	doInference(*context, net_input, out, 1);
	out_merge = merge(out[0], out[1], 3042, 12168);
	auto boxes = postProcess(image, out_merge);
	delete[] net_input;
	for (int i = 0; i < boxes.size(); i++)
	{
		std::cout << "boxes_" << i << ".x: " << boxes[i].x << endl;
		std::cout << "boxes_" << i << ".y: " << boxes[i].y << endl;
		std::cout << "boxes_" << i << ".w: " << boxes[i].w << endl;
		std::cout << "boxes_" << i << ".h: " << boxes[i].h << endl;
		std::cout << "boxes_" << i << ".prob: " << boxes[i].prob << endl;
	}
	if (boxes.size() > 0)
	{
		Rect rect_ptg = Rect(boxes[0].x, boxes[0].y, boxes[0].w, boxes[0].h);
		rectangle(image, rect_ptg, (0, 0, 255), 2, 8, 0);
		resize(image, image, Size(image.cols / 2, image.rows / 2));
		imshow("Result", image);
		waitKey();
	}

数据处理函数:

float* merge(float* out1, float* out2, int bsize_out1, int bsize_out2)
{
	float* out_total = new float[bsize_out1 + bsize_out2];
	for (int j = 0; j < bsize_out1; ++j)
	{
		int index = j;
		out_total[index] = out1[j];
	}
	for (int j = 0; j < bsize_out2; ++j)
	{
		int index = j + bsize_out1;
		out_total[index] = out2[j];
	}
	return out_total;
}

基于x86架构的网络推理实现:

void doInference(IExecutionContext& context, float* input, float** output, int batchSize)
{
	const ICudaEngine& engine = context.getEngine();
	assert(engine.getNbBindings() == 3);
	void* buffers[3];
	const int inputIndex = 0;
	const int outputLocIndex = 1;
	const int outputConfIndex = 2;
	// create GPU buffers, 申请GPU显存, Allocate GPU memory for Input / Output data
	cudaMalloc(&buffers[inputIndex], batchSize * INPUT_CHANNEL * DETECT_HEIGHT * DETECT_WIDTH * sizeof(float));
    cudaMalloc(&buffers[outputLocIndex], batchSize * 12168 * sizeof(float));
	cudaMalloc(&buffers[outputConfIndex], batchSize * 3042 * sizeof(float));
	//使用cuda 流来管理并行计算, Use CUDA streams to manage the concurrency of copying and executing
	cudaStream_t stream;
	CHECK(cudaStreamCreate(&stream));
	//内存到显存,input是读入内存中的数据;buffers[inputIndex]是显存上的存储区域,用于存放输入数据
	// Copy Input Data to the GPU
    cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_CHANNEL * DETECT_HEIGHT * DETECT_WIDTH * sizeof(float), cudaMemcpyHostToDevice, stream);
	context.enqueue(batchSize, buffers, stream, nullptr);
	cudaMemcpyAsync(output[0], buffers[outputLocIndex], batchSize * 12168 * sizeof(float), cudaMemcpyDeviceToHost, stream);
	cudaMemcpyAsync(output[1], buffers[outputConfIndex], batchSize * 3042 * sizeof(float), cudaMemcpyDeviceToHost, stream);
	//如果使用了多个cuda流,需要同步
	cudaStreamSynchronize(stream);
	// release the stream and the buffers
	cudaStreamDestroy(stream);
	cudaFree(buffers[inputIndex]);
	cudaFree(buffers[outputLocIndex]);
	cudaFree(buffers[outputConfIndex]);
}

这其中:12168与3042的计算方式如下:

图像的输入为416*416*3,在yolov3-tiny中,yolo有两次输出,一次经过了4次卷积池化,另一次经过5次卷积池化后,宽高分别变为原理的1/16和1/32,因此经过4次池化的yolo层输出前的大小为:26*26*3,经过5次池化的yolo层输出前的大小为13*13*3,如果你识别的目标有一类,那么该类目标包含位置信息(x,y,w,h)以及class和置信度信息,所以:

经过4次池化的yolo层输出前的单位大小为:26*26*3*(4+1+1)=12168

经过4次池化的yolo层输出前的单位大小为:13*13*3*(4+1+1)=3042

分别占据的内存大小为:12168*sizeof(float)以及3042*sizeof(float),这部分如果理解不了可以查看下yolov3网络的架构;

后处理:yolo层输出处理

void DoNms(vector& detections, float nmsThresh)
{
	auto iouCompute = [](float * lbox, float* rbox) 
	{
		float interBox[] = 
		{
			max(lbox[0], rbox[0]), //left
			min(lbox[0] + lbox[2], rbox[0] + rbox[2]), //right
			max(lbox[1], rbox[1]), //top
			min(lbox[1] + lbox[3], rbox[1] + rbox[3]), //bottom
		};
		if (interBox[2] >= interBox[3] || interBox[0] >= interBox[1])
			return 0.0f;
		float interBoxS = (interBox[1] - interBox[0] + 1) * (interBox[3] - interBox[2] + 1);
		return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
	};
	sort(detections.begin(), detections.end(), [=](const DetectionRes & left, const DetectionRes & right) 
	{
		return left.prob > right.prob;
	});
	vector result;
	for (unsigned int m = 0; m < detections.size(); ++m) 
	{
		result.push_back(detections[m]);
		for (unsigned int n = m + 1; n < detections.size(); ++n) 
		{
			if (iouCompute((float *)(&detections[m]), (float *)(&detections[n])) > nmsThresh)
			{
				detections.erase(detections.begin() + n);
				--n;
			}
		}
	}
	detections = move(result);
}

vector postProcess(cv::Mat& image, float * output)
{
	vector detections;
	int total_size = 0;
	for (int i = 0; i < output_shape.size(); i++)
	{
		auto shape = output_shape[i];
		int size = 1;
		for (int j = 0; j < shape.size(); j++) 
		{
			size *= shape[j];
		}
		total_size += size;
	}
	int offset = 0;
	float * transposed_output = new float[total_size];
	float * transposed_output_t = transposed_output;
	for (int i = 0; i < output_shape.size(); i++) 
	{
		auto shape = output_shape[i];  // nchw
		int chw = shape[1] * shape[2] * shape[3];
		int hw = shape[2] * shape[3];
		for (int n = 0; n < shape[0]; n++) 
		{
			int offset_n = offset + n * chw;
			for (int h = 0; h < shape[2]; h++) 
			{
				for (int w = 0; w < shape[3]; w++) 
				{
					int h_w = h * shape[3] + w;
					for (int c = 0; c < shape[1]; c++) 
					{
						int offset_c = offset_n + hw * c + h_w;
						*transposed_output_t++ = output[offset_c];
					}
				}
			}
		}
		offset += shape[0] * chw;
	}
	vector > shapes;
	for (int i = 0; i < output_shape.size(); i++) 
	{
		auto shape = output_shape[i];
		vector tmp = { shape[2], shape[3], 3, 6 };
		shapes.push_back(tmp);
	}

	offset = 0;
	for (int i = 0; i < output_shape.size(); i++) 
	{
		auto masks = g_masks[i];
		vector > anchors;
		for (auto mask : masks)
			anchors.push_back(g_anchors[mask]);
		auto shape = shapes[i];
		for (int h = 0; h < shape[0]; h++) 
		{
			int offset_h = offset + h * shape[1] * shape[2] * shape[3];
			for (int w = 0; w < shape[1]; w++) 
			{
				int offset_w = offset_h + w * shape[2] * shape[3];
				for (int c = 0; c < shape[2]; c++) 
				{
					int offset_c = offset_w + c * shape[3];
					float * ptr = transposed_output + offset_c;
					ptr[4] = sigmoid(ptr[4]);
					ptr[5] = sigmoid(ptr[5]);
					float score = ptr[4] * ptr[5];
					if (score < obj_threshold)
						continue;
					ptr[0] = sigmoid(ptr[0]);
					ptr[1] = sigmoid(ptr[1]);
					ptr[2] = exponential(ptr[2]) * anchors[c][0];
					ptr[3] = exponential(ptr[3]) * anchors[c][1];
					ptr[0] += w;
					ptr[1] += h;
					ptr[0] /= shape[0];
					ptr[1] /= shape[1];
					ptr[2] /= DETECT_WIDTH;
					ptr[3] /= DETECT_WIDTH;
					ptr[0] -= ptr[2] / 2;
					ptr[1] -= ptr[3] / 2;
					DetectionRes det;;
					det.x = ptr[0];
					det.y = ptr[1];
					det.w = ptr[2];
					det.h = ptr[3];
					det.prob = score;
					detections.push_back(det);
				}
			}
		}
		offset += shape[0] * shape[1] * shape[2] * shape[3];
	}
	delete[]transposed_output;
	int h = DETECT_WIDTH;   //net h
	int w = DETECT_WIDTH;   //net w
							//scale bbox to img
	int width = image.cols;
	int height = image.rows;
	float scale = min(float(w) / width, float(h) / height);
	float scaleSize[] = { width * scale, height * scale };
	//correct box
	for (auto& bbox : detections) 
	{
		bbox.x = (bbox.x * w - (w - scaleSize[0]) / 2.f) / scale;
		bbox.y = (bbox.y * h - (h - scaleSize[1]) / 2.f) / scale;
		bbox.w *= w;
		bbox.h *= h;
		bbox.w /= scale;
		bbox.h /= scale;
	}
	//nms
	float nmsThresh = nms_threshold;
	if (nmsThresh > 0)
		DoNms(detections, nmsThresh);
	return detections;
}

你可能感兴趣的:(#,TensorRT,yolov3,c++,tensorRT,后处理)