trtexec --onnx=face_generator.onnx --saveEngine=face_generator.trt --workspace=6000
--onnx后面是输入的onnx模型,saveEngine是输出的模型,还有一个workspace是限制每个层能使用的最大空间(可能理解的不到位)
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#define CHECK(status) \
do\
{\
auto ret = (status);\
if (ret != 0)\
{\
std::cerr << "Cuda failure: " << ret << std::endl;\
abort();\
}\
} while (0)
#define DEVICE 0 // GPU id
using namespace nvinfer1;
// 输入输出的名字,导出onnx的时候可以指定
const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";
//一个Logger,使用tensorrt必须要有
static Logger gLogger;
//就是生成625x100的数据,只不过需要将这个数据填充到一维float数组中去(tensorrt输入格式)
float* generate_input_blob()
{
float* blob = new float[625 * 100];
cv::RNG rng;
for(int i=0;i<625;i++)
{
cv::Mat noise(1,100,CV_32FC1);
rng.fill(noise,cv::RNG::NORMAL,0,1);
memcpy(&blob[i*100],noise.data,100*sizeof(float));
}
return blob;
}
//工具函数,将vector里面放的图片整合成一个大图片,像上面一样
void make_grid(const std::vector<cv::Mat> images,cv::Mat& out,int nrow=25,int gap=2)
{
int ncol = std::ceil(images.size()*1.0/nrow);
int width = images[0].cols;
int height = images[0].rows;
int vertical_pix = nrow*height + (nrow-1)*gap;
int horizon_pix = ncol*width + (ncol-1)*gap;
out.create(vertical_pix,horizon_pix,CV_8UC3);
out.setTo(cv::Scalar(255,255,255));
for(int i=0;i<nrow;i++)
{
for(int j=0;j<ncol;j++)
{
int x = width * j + j*gap;
int y = height * i + i*gap;
cv::Rect roi(x,y,width,height);
images[i*nrow+j].copyTo(out(roi));
}
}
}
//tensorrt的输出也是一个float数组,所以需要解码成我们想要的图片,由于模型生成的值范围是[-1.0,1.0]所以需要转化为[0,255]
void decode_output(const float *output,cv::Mat &result)
{
std::vector<cv::Mat> images;
for(int i=0;i<625;i++)
{
cv::Mat image(64,68,CV_32FC3);
for(int c=0;c<3;c++)
for(int h=0;h<64;h++)
for(int w=0;w<68;w++)
image.ptr<float>(h)[image.channels() * w + c] = output[image.channels()*64*68*i + c*64*68 + 68*h + w];
cv::add(image,1.0,image);
cv::divide(image,2.0,image);
image.convertTo(image,CV_8U,255);
cv::cvtColor(image,image,cv::COLOR_RGB2BGR);
images.push_back(image);
}
make_grid(images,result);
}
//下面就主要的推理过程了,详细注释一下
void doInference(IExecutionContext& context, float* input, float* output, const int output_size, const int input_size) {
const ICudaEngine& engine = context.getEngine();//从上下文得到模型
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
assert(engine.getNbBindings() == 2);//确认一下需要绑定的数据个数是否为2,在这个模型的就只有输入和输出分别一个,所以为2。另外注意一个模型可以不止一个输入哈
void* buffers[2];//输入和输出的指针保存数组
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
//根据输入的名字获得输入的下标,就是上面声明的数组里的两个指针,不能想当然的以为buffers[0]为输入,buffers[1]为输出。
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
//检查输入的类型是否为float
assert(engine.getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT);
//同上,不过这里是输出的index
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
assert(engine.getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT);
// Create GPU buffers on device
//分配显存,输入输出都要分配,从这里应该就能理解上面的的操作了,注意分配的单位是字节
CHECK(cudaMalloc(&buffers[inputIndex], input_size * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], output_size * sizeof(float)));
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
//将内存里的输入复制到显存中去,第一个参数为显存的地址,第二个为内存地址,第三个为复制的字节数,第四个从名称就能看出来,第五个stream
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, input_size * sizeof(float), cudaMemcpyHostToDevice, stream));
//这里是真正的推理过程,tensorrt使用buffers[inputIndex]里指的数据,运行模型,然后把结果放在buffers[outputIndex]指的地址里去
context.enqueue(1, buffers, stream, nullptr);
//将显存里的输出复制到内存中去
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
int main(int argc, char** argv) {
cudaSetDevice(DEVICE);
// create a model using the API directly and serialize it to a stream
char *trtModelStream{nullptr};
size_t size{0};
if (argc == 2) {
//运行程序的时候需要指定trt文件的路径名字
const std::string engine_file_path {argv[1]};
std::ifstream file(engine_file_path, std::ios::binary);
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
}
} else {
std::cerr << "arguments not right!" << std::endl;
return -1;
}
const std::string input_image_path {argv[3]};
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
delete[] trtModelStream;
auto input_dims = engine->getBindingDimensions(0);
std::cout<<"input dim: ";
int input_size = 1;
for(int i=0;i<input_dims.nbDims;i++) {
std::cout<<input_dims.d[i]<<",";
input_size *= input_dims.d[i];
}
std::cout<<std::endl;
float *input = generate_input_blob();
auto out_dims = engine->getBindingDimensions(1);
auto output_size = 1;
std::cout<<"output dim: ";
for(int j=0;j<out_dims.nbDims;j++) {
output_size *= out_dims.d[j];
std::cout<<out_dims.d[j]<<",";
}
std::cout<<std::endl;
float* output = new float[output_size];
doInference(*context,input,output,output_size,input_size);
cv::Mat result;
decode_output(output,result);
cv::imwrite("result.png",result);
delete[] input;
delete[] output;
context->destroy();
engine->destroy();
runtime->destroy();
return 0;
}
完整的模型包等审核完毕就发上来,包括onnx模型和cmakelist文件
模型和代码