Windows下TensorRT部署Yolov5s

1、准备

GTX 1050

CUDA 11.4

cudnn 8.9.1.23

TensorRT-8.6.1.6

2、代码

#include "NvInfer.h"
#include "NvOnnxParser.h"
#include 
#include 

#include 
#include 
#include 
#include 

using namespace cv;
using namespace nvinfer1;
using namespace nvonnxparser;

struct Detection
{
    int class_id;
    float confidence;
    cv::Rect box;
};

class MyLogger : public ILogger {
public:
    explicit MyLogger(Severity severity = Severity::kWARNING) : severity_(severity) {}
    void log(Severity severity, const char* msg) noexcept override 
    {
        if (severity <= severity_) 
        {
            std::cerr << msg << std::endl;
        }
    }
    Severity severity_;
};

const std::vector colors = { cv::Scalar(255, 255, 0), cv::Scalar(0, 255, 0), cv::Scalar(0, 255, 255), cv::Scalar(255, 0, 0) };

std::vector load_class_list(std::string class_path)
{
    std::vector class_list;
    std::ifstream ifs(class_path);
    std::string line;
    while (getline(ifs, line))
    {
        class_list.push_back(line);
    }
    return class_list;
}

Mat img_preprocess(Mat image, int wid, int hei, float& ratio, int& x_offset, int& y_offset) 
{
    Mat resize_image;
    ratio = std::min(wid / (image.cols * 1.0f), hei / (image.rows * 1.0f));
    int border_width = image.cols * ratio;
    int border_height = image.rows * ratio;
    x_offset = (wid - border_width) / 2;
    y_offset = (hei - border_height) / 2;
    resize(image, resize_image, Size(border_width, border_height));
    copyMakeBorder(resize_image, resize_image, y_offset, y_offset, x_offset, x_offset, 
        cv::BORDER_CONSTANT, Scalar(114, 114, 114));
    cvtColor(resize_image, resize_image, cv::COLOR_BGR2RGB);
    return resize_image;
}

void detect(cv::Mat& frame, ICudaEngine* engine, std::vector& output,
    const std::vector& className, float score_thr, float conf_thr, float nms_thr)
{
    float ratio;
    int x_offset, y_offset;
    Mat resize_image = img_preprocess(frame, 640, 640, ratio, x_offset, y_offset);

    void* buffers[2];
    Dims input_dim = engine->getBindingDimensions(0);  // 1,3,640,640
    int input_size = 1;
    for (int j = 0; j < input_dim.nbDims; ++j)
    {
        input_size *= input_dim.d[j];
    }
    size_t input_sz = input_size * sizeof(float);
    cudaMalloc(&buffers[0], input_sz);

    Dims output_dim = engine->getBindingDimensions(1);  // 1,25200,85
    int output_size = 1;
    for (int j = 0; j < output_dim.nbDims; ++j)
    {
        output_size *= output_dim.d[j];
    }
    size_t output_sz = output_size * sizeof(float);
    cudaMalloc(&buffers[1], output_sz);

    float* input_blob = new float[input_size];    // host
    float* output_blob = new float[output_size];  // host

    std::cout << "Get input blob!--------------------\n";
    const int channels = resize_image.channels();
    const int width = resize_image.cols;
    const int height = resize_image.rows;
    for (int c = 0; c < channels; c++) {
        for (int h = 0; h < height; h++) {
            for (int w = 0; w < width; w++) {
                input_blob[c * width * height + h * width + w] =
                    resize_image.at(h, w)[c] / 255.0f;
            }
        }
    }

    std::cout << "Performing Inference!--------------------\n";
    IExecutionContext* context = engine->createExecutionContext();
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    cudaMemcpyAsync(buffers[0], input_blob, input_sz, cudaMemcpyHostToDevice, stream);
    context->enqueueV2(buffers, stream, nullptr);
    cudaMemcpyAsync(output_blob, buffers[1], output_sz, cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);

    delete context;
    cudaFree(buffers[0]);
    cudaFree(buffers[1]);

    std::vector boxes;
    std::vector labels;
    std::vector confs;
    float* ptr = output_blob;
    const int dim = className.size() + 5;
    for (int i = 0; i < 25200; ++i) {
        float objectness = ptr[4];
        if (objectness >= score_thr) {
            int label = std::max_element(ptr + 5, ptr + dim) - (ptr + 5);
            float confidence = ptr[5 + label] * objectness;
            if (confidence >= conf_thr) {
                float bx = ptr[0];
                float by = ptr[1];
                float bw = ptr[2];
                float bh = ptr[3];
                Rect box;
                box.x = (bx - bw * 0.5f - x_offset) / ratio;
                box.y = (by - bh * 0.5f - y_offset) / ratio;
                box.width = bw / ratio;
                box.height = bh / ratio;
                boxes.push_back(box);
                labels.push_back(label);
                confs.push_back(confidence);
            }
        }
        ptr += dim;
    }

    delete[] input_blob;
    delete[] output_blob;

    std::vector indices;
    cv::dnn::NMSBoxes(boxes, confs, conf_thr, nms_thr, indices);
    for (int i = 0; i < indices.size(); i++) 
    {
        int idx = indices[i];
        Detection result;
        result.class_id = labels[idx];
        result.confidence = confs[idx];
        result.box = boxes[idx];
        output.push_back(result);
    }
}

int main() 
{
    // TensorRT-8.6.1.6 ---- 2023.6.2
    std::string onnx_model = "E:\\model\\yolov5s\\yolov5s.onnx";
    std::string source_path = "E:\\model\\yolov5s\\sample.mp4";
    std::string class_path = "E:\\model\\yolov5s\\classes.txt";
    float score_thr = 0.25;
    float conf_thr = 0.25;
    float nms_thr = 0.5;

    std::cout << "Load class names!--------------------" << std::endl;
    std::vector class_list = load_class_list(class_path);

    std::cout << "Create a network!--------------------" << std::endl;
    MyLogger logger;
    IBuilder* builder = createInferBuilder(logger);
    uint32_t explicit_batch = 1U << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    INetworkDefinition* network = builder->createNetworkV2(explicit_batch);

    std::cout << "Import model parser!--------------------" << std::endl;
    IParser* parser = createParser(*network, logger);
    parser->parseFromFile(onnx_model.c_str(), static_cast(ILogger::Severity::kWARNING));
    for (int i = 0; i < parser->getNbErrors(); ++i)
    {
        std::cout << parser->getError(i)->desc() << std::endl;
    }

    std::cout << "Building an Engine!--------------------" << std::endl;
    IBuilderConfig* config = builder->createBuilderConfig();
    config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, 1U << 26);
    IHostMemory* model = builder->buildSerializedNetwork(*network, *config);

    std::cout << "Deserializing a Plan!--------------------" << std::endl;
    IRuntime* runtime = createInferRuntime(logger);
    ICudaEngine* engine = runtime->deserializeCudaEngine(model->data(), model->size());

    delete config;
    delete parser;
    delete network;
    delete builder;
    delete model;
    delete runtime;

    cv::VideoCapture capture(source_path);
    if (!capture.isOpened()) 
    {
        std::cerr << "Error opening video file\n";
        return -1;
    }

    auto start = std::chrono::high_resolution_clock::now();
    int frame_count = 0;
    int total_frames = 0;
    float fps = -1;
    while (true) 
    {
        cv::Mat frame;
        capture.read(frame);
        if (frame.empty()) 
        {
            std::cout << "End of stream\n";
            break;
        }

        std::vector output;
        detect(frame, engine, output, class_list, score_thr, conf_thr, nms_thr);

        frame_count++;
        total_frames++;
        int n_det = output.size();
        for (size_t i = 0; i < n_det; i++)
        {
            auto detection = output[i];
            auto box = detection.box;
            auto class_id = detection.class_id;
            auto conf = detection.confidence;
            const auto color = colors[class_id % colors.size()];
            cv::rectangle(frame, box, color, 3);
            cv::rectangle(frame, cv::Point(box.x, box.y - 20), cv::Point(box.x + box.width, box.y), color, cv::FILLED);
            cv::putText(frame, class_list[class_id].c_str(), cv::Point(box.x, box.y - 5), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
        }

        if (frame_count >= 30) 
        {
            auto end = std::chrono::high_resolution_clock::now();
            fps = frame_count * 1000.0 / std::chrono::duration_cast(end - start).count();
            frame_count = 0;
            start = std::chrono::high_resolution_clock::now();
        }

        if (fps > 0) 
        {
            std::ostringstream fps_label;
            fps_label << std::fixed << std::setprecision(2);
            fps_label << "FPS: " << fps;
            std::string fps_label_str = fps_label.str();
            cv::putText(frame, fps_label_str.c_str(), cv::Point(10, 25), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 0, 255), 2);
        }

        cv::imshow("output", frame);

        if (cv::waitKey(1) != -1) 
        {
            capture.release();
            std::cout << "finished by user\n";
            break;
        }
    }

    std::cout << "Total frames: " << total_frames << "\n";

    return 0;
}

3、附件

链接:https://pan.baidu.com/s/150ZEyUhhJoGXkTXWoKiWYg?pwd=65g1
提取码:65g1

你可能感兴趣的:(CUDA,YOLO,TensorRT)