一文玩转pytorch转onnx-tensorRT ——(3)测试自定义的tensorRT层

层已经写完了,如何测试呢。可以直接去看下面的代码

代码说明:

  • 如何编译在开头写出来了,直接用就可以了,当然不是。
    • 首先是cub库,它是cuda函数的__share__函数的打包版。我的库放在了/home/user/package/cub-1.8.0下
    • 用到了common,是tensorRT的示例文件,我直接搬过来用的,如果路径不对,修改到自己的common路径下
    • 上面的common不是必须的,可以注释掉,没多大影响。
  • 有两个版本的实现:在代码中有大块规则的注释(49-61)。这和下面的是两种不同的调用方式——可以直接加载plugin;也可以使用函数名去寻找该函数。
  • 此代码的目的是测试group normalization层,输入数据是随机生成的,可以和pytorch下的结果进行对比。

出现的问题

  • 问题:conversion for argument 1 from ‘nvinfer1::ITensor*’ to ‘nvinfer1::ITensor* const*’:是因为输入需要是个指针的数组,可以参考这个链接,然后可以看一下addPluginV2的官方介绍。在代码中,addplugin时的输入数据是写出数组或者vector才行。
  • 问题:莫名奇妙的问题,比如我就遇到这样的情况:出现指向不明的cuda error,好像是output不能正确分配一样,1)首先排查的是对output的cudamemory操作的函数出现invliad argument;2)出现misaligned memory的报错。其实,是因为 layer->getOutput(0)->setName(const char *);,传入string类型会出错,需要传入char*。
//nvcc -o gn test_onnx.cpp ../cuda/groupnorm.cu /usr/src/tensorrt/samples/common/logger.cpp
// -I/home/user/package/cub-1.8.0 -I/usr/src/tensorrt/samples/common/ -I./../cuda/ -L/usr/local/cuda/lib64
// -lcudart -lcuda -L/usr/local/lib/ -lnvonnxparser -L/usr/lib/x86_64-linux-gnu/
// -lnvinfer  -lnvparsers -lnvinfer_plugin
#include "GN.h"
#include "NvInfer.h"
#include "logger.h"
#include "common.h"
#include "cuda_runtime_api.h"
#include 
#include 
#include 
#include 


#define GN_PLUGIN_NAME "group_norm"
#define GN_PLUGIN_VERSION "1"

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 2;
static const int INPUT_W = 2;
static const int INPUT_C = 8;
static const int OUTPUT_SIZE = INPUT_H * INPUT_W * INPUT_C;
//samplesCommon::Args gArgs;

const char *INPUT_BLOB_NAME = "data";
const char *OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;

const std::string gSampleName = "TensorRT.TestLayer";


// Creat the engine using only the API and not any parser.
ICudaEngine *createCustomEngine(unsigned int maxBatchSize, IBuilder *builder, DataType dt) {
    INetworkDefinition *network = builder->createNetwork();

    // Create input tensor of shape {4, 2, 2 } with name INPUT_BLOB_NAME
    ITensor *iptdata = network->addInput(INPUT_BLOB_NAME, dt, Dims3{INPUT_C, INPUT_H, INPUT_W});
    assert(iptdata);

    // Add decode plugins
    std::cout << "Building accelerated plugins..." << std::endl;
    ITensor *xx[] = {iptdata};
    static const int nbWeights = 2, GROUP = 2;
    float EPS = 1e-5;
    float hostData[INPUT_C] = {0}, biasData[INPUT_C] = {0};
    for (int i = 0; i < INPUT_C; i++)hostData[i] = 1;
--1. creater layer by IPluginCreator--/
//    std::vector  Attr;
//    Attr.emplace_back(nvinfer1::PluginField("count", &INPUT_C, nvinfer1::PluginFieldType::kINT32, 1));
//    Attr.emplace_back(nvinfer1::PluginField("num_groups", &GROUP, nvinfer1::PluginFieldType::kINT32, 1));
//    Attr.emplace_back(nvinfer1::PluginField("eps", &EPS, nvinfer1::PluginFieldType::kFLOAT32, 1));
//    Attr.emplace_back(nvinfer1::PluginField("w", hostData, nvinfer1::PluginFieldType::kFLOAT32, INPUT_C));
//    Attr.emplace_back(nvinfer1::PluginField("b", biasData, nvinfer1::PluginFieldType::kFLOAT32, INPUT_C));
//    nvinfer1::PluginFieldCollection mFC = {int(Attr.size()), Attr.data()};
//
//    auto creator = getPluginRegistry()->getPluginCreator(GN_PLUGIN_NAME, GN_PLUGIN_VERSION);
//    nvinfer1::IPluginV2 *gnPlugin = creator->createPlugin("", &mFC);
//    auto gnlayer = network->addPluginV2(xx, 1, *gnPlugin);
//    assert(gnlayer);
/

--2. creater layer by IPlugin--/
    Weights mKernelWeights = Weights{DataType::kFLOAT, hostData, INPUT_C},
            mBiasWeights = Weights{DataType::kFLOAT, biasData, INPUT_C};

    const nvinfer1::Weights weights[] = {mKernelWeights, mBiasWeights};
    auto gnPlugin = GNPlugin(weights, nbWeights, GROUP, EPS);
    auto gnlayer = network->addPluginV2(xx, 1, gnPlugin);
    assert(gnlayer);
/

//###################
//    IActivationLayer *layer = network->addActivation(*gnlayer->getOutput(0), ActivationType::kRELU);
//    assert(layer);
//###################

    auto nbOutputs = gnlayer->getNbOutputs();
    auto output = gnlayer->getOutput(0);
    std::cout << "nbOutputs: " << nbOutputs << "INPUT_C: " << INPUT_C;
    gnlayer->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*gnlayer->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    builder->setMaxWorkspaceSize(1 << 20);
    ICudaEngine *engine = builder->buildCudaEngine(*network);
    network->destroy();
    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream) {
    // Create builder
    IBuilder *builder = createInferBuilder(gLogger.getTRTLogger());
    assert(builder != nullptr);

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine *engine = createCustomEngine(maxBatchSize, builder, DataType::kFLOAT);
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

void doInference(IExecutionContext &context, float *input, float *output, int batchSize) {
    const ICudaEngine &engine = context.getEngine();

    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void *buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float),
                          cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost,
                          stream));
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

//!
//! \brief This function prints the help information for running this sample
//!
void printHelpInfo() {
    std::cout
            << "Usage: ./sample_mnist_api [-h or --help] [-d or --datadir=] [--useDLACore=]\n";
    std::cout << "--help          Display help information\n";
    std::cout
            << "--datadir       Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple directories. If no data directories are given, the default is to use (data/samples/mnist/, data/mnist/)"
            << std::endl;
    std::cout
            << "--useDLACore=N  Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform."
            << std::endl;
    std::cout << "--int8          Run in Int8 mode.\n";
    std::cout << "--fp16          Run in FP16 mode.\n";
}


int main(int argc, char **argv) {
    auto sampleTest = gLogger.defineTest(gSampleName, argc, const_cast<const char **>(argv));
    gLogger.reportTestStart(sampleTest);
    // create a model using the API directly and serialize it to a stream
    IHostMemory *modelStream{nullptr};

    float data[] = {6.5486, 5.9743, 2.1141, 9.3370, 1.3765, 1.5005, 0.8672, 1.6151, 2.5169,
                    4.6985, 4.7457, 3.7380, 1.8509, 0.4440, 2.9834, 7.2476, 6.2173, 8.6723,
                    2.8924, 7.9829, 5.6329, 7.9505, 7.6445, 4.0675, 6.2665, 6.3810, 3.4424,
                    1.3675, 5.4805, 0.8932, 9.6199, 5.3145, 0.4758, 7.7327, 6.0537, 0.4564,
                    1.0053, 4.2043, 0.6361, 0.9264, 6.7714, 1.7562, 6.7792, 5.6586, 5.1015,
                    1.4030, 9.1401, 9.5983, 9.4102, 3.1274, 6.6962, 1.6425, 1.1220, 9.6301,
                    8.8801, 4.8154, 1.0288, 6.0371, 3.6395, 6.8811, 8.1933, 9.6004, 3.5542,
                    5.4013};

    int batch = 2, maxbatch = (2 + 4 - 1) / 4 * 4;
    APIToModel(maxbatch, &modelStream);
    assert(modelStream != nullptr);
    IRuntime *runtime = createInferRuntime(gLogger.getTRTLogger());
    assert(runtime != nullptr);
    ICudaEngine *engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), nullptr);
    assert(engine != nullptr);
    modelStream->destroy();
    IExecutionContext *context = engine->createExecutionContext();
    assert(context != nullptr);

    // Run inference
    float prob[OUTPUT_SIZE];
    doInference(*context, data, prob, batch);

    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // Print histogram of the output distribution
    gLogInfo << "Output:\n";
    for (unsigned int i = 0; i < batch * OUTPUT_SIZE; i++)gLogInfo << prob[i] << ",";
    gLogInfo << std::endl;
}

你可能感兴趣的:(深度学习)