前文讲到了如何在onnx注册自定义层,以便onnx parsing时找到所对应的层。
在示例demo前,贴出遇到的问题
import torch.onnx.symbolic
# Override Upsample's ONNX export from old opset if required (not needed for TRT 5.1+)
@torch.onnx.symbolic.parse_args('v', 'is')
def upsample_nearest2d(g, input, output_size):
height_scale = float(output_size[-2]) / input.type().sizes()[-2]
width_scale = float(output_size[-1]) / input.type().sizes()[-1]
return g.op("Upsample", input,
scales_f=(1, 1, height_scale, width_scale),
mode_s="nearest")
@torch.onnx.symbolic.parse_args('v', 'is', 'i')
def upsample_bilinear2d(g, input, output_size, align_corners):
height_scale = float(output_size[-2]) / input.type().sizes()[-2]
width_scale = float(output_size[-1]) / input.type().sizes()[-1]
return g.op("Upsample", input,
scales_f=(1, 1, height_scale, width_scale),
mode_s="linear")
torch.onnx.symbolic.upsample_bilinear2d = upsample_bilinear2d
torch.onnx.symbolic.upsample_nearest2d = upsample_nearest2d
后来的其他实验遇到的一些小bug,记录如下:
class gnSOLE(nn.Module):
def __init__(self):
super().__init__()
out_channels = 2
self.lr = nn.LeakyReLU()
self.mp = nn.MaxPool2d((2, 2), stride=(2, 2))
self.gn = nn.GroupNorm(num_groups=2, num_channels=out_channels, eps=1e-5)
self.bn = nn.BatchNorm2d(out_channels)
for m in self.modules():
if isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def forward(self, x):
x1 = self.mp(x)
x1 = self.gn(self.lr(x1))
# x1 = upsample_to(x1, x)
p4 = x1 + x
return p4
def GN_ONNX():
model = gnSOLE()
model.to(torch.device('cuda'))
zero_input = torch.rand(1, 2, 8, 8).cuda()
# model(zero_input)
torch.onnx.export(model, zero_input, 'gn.onnx', verbose=True)
//nvcc -o gn test_onnx.cpp ../cuda/groupnorm.cu /usr/src/tensorrt/samples/common/logger.cpp
// -I/home/user/package/cub-1.8.0 -I/usr/src/tensorrt/samples/common/ -I./../cuda/ -L/usr/local/cuda/lib64
// -lcudart -lcuda -L/usr/local/lib/ -lnvonnxparser -L/usr/lib/x86_64-linux-gnu/ -lnvinfer
// -lnvparsers -lnvinfer_plugin
#include
#include
#include
//#include
#include
#include
#include
#include
#include
#include
#include "NvInfer.h"
#include
#include "logger.h"
//#include "common.h"
#include "GN.h"
#define CHECK(status) \
do \
{ \
auto ret = (status); \
if (ret != 0) \
{ \
std::cout << "Cuda failure: " << ret << std::endl; \
abort(); \
} \
} while (0)
using namespace nvinfer1;
static const int INPUT_H = 28;
static const int INPUT_W = 28;
static const int OUTPUT_SIZE = 10;
const std::string gSampleName = "TensorRT.sample_onnx_mnist";
bool onnxToTRTModel(const std::string &modelFile, // name of the onnx model
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with
IHostMemory *&trtModelStream) // output buffer for the TensorRT model
{
// create the builder
IBuilder *builder = createInferBuilder(gLogger.getTRTLogger());
assert(builder != nullptr);
nvinfer1::INetworkDefinition *network = builder->createNetwork();
auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
//Optional - uncomment below lines to view network layer information
//config->setPrintLayerInfo(true);
//parser->reportParsingInfo();
std::cout << modelFile << std::endl;
if (!parser->parseFromFile(modelFile.c_str(),
static_cast<int>(gLogger.getReportableSeverity()))) {
gLogError << "Failure while parsing ONNX file" << std::endl;
return false;
}
// Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(1 << 20);
// builder->setFp16Mode(gArgs.runInFp16);
// builder->setInt8Mode(gArgs.runInInt8);
//
// if (gArgs.runInInt8) {
// samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
// }
//
// samplesCommon::enableDLA(builder, gArgs.useDLACore);
ICudaEngine *engine = builder->buildCudaEngine(*network);
assert(engine);
// we can destroy the parser
parser->destroy();
// serialize the engine, then close everything down
trtModelStream = engine->serialize();
engine->destroy();
network->destroy();
builder->destroy();
return true;
}
void doInference(IExecutionContext &context, float *input, float *output, int batchSize) {
const ICudaEngine &engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);
void *buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex{}, outputIndex{};
for (int b = 0; b < engine.getNbBindings(); ++b) {
if (engine.bindingIsInput(b))
inputIndex = b;
else
outputIndex = b;
}
// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float),
cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost,
stream));
cudaStreamSynchronize(stream);
// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
//!
//! \brief This function prints the help information for running this sample
//!
void printHelpInfo() {
std::cout
<< "Usage: ./sample_onnx_mnist [-h or --help] [-d or --datadir=] [--useDLACore=]\n" ;
std::cout << "--help Display help information\n";
std::cout
<< "--datadir Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple directories. If no data directories are given, the default is to use (data/samples/mnist/, data/mnist/)"
<< std::endl;
std::cout
<< "--useDLACore=N Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform."
<< std::endl;
std::cout << "--int8 Run in Int8 mode.\n";
std::cout << "--fp16 Run in FP16 mode." << std::endl;
}
int main(int argc, char **argv) {
auto sampleTest = gLogger.defineTest(gSampleName, argc, const_cast<const char **>(argv));
gLogger.reportTestStart(sampleTest);
// create a TensorRT model from the onnx model and serialize it to a stream
IHostMemory *trtModelStream{nullptr};
if (!onnxToTRTModel("/home/user/weight/gn.onnx", 1, trtModelStream))
std::cerr << "can not read onnx!";
assert(trtModelStream != nullptr);
uint8_t fileData[INPUT_H * INPUT_W];
// print an ascii representation
gLogInfo << "Input:\n"
// float data[INPUT_H * INPUT_W];
// for (int i = 0; i < INPUT_H * INPUT_W; i++)
// data[i] = 1.0 - float(fileData[i] / 255.0);
//
// // deserialize the engine
// IRuntime *runtime = createInferRuntime(gLogger);
// assert(runtime != nullptr);
// if (gArgs.useDLACore >= 0) {
// runtime->setDLACore(gArgs.useDLACore);
// }
//
// ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
// assert(engine != nullptr);
// trtModelStream->destroy();
// IExecutionContext *context = engine->createExecutionContext();
// assert(context != nullptr);
// // run inference
// float prob[OUTPUT_SIZE];
// doInference(*context, data, prob, 1);
//
// // destroy the engine
// context->destroy();
// engine->destroy();
// runtime->destroy();
//*********************************
// float val{0.0f};
// int idx{0};
//
// //Calculate Softmax
// float sum{0.0f};
// for (int i = 0; i < OUTPUT_SIZE; i++)
// {
// prob[i] = exp(prob[i]);
// sum += prob[i];
// }
//
// gLogInfo << "Output:\n";
// for (int i = 0; i < OUTPUT_SIZE; i++)
// {
// prob[i] /= sum;
// val = std::max(val, prob[i]);
// if (val == prob[i])
// idx = i;
//
// gLogInfo << " Prob " << i << " " << std::fixed << std::setw(5) << std::setprecision(4) << prob[i] << " "
// << "Class " << i << ": " << std::string(int(std::floor(prob[i] * 10 + 0.5f)), '*') << "\n";
// }
// gLogInfo << std::endl;
//
// bool pass{idx == num && val > 0.9f};
//
// return gLogger.reportTest(sampleTest, pass);
}