OpenMMlab导出swin-transformer模型并用onnxruntime和tensorrt推理

导出onnx文件

通过mmpretrain 导出swin-transformer的onnx文件非常容易,注意需设置 opset_version=12这里是一个坑,刚开始设置的opset_version=11后续转换trtengine的时候会有问题。

import torch
from mmpretrain import get_model, inference_model

model = get_model('swin-tiny_16xb64_in1k',pretrained='swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth', device='cpu') 

input = torch.zeros(1, 3, 224, 224)
out = model(input)
print(torch.argmax(out, dim=1))
torch.onnx.export(model, input, "swin_transformer.onnx", opset_version=12)

如果安装了mmdeploy,也可以通过下面的脚本进行模型转换:

from mmdeploy.apis import torch2onnx
from mmdeploy.backend.sdk.export_info import export2SDK

img = 'demo.JPEG'
work_dir = './work_dir/onnx/swin_transformer'
save_file = './end2end.onnx'
deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_onnxruntime_dynamic.py'
model_cfg = 'mmpretrain/configs/swin_transformer/swin-tiny_16xb64_in1k.py'
model_checkpoint = './checkpoints/swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth'
device = 'cpu'

# 1. convert model to onnx
torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg, model_checkpoint, device)

# 2. extract pipeline info for sdk use (dump-info)
export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, device=device)

onnxruntime推理

python推理:

import cv2
import numpy as np
import onnxruntime

use_letterbox = True
input_shape = (224, 224)     

def letterbox(im, new_shape=(416, 416), color=(114, 114, 114)):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    
    # Compute padding
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))    
    dw, dh = (new_shape[1] - new_unpad[0])/2, (new_shape[0] - new_unpad[1])/2  # wh padding 
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    
    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im


if __name__ == '__main__':
    img = cv2.imread('goldfish.jpg')
    if use_letterbox:
        or_img = letterbox(img, input_shape)
    else:
        or_img = cv2.resize(img, input_shape)
    img = or_img[:,:,::-1].transpose(2,0,1)  #BGR2RGB和HWC2CHW
    img = img.astype(dtype=np.float32)
    img[0,:] = (img[0,:] - 123.675) / 58.395   
    img[1,:] = (img[1,:] - 116.28) / 57.12
    img[2,:] = (img[2,:] - 103.53) / 57.375
    img = np.expand_dims(img, axis=0)

    onnx_session = onnxruntime.InferenceSession("swin_transformer.onnx", providers=['CUDAExecutionProvider','CPUExecutionProvider'])

    input_name=[]
    for node in onnx_session.get_inputs():
        input_name.append(node.name)

    output_name=[]
    for node in onnx_session.get_outputs():
        output_name.append(node.name)

    input_feed={}
    for name in input_name:
        input_feed[name] = img

    pred = onnx_session.run(None, input_feed)[0]
    print(pred)
    print(np.argmax(pred))

C++推理:

#include 
#include 
#include 


int main(int argc, char* argv[])
{
	Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "cls");
	Ort::SessionOptions session_options;
	session_options.SetIntraOpNumThreads(1);
	session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);

	//OrtCUDAProviderOptions cuda_option;
	//cuda_option.device_id = 0;
	//cuda_option.arena_extend_strategy = 0;
	//cuda_option.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchExhaustive;
	//cuda_option.gpu_mem_limit = SIZE_MAX;
	//cuda_option.do_copy_in_default_stream = 1;
	//session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
	//session_options.AppendExecutionProvider_CUDA(cuda_option);

	const wchar_t* model_path = L"swin_transformer.onnx";
	Ort::Session session(env, model_path, session_options);
	Ort::AllocatorWithDefaultOptions allocator;

	size_t num_input_nodes = session.GetInputCount();
	std::vector<const char*> input_node_names = { "input" };
	std::vector<const char*> output_node_names = { "output" };

	cv::Mat image = cv::imread("goldfish.jpg", 1);
	cv::resize(image, image, cv::Size(224, 224));

	const size_t input_tensor_size = 1 * 3 * image.cols * image.rows;
	std::vector<float> input_tensor_values(input_tensor_size);

	for (int i = 0; i < image.cols; i++)
	{
		for (int j = 0; j < image.rows; j++)
		{
			input_tensor_values[0 * image.cols * image.rows + i * image.rows + j] = (image.ptr<uchar>(i)[j * 3 + 2] - 123.675) / 58.395;
			input_tensor_values[1 * image.cols * image.rows + i * image.rows + j] = (image.ptr<uchar>(i)[j * 3 + 1] - 116.28) / 57.12;
			input_tensor_values[2 * image.cols * image.rows + i * image.rows + j] = (image.ptr<uchar>(i)[j * 3 + 0] - 103.53) / 57.375;
		}
	}
	
	std::vector<int64_t> input_node_dims = { 1, 3, image.cols, image.rows };
	auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
	Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), input_node_dims.size());

	std::vector<Ort::Value> ort_inputs;
	ort_inputs.push_back(std::move(input_tensor));

	std::vector<Ort::Value> output_tensors = session.Run(Ort::RunOptions{ nullptr }, input_node_names.data(), ort_inputs.data(), input_node_names.size(), output_node_names.data(), output_node_names.size());

	const float* rawOutput = output_tensors[0].GetTensorData<float>();
	std::vector<int64_t> outputShape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();
	size_t count = output_tensors[0].GetTensorTypeAndShapeInfo().GetElementCount();
	std::vector<float> output(rawOutput, rawOutput + count);

	int predict_label = std::max_element(output.begin(), output.end()) - output.begin();
	std::cout << predict_label << std::endl;

	return 0;
}

如果安装了mmdeploy,也可以这样进行推理:
python推理:

import cv2
from mmdeploy_runtime import Classifier

img = cv2.imread('mmpretrain/demo/demo.JPEG')
classifier = Classifier(model_path='work_dir/onnx/swin_transformer', device_name='cpu')
result = classifier(img)
for label_id, score in result:
    print(label_id, score)

C++推理可以参考:https://github.com/open-mmlab/mmdeploy/blob/main/demo/csrc/c/image_classification.cpp

导出engine文件

这里通过trtexec转换onnx文件,LZ的版本是TensorRT-8.2.1.8。
需要先使用onnxsim简化模型,这里是第二个坑,否则会报错。

import onnx
from onnxsim import simplify
onnx_model = onnx.load("swin_transformer.onnx")  # load onnx model
model_simp, check = simplify(onnx_model)
assert check, "Simplified ONNX model could not be validated"
onnx.save(model_simp, "swin_transformer_sim.onnx")

再去ensorRT的bin目录下运行

./trtexec.exe --onnx=swin_transformer_sim.onnx --saveEngine=swin_transformer.engine --workspace=20480

第三个坑是如果不加上–workspace参数可能会因内存不足报错,LZ的机器有32G内存索性就设了20G的工作空间,可以根据自己的内存大小酌情设置该参数。
至此,不出意外可以成功导出engine文件。
mmdeploy导出LZ没有尝试成功,不知道是环境配置问题还是在windows系统下库的bug。

tensorrt推理

python推理:(待补充)
C++推理:

// tensorRT include
#include 
#include 
#include  // onnx解析器的头文件

// cuda include
#include 
#include 

// system include
#include 
#include 
#include 


inline const char* severity_string(nvinfer1::ILogger::Severity t) {
	switch (t) {
	case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";
	case nvinfer1::ILogger::Severity::kERROR:   return "error";
	case nvinfer1::ILogger::Severity::kWARNING: return "warning";
	case nvinfer1::ILogger::Severity::kINFO:    return "info";
	case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";
	default: return "unknow";
	}
}

class TRTLogger : public nvinfer1::ILogger {
public:
	virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override {
		if (severity <= Severity::kINFO) {
			if (severity == Severity::kWARNING)
				printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);
			else if (severity <= Severity::kERROR)
				printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);
			else
				printf("%s: %s\n", severity_string(severity), msg);
		}
	}
} logger;

std::vector<unsigned char> load_file(const std::string & file) {
	std::ifstream in(file, std::ios::in | std::ios::binary);
	if (!in.is_open())
		return {};

	in.seekg(0, std::ios::end);
	size_t length = in.tellg();

	std::vector<uint8_t> data;
	if (length > 0) {
		in.seekg(0, std::ios::beg);
		data.resize(length);

		in.read((char*)& data[0], length);
	}
	in.close();
	return data;
}


void inference() {
	// ------------------------------ 1. 准备模型并加载   ----------------------------
	TRTLogger logger;
	auto engine_data = load_file("swin_transformer.engine");
	// 执行推理前,需要创建一个推理的runtime接口实例。与builer一样,runtime需要logger:
	nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
	// 将模型从读取到engine_data中,则可以对其进行反序列化以获得engine
	nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
	if (engine == nullptr) {
		printf("Deserialize cuda engine failed.\n");
		runtime->destroy();
		return;
	}

	nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();
	cudaStream_t stream = nullptr;
	// 创建CUDA流,以确定这个batch的推理是独立的
	cudaStreamCreate(&stream);

	// ------------------------------ 2. 准备好要推理的数据并搬运到GPU   ----------------------------
	cv::Mat image = cv::imread("goldfish.jpg", 1);
	cv::resize(image, image, cv::Size(224, 224));
	cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
	std::vector<uint8_t> fileData(image.cols * image.rows);
	fileData = (std::vector<uint8_t>)(image.reshape(1, 1));
	int input_numel = 1 * 3 * image.rows * image.cols;

	float* input_data_host = nullptr;
	cudaMallocHost(&input_data_host, input_numel * sizeof(float));

	int image_area = image.cols * image.rows;
	unsigned char* pimage = image.data;
	float* phost_b = input_data_host + image_area * 0;
	float* phost_g = input_data_host + image_area * 1;
	float* phost_r = input_data_host + image_area * 2;
	for (int i = 0; i < image_area; ++i, pimage += 3) {
		*phost_r++ = (pimage[0] - 123.675) / 58.395;
		*phost_g++ = (pimage[1] - 116.28 )/ 57.12;
		*phost_b++ = (pimage[2] - 103.53 )/ 57.375;
	}

	float* input_data_device = nullptr;
	float output_data_host[1000];
	float* output_data_device = nullptr;
	cudaMalloc(&input_data_device, input_numel * sizeof(float));
	cudaMalloc(&output_data_device, sizeof(output_data_host));
	cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream);

	// 用一个指针数组指定input和output在gpu中的指针
	float* bindings[] = { input_data_device, output_data_device };

	// ------------------------------ 3. 推理并将结果搬运回CPU   ----------------------------
	bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr);
	cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream);
	cudaStreamSynchronize(stream);

	int predict_label = std::max_element(output_data_host, output_data_host + 1000) - output_data_host;
	std::cout << "predict_label: " << predict_label << std::endl;

	// ------------------------------ 4. 释放内存 ----------------------------
	cudaStreamDestroy(stream);
	execution_context->destroy();
	engine->destroy();
	runtime->destroy();
}

int main() {
	inference();
	return 0;
}

你可能感兴趣的:(#,model,deployment,#,classification,#,OpenMMlab,transformer,深度学习,人工智能)