通过mmpretrain 导出swin-transformer的onnx文件非常容易,注意需设置 opset_version=12这里是一个坑,刚开始设置的opset_version=11后续转换trtengine的时候会有问题。
import torch
from mmpretrain import get_model, inference_model
model = get_model('swin-tiny_16xb64_in1k',pretrained='swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth', device='cpu')
input = torch.zeros(1, 3, 224, 224)
out = model(input)
print(torch.argmax(out, dim=1))
torch.onnx.export(model, input, "swin_transformer.onnx", opset_version=12)
如果安装了mmdeploy,也可以通过下面的脚本进行模型转换:
from mmdeploy.apis import torch2onnx
from mmdeploy.backend.sdk.export_info import export2SDK
img = 'demo.JPEG'
work_dir = './work_dir/onnx/swin_transformer'
save_file = './end2end.onnx'
deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_onnxruntime_dynamic.py'
model_cfg = 'mmpretrain/configs/swin_transformer/swin-tiny_16xb64_in1k.py'
model_checkpoint = './checkpoints/swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth'
device = 'cpu'
# 1. convert model to onnx
torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg, model_checkpoint, device)
# 2. extract pipeline info for sdk use (dump-info)
export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, device=device)
python推理:
import cv2
import numpy as np
import onnxruntime
use_letterbox = True
input_shape = (224, 224)
def letterbox(im, new_shape=(416, 416), color=(114, 114, 114)):
# Resize and pad image while meeting stride-multiple constraints
shape = im.shape[:2] # current shape [height, width]
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
# Compute padding
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = (new_shape[1] - new_unpad[0])/2, (new_shape[0] - new_unpad[1])/2 # wh padding
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
if shape[::-1] != new_unpad: # resize
im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
return im
if __name__ == '__main__':
img = cv2.imread('goldfish.jpg')
if use_letterbox:
or_img = letterbox(img, input_shape)
else:
or_img = cv2.resize(img, input_shape)
img = or_img[:,:,::-1].transpose(2,0,1) #BGR2RGB和HWC2CHW
img = img.astype(dtype=np.float32)
img[0,:] = (img[0,:] - 123.675) / 58.395
img[1,:] = (img[1,:] - 116.28) / 57.12
img[2,:] = (img[2,:] - 103.53) / 57.375
img = np.expand_dims(img, axis=0)
onnx_session = onnxruntime.InferenceSession("swin_transformer.onnx", providers=['CUDAExecutionProvider','CPUExecutionProvider'])
input_name=[]
for node in onnx_session.get_inputs():
input_name.append(node.name)
output_name=[]
for node in onnx_session.get_outputs():
output_name.append(node.name)
input_feed={}
for name in input_name:
input_feed[name] = img
pred = onnx_session.run(None, input_feed)[0]
print(pred)
print(np.argmax(pred))
C++推理:
#include
#include
#include
int main(int argc, char* argv[])
{
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "cls");
Ort::SessionOptions session_options;
session_options.SetIntraOpNumThreads(1);
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
//OrtCUDAProviderOptions cuda_option;
//cuda_option.device_id = 0;
//cuda_option.arena_extend_strategy = 0;
//cuda_option.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchExhaustive;
//cuda_option.gpu_mem_limit = SIZE_MAX;
//cuda_option.do_copy_in_default_stream = 1;
//session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
//session_options.AppendExecutionProvider_CUDA(cuda_option);
const wchar_t* model_path = L"swin_transformer.onnx";
Ort::Session session(env, model_path, session_options);
Ort::AllocatorWithDefaultOptions allocator;
size_t num_input_nodes = session.GetInputCount();
std::vector<const char*> input_node_names = { "input" };
std::vector<const char*> output_node_names = { "output" };
cv::Mat image = cv::imread("goldfish.jpg", 1);
cv::resize(image, image, cv::Size(224, 224));
const size_t input_tensor_size = 1 * 3 * image.cols * image.rows;
std::vector<float> input_tensor_values(input_tensor_size);
for (int i = 0; i < image.cols; i++)
{
for (int j = 0; j < image.rows; j++)
{
input_tensor_values[0 * image.cols * image.rows + i * image.rows + j] = (image.ptr<uchar>(i)[j * 3 + 2] - 123.675) / 58.395;
input_tensor_values[1 * image.cols * image.rows + i * image.rows + j] = (image.ptr<uchar>(i)[j * 3 + 1] - 116.28) / 57.12;
input_tensor_values[2 * image.cols * image.rows + i * image.rows + j] = (image.ptr<uchar>(i)[j * 3 + 0] - 103.53) / 57.375;
}
}
std::vector<int64_t> input_node_dims = { 1, 3, image.cols, image.rows };
auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), input_node_dims.size());
std::vector<Ort::Value> ort_inputs;
ort_inputs.push_back(std::move(input_tensor));
std::vector<Ort::Value> output_tensors = session.Run(Ort::RunOptions{ nullptr }, input_node_names.data(), ort_inputs.data(), input_node_names.size(), output_node_names.data(), output_node_names.size());
const float* rawOutput = output_tensors[0].GetTensorData<float>();
std::vector<int64_t> outputShape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();
size_t count = output_tensors[0].GetTensorTypeAndShapeInfo().GetElementCount();
std::vector<float> output(rawOutput, rawOutput + count);
int predict_label = std::max_element(output.begin(), output.end()) - output.begin();
std::cout << predict_label << std::endl;
return 0;
}
如果安装了mmdeploy,也可以这样进行推理:
python推理:
import cv2
from mmdeploy_runtime import Classifier
img = cv2.imread('mmpretrain/demo/demo.JPEG')
classifier = Classifier(model_path='work_dir/onnx/swin_transformer', device_name='cpu')
result = classifier(img)
for label_id, score in result:
print(label_id, score)
C++推理可以参考:https://github.com/open-mmlab/mmdeploy/blob/main/demo/csrc/c/image_classification.cpp
这里通过trtexec转换onnx文件,LZ的版本是TensorRT-8.2.1.8。
需要先使用onnxsim简化模型,这里是第二个坑,否则会报错。
import onnx
from onnxsim import simplify
onnx_model = onnx.load("swin_transformer.onnx") # load onnx model
model_simp, check = simplify(onnx_model)
assert check, "Simplified ONNX model could not be validated"
onnx.save(model_simp, "swin_transformer_sim.onnx")
再去ensorRT的bin目录下运行
./trtexec.exe --onnx=swin_transformer_sim.onnx --saveEngine=swin_transformer.engine --workspace=20480
第三个坑是如果不加上–workspace参数可能会因内存不足报错,LZ的机器有32G内存索性就设了20G的工作空间,可以根据自己的内存大小酌情设置该参数。
至此,不出意外可以成功导出engine文件。
mmdeploy导出LZ没有尝试成功,不知道是环境配置问题还是在windows系统下库的bug。
python推理:(待补充)
C++推理:
// tensorRT include
#include
#include
#include // onnx解析器的头文件
// cuda include
#include
#include
// system include
#include
#include
#include
inline const char* severity_string(nvinfer1::ILogger::Severity t) {
switch (t) {
case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";
case nvinfer1::ILogger::Severity::kERROR: return "error";
case nvinfer1::ILogger::Severity::kWARNING: return "warning";
case nvinfer1::ILogger::Severity::kINFO: return "info";
case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";
default: return "unknow";
}
}
class TRTLogger : public nvinfer1::ILogger {
public:
virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override {
if (severity <= Severity::kINFO) {
if (severity == Severity::kWARNING)
printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);
else if (severity <= Severity::kERROR)
printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);
else
printf("%s: %s\n", severity_string(severity), msg);
}
}
} logger;
std::vector<unsigned char> load_file(const std::string & file) {
std::ifstream in(file, std::ios::in | std::ios::binary);
if (!in.is_open())
return {};
in.seekg(0, std::ios::end);
size_t length = in.tellg();
std::vector<uint8_t> data;
if (length > 0) {
in.seekg(0, std::ios::beg);
data.resize(length);
in.read((char*)& data[0], length);
}
in.close();
return data;
}
void inference() {
// ------------------------------ 1. 准备模型并加载 ----------------------------
TRTLogger logger;
auto engine_data = load_file("swin_transformer.engine");
// 执行推理前,需要创建一个推理的runtime接口实例。与builer一样,runtime需要logger:
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
// 将模型从读取到engine_data中,则可以对其进行反序列化以获得engine
nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
if (engine == nullptr) {
printf("Deserialize cuda engine failed.\n");
runtime->destroy();
return;
}
nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();
cudaStream_t stream = nullptr;
// 创建CUDA流,以确定这个batch的推理是独立的
cudaStreamCreate(&stream);
// ------------------------------ 2. 准备好要推理的数据并搬运到GPU ----------------------------
cv::Mat image = cv::imread("goldfish.jpg", 1);
cv::resize(image, image, cv::Size(224, 224));
cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
std::vector<uint8_t> fileData(image.cols * image.rows);
fileData = (std::vector<uint8_t>)(image.reshape(1, 1));
int input_numel = 1 * 3 * image.rows * image.cols;
float* input_data_host = nullptr;
cudaMallocHost(&input_data_host, input_numel * sizeof(float));
int image_area = image.cols * image.rows;
unsigned char* pimage = image.data;
float* phost_b = input_data_host + image_area * 0;
float* phost_g = input_data_host + image_area * 1;
float* phost_r = input_data_host + image_area * 2;
for (int i = 0; i < image_area; ++i, pimage += 3) {
*phost_r++ = (pimage[0] - 123.675) / 58.395;
*phost_g++ = (pimage[1] - 116.28 )/ 57.12;
*phost_b++ = (pimage[2] - 103.53 )/ 57.375;
}
float* input_data_device = nullptr;
float output_data_host[1000];
float* output_data_device = nullptr;
cudaMalloc(&input_data_device, input_numel * sizeof(float));
cudaMalloc(&output_data_device, sizeof(output_data_host));
cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream);
// 用一个指针数组指定input和output在gpu中的指针
float* bindings[] = { input_data_device, output_data_device };
// ------------------------------ 3. 推理并将结果搬运回CPU ----------------------------
bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr);
cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
int predict_label = std::max_element(output_data_host, output_data_host + 1000) - output_data_host;
std::cout << "predict_label: " << predict_label << std::endl;
// ------------------------------ 4. 释放内存 ----------------------------
cudaStreamDestroy(stream);
execution_context->destroy();
engine->destroy();
runtime->destroy();
}
int main() {
inference();
return 0;
}