OnnxRuntime部署SwinTransformer

一、概述

实测SwinTransformer真的是涨点神器,刷榜秘籍,用SwinTransformer作为模型主干网络来微调下游任务对比ResNet50保守能够带来2~5个点的提升,当然模型参数量是大了点。测试了下基于OnnxRuntime cpu模式和gpu(非TensorRT)模式下的速度。对于大部分图片识别类任务,这个速度也是可以接受的。

模式 硬件 输入 平均速度
cpu Intel(R) Xeon(R) W-2102 CPU @ 2.90GHz 224*224
gpu Nvidia Tesla T4 224*224

二、环境

  • ubuntu18.04
  • pytorch1.10低版本不支持
  • onnxruntime1.10
  • cuda11.4
  • cudnn 8.24cuda版本和cudnn版本需要11.4以上
  • opencv4.4
    Onnxruntime和cuda版本之间的对应关系如下图
    CUDA - onnxruntime对应关系
    Onnxruntime下载地址

三、模型转onnx

去掉训练时候的分类头,只提取timm的版本SwinTransformer的特征。
转出来的模型可以在此处下载
链接:https://pan.baidu.com/s/1oKUrPxPtYUFGVXJ2SiBP3g
提取码:czfj

import timm
import torch.nn as nn
import torch
class ft_net_swin_extract(nn.Module):

    def __init__(self, class_num, droprate=0.5, stride=2,):
        super(ft_net_swin_extract, self).__init__()
        model_ft = timm.create_model('swin_base_patch4_window7_224', pretrained=True)
        # avg pooling to global pooling
        #model_ft.avgpool = nn.AdaptiveAvgPool2d((1,1))
        model_ft.head = nn.Sequential() # save memory
        self.model = model_ft
    def forward(self, x):
        x = self.model.forward_features(x)
        return x
from model import ft_net_swin_extract
import numpy as np
from torchvision import models,transforms,datasets
import cv2
import onnx
import onnxruntime


data_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
image_file = "000_000_gan0002_c3s1_136308_04.jpg"
input = cv2.imread(image_file)
img_h,img_w,_ = input.shape
resize_input = cv2.resize(input,(224,224))
image = data_transforms(resize_input)
image = image.unsqueeze(0)
print(image.shape)

model = ft_net_swin_extract(class_num=751,circle=True)
model_path = "/home/nemo/DeepLearning/Person_reID_baseline_pytorch/MarketOut/best.pth"
model.load_state_dict(torch.load(model_path))
model.classifier.classifier = nn.Sequential()
model.eval()

torch_out = model(image)
# Export the model
torch.onnx.export(model,                     # model being run
                  image,                         # model input (or a tuple for multiple inputs)
                  "swin-transform.onnx",     # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=12,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['input'],   # the model's input names
                  output_names = ['output'], # the model's output names
                  )

onnx_model = onnx.load("swin-transform.onnx")
onnx.checker.check_model(onnx_model)

四、编写onnxruntime 推理代码
代码地址
https://gitee.com/running_jiang/swintransformer-onnxruntime.git
https://github.com/runningJ/swintransformer-onnxruntime.git
欢迎star,拒绝白嫖。

cpu版本

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

using namespace std;
using namespace cv;
using namespace Ort;


template 
T vectorProduct(const std::vector& v)
{
    return accumulate(v.begin(), v.end(), 1, std::multiplies());
};

int main(int argc,char**argv)
{
    if (argc != 3)
    {
        cerr<<"usage "<< argv[0] <<" image_path model_path"< inputDims = inputTensorInfo.GetShape();
    std::cout << "Input Dimensions: ";
    for(int i = 0; i < inputDims.size(); ++i)
    {
        cout<< inputDims[i]<<" ";
    }
    cout < outputDims = outputTensorInfo.GetShape();
    std::cout << "Output Dimensions: ";
    for(int i = 0; i < outputDims.size(); ++i)
    {
        cout<< outputDims[i]<<" ";
    }
    cout < inputTensorValues(inputTensorSize);
    inputTensorValues.assign(preprocessedImage.begin(),
                             preprocessedImage.end());

    size_t outputTensorSize = vectorProduct(outputDims);
    std::vector outputTensorValues(outputTensorSize);

    std::vector inputNames{inputName};
    std::vector outputNames{outputName};
    
    std::vector inputTensors;
    std::vector outputTensors;

    MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(
        OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);

    inputTensors.push_back(Value::CreateTensor(
        memoryInfo, inputTensorValues.data(), inputTensorSize, inputDims.data(),
        inputDims.size()));

    outputTensors.push_back(Value::CreateTensor(
        memoryInfo, outputTensorValues.data(), outputTensorSize,
        outputDims.data(), outputDims.size()));
    for(int i = 0; i < 100; ++i)
    {
        auto s_t=std::chrono::steady_clock::now();
        session.Run(Ort::RunOptions{nullptr}, inputNames.data(),
                    inputTensors.data(), 1, outputNames.data(),
                    outputTensors.data(), 1);
        auto e_t=std::chrono::steady_clock::now();
        double dr_s=std::chrono::duration(e_t-s_t).count();
        cout <<"runing inference cost time "<< dr_s <<"ms"<

cuda 版本

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

using namespace std;
using namespace cv;
using namespace Ort;

template 
T vectorProduct(const std::vector& v)
{
    return accumulate(v.begin(), v.end(), 1, std::multiplies());
};

int main(int argc,char**argv)
{
    if (argc != 3)
    {
        cerr<<"usage "<< argv[0] <<" image_path model_path"< inputDims = inputTensorInfo.GetShape();
    std::cout << "Input Dimensions: ";
    for(int i = 0; i < inputDims.size(); ++i)
    {
        cout<< inputDims[i]<<" ";
    }
    cout < outputDims = outputTensorInfo.GetShape();
    std::cout << "Output Dimensions: ";
    for(int i = 0; i < outputDims.size(); ++i)
    {
        cout<< outputDims[i]<<" ";
    }
    cout < inputTensorValues(inputTensorSize);
    inputTensorValues.assign(preprocessedImage.begin(),
                             preprocessedImage.end());
    size_t outputTensorSize = vectorProduct(outputDims);
    std::vector outputTensorValues(outputTensorSize);

    std::vector inputNames{inputName};
    std::vector outputNames{outputName};
    
    std::vector inputTensors;
    std::vector outputTensors;

    MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(
        OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);

    inputTensors.push_back(Value::CreateTensor(
        memoryInfo, inputTensorValues.data(), inputTensorSize, inputDims.data(),
        inputDims.size()));

    outputTensors.push_back(Value::CreateTensor(
        memoryInfo, outputTensorValues.data(), outputTensorSize,
        outputDims.data(), outputDims.size()));
    for(int i = 0; i < 100; ++i)
    {
        auto s_t=std::chrono::steady_clock::now();
        session.Run(Ort::RunOptions{nullptr}, inputNames.data(),
                inputTensors.data(), 1, outputNames.data(),
                outputTensors.data(), 1);
        auto e_t=std::chrono::steady_clock::now();
        double dr_s=std::chrono::duration(e_t-s_t).count();
        cout <<"runing inference cost time "<< dr_s <<"ms"<

你可能感兴趣的:(OnnxRuntime部署SwinTransformer)