搭建使用TensorRT加速的centerpoint模型环境

github地址:
本文安装的环境是RTX3090+CUDA11.1+CUDNN8.0.5+Pytorch1.8.0+PYTHON3.7.0

1、基础安装

# basic python libraries
conda create --name centerpoint python=3.7
conda activate centerpoint
conda install pytorch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 cudatoolkit=11.1 -c pytorch -c conda-forge
git clone https://hub.fastgit.org/CarkusL/CenterPoint.git
cd CenterPoint
pip install -r requirements.txt

# add CenterPoint to PYTHONPATH by adding the following line to ~/.bashrc (change the path accordingly)
export PYTHONPATH="${PYTHONPATH}:PATH_TO_CENTERPOINT"

2、安装nuscenes devkit

git clone https://github.com/tianweiy/nuscenes-devkit

# add the following line to ~/.bashrc and reactivate bash (remember to change the PATH_TO_NUSCENES_DEVKIT value)
export PYTHONPATH="${PYTHONPATH}:PATH_TO_NUSCENES_DEVKIT/python-sdk"

3、设置CUDA环境

# 加到~/.bashrc里面
export PATH=/usr/local/cuda-11.1/bin:$PATH
export CUDA_PATH=/usr/local/cuda-11.1
export CUDA_HOME=/usr/local/cuda-11.1
export LD_LIBRARY_PATH=/usr/local/cuda-11.1/lib64:$LD_LIBRARY_PATH

4、编译

source ~/.bashrc
bash setup.sh

中间编译deform_conv_cuda报错
解决办法:
CenterPoint/det3d/ops/dcn/src/deform_conv_cuda.cpp中用TORCH_CHECK替换掉AT_CHECK

5、安装apex

git clone https://github.com/NVIDIA/apex
cd apex
git checkout 5633f6  # recent commit doesn't build in our system 
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./

6、安装spconv

sudo apt-get install libboost-all-dev
git clone https://github.com/traveller59/spconv.git --recursive
cd spconv && git checkout 7342772
python setup.py bdist_wheel
cd ./dist && pip install *

python setup.py bdist_wheel时报错:

error: no matching function for call to ‘torch::jit::RegisterOperators::RegisterOperators(const char [28], <unresolved overloaded function type>)

解决办法:
torch::jit::RegisterOperators()替换成torch::RegisterOperators()

7、生成数据集元素

nuscenes数据集目录结构

# For nuScenes Dataset         
└── NUSCENES_DATASET_ROOT
       ├── samples       <-- key frames
       ├── sweeps        <-- frames without annotation
       ├── maps          <-- unused
       ├── v1.0-mini <-- metadata
mkdir data
cd data
mkdir nuScenes
cd nuScenes
#下面使用软连接或者拷贝的方式把前面提到的目录结构放进去
#接着运行以下代码,其中NUSCENES_TRAINVAL_DATASET_ROOT代表到nuScenes文件夹的路径
python tools/create_data.py nuscenes_data_prep --root_path=NUSCENES_TRAINVAL_DATASET_ROOT --version="v1.0-mini" --nsweeps=10

8、转换模型

python tools/export_pointpillars_onnx.py

报错:
搭建使用TensorRT加速的centerpoint模型环境_第1张图片

解决办法:
初步认为是pytorch版本问题,使用CUDA10.2+CUDNN8.2.2+Pytorch1.7.0+PYTHON3.7.0之后没有问题

9、简化模型

python tools/simplify_model.py
python tools/merge_pfe_rpn_model.py

10、下载TensorRT

参考地址
CenterPoint/tensorrt/samples文件夹下的文件放到tensorrt主目录下的samples文件夹下,将CenterPoint/tensorrt/data文件夹下的文件放到tensorrt主目录下的data文件夹下

11、编译

进tensorRT主目录下的samples/centerpoint

make

报错:
提示找不到NvInferPlugin.h
解决办法:

sudo cp TENSORRT_ROOT/include/* /usr/include

TENSORRT_ROOT是TensorRT的主目录

继续报错:
搭建使用TensorRT加速的centerpoint模型环境_第2张图片
解决办法:
我这边的原因是因为作者可能使用的低版本的tensorrt,而我的版本较高,导致接口对应不上,因此需要修改头文件和cpp文件,修改后如下:

ScatterNDPlugin.h:

#ifndef BATCHTILEPLUGIN_H
#define BATCHTILEPLUGIN_H
#include "NvInferPlugin.h"
#include 
#include 
namespace nvinfer1
{
namespace plugin
{
class ScatterNDPlugin : public IPluginV2Ext
{
public:
    ScatterNDPlugin(const std::string name, const size_t mOutputSizeAttr[], const size_t inputShapeAttr[], const DataType type);

    ScatterNDPlugin(const std::string name, const void* data, size_t length);

    ScatterNDPlugin() = delete;

   int32_t getNbOutputs() const noexcept override;

    Dims getOutputDimensions(int32_t index, Dims const* inputs, int32_t nbInputDims) noexcept override;

    int32_t initialize() noexcept override;
    void terminate() noexcept override;

    size_t getWorkspaceSize(int32_t maxBatchSize) const noexcept override;

    int32_t enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace,
        cudaStream_t stream) noexcept override;

    DataType getOutputDataType(
        int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override;

    size_t getSerializationSize() const noexcept override;

    void serialize(void* buffer) const noexcept override;

    bool isOutputBroadcastAcrossBatch(
        int32_t outputIndex, bool const* inputIsBroadcasted, int32_t nbInputs) const noexcept override;

    bool canBroadcastInputAcrossBatch(int32_t inputIndex) const noexcept override;

    void configurePlugin(Dims const* inputDims, int32_t nbInputs, Dims const* outputDims, int32_t nbOutputs,
        DataType const* inputTypes, DataType const* outputTypes, bool const* inputIsBroadcast,
        bool const* outputIsBroadcast, PluginFormat floatFormat, int32_t maxBatchSize) noexcept override;

    bool supportsFormat(DataType type, PluginFormat format) const noexcept override;

    AsciiChar const* getPluginType() const noexcept override;

    AsciiChar const* getPluginVersion() const noexcept override;

    void destroy() noexcept override;

    IPluginV2Ext* clone() const noexcept override;

    void setPluginNamespace(AsciiChar const* pluginNamespace) noexcept override;

    AsciiChar const* getPluginNamespace() const noexcept override;

private:
    const std::string mLayerName;
    size_t mCopySize;
    std::string mNamespace;
    DataType mDataType;
    size_t mOutputSize[2]; // [H*W, C]
    size_t mInputIndexSize[2]; // [H*W, C]

};

class ScatterNDSamplePluginCreator : public IPluginCreator
{
public:
    ScatterNDSamplePluginCreator();

    AsciiChar const* getPluginNamespace() const noexcept override;

    AsciiChar const* getPluginVersion() const noexcept override;

    PluginFieldCollection const* getFieldNames() noexcept override;

    IPluginV2* createPlugin(AsciiChar const* name, PluginFieldCollection const* fc) noexcept override;

    IPluginV2* deserializePlugin(AsciiChar const* name, void const* serialData, size_t serialLength) noexcept override;

    void setPluginNamespace(AsciiChar const* pluginNamespace) noexcept override
    {
        mNamespace = pluginNamespace;
    }

    AsciiChar const* getPluginName() const noexcept override
    {
        return mNamespace.c_str();
    }

private:
    static PluginFieldCollection mFC;
    static std::vector<PluginField> mPluginAttributes;
    std::string mNamespace;
    DataType mDataType;
};

} // namespace plugin
} // namespace nvinfer1

#endif

ScatterNDPlugin.cu:

/**
 * For the usage of those member function, please refer to the
 * offical api doc.
 * https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_plugin_v2_ext.html
 */

#include "ScatterNDPlugin.h"
#include 
#include 
#include 

#include "cuda_runtime.h"
#include "cuda_fp16.h"



// Use fp16 mode for inference
#define DATA_TYPE nvinfer1::DataType::kHALF
#define THREAD_NUM 1024

// Helper function for deserializing plugin
template <typename T>
T readFromBuffer(const char*& buffer)
{
    T val = *reinterpret_cast<const T*>(buffer);
    buffer += sizeof(T);
    return val;
}
// Helper function for serializing plugin
template <typename T>
void writeToBuffer(char*& buffer, const T& val)
{
    *reinterpret_cast<T*>(buffer) = val;
    buffer += sizeof(T);
}

using namespace nvinfer1;
using nvinfer1::plugin::ScatterNDPlugin;
using nvinfer1::plugin::ScatterNDSamplePluginCreator;

static const char* SCATTERND_PLUGIN_VERSION{"1"};
static const char* SCATTERND_PLUGIN_NAME{"ScatterND"};

PluginFieldCollection ScatterNDSamplePluginCreator::mFC{};
std::vector<PluginField> ScatterNDSamplePluginCreator::mPluginAttributes;



ScatterNDPlugin::ScatterNDPlugin(const std::string name, const size_t outputShapeArray[], 
                                 const size_t indexShapeArray[], const DataType type) : mLayerName(name), mDataType(type)
{
    mOutputSize[0] = outputShapeArray[0];
    mOutputSize[1] = outputShapeArray[1];

    mInputIndexSize[0] = indexShapeArray[0];
    mInputIndexSize[1] = indexShapeArray[1];

}

ScatterNDPlugin::ScatterNDPlugin(const std::string name, const void* data, size_t length)
    : mLayerName(name)
{
    const char *d = reinterpret_cast<const char *>(data);
    const char *a = d;

    mDataType = readFromBuffer<DataType>(d);
    mOutputSize[0] = readFromBuffer<size_t>(d);
    mOutputSize[1] = readFromBuffer<size_t>(d);
    mInputIndexSize[0] = readFromBuffer<size_t>(d);
    mInputIndexSize[1] = readFromBuffer<size_t>(d);

    assert(d == a + length);
}

int32_t ScatterNDPlugin::getNbOutputs() const noexcept
{
    return 1;
}

Dims ScatterNDPlugin::getOutputDimensions(int32_t index, Dims const* inputs, int32_t nbInputDims) noexcept
{   
    // scatterND data input 
    return Dims2(inputs[0].d[0],inputs[0].d[1]);
}

int32_t ScatterNDPlugin::initialize() noexcept
{
    return 0;
}

size_t ScatterNDPlugin::getWorkspaceSize(int32_t maxBatchSize) const noexcept
{
    return 0;
}

DataType ScatterNDPlugin::getOutputDataType(
        int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept
{
    return inputTypes[2];
}

template <typename Dtype>
__global__ void _ScatterNDKernel(const Dtype *updata_input, const int *indicesInputPtr , Dtype* output,
        int channel_num, int max_index_num) {
    
    int idx_num = blockDim.x * blockIdx.x + threadIdx.x;
    if (idx_num >= max_index_num) return;    
    
    int idx_output = indicesInputPtr[idx_num*2+1];
    if (idx_output < 0) return;
    
    for(int idx=0; idx < channel_num; idx++){
        output[idx_output*channel_num+idx] = updata_input[idx_num*channel_num+idx];
    }
}

 int32_t ScatterNDPlugin::enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace,
        cudaStream_t stream) noexcept
{
    int channel_num = mOutputSize[1];
    int max_index_num = mInputIndexSize[0];

    int totalElems = mOutputSize[0]*channel_num;
    
    dim3 blockSize(THREAD_NUM);
    dim3 gridsize(max_index_num/blockSize.x+1);
     
    // if you want to inference use fp32, change the DATA_TYPE
    switch (mDataType)
    {
    case nvinfer1::DataType::kFLOAT:
        cudaMemset(outputs[0], 0, totalElems * sizeof(float));
        _ScatterNDKernel<<<gridsize, blockSize,0,stream>>>(static_cast<float const*> (inputs[2]), static_cast<int32_t const*> (inputs[1]), 
                                                    static_cast<float *> (outputs[0]), channel_num, max_index_num);
        break;

    case nvinfer1::DataType::kHALF:
        cudaMemset(outputs[0], 0, totalElems * sizeof(float)/2);
        _ScatterNDKernel<<<gridsize, blockSize,0,stream>>>(static_cast<int16_t const*> (inputs[2]), static_cast<int32_t const*> (inputs[1]), 
                                                           static_cast<int16_t *> (outputs[0]), channel_num, max_index_num);
        
        break;
    
    default:
        std::cout << "[ERROR]: mDataType dones't support" << std::endl;
    }
    return 0;
}

void ScatterNDPlugin::serialize(void* buffer) const noexcept
{
    char* d = static_cast<char*>(buffer);
    char *a = d;
    writeToBuffer<DataType>(d, mDataType);
    writeToBuffer<size_t>(d, mOutputSize[0]);
    writeToBuffer<size_t>(d, mOutputSize[1]);
    writeToBuffer<size_t>(d, mInputIndexSize[0]);
    writeToBuffer<size_t>(d, mInputIndexSize[1]);

    assert(d == a + getSerializationSize());
}

void ScatterNDPlugin::terminate() noexcept {
}

size_t ScatterNDPlugin::getSerializationSize() const noexcept
{
    return sizeof(DataType)+ 4*sizeof(size_t);
}

bool ScatterNDPlugin::isOutputBroadcastAcrossBatch(
        int32_t outputIndex, bool const* inputIsBroadcasted, int32_t nbInputs) const noexcept
{
    return false;
}

bool ScatterNDPlugin::canBroadcastInputAcrossBatch(int32_t inputIndex) const noexcept
{
    return false;
}

void ScatterNDPlugin::configurePlugin(Dims const* inputDims, int32_t nbInputs, Dims const* outputDims, int32_t nbOutputs,
        DataType const* inputTypes, DataType const* outputTypes, bool const* inputIsBroadcast,
        bool const* outputIsBroadcast, PluginFormat floatFormat, int32_t maxBatchSize) noexcept
{
    mOutputSize[0] = outputDims[0].d[0];
    mOutputSize[1] = outputDims[0].d[1];
    mInputIndexSize[0] = inputDims[1].d[0];
    mInputIndexSize[1] = inputDims[1].d[1];
}

bool ScatterNDPlugin::supportsFormat(DataType type, PluginFormat format) const noexcept
{
    switch (type)
    {   
        case nvinfer1::DataType::kINT32: return true;
        case nvinfer1::DataType::kFLOAT: return true;
        case nvinfer1::DataType::kHALF: return true;
    }
    return false;
}

/**
 * NO NEED TO MODIFY
 */
AsciiChar const* ScatterNDPlugin::getPluginType() const noexcept
{
    return SCATTERND_PLUGIN_NAME;
}

/**
 * NO NEED TO MODIFY
 */
AsciiChar const* ScatterNDPlugin::getPluginVersion() const noexcept
{
    return SCATTERND_PLUGIN_VERSION;
}

void ScatterNDPlugin::destroy() noexcept
{
    delete this;
}

IPluginV2Ext* ScatterNDPlugin::clone() const noexcept
{
    auto* plugin = new ScatterNDPlugin(mLayerName, mOutputSize, mInputIndexSize, mDataType);
    plugin->setPluginNamespace(mNamespace.c_str());
    return plugin;
}

/**
 * NO NEED TO MODIFY
 */
void ScatterNDPlugin::setPluginNamespace(AsciiChar const* pluginNamespace) noexcept
{
    mNamespace = pluginNamespace;
}

/**
 * NO NEED TO MODIFY
 */
AsciiChar const* ScatterNDPlugin::getPluginNamespace() const noexcept
{
    return mNamespace.c_str();
}

ScatterNDSamplePluginCreator::ScatterNDSamplePluginCreator()
{   
    mPluginAttributes.emplace_back(PluginField("output_shape", nullptr, PluginFieldType::kINT32, 3));
    mPluginAttributes.emplace_back(PluginField("index_shape", nullptr, PluginFieldType::kINT32, 3));

    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

/**
 * NO NEED TO MODIFY
 */
AsciiChar const* ScatterNDSamplePluginCreator::getPluginNamespace() const noexcept
{
    return SCATTERND_PLUGIN_NAME;
}

/**
 * NO NEED TO MODIFY
 */
AsciiChar const* ScatterNDSamplePluginCreator::getPluginVersion() const noexcept
{
    return SCATTERND_PLUGIN_VERSION;
}

/**
 * NO NEED TO MODIFY
 */
PluginFieldCollection const* ScatterNDSamplePluginCreator::getFieldNames() noexcept
{   
    return &mFC;
}

IPluginV2* ScatterNDSamplePluginCreator::createPlugin(AsciiChar const* name, PluginFieldCollection const* fc) noexcept
{
    
    const nvinfer1::PluginField* fields = fc->fields;
    
    mDataType = DATA_TYPE;

    size_t indexShapeArray[2] = {0};
    size_t outputShapeArray[2] = {0};

    for (int i=0; i<fc->nbFields; i++) {
        if(!strcmp(fields[i].name, "output_shape")){
            const auto *outputShapeAttr = static_cast<const int32_t*>(fields[i].data);
            outputShapeArray[0] = outputShapeAttr[1];
            outputShapeArray[1] = outputShapeAttr[2];

        }
        if(!strcmp(fields[i].name, "index_shape")){
            const auto * indexShapeAttr = static_cast<const int32_t*>(fields[i].data);

            indexShapeArray[0] = indexShapeAttr[1];
            indexShapeArray[1] = indexShapeAttr[2];
        }
    }
    
    auto* plugin = new ScatterNDPlugin(name, outputShapeArray, indexShapeArray, mDataType);
    plugin->setPluginNamespace(mNamespace.c_str());
    return plugin;
}

IPluginV2* ScatterNDSamplePluginCreator::deserializePlugin(AsciiChar const* name, void const* serialData, size_t serialLength) noexcept
{   
    return new ScatterNDPlugin(name, serialData, serialLength);
}

REGISTER_TENSORRT_PLUGIN(ScatterNDSamplePluginCreator);

继续make,报错:
在这里插入图片描述
解决办法:
1.下载tensorrt7,将其中lib文件夹下的libmyelin.so,libmyelin.so.1
,libmyelin.so.1.1.116放到现在的TensorRT8根目录中的lib文件夹下
2.将Makefile.config

COMMON_LIBS += $(MYELIN_LIB) $(NVRTC_LIB) 

替换为

COMMON_LIBS += $(NVRTC_LIB)

12、 测试

运行CenterPoint下的TensorRT_Visualize.ipynb可以生成data,将tensorrt/data目录下的文件复制到TENSORRT8根目录/data

接着在TENSORRT8根目录/bin文件夹下执行./centerpoint
发现没有输出耗时的输出
解决办法:
TENSORRT8根目录/samples/centerpoint/samplecenterpoint.cpp中253行中的路径改为绝对路径
253行原本为:

std::vector<std::string> filePath = glob("../"+mParams.dataDirs[0]+"/points/*.bin");

接着报错:
在这里插入图片描述
解决办法:
tensorrt版本换成7.1.3.4,完美解决,记得换完后把/usr/include下面的NvInfer*.h删掉,换成最新的。

**PS:**高版本适配低版本也太难了,最后还是妥协了= =

你可能感兴趣的:(Linux,深度学习,#,3D目标检测,pytorch,深度学习)