TensorRT_mnist_int8校正

  • 定义网络

为INT8执行定义网络与为任何其他精度定义网络完全相同。权重作为FP32导入,构建者将校准网络,以找到适当的量化因子,从而将网络降低到INT8精度。此示例使用NvCaffeParser导入网络:

 const nvcaffeparser1;:IBlobNameToTensor* blobNameToTensor =
            parser->parse(locateFile(mParams.prototxtFileName,mParams.dataDirs).c_str(),
            locateFile(mParams.weightsFileName,mParams.dataDirs).c_str(),*network,
            dataType == DtatType::kINT8 ? DataType::kFLOAT : dataType);
  • 设置校准器

为INT8构建网络时,校准是一个额外的步骤。应用程序必须向TensorRT提供样本输入,即校准数据。然后,TensorRT将在FP32中执行推断,并收集有关中间激活层的统计信息,用于构建精度降低的INT8引擎。

创建int校准器

std::unique_ptr calibrator;

    config->setAvgTimingIterations(1);
    config->setMinTimingIterations(1);
    config->setMaxWOrkspaceSize(1_GiB);

(1)校准数据

 if(dataType == DataType::kINT8)
    {
        MNISTBatchStream calibrationStream(mParams.calBatchSize,mParams.nbCalBatches,"train-images-idx3-ubyte",
                                           "train-labels-idx1-ubyte",mParams.dataDirs);
    }

MNISTBatchStream 类提供用于检索批处理数据的帮助方法。校准器使用批处理流对象,在校准时检索批处理数据。通常,batchstream类为

class IBatchStream
{
public:
    virtual void reset(int firstBatch) = 0;
    virtual bool next() = 0;
    virtual void skip(int skipCount) = 0;
    virtual float* getBatch() = 0;
    virtual float* getLabels() = 0;
    virtual int getBatchesRead() const = 0;
    virtual int getBatchSize() const = 0;
    virtual nvinfer1::Dims getDims() const = 0;
};

注意:校准数据必须代表运行时提供给TensorRT的输入;例如,对于图像分类网络,它不应该只包含一小部分类别的图像。对于ImageNet,大约500幅图像的校准就足够了。

(2)校准推理

应用程序必须实现“IInt8Calibrator”接口,用以提供读取和写入校准表文件的校准数据和辅助方法。tensorrt提供了4中实现IInt8Calibrator的方法:

IInt8EntropyCalibrator
IInt8EntropyCalibrator2
IInt8MinMaxCalibrator
IInt8LegacyCalibrator

本例中使用IInt8EntropyCalibrator2

 calibrator.reset(new Int8EntropyCalibrator2(
                             calibrationStream,0,mParams.networkName.c_str(),mParams.inputTensorNames[0].c_str()));

校准器对象需要使用校准批处理流。

为了执行校准,接口必须提供“getBatchSize()”和“getBatch()”的实现,可以从batchstream对象检索数据。

构建器在开始时调用依此“getBatchSize()”方法,以获取校准集的批次大小。

config->setInt8Calibrator(calibrator.get());

然后重复调用“getBatch()”的方法,从应用程序中获取批,知道该方法返回false位置。每个校准批次必须包含指定为批次大小的图像数量。

 float* getBatch() override
    {
        return mData.data() + (mBatchCount * mBatchSize * samplesCommon::volume(mDims));
    }
float* getBatch() override
    {
        return mBatch.data();
    }
 while(batchStream.next())
    {
        assert(mParams.inputTensorNames.size() == 1);
        if(!processInput(buffers,batchStream.getBatch()))
        {
            return false;
        }
     ...
 }

对于每个输入张量,必须将只向GPU内存中输入数据的指针写入绑定数组。名称数组包含输入张量的名称。绑定数组中每个张量的位置与其名称在名称数组中的位置相匹配。两个数组的大小都为“nbBings”。由于校准步骤非常耗时,可以通过“writeCalibrationCache”实现,以便将校准表写入适当的位置,以便在以后的运行中使用。然后,通过’readCalibrationCache`方法从所需位置读取校准表文件。校准期间,生成器将使用“readCalibrationCache()”检查校准文件是否存在。仅当校准文件不存在或与生成该文件的当前TensorRT版本或校准器变体不兼容时,生成器才会重新校准。

(3)校准文件

每个tensor网络激活都存储一个tensor校准文件。激活比例使用校准算法生成的动态范围计算,=abs(最大动态范围)/127.0f

校准文件名为“CalibrationTable”,其中“”是您的网络名称,例如“mnist”。该文件位于’TensorRT-x.x.x.x/data/mnist’目录中,其中’x.x.x.x’是您安装的TensorRT版本。

如果未找到“CalibrationTable”文件,构建器将再次运行校准算法以创建它。“校准表”的内容包括:

TRT-7000-EntropyCalibration2 //TRT-校准算法的TensorRt版本-校准算法
//layer naem:校准期间为网络中的每个张量确定的浮点激活比例
data: 3c008912
conv1: 3c88edfc
pool1: 3c88edfc
conv2: 3ddc858b
pool2: 3ddc858b
ip1: 3db6bd6e
ip2: 3e691968
prob: 3c010a14

“CalibrationTable”文件是在运行校准算法的构建阶段生成的。创建校准文件后,无需再次运行校准,即可读取该文件进行后续运行。您可以为“readCalibrationCache()”提供实现,以便从所需位置加载校准文件。如果读取的校准文件与校准器类型(用于生成该文件)和TensorRT版本兼容,则生成器将跳过校准步骤,使用校准文件中的每张量刻度值。

  • 配置生成器
 config->setAvgTimingIterations(1); //设置计时迭代的最小次数
    config->setMinTimingIterations(1);//设置计时迭代的平均次数
    config->setMaxWOrkspaceSize(1_GiB);//设置最大工作空间
    //设置除kHALF外,允许的生成器的精度为kFP16
    if(dataType == DataType::kHALF)
    {
        config->setFlag(BuilderFlag::kFP16);
    }
    //设置除FP32外,允许的生成器的精度为kINT8,
    if(dataType == DataType::kINT8)
    {
        config->setFlag(BuilderFlag::kINT8);
    }
    //设置最大批量大小
    builder->setMaxBatchSize(mParams.batchSize);
   
    if(dataType == DataType::kINT8)
    {
       //建立批量处理流
        MNISTBatchStream calibrationStream(mParams.calBatchSize,mParams.nbCalBatches,"train-images-idx3-ubyte",
                                           "train-labels-idx1-ubyte",mParams.dataDirs);
        //创建校准器
        calibrator.reset(new Int8EntropyCalibrator2(
                             calibrationStream,0,mParams.networkName.c_str(),mParams.inputTensorNames[0].c_str()));
        //将校准器对象传递给builder()
        config->setInt8Calibrator(calibrator.get());
    }
  • 构建引擎
 mEngine = std::shared_ptr(
                builder->buildEngineWithConfig(*network,*config),samplesCommon::InferDeleter());
  • 运行引擎

输入输出任然保持32-bit 浮点数

bool SampleINT8::infer(std::vector& score,int firstScoreBatch, int nbScoreBatches)
{
    float ms{0.0f};
    //分配输出,输出内存buffer
    samplesCommon::BufferManager buffers(mEngine,mParams.batchSize);
    //创建执行上下文
    auto context = SampleUniquePtr(mEngine->createExecutionContext());
    if(!context)
    {
        return false;
    }
    MNISTBatchStream batchStream(mParams.batchSize,nbScoreBatches+firstScoreBatch,"train-images-idx3-ubyte",
                                 "train-labels-idx1-ubyte",mPArams.dataDirs);
    batchStream.skip(firstScoreBatch);
    //获取数据维度
    Dims outputDims = context->getEngine().getBindingDimensions(
                context->getEngine().getBindingIndex(mParams.outputTensorNames[0].c_str()));
    int outputSize = samplesCommon::volume(outputDims);
    int top1{0},top5{0};
    float totalTime{0.0f};
    while(batchStream.next())
    {
        assert(mParams.inputTensorNames.size() == 1);
        //读取输入数据生成managed buffer
        if(!processInput(buffers,batchStream.getBatch()))
        {
            return false;
        }
        //将输入数据从host拷贝到device中
        buffers.copyInputToDevice();
        cudaStream_t stream;
        CHECK(cudaSTreamCreate(&stream));

        cudaEvent_t start,end;
        CHECK(cudaEventCreateWithFlags(&start,cudaEventBlockingSync));
        CHECK(cudaEventCreatWithFlags(&end,cudaEventBlockingSync));
        cudaEventRecord(start,stream);
        //运行推理
 
        bool status = context->enqueue(mParams.batchSize,buffers.getDeviceBindings().data(),stream,nullptr);
        if(!status)
        {
            return false;
        }
        cudaEventRecord(end,stream);
        cudaEventSynchronize(end);
        cudaEventElapsedTime(&ms,start,end);
        cudaEventDestory(start);
        cudaEventDestory(end);

        totalTime += ms;
        //从device拷贝输出到host
        buffers.copyOutputToHost();

        CHECK(cudaStreamDestory(stream));
        //此示例输出FP32和INT8精度的Top-1和Top-5指标,
        //以及FP16(如果硬件本机支持)的Top-1和Top-5指标。这些数字应在1%以内。
        top1 += calculateScore(buffers,batchStream.getLabels(),mParams.batchSize,outputSize,1);
        top5 += calculateScore(buffers,batchStream.getLabels(),mPArams.batchSIze,outputSize,5);

        if(batchStream.getBatchesRead() % 100 ==0)
        {
            sample::gLogInfo <<"Processing next set of max 100 batches"<
  • 验证输出

7.5.2 sample_int8

#include "common/BatchStream.h"
#include "common/EntropyCalibrator.h"
#include "common/argsParser.h"
#include "common/buffers.h"
#include "common/common.h"
#include "common/logger.h"
#include "common/logging.h"

#include "NvCaffeParser.h"
#include "NvInfer.h"
#include 

#include 
#include 
#include 
#include 

const std::string gSampleName = "TensorRT.sample_int8";

struct SampleINT8Params : public samplesCommon::CaffeSampleParams
{
    int nbCalBatches;
    int calBatchSize;
    std::string networkName;
};
class SampleINT8
{
    template 
    using SampleUniquePtr = std::unique_ptr;
public:
    SampleINT8(const SampleINT8Params& params)
        : mParams(params)
        , mEngine(nullptr)
    {
        initLibNvInferPlugins(&sample::gLogger.getTRTLOgger(),"");
    }

    bool build(DataType dataType);
    bool isSupported(DataType dataType);
    bool infer(std::vector& score,int firstScoreBatch, int nbScoreBatches);
    bool teardown;
private:
    SampleINT8Params mParams;
    std::shared_ptr mEngine;

    nvinfer1::Dims mInputDims;

    bool constructNetwork(SampleUniquePtr& builder,
                          SampleUniquePtr& network,
                          SampleUniquePtr& config,
                          SampleUniquePtr& parser,
                          DataType datattype);
    bool processInput(const samplesCommon::BufferManager& buffers,const float* data);
    int calculateScore(const samplesCommon::BufferManager& buffers,float* labels,int batchSize,int outputSize,int threshold);
};
bool SampleINT8::build(DataType dataType)
{
    auto builder = ampleUniquePtr(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
    if(!builder)
    {
        return false;
    }
    auto network = SampleUniquePtr(builder->createNetwork());
    if(!network)
    {
        return false;
    }
    auto config = SampleUniquePtr(builder->createBuilderConfig());
    if(!config)
    {
        return false;
    }
    auto parser = SampleUniquePtr(nvcaffeparser1::createCaffeParser());
    if(!parser)
    {
        return false;
    }
    if((dataType == DataType::kINT8 && ! builder->platformHaFastInt8())
            || (dataType == DataType::kHALF && !builder->platformHasFatDp16()))
    {
        return false;
    }
    auto constructed = constructNetwork(builder,network,config,parser,dataType);
    if(!constructed)
    {
        return false;
    }
    assert(network->getNbInputs() == 1);
    mInputDims = network->getInput(0)->getDimensions();
    assert(mInputDims.nbDims == 3);

    return true;

}
bool SampleINT8::isSupported(DataType dataType)
{
    auto builder = SampleUniquePtr(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
    if(!builder)
    {
        return false;
    }
    if((dataType == DataType::kINT8 && !builder->platformHaFastInt8())
        || (dataType == DataType::kHALF && !builder->platformHasFatFp16()))
    {
        return false;
    }
    return true;
}
bool SampleINT8::constructNetwork(SampleUniquePtr &builder,
                                  SampleUniquePtr &network,
                                  SampleUniquePtr &config, int &parser, int datattype)
{
    mEngine = nullptr;
    const nvcaffeparser1;:IBlobNameToTensor* blobNameToTensor =
            parser->parse(locateFile(mParams.prototxtFileName,mParams.dataDirs).c_str(),
            locateFile(mParams.weightsFileName,mParams.dataDirs).c_str(),*network,
            dataType == DtatType::kINT8 ? DataType::kFLOAT : dataType);
    for(auto & s: mPrams.outputTensorNames)
    {
        network->markOutput(*blobNameToTensor->find(s.c_str()));
    }
    std::unique_ptr calibrator;

    config->setAvgTimingIterations(1);
    config->setMinTimingIterations(1);
    config->setMaxWorkspaceSize(1_GiB);
    if(dataType == DataType::kHALF)
    {
        config->setFlag(BuilderFlag::kFP16);
    }
    if(dataType == DataType::kINT8)
    {
        config->setFlag(BuilderFlag::kINT8);
    }
    builder->setMaxBatchSize(mParams.batchSize);
    if(dataType == DataType::kINT8)
    {
        MNISTBatchStream calibrationStream(mParams.calBatchSize,mParams.nbCalBatches,"train-images-idx3-ubyte",
                                           "train-labels-idx1-ubyte",mParams.dataDirs);
        calibrator.reset(new Int8EntropyCalibrator2(
                             calibrationStream,0,mParams.networkName.c_str(),mParams.inputTensorNames[0].c_str()));
        config->setInt8Calibrator(calibrator.get());
    }
    if(mParams.dlaCore >= 0)
    {
        samplesCommon::enableDLA(builder.get(),config.get(),mParams.dlaCore);
        if(mParams.batchSize > builder->getMaxDLABatchSize())
        {
            sample::gLogError << "Requested batch size "<getMaxDLABatchSize()
                              << ". Reducing batch size accordingly."<(
                builder->buildEngineWithConfig(*network,*config),samplesCommon::InferDeleter());
    if(!mEngine)
    {
        return false;
    }
    return true;
}
bool SampleINT8::infer(std::vector& score,int firstScoreBatch, int nbScoreBatches)
{
    float ms{0.0f};
    samplesCommon::BufferManager buffers(mEngine,mParams.batchSize);
    auto context = SampleUniquePtr(mEngine->createExecutionContext());
    if(!context)
    {
        return false;
    }
    MNISTBatchStream batchStream(mParams.batchSize,nbScoreBatches+firstScoreBatch,"train-images-idx3-ubyte",
                                 "train-labels-idx1-ubyte",mPArams.dataDirs);
    batchStream.skip(firstScoreBatch);

    Dims outputDims = context->getEngine().getBindingDimensions(
                context->getEngine().getBindingIndex(mParams.outputTensorNames[0].c_str()));
    int outputSize = samplesCommon::volume(outputDims);
    int top1{0},top5{0};
    float totalTime{0.0f};
    while(batchStream.next())
    {
        assert(mParams.inputTensorNames.size() == 1);
        if(!processInput(buffers,batchStream.getBatch()))
        {
            return false;
        }
        buffers.copyInputToDevice();
        cudaStream_t stream;
        CHECK(cudaSTreamCreate(&stream));

        cudaEvent_t start,end;
        CHECK(cudaEventCreateWithFlags(&start,cudaEventBlockingSync));
        CHECK(cudaEventCreatWithFlags(&end,cudaEventBlockingSync));
        cudaEventRecord(start,stream);

        bool status = context->enqueue(mParams.batchSize,buffers.getDeviceBindings().data(),stream,nullptr);
        if(!status)
        {
            return false;
        }
        cudaEventRecord(end,stream);
        cudaEventSynchronize(end);
        cudaEventElapsedTime(&ms,start,end);
        cudaEventDestory(start);
        cudaEventDestory(end);

        totalTime += ms;

        buffers.copyOutputToHost();

        CHECK(cudaStreamDestory(stream));

        top1 += calculateScore(buffers,batchStream.getLabels(),mParams.batchSize,outputSize,1);
        top5 += calculateScore(buffers,batchStream.getLabels(),mPArams.batchSIze,outputSize,5);

        if(batchStream.getBatchesRead() % 100 ==0)
        {
            sample::gLogInfo <<"Processing next set of max 100 batches"<(buffers.getHstBuffer(mParams.inputTensorNames[0]));
    std::memcpy(hostDataBuffer,data,mParams.batchSize*samplesCommon::volume(mInputDims)*sizeof(float));
    return true;
}
int SampleINT8::calculatScore(
        const samplesCommon::BufferManager& buffers,float* labels,int batchSize,int outputSize,int threshold)
{
    float* probs = static_cast(buffers.getHostBuffer(mParams.outputTensorNames[0]));

    int success = 0;
    for(int i=0;i=correct)
            {
                better++;
            }
        }
        if(better <= threshold)
        {
            success++;
        }
    }
    return success;
}
SampleINT8Params initializeSampleParams(const samplesCommon::Args& args,int batchSize)
{
    SampleINT8Params params;
    params.dataDirs = args.dataDirs;
    params.dataDirs.emplace_back("data/");

    params.batchSize = batchSize;
    params.dlaCore = args.useDLACore;
    params.nbCalBAtches=10;
    params.calBatchSize = 50;
    params.inputTensorNames.push_back("data");
    params.outputTensorNames.push_back("prob");
    params.prototxtFileName = "deploy.prototxt";
    params.weightsFileName = "mnist_lenet.caffemodel";
    params.networkName = "mnist";
    return params;
}
void printHelpInfo()
{
    std::cout << "Usage: ./sample_int8 [-h or --help] [-d or --datadir=] "
                 "[--useDLACore=]"
              << std::endl;
    std::cout << "--help, -h      Display help information" << std::endl;
    std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used "
                 "multiple times to add multiple directories."
              << std::endl;
    std::cout << "--useDLACore=N  Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, "
                 "where n is the number of DLA engines on the platform."
              << std::endl;
    std::cout << "batch=N         Set batch size (default = 32)." << std::endl;
    std::cout << "start=N         Set the first batch to be scored (default = 16). All batches before this batch will "
                 "be used for calibration."
              << std::endl;
    std::cout << "score=N         Set the number of batches to be scored (default = 1800)." << std::endl;
}
int main(int argc,char** argv)
{
    if(argc >= 2 && (!strncmp(argv[1],"--help",6) || !strncmp(argv[1],"-h",2)))
    {
        printHelpInfo();
        return EXIT_SUCCESS;
    }
    int natchSize = 32;
    int firstScoreBatch = 16;
    int nbSxoreBatches = 18000;

    for(int i=1;i 60000)
    {
        sample::gLogError << "Only 60000 images available" << std::endl;
        return EXIT_FAILURE;
    }
    samplesCommon::Args args;
    samplesCommon::parseArgs(args, argc, argv);

    SampleINT8 sample(initializeSampleParams(args, batchSize));

    auto sampleTest = sample::gLogger.defineTest(gSampleName, argc, argv);

    sample::gLogger.reportTestStart(sampleTest);

    sample::gLogInfo << "Building and running a GPU inference engine for INT8 sample" << std::endl;

    std::vector dataTypeNames = {"FP32", "FP16", "INT8"};
    std::vector topNames = {"Top1", "Top5"};
    std::vector dataTypes = {DataType::kFLOAT, DataType::kHALF, DataType::kINT8};
    std::vector> scores(3, std::vector(2, 0.0f));
    for (size_t i = 0; i < dataTypes.size(); i++)
    {
        sample::gLogInfo << dataTypeNames[i] << " run:" << nbScoreBatches << " batches of size " << batchSize
                         << " starting at " << firstScoreBatch << std::endl;

        if (!sample.build(dataTypes[i]))
        {
            if (!sample.isSupported(dataTypes[i]))
            {
                sample::gLogWarning << "Skipping " << dataTypeNames[i]
                                    << " since the platform does not support this data type." << std::endl;
                continue;
            }
            return sample::gLogger.reportFail(sampleTest);
        }
        if (!sample.infer(scores[i], firstScoreBatch, nbScoreBatches))
        {
            return sample::gLogger.reportFail(sampleTest);
        }
    }

    auto isApproximatelyEqual = [](float a, float b, double tolerance) { return (std::abs(a - b) <= tolerance); };
    const double tolerance{0.01};
    const double goldenMNIST{0.99};

    if ((scores[0][0] < goldenMNIST) || (scores[0][1] < goldenMNIST))
    {
        sample::gLogError << "FP32 accuracy is less than 99%: Top1 = " << scores[0][0] << ", Top5 = " << scores[0][1]
                          << "." << std::endl;
        return sample::gLogger.reportFail(sampleTest);
    }

    for (unsigned i = 0; i < topNames.size(); i++)
    {
        for (unsigned j = 1; j < dataTypes.size(); j++)
        {
            if (scores[j][i] != 0.0f && !isApproximatelyEqual(scores[0][i], scores[j][i], tolerance))
            {
                sample::gLogError << "FP32(" << scores[0][i] << ") and " << dataTypeNames[j] << "(" << scores[j][i]
                                  << ") " << topNames[i] << " accuracy differ by more than " << tolerance << "."
                                  << std::endl;
                return sample::gLogger.reportFail(sampleTest);
            }
        }
    }

    if (!sample.teardown())
    {
        return sample::gLogger.reportFail(sampleTest);
    }

    return sample::gLogger.reportPass(sampleTest);

}

你可能感兴趣的:(笔记,pytorch,自动驾驶,深度学习)