为INT8执行定义网络与为任何其他精度定义网络完全相同。权重作为FP32导入,构建者将校准网络,以找到适当的量化因子,从而将网络降低到INT8精度。此示例使用NvCaffeParser导入网络:
const nvcaffeparser1;:IBlobNameToTensor* blobNameToTensor =
parser->parse(locateFile(mParams.prototxtFileName,mParams.dataDirs).c_str(),
locateFile(mParams.weightsFileName,mParams.dataDirs).c_str(),*network,
dataType == DtatType::kINT8 ? DataType::kFLOAT : dataType);
为INT8构建网络时,校准是一个额外的步骤。应用程序必须向TensorRT提供样本输入,即校准数据。然后,TensorRT将在FP32中执行推断,并收集有关中间激活层的统计信息,用于构建精度降低的INT8引擎。
创建int校准器
std::unique_ptr calibrator;
config->setAvgTimingIterations(1);
config->setMinTimingIterations(1);
config->setMaxWOrkspaceSize(1_GiB);
(1)校准数据
if(dataType == DataType::kINT8)
{
MNISTBatchStream calibrationStream(mParams.calBatchSize,mParams.nbCalBatches,"train-images-idx3-ubyte",
"train-labels-idx1-ubyte",mParams.dataDirs);
}
MNISTBatchStream 类提供用于检索批处理数据的帮助方法。校准器使用批处理流对象,在校准时检索批处理数据。通常,batchstream类为
class IBatchStream
{
public:
virtual void reset(int firstBatch) = 0;
virtual bool next() = 0;
virtual void skip(int skipCount) = 0;
virtual float* getBatch() = 0;
virtual float* getLabels() = 0;
virtual int getBatchesRead() const = 0;
virtual int getBatchSize() const = 0;
virtual nvinfer1::Dims getDims() const = 0;
};
注意:校准数据必须代表运行时提供给TensorRT的输入;例如,对于图像分类网络,它不应该只包含一小部分类别的图像。对于ImageNet,大约500幅图像的校准就足够了。
(2)校准推理
应用程序必须实现“IInt8Calibrator”接口,用以提供读取和写入校准表文件的校准数据和辅助方法。tensorrt提供了4中实现IInt8Calibrator的方法:
IInt8EntropyCalibrator
IInt8EntropyCalibrator2
IInt8MinMaxCalibrator
IInt8LegacyCalibrator
本例中使用IInt8EntropyCalibrator2
calibrator.reset(new Int8EntropyCalibrator2(
calibrationStream,0,mParams.networkName.c_str(),mParams.inputTensorNames[0].c_str()));
校准器对象需要使用校准批处理流。
为了执行校准,接口必须提供“getBatchSize()”和“getBatch()”的实现,可以从batchstream对象检索数据。
构建器在开始时调用依此“getBatchSize()”方法,以获取校准集的批次大小。
config->setInt8Calibrator(calibrator.get());
然后重复调用“getBatch()”的方法,从应用程序中获取批,知道该方法返回false位置。每个校准批次必须包含指定为批次大小的图像数量。
float* getBatch() override
{
return mData.data() + (mBatchCount * mBatchSize * samplesCommon::volume(mDims));
}
float* getBatch() override
{
return mBatch.data();
}
while(batchStream.next())
{
assert(mParams.inputTensorNames.size() == 1);
if(!processInput(buffers,batchStream.getBatch()))
{
return false;
}
...
}
对于每个输入张量,必须将只向GPU内存中输入数据的指针写入绑定数组。名称数组包含输入张量的名称。绑定数组中每个张量的位置与其名称在名称数组中的位置相匹配。两个数组的大小都为“nbBings”。由于校准步骤非常耗时,可以通过“writeCalibrationCache”实现,以便将校准表写入适当的位置,以便在以后的运行中使用。然后,通过’readCalibrationCache`方法从所需位置读取校准表文件。校准期间,生成器将使用“readCalibrationCache()”检查校准文件是否存在。仅当校准文件不存在或与生成该文件的当前TensorRT版本或校准器变体不兼容时,生成器才会重新校准。
(3)校准文件
每个tensor网络激活都存储一个tensor校准文件。激活比例使用校准算法生成的动态范围计算,=abs(最大动态范围)/127.0f
。
校准文件名为“CalibrationTable”,其中“”是您的网络名称,例如“mnist”。该文件位于’TensorRT-x.x.x.x/data/mnist’目录中,其中’x.x.x.x’是您安装的TensorRT版本。
如果未找到“CalibrationTable”文件,构建器将再次运行校准算法以创建它。“校准表”的内容包括:
TRT-7000-EntropyCalibration2 //TRT-校准算法的TensorRt版本-校准算法
//layer naem:校准期间为网络中的每个张量确定的浮点激活比例
data: 3c008912
conv1: 3c88edfc
pool1: 3c88edfc
conv2: 3ddc858b
pool2: 3ddc858b
ip1: 3db6bd6e
ip2: 3e691968
prob: 3c010a14
“CalibrationTable”文件是在运行校准算法的构建阶段生成的。创建校准文件后,无需再次运行校准,即可读取该文件进行后续运行。您可以为“readCalibrationCache()”提供实现,以便从所需位置加载校准文件。如果读取的校准文件与校准器类型(用于生成该文件)和TensorRT版本兼容,则生成器将跳过校准步骤,使用校准文件中的每张量刻度值。
config->setAvgTimingIterations(1); //设置计时迭代的最小次数
config->setMinTimingIterations(1);//设置计时迭代的平均次数
config->setMaxWOrkspaceSize(1_GiB);//设置最大工作空间
//设置除kHALF外,允许的生成器的精度为kFP16
if(dataType == DataType::kHALF)
{
config->setFlag(BuilderFlag::kFP16);
}
//设置除FP32外,允许的生成器的精度为kINT8,
if(dataType == DataType::kINT8)
{
config->setFlag(BuilderFlag::kINT8);
}
//设置最大批量大小
builder->setMaxBatchSize(mParams.batchSize);
if(dataType == DataType::kINT8)
{
//建立批量处理流
MNISTBatchStream calibrationStream(mParams.calBatchSize,mParams.nbCalBatches,"train-images-idx3-ubyte",
"train-labels-idx1-ubyte",mParams.dataDirs);
//创建校准器
calibrator.reset(new Int8EntropyCalibrator2(
calibrationStream,0,mParams.networkName.c_str(),mParams.inputTensorNames[0].c_str()));
//将校准器对象传递给builder()
config->setInt8Calibrator(calibrator.get());
}
mEngine = std::shared_ptr(
builder->buildEngineWithConfig(*network,*config),samplesCommon::InferDeleter());
输入输出任然保持32-bit 浮点数
bool SampleINT8::infer(std::vector& score,int firstScoreBatch, int nbScoreBatches)
{
float ms{0.0f};
//分配输出,输出内存buffer
samplesCommon::BufferManager buffers(mEngine,mParams.batchSize);
//创建执行上下文
auto context = SampleUniquePtr(mEngine->createExecutionContext());
if(!context)
{
return false;
}
MNISTBatchStream batchStream(mParams.batchSize,nbScoreBatches+firstScoreBatch,"train-images-idx3-ubyte",
"train-labels-idx1-ubyte",mPArams.dataDirs);
batchStream.skip(firstScoreBatch);
//获取数据维度
Dims outputDims = context->getEngine().getBindingDimensions(
context->getEngine().getBindingIndex(mParams.outputTensorNames[0].c_str()));
int outputSize = samplesCommon::volume(outputDims);
int top1{0},top5{0};
float totalTime{0.0f};
while(batchStream.next())
{
assert(mParams.inputTensorNames.size() == 1);
//读取输入数据生成managed buffer
if(!processInput(buffers,batchStream.getBatch()))
{
return false;
}
//将输入数据从host拷贝到device中
buffers.copyInputToDevice();
cudaStream_t stream;
CHECK(cudaSTreamCreate(&stream));
cudaEvent_t start,end;
CHECK(cudaEventCreateWithFlags(&start,cudaEventBlockingSync));
CHECK(cudaEventCreatWithFlags(&end,cudaEventBlockingSync));
cudaEventRecord(start,stream);
//运行推理
bool status = context->enqueue(mParams.batchSize,buffers.getDeviceBindings().data(),stream,nullptr);
if(!status)
{
return false;
}
cudaEventRecord(end,stream);
cudaEventSynchronize(end);
cudaEventElapsedTime(&ms,start,end);
cudaEventDestory(start);
cudaEventDestory(end);
totalTime += ms;
//从device拷贝输出到host
buffers.copyOutputToHost();
CHECK(cudaStreamDestory(stream));
//此示例输出FP32和INT8精度的Top-1和Top-5指标,
//以及FP16(如果硬件本机支持)的Top-1和Top-5指标。这些数字应在1%以内。
top1 += calculateScore(buffers,batchStream.getLabels(),mParams.batchSize,outputSize,1);
top5 += calculateScore(buffers,batchStream.getLabels(),mPArams.batchSIze,outputSize,5);
if(batchStream.getBatchesRead() % 100 ==0)
{
sample::gLogInfo <<"Processing next set of max 100 batches"<
#include "common/BatchStream.h"
#include "common/EntropyCalibrator.h"
#include "common/argsParser.h"
#include "common/buffers.h"
#include "common/common.h"
#include "common/logger.h"
#include "common/logging.h"
#include "NvCaffeParser.h"
#include "NvInfer.h"
#include
#include
#include
#include
#include
const std::string gSampleName = "TensorRT.sample_int8";
struct SampleINT8Params : public samplesCommon::CaffeSampleParams
{
int nbCalBatches;
int calBatchSize;
std::string networkName;
};
class SampleINT8
{
template
using SampleUniquePtr = std::unique_ptr;
public:
SampleINT8(const SampleINT8Params& params)
: mParams(params)
, mEngine(nullptr)
{
initLibNvInferPlugins(&sample::gLogger.getTRTLOgger(),"");
}
bool build(DataType dataType);
bool isSupported(DataType dataType);
bool infer(std::vector& score,int firstScoreBatch, int nbScoreBatches);
bool teardown;
private:
SampleINT8Params mParams;
std::shared_ptr mEngine;
nvinfer1::Dims mInputDims;
bool constructNetwork(SampleUniquePtr& builder,
SampleUniquePtr& network,
SampleUniquePtr& config,
SampleUniquePtr& parser,
DataType datattype);
bool processInput(const samplesCommon::BufferManager& buffers,const float* data);
int calculateScore(const samplesCommon::BufferManager& buffers,float* labels,int batchSize,int outputSize,int threshold);
};
bool SampleINT8::build(DataType dataType)
{
auto builder = ampleUniquePtr(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
if(!builder)
{
return false;
}
auto network = SampleUniquePtr(builder->createNetwork());
if(!network)
{
return false;
}
auto config = SampleUniquePtr(builder->createBuilderConfig());
if(!config)
{
return false;
}
auto parser = SampleUniquePtr(nvcaffeparser1::createCaffeParser());
if(!parser)
{
return false;
}
if((dataType == DataType::kINT8 && ! builder->platformHaFastInt8())
|| (dataType == DataType::kHALF && !builder->platformHasFatDp16()))
{
return false;
}
auto constructed = constructNetwork(builder,network,config,parser,dataType);
if(!constructed)
{
return false;
}
assert(network->getNbInputs() == 1);
mInputDims = network->getInput(0)->getDimensions();
assert(mInputDims.nbDims == 3);
return true;
}
bool SampleINT8::isSupported(DataType dataType)
{
auto builder = SampleUniquePtr(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
if(!builder)
{
return false;
}
if((dataType == DataType::kINT8 && !builder->platformHaFastInt8())
|| (dataType == DataType::kHALF && !builder->platformHasFatFp16()))
{
return false;
}
return true;
}
bool SampleINT8::constructNetwork(SampleUniquePtr &builder,
SampleUniquePtr &network,
SampleUniquePtr &config, int &parser, int datattype)
{
mEngine = nullptr;
const nvcaffeparser1;:IBlobNameToTensor* blobNameToTensor =
parser->parse(locateFile(mParams.prototxtFileName,mParams.dataDirs).c_str(),
locateFile(mParams.weightsFileName,mParams.dataDirs).c_str(),*network,
dataType == DtatType::kINT8 ? DataType::kFLOAT : dataType);
for(auto & s: mPrams.outputTensorNames)
{
network->markOutput(*blobNameToTensor->find(s.c_str()));
}
std::unique_ptr calibrator;
config->setAvgTimingIterations(1);
config->setMinTimingIterations(1);
config->setMaxWorkspaceSize(1_GiB);
if(dataType == DataType::kHALF)
{
config->setFlag(BuilderFlag::kFP16);
}
if(dataType == DataType::kINT8)
{
config->setFlag(BuilderFlag::kINT8);
}
builder->setMaxBatchSize(mParams.batchSize);
if(dataType == DataType::kINT8)
{
MNISTBatchStream calibrationStream(mParams.calBatchSize,mParams.nbCalBatches,"train-images-idx3-ubyte",
"train-labels-idx1-ubyte",mParams.dataDirs);
calibrator.reset(new Int8EntropyCalibrator2(
calibrationStream,0,mParams.networkName.c_str(),mParams.inputTensorNames[0].c_str()));
config->setInt8Calibrator(calibrator.get());
}
if(mParams.dlaCore >= 0)
{
samplesCommon::enableDLA(builder.get(),config.get(),mParams.dlaCore);
if(mParams.batchSize > builder->getMaxDLABatchSize())
{
sample::gLogError << "Requested batch size "<getMaxDLABatchSize()
<< ". Reducing batch size accordingly."<(
builder->buildEngineWithConfig(*network,*config),samplesCommon::InferDeleter());
if(!mEngine)
{
return false;
}
return true;
}
bool SampleINT8::infer(std::vector& score,int firstScoreBatch, int nbScoreBatches)
{
float ms{0.0f};
samplesCommon::BufferManager buffers(mEngine,mParams.batchSize);
auto context = SampleUniquePtr(mEngine->createExecutionContext());
if(!context)
{
return false;
}
MNISTBatchStream batchStream(mParams.batchSize,nbScoreBatches+firstScoreBatch,"train-images-idx3-ubyte",
"train-labels-idx1-ubyte",mPArams.dataDirs);
batchStream.skip(firstScoreBatch);
Dims outputDims = context->getEngine().getBindingDimensions(
context->getEngine().getBindingIndex(mParams.outputTensorNames[0].c_str()));
int outputSize = samplesCommon::volume(outputDims);
int top1{0},top5{0};
float totalTime{0.0f};
while(batchStream.next())
{
assert(mParams.inputTensorNames.size() == 1);
if(!processInput(buffers,batchStream.getBatch()))
{
return false;
}
buffers.copyInputToDevice();
cudaStream_t stream;
CHECK(cudaSTreamCreate(&stream));
cudaEvent_t start,end;
CHECK(cudaEventCreateWithFlags(&start,cudaEventBlockingSync));
CHECK(cudaEventCreatWithFlags(&end,cudaEventBlockingSync));
cudaEventRecord(start,stream);
bool status = context->enqueue(mParams.batchSize,buffers.getDeviceBindings().data(),stream,nullptr);
if(!status)
{
return false;
}
cudaEventRecord(end,stream);
cudaEventSynchronize(end);
cudaEventElapsedTime(&ms,start,end);
cudaEventDestory(start);
cudaEventDestory(end);
totalTime += ms;
buffers.copyOutputToHost();
CHECK(cudaStreamDestory(stream));
top1 += calculateScore(buffers,batchStream.getLabels(),mParams.batchSize,outputSize,1);
top5 += calculateScore(buffers,batchStream.getLabels(),mPArams.batchSIze,outputSize,5);
if(batchStream.getBatchesRead() % 100 ==0)
{
sample::gLogInfo <<"Processing next set of max 100 batches"<(buffers.getHstBuffer(mParams.inputTensorNames[0]));
std::memcpy(hostDataBuffer,data,mParams.batchSize*samplesCommon::volume(mInputDims)*sizeof(float));
return true;
}
int SampleINT8::calculatScore(
const samplesCommon::BufferManager& buffers,float* labels,int batchSize,int outputSize,int threshold)
{
float* probs = static_cast(buffers.getHostBuffer(mParams.outputTensorNames[0]));
int success = 0;
for(int i=0;i=correct)
{
better++;
}
}
if(better <= threshold)
{
success++;
}
}
return success;
}
SampleINT8Params initializeSampleParams(const samplesCommon::Args& args,int batchSize)
{
SampleINT8Params params;
params.dataDirs = args.dataDirs;
params.dataDirs.emplace_back("data/");
params.batchSize = batchSize;
params.dlaCore = args.useDLACore;
params.nbCalBAtches=10;
params.calBatchSize = 50;
params.inputTensorNames.push_back("data");
params.outputTensorNames.push_back("prob");
params.prototxtFileName = "deploy.prototxt";
params.weightsFileName = "mnist_lenet.caffemodel";
params.networkName = "mnist";
return params;
}
void printHelpInfo()
{
std::cout << "Usage: ./sample_int8 [-h or --help] [-d or --datadir=] "
"[--useDLACore=]"
<< std::endl;
std::cout << "--help, -h Display help information" << std::endl;
std::cout << "--datadir Specify path to a data directory, overriding the default. This option can be used "
"multiple times to add multiple directories."
<< std::endl;
std::cout << "--useDLACore=N Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, "
"where n is the number of DLA engines on the platform."
<< std::endl;
std::cout << "batch=N Set batch size (default = 32)." << std::endl;
std::cout << "start=N Set the first batch to be scored (default = 16). All batches before this batch will "
"be used for calibration."
<< std::endl;
std::cout << "score=N Set the number of batches to be scored (default = 1800)." << std::endl;
}
int main(int argc,char** argv)
{
if(argc >= 2 && (!strncmp(argv[1],"--help",6) || !strncmp(argv[1],"-h",2)))
{
printHelpInfo();
return EXIT_SUCCESS;
}
int natchSize = 32;
int firstScoreBatch = 16;
int nbSxoreBatches = 18000;
for(int i=1;i128)
{
sample::gLogError <<"Please provide batch size <= 128"< 60000)
{
sample::gLogError << "Only 60000 images available" << std::endl;
return EXIT_FAILURE;
}
samplesCommon::Args args;
samplesCommon::parseArgs(args, argc, argv);
SampleINT8 sample(initializeSampleParams(args, batchSize));
auto sampleTest = sample::gLogger.defineTest(gSampleName, argc, argv);
sample::gLogger.reportTestStart(sampleTest);
sample::gLogInfo << "Building and running a GPU inference engine for INT8 sample" << std::endl;
std::vector dataTypeNames = {"FP32", "FP16", "INT8"};
std::vector topNames = {"Top1", "Top5"};
std::vector dataTypes = {DataType::kFLOAT, DataType::kHALF, DataType::kINT8};
std::vector> scores(3, std::vector(2, 0.0f));
for (size_t i = 0; i < dataTypes.size(); i++)
{
sample::gLogInfo << dataTypeNames[i] << " run:" << nbScoreBatches << " batches of size " << batchSize
<< " starting at " << firstScoreBatch << std::endl;
if (!sample.build(dataTypes[i]))
{
if (!sample.isSupported(dataTypes[i]))
{
sample::gLogWarning << "Skipping " << dataTypeNames[i]
<< " since the platform does not support this data type." << std::endl;
continue;
}
return sample::gLogger.reportFail(sampleTest);
}
if (!sample.infer(scores[i], firstScoreBatch, nbScoreBatches))
{
return sample::gLogger.reportFail(sampleTest);
}
}
auto isApproximatelyEqual = [](float a, float b, double tolerance) { return (std::abs(a - b) <= tolerance); };
const double tolerance{0.01};
const double goldenMNIST{0.99};
if ((scores[0][0] < goldenMNIST) || (scores[0][1] < goldenMNIST))
{
sample::gLogError << "FP32 accuracy is less than 99%: Top1 = " << scores[0][0] << ", Top5 = " << scores[0][1]
<< "." << std::endl;
return sample::gLogger.reportFail(sampleTest);
}
for (unsigned i = 0; i < topNames.size(); i++)
{
for (unsigned j = 1; j < dataTypes.size(); j++)
{
if (scores[j][i] != 0.0f && !isApproximatelyEqual(scores[0][i], scores[j][i], tolerance))
{
sample::gLogError << "FP32(" << scores[0][i] << ") and " << dataTypeNames[j] << "(" << scores[j][i]
<< ") " << topNames[i] << " accuracy differ by more than " << tolerance << "."
<< std::endl;
return sample::gLogger.reportFail(sampleTest);
}
}
}
if (!sample.teardown())
{
return sample::gLogger.reportFail(sampleTest);
}
return sample::gLogger.reportPass(sampleTest);
}