超分辨率修复、重建是采用低分辨率(LR)输入并将其提高到高分辨率的任务,具体原理可以参考paddlegan原理介绍。对于在linux上如何实现trt加速该网络,这里有一篇文章详细记录了过程。
我的环境:
作者在ESRGAN(Real-ESRGAN同一作者)的基础上,又做了一些改进。这篇博文把这块改进写的很详细,想要了解原理的可以参看这篇文章解读作者的论文。
网络设计的解读,可以参考ESRGAN官方代码解读,了解整个网络结构,我这里截取部分onnx的节点图。
由于推理网络(就是生成器)是照搬了 ESRGAN 中的 Generator,即使用 Residual-in-residul Dense Block(RRDB),所以代码的复现还是基于ESRGAN 。
我直接跳过了,试了下带不动。
主要参照:
tensorrtx的代码
pip install basicsr
pip install facexlib
pip install gfpgan
pip install -r requirements.txt
python setup.py develop
3)下载权重文件,在Real-ESRGAN官方pytorch(python)新建一个experiments/pretrained_models
二级目录,并将权重文件拷贝到这里面;将tensorrtx/real-esrgan/目录下的gen_wts.py脚本拷贝到 Real-ESRGAN官方pytorch下并运行
python gen_wts.py
运行成功后会生成一个real-esrgan.wts。
cmake_minimum_required(VERSION 3.0)
project(real-esrgan) #1
set(OpenCV_DIR "D:\\opencv\\build") #2
set(OpenCV_INCLUDE_DIRS ${OpenCV_DIR}\\include) #3
set(OpenCV_LIB_DIRS ${OpenCV_DIR}\\x64\\vc15\\lib) #4
set(OpenCV_Debug_LIBS "opencv_world450d.lib") #5
set(OpenCV_Release_LIBS "opencv_world450.lib") #6
set(TRT_DIR "D:\\lbq\\TensorRT-7.2.3.4") #7
set(TRT_INCLUDE_DIRS ${TRT_DIR}\\include) #8
set(TRT_LIB_DIRS ${TRT_DIR}\\lib) #9
set(Dirent_INCLUDE_DIRS "D:\\lbq\\dirent\\include") #10
add_definitions(-std=c++14)
set(CUDA_BIN_PATH C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.5)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_BUILD_TYPE Release)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads)
# setup CUDA
find_package(CUDA REQUIRED)
message(STATUS " libraries: ${CUDA_LIBRARIES}")
message(STATUS " include path: ${CUDA_INCLUDE_DIRS}")
include_directories(${CUDA_INCLUDE_DIRS})
set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++14; -g; -G;-gencode; arch=compute_86;code=sm_86)
####
enable_language(CUDA) # add this line, then no need to setup cuda path in vs
####
include_directories(${PROJECT_SOURCE_DIR}/include) #14
include_directories(${TRT_INCLUDE_DIRS}) #12
link_directories(${TRT_LIB_DIRS}) #13
include_directories(${OpenCV_INCLUDE_DIRS}) #14
link_directories(${OpenCV_LIB_DIRS}) #15
include_directories(${Dirent_INCLUDE_DIRS}) #16
# -D_MWAITXINTRIN_H_INCLUDED for solving error: identifier "__builtin_ia32_mwaitx" is undefined
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -Wall -Ofast -D_MWAITXINTRIN_H_INCLUDED")
# setup opencv
find_package(OpenCV QUIET
NO_MODULE
NO_DEFAULT_PATH
NO_CMAKE_PATH
NO_CMAKE_ENVIRONMENT_PATH
NO_SYSTEM_ENVIRONMENT_PATH
NO_CMAKE_PACKAGE_REGISTRY
NO_CMAKE_BUILDS_PATH
NO_CMAKE_SYSTEM_PATH
NO_CMAKE_SYSTEM_PACKAGE_REGISTRY
)
message(STATUS "OpenCV library status:")
message(STATUS " version: ${OpenCV_VERSION}")
message(STATUS " lib path: ${OpenCV_LIB_DIRS}")
message(STATUS " Debug libraries: ${OpenCV_Debug_LIBS}")
message(STATUS " Release libraries: ${OpenCV_Release_LIBS}")
message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}")
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 86)
endif(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
add_executable(real-esrgan ${PROJECT_SOURCE_DIR}/real-esrgan.cpp ${PROJECT_SOURCE_DIR}/common.hpp
${PROJECT_SOURCE_DIR}/preprocess.cu ${PROJECT_SOURCE_DIR}/preprocess.hpp
${PROJECT_SOURCE_DIR}/postprocess.cu ${PROJECT_SOURCE_DIR}/postprocess.hpp
) #17
target_link_libraries(real-esrgan "nvinfer" "nvinfer_plugin") #18
target_link_libraries(real-esrgan debug ${OpenCV_Debug_LIBS}) #19
target_link_libraries(real-esrgan optimized ${OpenCV_Release_LIBS}) #20
target_link_libraries(real-esrgan ${CUDA_LIBRARIES}) #21
target_link_libraries(real-esrgan Threads::Threads)
用cmake-gui打开代码,自己适配下版本,步骤跟yolov5类似。第一行填入Real_ESRGAN_TRT 目录,第三行填入Real_ESRGAN_TRT/buildnew目录。然后就是点击’Configure’、‘Generate’、‘Open Project’ 编译、生成、打开工程
将第一步中的real-esrgan.wts拷贝到这个Release目录下,将tensorrt中相关的dll文件如nvinfer.dll等也拷贝(软链接)到该目录下。
打开命令行窗口,执行real-esrgan.exe -s real-esrgan.wts real-esrgan_f32.engine ,生成对应的engine,这一步可能会花半个小时的时间,具体要看GPU。
最后将 -d real-esrgan_f32.engine …/samples 写到VS的命令参数里,注意一定要索引到正确的目录,第一个是engine 文件所在的目录,第二个是图像所在的目录,就可以直接按F5执行程序了,图像涉密这里我就不贴出来了。
我发现对特定的图像修复还是要引用一个传统的图像修复算法做前处理,于是封装了四个传统图像增强方法在代码里,有兴趣的可以和我交流。
~~以下方式二选一,在代码257~273行;需要注意第262行是采用了传统的图像增强的方法,这一步对细胞的改善也是至关重要的;代码里一共集成了四种常用的传统的图像增强的方法均在utils.h文件里,里面的一些参数需要调节,目前采用的是伽马变换(gamma_transform)
#include "cuda_utils.h"
#include "common.hpp"
#include "preprocess.hpp" // preprocess plugin
#include "postprocess.hpp" // postprocess plugin
#include "logging.h"
#include "utils.h"
#include //access()
#define DEVICE 0 // GPU id
#define BATCH_SIZE 1
#define MAX_IMAGE_INPUT_SIZE_THRESH 4096 * 4096 // ensure it exceed the maximum size in the input images !
// stuff we know about the network and the input/output blobs
static const int PRECISION_MODE = 32; // fp32 : 32, fp16 : 16
static const bool VISUALIZATION = false;
static const int INPUT_H = 1024;
static const int INPUT_W = 1024;
static const int INPUT_C = 3;
static const int OUT_SCALE = 4;
static const int OUTPUT_SIZE = INPUT_C * INPUT_H * OUT_SCALE * INPUT_W * OUT_SCALE;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;
// Creat the engine using only the API and not any parser. 作者存手敲API的写法,没有用parser
ICudaEngine* build_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) {
INetworkDefinition* network = builder->createNetworkV2(0U);//#定义网络
// Create input tensor of shape {INPUT_H, INPUT_W, INPUT_C} with name INPUT_BLOB_NAME
ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ INPUT_H, INPUT_W, INPUT_C });
assert(data);
// tensorrtx项目加载.wts权重图的通用函数
std::map<std::string, Weights> weightMap = loadWeights(wts_name);
// 前处理,Custom preprocess (NHWC->NCHW, BGR->RGB, [0, 255]->[0, 1](Normalize))
Preprocess preprocess{ maxBatchSize, INPUT_C, INPUT_H, INPUT_W };
// TensorRT Plugin: https://zhuanlan.zhihu.com/p/448241566
// 注册PluginCreator, 前处理的plugin,tensorrtx项目里很多模型都用这个前处理plugin
IPluginCreator* preprocess_creator = getPluginRegistry()->getPluginCreator("preprocess", "1");
// 创建自定义层类型的对象并返回
IPluginV2* preprocess_plugin = preprocess_creator->createPlugin("preprocess_plugin", (PluginFieldCollection*)&preprocess);
// Add a plugin layer to the network using the IPluginV2 interface
IPluginV2Layer* preprocess_layer = network->addPluginV2(&data, 1, *preprocess_plugin);
// https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a0c670938a4aef867545f41b65c52cd93
preprocess_layer->setName("preprocess_layer");
ITensor* prep = preprocess_layer->getOutput(0);
// 以下是整个RRDBNet生成器/推理网络的结构,可以参照wts/onnx模型文件查看具体的网络节点
// conv_first, 第一个卷积层(特征图生成), 输入tensor是*prep, 输出64通道,kernel大小是3x3,后面是权重和偏置的值
IConvolutionLayer* conv_first = network->addConvolutionNd(*prep, 64, DimsHW{ 3, 3 }, weightMap["conv_first.weight"], weightMap["conv_first.bias"]);
conv_first->setStrideNd(DimsHW{ 1, 1 });
conv_first->setPaddingNd(DimsHW{ 1, 1 });
conv_first->setName("conv_first");
ITensor* feat = conv_first->getOutput(0);
// conv_body, https://www.cnblogs.com/carsonzhu/p/10967369.html
// inference_realesrgan.py
ITensor* body_feat = RRDB(network, weightMap, feat, "body.0");
// https://blog.csdn.net/qq_39751446/article/details/119970924
for (int idx = 1; idx < 23; idx++) { //num_block=23
// RRDB的代码编写参考了
body_feat = RRDB(network, weightMap, body_feat, "body." + std::to_string(idx));
}
// ****************此处为RRDB结构截至处****************
IConvolutionLayer* conv_body = network->addConvolutionNd(*body_feat, 64, DimsHW{ 3, 3 }, weightMap["conv_body.weight"], weightMap["conv_body.bias"]);
conv_body->setStrideNd(DimsHW{ 1, 1 });
conv_body->setPaddingNd(DimsHW{ 1, 1 });
IElementWiseLayer* ew1 = network->addElementWise(*feat, *conv_body->getOutput(0), ElementWiseOperation::kSUM);
feat = ew1->getOutput(0);
// upsample, onnx图从最后一个Add后的Resize算子开始
// 添加一个resize网络层,使用线性插值的方法,使得输出数据的尺寸是输入数据尺寸的2倍
IResizeLayer* interpolate_nearest = network->addResize(*feat);
float sclaes1[] = { 1, 2, 2 }; // 需要制定channel,heigh和widht三个通道的缩放比例
interpolate_nearest->setScales(sclaes1, 3);
interpolate_nearest->setResizeMode(ResizeMode::kNEAREST);//kLINEAR
IConvolutionLayer* conv_up1 = network->addConvolutionNd(*interpolate_nearest->getOutput(0), 64, DimsHW{ 3, 3 }, weightMap["conv_up1.weight"], weightMap["conv_up1.bias"]);
conv_up1->setStrideNd(DimsHW{ 1, 1 });
conv_up1->setPaddingNd(DimsHW{ 1, 1 });
IActivationLayer* leaky_relu_1 = network->addActivation(*conv_up1->getOutput(0), ActivationType::kLEAKY_RELU);
leaky_relu_1->setAlpha(0.2);
IResizeLayer* interpolate_nearest2 = network->addResize(*leaky_relu_1->getOutput(0));
float sclaes2[] = { 1, 2, 2 };
interpolate_nearest2->setScales(sclaes2, 3);
interpolate_nearest2->setResizeMode(ResizeMode::kNEAREST);
IConvolutionLayer* conv_up2 = network->addConvolutionNd(*interpolate_nearest2->getOutput(0), 64, DimsHW{ 3, 3 }, weightMap["conv_up2.weight"], weightMap["conv_up2.bias"]);
conv_up2->setStrideNd(DimsHW{ 1, 1 });
conv_up2->setPaddingNd(DimsHW{ 1, 1 });
IActivationLayer* leaky_relu_2 = network->addActivation(*conv_up2->getOutput(0), ActivationType::kLEAKY_RELU);
leaky_relu_2->setAlpha(0.2);
IConvolutionLayer* conv_hr = network->addConvolutionNd(*leaky_relu_2->getOutput(0), 64, DimsHW{ 3, 3 }, weightMap["conv_hr.weight"], weightMap["conv_hr.bias"]);
conv_hr->setStrideNd(DimsHW{ 1, 1 });
conv_hr->setPaddingNd(DimsHW{ 1, 1 });
IActivationLayer* leaky_relu_hr = network->addActivation(*conv_hr->getOutput(0), ActivationType::kLEAKY_RELU);
leaky_relu_hr->setAlpha(0.2);
IConvolutionLayer* conv_last = network->addConvolutionNd(*leaky_relu_hr->getOutput(0), 3, DimsHW{ 3, 3 }, weightMap["conv_last.weight"], weightMap["conv_last.bias"]);
conv_last->setStrideNd(DimsHW{ 1, 1 });
conv_last->setPaddingNd(DimsHW{ 1, 1 });
ITensor* out = conv_last->getOutput(0);
// 后处理,Custom postprocess (RGB -> BGR, NCHW->NHWC, *255, ROUND, uint8)
Postprocess postprocess{ maxBatchSize, out->getDimensions().d[0], out->getDimensions().d[1], out->getDimensions().d[2] };
IPluginCreator* postprocess_creator = getPluginRegistry()->getPluginCreator("postprocess", "1");
IPluginV2* postprocess_plugin = postprocess_creator->createPlugin("postprocess_plugin", (PluginFieldCollection*)&postprocess);
IPluginV2Layer* postprocess_layer = network->addPluginV2(&out, 1, *postprocess_plugin);
postprocess_layer->setName("postprocess_layer");
ITensor* final_tensor = postprocess_layer->getOutput(0);
final_tensor->setName(OUTPUT_BLOB_NAME);
network->markOutput(*final_tensor); //网络输出
// Build engine
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB,左移20位 = 16 * 1 * (2^20)
if (PRECISION_MODE == 16) {
std::cout << "==== precision f16 ====" << std::endl << std::endl;
config->setFlag(BuilderFlag::kFP16);
}
else {
std::cout << "==== precision f32 ====" << std::endl << std::endl;
}
std::cout << "Building engine, please wait for a while..." << std::endl;
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
std::cout << "Build engine successfully!" << std::endl;
// Don't need the network any more
delete network;
// Release host memory
for (auto& mem : weightMap)
{
free((void*)(mem.second.values));
}
return engine;
}
void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, std::string& wts_name) {
// Create builder, #创建一个build初始化tensorRT的库
IBuilder* builder = createInferBuilder(gLogger);
// 构造 CudaEngine 的配置参数,可添加 IOptimizationProfile 配置,设置最大工作内存空间、最大Batch大小、最小可接受精度级别、半浮点精度运算等
IBuilderConfig* config = builder->createBuilderConfig();
// Create model to populate the network, then set the outputs and create an engine
ICudaEngine* engine = build_engine(maxBatchSize, builder, config, DataType::kFLOAT, wts_name);
assert(engine != nullptr);
// Serialize the engine
(*modelStream) = engine->serialize();
// Close everything down
delete engine;
delete builder;
delete config;
}
void doInference(IExecutionContext& context, cudaStream_t& stream, void** buffers, uint8_t* output, int batchSize) {
// infer on the batch asynchronously, and DMA output back to host
context.enqueue(batchSize, buffers, stream, nullptr);
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir) {
if (argc < 4) return false;
if (std::string(argv[1]) == "-s" && argc == 4) {
wts = std::string(argv[2]);
engine = std::string(argv[3]);
}
else if (std::string(argv[1]) == "-d" && argc == 4) {
engine = std::string(argv[2]);
img_dir = std::string(argv[3]);
}
else {
return false;
}
return true;
}
// ./real-esrgan -s ./real-esrgan.wts ./real-esrgan_f32.engine
// ./real-esrgan -d ./real-esrgan_f32.engine ../samples
int main(int argc, char** argv) {
std::string wts_name = "";
std::string engine_name = "";
std::string img_dir;
if (!parse_args(argc, argv, wts_name, engine_name, img_dir)) {
std::cerr << "arguments not right!" << std::endl;
std::cerr << "./real-esrgan -s [.wts] [.engine] // serialize model to plan file" << std::endl;
std::cerr << "./real-esrgan -d [.engine] ../samples // deserialize plan file and run inference" << std::endl;
return -1;
}
// create a model using the API directly and serialize it to a stream
if (!wts_name.empty()) {
IHostMemory* modelStream{ nullptr };
APIToModel(BATCH_SIZE, &modelStream, wts_name);
assert(modelStream != nullptr);
std::ofstream p(engine_name, std::ios::binary);
if (!p) {
std::cerr << "could not open plan output file" << std::endl;
return -1;
}
p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
delete modelStream;
return 0;
}
// deserialize the .engine and run inference
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
return -1;
}
char* trtModelStream = nullptr;
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
std::vector<std::string> file_names;
std::cout << "img_dir:" << img_dir.c_str() << std::endl;
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
std::cerr << "read_files_in_dir failed." << std::endl;
return -1;
}
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
delete[] trtModelStream;
assert(engine->getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
assert(inputIndex == 0);
assert(outputIndex == 1);
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W * sizeof(uint8_t)));
CUDA_CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(uint8_t)));
std::vector<uint8_t> input(BATCH_SIZE * INPUT_H * INPUT_W * INPUT_C);
std::vector<uint8_t> outputs(BATCH_SIZE * OUTPUT_SIZE);
// Create stream
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
std::vector<cv::Mat> imgs_buffer(BATCH_SIZE);
for (int f = 0; f < (int)file_names.size(); f++) {
cv::Mat re_img;
for (int b = 0; b < BATCH_SIZE; b++) {
cv::Mat img = cv::imread(img_dir + "/" + file_names[f]);
if (img.empty()) continue;
// 以下两种方法二选一
// 1.仅静态图输入,固定了输入图像可接受的C、W、H
// memcpy(input.data() + b * INPUT_H * INPUT_W * INPUT_C, img.data, INPUT_H * INPUT_W * INPUT_C);
// 2.支持多种尺度的输入
cv::Mat traditional_enhance_img = gamma_transform(img);
cv::Mat pr_img;
std::pair<cv::Mat, cv::Mat> preprocess_rst;
if (img.cols != INPUT_W && img.cols != INPUT_H)
{
preprocess_rst = preprocess_img(traditional_enhance_img, INPUT_W, INPUT_H);//等比例填充
// std::cout << "img_dir:" << std::endl;
pr_img = preprocess_rst.first;
re_img = preprocess_rst.second;//对比图象
}
else
{
pr_img = traditional_enhance_img;
}
memcpy(input.data() + b * INPUT_H * INPUT_W * INPUT_C, pr_img.data, INPUT_H * INPUT_W * INPUT_C);
}
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input.data(), BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W * sizeof(uint8_t), cudaMemcpyHostToDevice, stream));
// Run inference
auto start = std::chrono::system_clock::now();
doInference(*context, stream, (void**)buffers, outputs.data(), BATCH_SIZE);
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
cv::Mat frame = cv::Mat(INPUT_H * OUT_SCALE, INPUT_W * OUT_SCALE, CV_8UC3, outputs.data());
// 去除非标准的图片产生的填充
int dif_h = 0;
int dif_w = 0;
if (re_img.cols == INPUT_H && re_img.rows != INPUT_W)
{
dif_w = (INPUT_W - re_img.rows) * 2;
}
else if (re_img.cols != INPUT_H && re_img.rows == INPUT_W)
{
dif_h = (INPUT_H - re_img.rows) * 2;
}
// std::cout << dif_w << dif_h << std::endl;
cv::Mat result = frame(cv::Rect(dif_h, dif_w, INPUT_H * OUT_SCALE - 2 * dif_h, INPUT_W * OUT_SCALE - 2 * dif_w));
cv::imwrite("../_" + file_names[f], result);
if (VISUALIZATION) {
cv::imshow("result : " + file_names[0], frame);
cv::waitKey(0);
}
}
// Release stream and buffers
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(buffers[inputIndex]));
CUDA_CHECK(cudaFree(buffers[outputIndex]));
// Destroy the engine
delete context;
delete engine;
delete runtime;
}