ref
TensorFlow Lite的 GPU 委托代理(Delegate)是什么,怎么加速模型推理 - 知乎
https://www.tensorflow.org/lite/android/lite_build
TensorFlowLite + Armnn 实现神经网络推理_陌生的天花板的博客-CSDN博客_神经网络推理引擎
Install Bazel and Android Prerequisites
Bazel is the primary build system for TensorFlow. To build with it, you must have it and the Android NDK and SDK installed on your system.
ndk版本:20.1.5948944,ndk可以自己单独下载,也可以用下面的sdk manager工具下载。
也可以用android studio的sdk manager下载。比如wsl2里面要使用adb工具需要使用windows 安装的android studio里面sdk manager下载adb。
sdk manager命令行工具下载:
https://developer.android.com/studio/command-line/sdkmanager
列出可以安装的包
先安装java
sudo apt install -y default-jre default-jdk
./bin/sdkmanager --list --channel=0 --sdk_root=./sdk
例如:
build-tools;33.0.0
cmake;3.18.1
cmdline-tools;latest
emulator
ndk;20.1.5948944
platform-tools
platforms;android-30
sources;android-32
安装sdk ndk等一系列包
./bin/sdkmanager \
"platform-tools" \
"platforms;android-30" \
"sources;android-30" \
"cmake;3.18.1" \
"build-tools;33.0.0" \
"cmdline-tools;latest" \
"ndk;24.0.8215888" \
--channel=0 --sdk_root=./sdk
sdk:java开发; ndk:C++开发; 应用通常使用java通过jni方式调用C++。java函数生成C++头文件,然后使用C++进行实现。
tensorflow bazel编译问题_Luchang-Li的博客-CSDN博客
git clone -b v2.9.1 https://github.com/tensorflow/tensorflow.git
./configure配置
配置选项:for android build选y,然后ndk-bundle设置为sdk路径,例如path/sdk/ndk/20.1.5948944/, Android SDK路径设置,例如path/sdk/,其他可以默认。
bazel执行命令编译
bazel build -c opt --config=android_arm64 --cpu=arm64-v8a tensorflow/lite:libtensorflowlite.so
编译结果:bazel-bin/tensorflow/lite/libtensorflowlite.so
使用XNNPACK:
XNNPACK backend on Windows, Linux, and Mac is enabled via a build-time opt-in mechanism. When building TensorFlow Lite with Bazel, simply add --define tflite_with_xnnpack=true
, and the TensorFlow Lite interpreter will use the XNNPACK backend by default.
或者通过代码配置启用。
bazel build -c opt --config=android_arm64 --cpu=arm64-v8a --define tflite_with_xnnpack=true tensorflow/lite:libtensorflowlite.so
GPU程序只链接libtensorflowlite.so出错:undefined reference to TfLiteGpuDelegateOptionsV2Default,需要编译和链接GPU代理。
GPU代理编译
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/gpu_advanced.md
bazel build -c opt --config android_arm64 --cpu=arm64-v8a tensorflow/lite/delegates/gpu:libtensorflowlite_gpu_delegate.so # for dynamic library
编译结果:bazel-bin/tensorflow/lite/delegates/gpu/libtensorflowlite_gpu_delegate.so
参考上面内容先编译libtensorflowlite.so和libtensorflowlite_gpu_delegate.so
构建脚本build.sh参考
Android基于ndk和cmake开发C++命令行程序_Luchang-Li的博客-CSDN博客
运行该build.sh结合下面源代码和CMakeLists.txt编译命令行执行程序。
main.cpp:
注意这里面的优先级配置,如果配置了性能优先则tflite默认使用FP16对FP32模型进行推理,性能更优。
#include
#include "tensorflow/lite/interpreter.h"
#include "tensorflow/lite/kernels/register.h"
#include "tensorflow/lite/model.h"
#include "tensorflow/lite/tools/gen_op_registration.h"
#include "tensorflow/lite/delegates/gpu/delegate.h"
#include
#include
#include
#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
using namespace std;
int main(int argc, char* argv[]) {
// create net and session
if (argc <= 1) {
cout << "please set input model path" << endl;
return -1;
}
int warmup_num = 20;
int eval_num = 20;
std::string model_path = argv[1];
cout << "model_path: " << model_path << endl;
std::unique_ptr model = tflite::FlatBufferModel::BuildFromFile(model_path.c_str());
if (!model) {
printf("Failed to load model\n");
exit(0);
}
tflite::ops::builtin::BuiltinOpResolver resolver;
std::unique_ptr interpreter;
tflite::InterpreterBuilder(*model.get(), resolver)(&interpreter);
TfLiteGpuDelegateOptionsV2 gpu_options = TfLiteGpuDelegateOptionsV2Default();
gpu_options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
gpu_options.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE;
gpu_options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
gpu_options.inference_preference = TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED;
tflite::Interpreter::TfLiteDelegatePtr gpu_delegate(
TfLiteGpuDelegateV2Create(&gpu_options), [](TfLiteDelegate* delegate) { TfLiteGpuDelegateV2Delete(delegate); });
if (interpreter->ModifyGraphWithDelegate(gpu_delegate.get()) != kTfLiteOk) {
std::cout << "modify graph by gpu delegate failed" << std::endl;
return 1;
}
/*
// use xnnpack delegate
int cpu_num_threads = 4;
TfLiteXNNPackDelegateOptions xnnpack_opts = TfLiteXNNPackDelegateOptionsDefault();
// xnnpack_opts.num_threads = cpu_num_threads;
tflite::Interpreter::TfLiteDelegatePtr xnnpack_delegate(
TfLiteXNNPackDelegateCreate(&xnnpack_opts),
[](TfLiteDelegate* delegate) { TfLiteXNNPackDelegateDelete(delegate); });
interpreter->ModifyGraphWithDelegate(xnnpack_delegate.get());
*/
// Resize input tensors, if desired.
interpreter->AllocateTensors();
// float* input = interpreter->typed_input_tensor(0);
// Dummy input for testing
for (int i = 0; i < warmup_num; i++) {
interpreter->Invoke();
}
auto t_eval1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < eval_num; i++) {
interpreter->Invoke();
}
auto t_eval2 = std::chrono::high_resolution_clock::now();
auto duration_eval = std::chrono::duration_cast(t_eval2 - t_eval1).count();
float eval_time = duration_eval / 1000.0f / eval_num;
cout << "model_path:" << model_path << endl;
cout << "eval time ms:" << eval_time << endl;
// float* output = interpreter->typed_output_tensor(0);
// printf("Result is: %f\n", *output);
return 0;
}
CMakeLists.txt:
cmake_minimum_required(VERSION 3.10)
project(cmake_study LANGUAGES CXX)
# set(CMAKE_CXX_STANDARD 11)
# without these flags, the cmake generated binary file is much bigger than ndk-build
# you can also pass -DCMAKE_C_FLAGS="-s" to the CMake call.
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -s")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
# add_compile_options(-fPIC)
add_executable(
tflite_eval_gpu
main.cpp
)
target_include_directories(
tflite_eval_gpu
PUBLIC
tensorflow/
flatbuffers-2.0.6/include/
)
target_link_libraries(
tflite_eval_gpu
PUBLIC
tflite_release_2.11/libtensorflowlite.so
tflite_release_2.11/libtensorflowlite_gpu_delegate.so
log # liblog.so not found
z
EGL
GLESv2
)
flatbuffers头文件根据tensorflow里面配置的版本下载代码引用。
#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
// use xnnpack delegate
int cpu_num_threads = 4;
TfLiteXNNPackDelegateOptions xnnpack_opts =
TfLiteXNNPackDelegateOptionsDefault();
// xnnpack_opts.num_threads = cpu_num_threads;
tflite::Interpreter::TfLiteDelegatePtr xnnpack_delegate(
TfLiteXNNPackDelegateCreate(&xnnpack_opts),
[](TfLiteDelegate* delegate) { TfLiteXNNPackDelegateDelete(delegate); });
interpreter->ModifyGraphWithDelegate(xnnpack_delegate.get());
这里采用直接下载https://github.com/ARM-software/armnn/releases编译好的so,当前其配套的是tf-2.5.0,因此下载tf-2.5.0编译tflite然后引用两者的so和头文件进行编译测试。
main.cpp
#include
#include
#include
#include
#include "armnn_delegate.hpp"
#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
#include "tensorflow/lite/interpreter.h"
#include "tensorflow/lite/kernels/register.h"
#include "tensorflow/lite/model.h"
#include "tensorflow/lite/tools/gen_op_registration.h"
using uint8 = unsigned char;
size_t GetElemNum(const std::vector& shape) {
int elem_num = 1;
for (auto elem : shape) {
elem_num *= elem;
}
return elem_num;
}
void RandInit(uint8* data, const std::vector& shape) {
size_t elem_num = GetElemNum(shape);
for (size_t i = 0; i < elem_num; i++) {
data[i] = uint8(rand());
}
}
int main(int argc, char* argv[]) {
std::string model_path = "models/inception_v3_quant.tflite";
std::unique_ptr model =
tflite::FlatBufferModel::BuildFromFile(model_path.c_str());
if (!model) {
printf("Failed to mmap model\n");
return 1;
}
tflite::ops::builtin::BuiltinOpResolver resolver;
std::unique_ptr interpreter;
tflite::InterpreterBuilder(*model.get(), resolver)(&interpreter);
std::vector backendOptions;
armnn::BackendOptions gpuAcc(
"GpuAcc", {{"FastMathEnabled", true},
// {"SaveCachedNetwork", m_SaveCachedNetwork},
// {"CachedNetworkFilePath", m_CachedNetworkFilePath},
// {"MLGOTuningFilePath", m_MLGOTuningFilePath}
});
unsigned int numberOfThreads = 4;
armnn::BackendOptions cpuAcc(
"CpuAcc",
{{"FastMathEnabled", true}, {"NumberOfThreads", numberOfThreads}});
// Create the ArmNN Delegate, such as GpuAcc, CpuAcc
std::vector backends = {armnn::Compute::GpuAcc};
if (device_type == "cpu") {
backends = {armnn::Compute::CpuAcc};
std::cout << "create CPU delegate" << std::endl;
backendOptions.push_back(cpuAcc);
} else {
std::cout << "create GPU delegate" << std::endl;
backendOptions.push_back(gpuAcc);
}
armnnDelegate::DelegateOptions delegateOptions(backends, backendOptions);
// armnn::OptimizerOptions optimizerOptions;
// optimizerOptions.m_ReduceFp32ToFp16=true;
// armnn::BackendOptions modelOptionGpu("GpuAcc", {{"FastMathEnabled", true}, {"TuningLevel", 3}}); //
// optimizerOptions.m_ModelOptions.push_back(modelOptionGpu);
// armnnDelegate::DelegateOptions delegateOptions(armnn::Compute::GpuAcc, optimizerOptions);
std::unique_ptr
theArmnnDelegate(
armnnDelegate::TfLiteArmnnDelegateCreate(delegateOptions),
armnnDelegate::TfLiteArmnnDelegateDelete);
// Modify armnnDelegateInterpreter to use armnnDelegate
interpreter->ModifyGraphWithDelegate(theArmnnDelegate.get());
// Resize input tensors, if desired.
interpreter->AllocateTensors();
// uint8[1,299,299,3]
uint8* input = interpreter->typed_input_tensor(0);
// TfLiteTensor* input_tensor = interpreter->input_tensor(0);
// void* input = input_tensor->data.data;
RandInit(input, {1, 299, 299, 3});
int warm_up_num = 50;
int eval_num = 50;
if (argc >= 3) {
warm_up_num = atoi(argv[1]);
eval_num = atoi(argv[2]);
}
std::cout << "model_path: "<Invoke();
}
auto t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < eval_num; i++) {
interpreter->Invoke();
}
auto t2 = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast(t2 - t1).count();
float mean_time = duration / 1000.0f / eval_num;
std::cout << "mean time per eval (ms): " << mean_time << std::endl;
// uint8[1,1001]
uint8* output = interpreter->typed_output_tensor(0);
// TfLiteTensor* output_tensor = interpreter->output_tensor(0);
// void* output = output_tensor->data.data;
printf("Result is: %d\n", output[0]);
return 0;
}
CMakeLists.txt
cmake_minimum_required(VERSION 3.10)
project(cmake_study LANGUAGES CXX)
# set(CMAKE_CXX_STANDARD 11)
# without these flags, the cmake generated binary file is much bigger than ndk-build
# you can also pass -DCMAKE_C_FLAGS="-s" to the CMake call.
# set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -s")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
# add_compile_options(-fPIC)
set(TF_VER "tensorflow-v2.5.0")
add_executable(
main
src/main.cpp
)
target_include_directories(
main
PUBLIC
/root/codes/tflite_test/${TF_VER}/
${CMAKE_CURRENT_SOURCE_DIR}/flatbuffers-1.12.0/include/
/root/codes/armnn-22.05.01/armnn-22.05.01/delegate/include/
/root/codes/armnn-22.05.01/armnn-22.05.01/include/
/root/codes/armnn-22.05.01/armnn-22.05.01/profiling/
)
target_link_libraries(
main
PUBLIC
/root/codes/tflite_test/${TF_VER}/tflite_release/libtensorflowlite.so
/root/codes/armnn-22.05.01/ArmNN-android-29-arm64-v8.2-a/libarmnn.so
/root/codes/armnn-22.05.01/ArmNN-android-29-arm64-v8.2-a/libarmnnDelegate.so
log # liblog.so not found
z
)