腾讯NCNN框架中自带了测模型推理时间和每层时间的功能,然而,这些功能并没有文档写明白如何使用,也没有默认开启和计算每层的平均时间,为了更好地进行速度的测试,我重写了NCNN源码,并以此文记录下以下功能是如何实现的。
在手机上,给出任何一个网络可以用的ncnn param文件,可以得到以下的结果:
loop_count = 10
num_threads = 4
powersave = 2
output_blob_name: prob
mode: cls
resnet18.param :
Convolution conv1 8.014ms | feature_map: 224 x 224 inch: 3 outch: 64 kernel: 7 x 7 stride: 2
BatchNorm bn_conv1 0.336ms | feature_map: 112 x 112 inch: 64 outch: 64
Scale scale_conv1 0.218ms | feature_map: 112 x 112 inch: 64 outch: 64
ReLU conv1_relu 0.200ms | feature_map: 112 x 112 inch: 64 outch: 64
Pooling pool1 0.534ms | feature_map: 112 x 112 inch: 64 outch: 64 kernel: 3 x 3 stride: 2
Split splitncnn_0 0.001ms |
Convolution res2a_branch1 0.854ms | feature_map: 56 x 56 inch: 64 outch: 64 kernel: 1 x 1
BatchNorm bn2a_branch1 0.052ms | feature_map: 56 x 56 inch: 64 outch: 64
Scale scale2a_branch1 0.028ms | feature_map: 56 x 56 inch: 64 outch: 64
Convolution res2a_branch2a 3.417ms | feature_map: 56 x 56 inch: 64 outch: 64 kernel: 3 x 3
BatchNorm bn2a_branch2a 0.120ms | feature_map: 56 x 56 inch: 64 outch: 64
Scale scale2a_branch2a 0.078ms | feature_map: 56 x 56 inch: 64 outch: 64
ReLU res2a_branch2a_relu 0.047ms | feature_map: 56 x 56 inch: 64 outch: 64
Convolution res2a_branch2b 3.274ms | feature_map: 56 x 56 inch: 64 outch: 64 kernel: 3 x 3
BatchNorm bn2a_branch2b 0.113ms | feature_map: 56 x 56 inch: 64 outch: 64
Scale scale2a_branch2b 0.139ms | feature_map: 56 x 56 inch: 64 outch: 64
Eltwise res2a 0.146ms |
ReLU res2a_relu 0.095ms | feature_map: 56 x 56 inch: 64 outch: 64
Split splitncnn_1 0.001ms |
Convolution res2b_branch2a 3.356ms | feature_map: 56 x 56 inch: 64 outch: 64 kernel: 3 x 3
BatchNorm bn2b_branch2a 0.107ms | feature_map: 56 x 56 inch: 64 outch: 64
Scale scale2b_branch2a 0.085ms | feature_map: 56 x 56 inch: 64 outch: 64
ReLU res2b_branch2a_relu 0.106ms | feature_map: 56 x 56 inch: 64 outch: 64
Convolution res2b_branch2b 3.544ms | feature_map: 56 x 56 inch: 64 outch: 64 kernel: 3 x 3
BatchNorm bn2b_branch2b 0.065ms | feature_map: 56 x 56 inch: 64 outch: 64
Scale scale2b_branch2b 0.128ms | feature_map: 56 x 56 inch: 64 outch: 64
Eltwise res2b 0.159ms |
ReLU res2b_relu 0.110ms | feature_map: 56 x 56 inch: 64 outch: 64
Split splitncnn_2 0.001ms |
Convolution res3a_branch1 0.679ms | feature_map: 56 x 56 inch: 64 outch: 128 kernel: 1 x 1 stride: 2
BatchNorm bn3a_branch1 0.044ms | feature_map: 28 x 28 inch: 128 outch: 128
Scale scale3a_branch1 0.119ms | feature_map: 28 x 28 inch: 128 outch: 128
Convolution res3a_branch2a 2.173ms | feature_map: 56 x 56 inch: 64 outch: 128 kernel: 3 x 3 stride: 2
BatchNorm bn3a_branch2a 0.071ms | feature_map: 28 x 28 inch: 128 outch: 128
Scale scale3a_branch2a 0.035ms | feature_map: 28 x 28 inch: 128 outch: 128
ReLU res3a_branch2a_relu 0.115ms | feature_map: 28 x 28 inch: 128 outch: 128
Convolution res3a_branch2b 2.586ms | feature_map: 28 x 28 inch: 128 outch: 128 kernel: 3 x 3
BatchNorm bn3a_branch2b 0.102ms | feature_map: 28 x 28 inch: 128 outch: 128
Scale scale3a_branch2b 0.045ms | feature_map: 28 x 28 inch: 128 outch: 128
Eltwise res3a 0.113ms |
ReLU res3a_relu 0.121ms | feature_map: 28 x 28 inch: 128 outch: 128
Split splitncnn_3 0.000ms |
Convolution res3b_branch2a 2.639ms | feature_map: 28 x 28 inch: 128 outch: 128 kernel: 3 x 3
BatchNorm bn3b_branch2a 0.067ms | feature_map: 28 x 28 inch: 128 outch: 128
Scale scale3b_branch2a 0.087ms | feature_map: 28 x 28 inch: 128 outch: 128
ReLU res3b_branch2a_relu 0.036ms | feature_map: 28 x 28 inch: 128 outch: 128
Convolution res3b_branch2b 2.722ms | feature_map: 28 x 28 inch: 128 outch: 128 kernel: 3 x 3
BatchNorm bn3b_branch2b 0.045ms | feature_map: 28 x 28 inch: 128 outch: 128
Scale scale3b_branch2b 0.085ms | feature_map: 28 x 28 inch: 128 outch: 128
Eltwise res3b 0.106ms |
ReLU res3b_relu 0.063ms | feature_map: 28 x 28 inch: 128 outch: 128
Split splitncnn_4 0.001ms |
Convolution res4a_branch1 0.960ms | feature_map: 28 x 28 inch: 128 outch: 256 kernel: 1 x 1 stride: 2
BatchNorm bn4a_branch1 0.016ms | feature_map: 14 x 14 inch: 256 outch: 256
Scale scale4a_branch1 0.057ms | feature_map: 14 x 14 inch: 256 outch: 256
Convolution res4a_branch2a 2.292ms | feature_map: 28 x 28 inch: 128 outch: 256 kernel: 3 x 3 stride: 2
BatchNorm bn4a_branch2a 0.036ms | feature_map: 14 x 14 inch: 256 outch: 256
Scale scale4a_branch2a 0.014ms | feature_map: 14 x 14 inch: 256 outch: 256
ReLU res4a_branch2a_relu 0.013ms | feature_map: 14 x 14 inch: 256 outch: 256
Convolution res4a_branch2b 3.235ms | feature_map: 14 x 14 inch: 256 outch: 256 kernel: 3 x 3
BatchNorm bn4a_branch2b 0.064ms | feature_map: 14 x 14 inch: 256 outch: 256
Scale scale4a_branch2b 0.013ms | feature_map: 14 x 14 inch: 256 outch: 256
Eltwise res4a 0.026ms |
ReLU res4a_relu 0.012ms | feature_map: 14 x 14 inch: 256 outch: 256
Split splitncnn_5 0.001ms |
Convolution res4b_branch2a 3.222ms | feature_map: 14 x 14 inch: 256 outch: 256 kernel: 3 x 3
BatchNorm bn4b_branch2a 0.076ms | feature_map: 14 x 14 inch: 256 outch: 256
Scale scale4b_branch2a 0.071ms | feature_map: 14 x 14 inch: 256 outch: 256
ReLU res4b_branch2a_relu 0.052ms | feature_map: 14 x 14 inch: 256 outch: 256
Convolution res4b_branch2b 3.355ms | feature_map: 14 x 14 inch: 256 outch: 256 kernel: 3 x 3
BatchNorm bn4b_branch2b 0.015ms | feature_map: 14 x 14 inch: 256 outch: 256
Scale scale4b_branch2b 0.012ms | feature_map: 14 x 14 inch: 256 outch: 256
Eltwise res4b 0.086ms |
ReLU res4b_relu 0.055ms | feature_map: 14 x 14 inch: 256 outch: 256
Split splitncnn_6 0.001ms |
Convolution res5a_branch1 1.202ms | feature_map: 14 x 14 inch: 256 outch: 512 kernel: 1 x 1 stride: 2
BatchNorm bn5a_branch1 0.011ms | feature_map: 7 x 7 inch: 512 outch: 512
Scale scale5a_branch1 0.009ms | feature_map: 7 x 7 inch: 512 outch: 512
Convolution res5a_branch2a 2.877ms | feature_map: 14 x 14 inch: 256 outch: 512 kernel: 3 x 3 stride: 2
BatchNorm bn5a_branch2a 0.062ms | feature_map: 7 x 7 inch: 512 outch: 512
Scale scale5a_branch2a 0.011ms | feature_map: 7 x 7 inch: 512 outch: 512
ReLU res5a_branch2a_relu 0.009ms | feature_map: 7 x 7 inch: 512 outch: 512
Convolution res5a_branch2b 6.811ms | feature_map: 7 x 7 inch: 512 outch: 512 kernel: 3 x 3
BatchNorm bn5a_branch2b 0.028ms | feature_map: 7 x 7 inch: 512 outch: 512
Scale scale5a_branch2b 0.056ms | feature_map: 7 x 7 inch: 512 outch: 512
Eltwise res5a 0.040ms |
ReLU res5a_relu 0.009ms | feature_map: 7 x 7 inch: 512 outch: 512
Split splitncnn_7 0.000ms |
Convolution res5b_branch2a 6.584ms | feature_map: 7 x 7 inch: 512 outch: 512 kernel: 3 x 3
BatchNorm bn5b_branch2a 0.090ms | feature_map: 7 x 7 inch: 512 outch: 512
Scale scale5b_branch2a 0.015ms | feature_map: 7 x 7 inch: 512 outch: 512
ReLU res5b_branch2a_relu 0.032ms | feature_map: 7 x 7 inch: 512 outch: 512
Convolution res5b_branch2b 6.611ms | feature_map: 7 x 7 inch: 512 outch: 512 kernel: 3 x 3
BatchNorm bn5b_branch2b 0.086ms | feature_map: 7 x 7 inch: 512 outch: 512
Scale scale5b_branch2b 0.035ms | feature_map: 7 x 7 inch: 512 outch: 512
Eltwise res5b 0.028ms |
ReLU res5b_relu 0.012ms | feature_map: 7 x 7 inch: 512 outch: 512
Pooling pool5 0.046ms | feature_map: 7 x 7 inch: 512 outch: 512 kernel: 7 x 7
InnerProduct fc1000 0.598ms | feature_map: 1 x 1 inch: 512 outch: 1000
Softmax prob 0.006ms | feature_map: 1000 x 1 inch: 1 outch: 1
Total average inference time = 76.336ms
其中可以控制测试的次数、线程数、powersave模式与输出blob的名字,并随后打印出所有的测试中每一层的平均时间以及包含的参数
在NCNN中,类似于Caffe,每一层的数据被保存为Mat类型,这是一个NCNN自定的类,其计算使用了汇编,非常高效。网络定义为一个ncnn::Net类,其中的layers储存了每一层的信息,blobs储存了网络的中间数据。在计算时,先利用param文件和bin文件实例化一个ncnn::Net类,根据得到的net实例化一个ncnn::Extractor类,extractor中的net会被转为const类,以保证运算的时候blobs和layers等重要信息不会被改变。
当需要计算时,可以在extractor中的任意一层送入数据,如
ncnn::Mat in_blob(224, 224, 3); // set input blob with given shape
extractor.input("data", in_blob); // set network input using extractor
随后就可以取出任意层的数据
ncnn::Mat output;
extractor.extract("res5.1.sum", output); // extract any blob you want
在extract方法中,使用了net中的forward_layer方法递归地遍历网络。为了能够精确地计算每一层所消耗的时间,我们需要修改的地方,就在forward_layer方法内。
在ncnn::Net类中,首先定义一个hash map用来记录每一层的时间,并定义一个bool变量profile作为判断是否记录时间的flag。由于ncnn::Net类在extractor中为const类,为了能够修改profiler,要加入mutable关键字。
public:
#if NCNN_BENCHMARK
void set_profiler(int max_iter);
// define a hash map to store inference time for each layer
mutable std::map<std::string, int> profiler;
#endif
protected:
#if NCNN_BENCHMARK
bool profile;
#endif
随后修改ncnn::Net的forward_layer方法,对于forward_layer中的每一次调用layer->forward()方法或者layer->forward_inplace(),计算其Inference时间,下面给出一段作为范例:
#if NCNN_BENCHMARK
int start = get_msec();
int ret = layer->forward_inplace(bottom_top_blob, opt);
int end = get_msec();
int duration = end - start;
if (duration < 0)
{
duration = end + (1000000 - start);
}
if (profile)
{
profiler[layer->name] += duration;
// reach the last iteration
if (profiler["max_iter"] == profiler["iter"])
{
benchmark(layer, bottom_top_blob, bottom_top_blob, profiler[layer->name] / (1000.0 * profiler["max_iter"]));
}
}
#else
int ret = layer->forward_inplace(bottom_top_blob, opt);
#endif // NCNN_BENCHMARK
定义set_profiler方法
#if NCNN_BENCHMARK
void Net::set_profiler(int max_iter)
{
profiler["max_iter"] = max_iter;
profile = true;
}
#endif // NCNN_BENCHMARK
修改ncnn::Extractor类中的extract方法,加入
#if NCNN_BENCHMARK
if (net->profile)
{
net->profiler["iter"] ++;
}
#endif // NCNN_BENCHMARK
注意,有两个extract方法需要修改。
在benchmark.cpp中定义计算时间的函数
int get_msec()
{
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_usec; // return msec with 0.001ms precision
}
最后,编写一个benchncnn.cpp,包含执行测速的main函数
#include
#include
#ifdef _WIN32
#define NOMINMAX
#include
#include // Sleep()
#else
#include // sleep()
#endif
#include "benchmark.h"
#include "cpu.h"
#include "net.h"
namespace ncnn {
// always return empty weights
class ModelBinFromEmpty : public ModelBin
{
public:
virtual Mat load(int w, int /*type*/) const { return Mat(w); }
};
class BenchNet : public Net
{
public:
int load_model()
{
// load file
int ret = 0;
ModelBinFromEmpty mb;
for (size_t i=0; i<layers.size(); i++)
{
Layer* layer = layers[i];
int lret = layer->load_model(mb);
if (lret != 0)
{
fprintf(stderr, "layer load_model %d failed\n", (int)i);
ret = -1;
break;
}
}
return ret;
}
};
} // namespace ncnn
static int g_loop_count = 4;
static char* g_output_blob_name;
static char* g_mode;
static ncnn::UnlockedPoolAllocator g_blob_pool_allocator;
static ncnn::PoolAllocator g_workspace_pool_allocator;
/********************************/
/* My New Benchmark */
/********************************/
void ncnn_run(const ncnn::Net& net)
{
ncnn::Extractor ex = net.create_extractor();
if (strcmp("cls", g_mode) == 0) {
ncnn::Mat in(224, 224, 3);
ex.input("data", in);
} else {
ncnn::Mat in(384, 640, 3);
ex.input("data", in);
}
ncnn::Mat out;
ex.extract(g_output_blob_name, out);
}
void benchmark(char* model_name)
{
ncnn::BenchNet net;
int num_warm_up = 10; // warm up iteration
fprintf(stderr, "%s :\n\n", model_name);
// init ncnn net
net.load_param(model_name);
net.load_model();
g_blob_pool_allocator.clear();
g_workspace_pool_allocator.clear();
// sleep 5 seconds for cooling down SOC :(
#ifdef _WIN32
Sleep(5 * 1000);
#else
sleep(5);
#endif
// warm up
for ( int i=0; i<num_warm_up; i++)
{
ncnn_run(net);
}
// let's profile~
net.set_profiler(g_loop_count); // important!
for (int i=0; i<g_loop_count; i++)
{
ncnn_run(net);
}
float total_time = 0.0;
std::map<std::string, int>::iterator map_iter;
for (map_iter = net.profiler.begin(); map_iter != net.profiler.end(); map_iter++)
{
total_time += map_iter->second / 1000.0;
}
fprintf(stderr, "\nTotal average inference time = %7.3fms\n\n", total_time / net.profiler["max_iter"]);
}
/********************************/
/* Main Function */
/********************************/
int main(int argc, char** argv)
{
int loop_count = 4;
int num_threads = ncnn::get_cpu_count();
int powersave = 2;
char* model_name;
char* output_blob_name;
char* mode; // cls or seg, if cls, input_blob will be 224x224 else 384x640
model_name = argv[1];
if (argc >= 3) {
loop_count = atoi(argv[2]);
}
if (argc >= 4) {
num_threads = atoi(argv[3]);
}
if (argc >= 5) {
powersave = atoi(argv[4]);
}
if (argc >= 6) {
output_blob_name = argv[5];
} else {
output_blob_name = (char*)"fc";
}
if (argc >= 7) {
mode = argv[6];
} else {
mode = (char*)"cls";
}
g_loop_count = loop_count;
g_output_blob_name = output_blob_name;
g_mode = mode;
g_blob_pool_allocator.set_size_compare_ratio(0.0f);
g_workspace_pool_allocator.set_size_compare_ratio(0.5f);
ncnn::Option opt;
opt.lightmode = true;
opt.num_threads = num_threads;
opt.blob_allocator = &g_blob_pool_allocator;
opt.workspace_allocator = &g_workspace_pool_allocator;
ncnn::set_default_option(opt);
ncnn::set_cpu_powersave(powersave);
ncnn::set_omp_dynamic(0);
ncnn::set_omp_num_threads(num_threads);
fprintf(stderr, "\nloop_count = %d\n", g_loop_count);
fprintf(stderr, "num_threads = %d\n", num_threads);
fprintf(stderr, "powersave = %d\n", ncnn::get_cpu_powersave());
fprintf(stderr, "output_blob_name: %s\n", g_output_blob_name);
fprintf(stderr, "mode: %s\n\n", g_mode);
// run
benchmark(model_name);
return 0;
}
首先在CMakeLists.txt中修改
option(NCNN_BENCHMARK "print benchmark information for every layer" ON) // OFF for default
随后就是一堆命令行
mkdir build-phone
cd build-phone
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK_ROOT/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-27 ..
make -j2
cd ..
adb push build-phone/ /data/local/tmp/
adb shell
cd /data/local/tmp/build-phone/benchmark/
./benchncnn resnet18.param 100 4 2 prob
ojbk