使用NCNN benchmark测出每一层的时间

腾讯NCNN框架中自带了测模型推理时间和每层时间的功能,然而,这些功能并没有文档写明白如何使用,也没有默认开启和计算每层的平均时间,为了更好地进行速度的测试,我重写了NCNN源码,并以此文记录下以下功能是如何实现的。

效果

在手机上,给出任何一个网络可以用的ncnn param文件,可以得到以下的结果:

loop_count = 10
num_threads = 4
powersave = 2
output_blob_name: prob
mode: cls

resnet18.param :

Convolution conv1 8.014ms | feature_map: 224 x 224 inch: 3 outch: 64 kernel: 7 x 7 stride: 2
BatchNorm bn_conv1 0.336ms | feature_map: 112 x 112 inch: 64 outch: 64
Scale scale_conv1 0.218ms | feature_map: 112 x 112 inch: 64 outch: 64
ReLU conv1_relu 0.200ms | feature_map: 112 x 112 inch: 64 outch: 64
Pooling pool1 0.534ms | feature_map: 112 x 112 inch: 64 outch: 64 kernel: 3 x 3 stride: 2
Split splitncnn_0 0.001ms |
Convolution res2a_branch1 0.854ms | feature_map: 56 x 56 inch: 64 outch: 64 kernel: 1 x 1
BatchNorm bn2a_branch1 0.052ms | feature_map: 56 x 56 inch: 64 outch: 64
Scale scale2a_branch1 0.028ms | feature_map: 56 x 56 inch: 64 outch: 64
Convolution res2a_branch2a 3.417ms | feature_map: 56 x 56 inch: 64 outch: 64 kernel: 3 x 3
BatchNorm bn2a_branch2a 0.120ms | feature_map: 56 x 56 inch: 64 outch: 64
Scale scale2a_branch2a 0.078ms | feature_map: 56 x 56 inch: 64 outch: 64
ReLU res2a_branch2a_relu 0.047ms | feature_map: 56 x 56 inch: 64 outch: 64
Convolution res2a_branch2b 3.274ms | feature_map: 56 x 56 inch: 64 outch: 64 kernel: 3 x 3
BatchNorm bn2a_branch2b 0.113ms | feature_map: 56 x 56 inch: 64 outch: 64
Scale scale2a_branch2b 0.139ms | feature_map: 56 x 56 inch: 64 outch: 64
Eltwise res2a 0.146ms |
ReLU res2a_relu 0.095ms | feature_map: 56 x 56 inch: 64 outch: 64
Split splitncnn_1 0.001ms |
Convolution res2b_branch2a 3.356ms | feature_map: 56 x 56 inch: 64 outch: 64 kernel: 3 x 3
BatchNorm bn2b_branch2a 0.107ms | feature_map: 56 x 56 inch: 64 outch: 64
Scale scale2b_branch2a 0.085ms | feature_map: 56 x 56 inch: 64 outch: 64
ReLU res2b_branch2a_relu 0.106ms | feature_map: 56 x 56 inch: 64 outch: 64
Convolution res2b_branch2b 3.544ms | feature_map: 56 x 56 inch: 64 outch: 64 kernel: 3 x 3
BatchNorm bn2b_branch2b 0.065ms | feature_map: 56 x 56 inch: 64 outch: 64
Scale scale2b_branch2b 0.128ms | feature_map: 56 x 56 inch: 64 outch: 64
Eltwise res2b 0.159ms |
ReLU res2b_relu 0.110ms | feature_map: 56 x 56 inch: 64 outch: 64
Split splitncnn_2 0.001ms |
Convolution res3a_branch1 0.679ms | feature_map: 56 x 56 inch: 64 outch: 128 kernel: 1 x 1 stride: 2
BatchNorm bn3a_branch1 0.044ms | feature_map: 28 x 28 inch: 128 outch: 128
Scale scale3a_branch1 0.119ms | feature_map: 28 x 28 inch: 128 outch: 128
Convolution res3a_branch2a 2.173ms | feature_map: 56 x 56 inch: 64 outch: 128 kernel: 3 x 3 stride: 2
BatchNorm bn3a_branch2a 0.071ms | feature_map: 28 x 28 inch: 128 outch: 128
Scale scale3a_branch2a 0.035ms | feature_map: 28 x 28 inch: 128 outch: 128
ReLU res3a_branch2a_relu 0.115ms | feature_map: 28 x 28 inch: 128 outch: 128
Convolution res3a_branch2b 2.586ms | feature_map: 28 x 28 inch: 128 outch: 128 kernel: 3 x 3
BatchNorm bn3a_branch2b 0.102ms | feature_map: 28 x 28 inch: 128 outch: 128
Scale scale3a_branch2b 0.045ms | feature_map: 28 x 28 inch: 128 outch: 128
Eltwise res3a 0.113ms |
ReLU res3a_relu 0.121ms | feature_map: 28 x 28 inch: 128 outch: 128
Split splitncnn_3 0.000ms |
Convolution res3b_branch2a 2.639ms | feature_map: 28 x 28 inch: 128 outch: 128 kernel: 3 x 3
BatchNorm bn3b_branch2a 0.067ms | feature_map: 28 x 28 inch: 128 outch: 128
Scale scale3b_branch2a 0.087ms | feature_map: 28 x 28 inch: 128 outch: 128
ReLU res3b_branch2a_relu 0.036ms | feature_map: 28 x 28 inch: 128 outch: 128
Convolution res3b_branch2b 2.722ms | feature_map: 28 x 28 inch: 128 outch: 128 kernel: 3 x 3
BatchNorm bn3b_branch2b 0.045ms | feature_map: 28 x 28 inch: 128 outch: 128
Scale scale3b_branch2b 0.085ms | feature_map: 28 x 28 inch: 128 outch: 128
Eltwise res3b 0.106ms |
ReLU res3b_relu 0.063ms | feature_map: 28 x 28 inch: 128 outch: 128
Split splitncnn_4 0.001ms |
Convolution res4a_branch1 0.960ms | feature_map: 28 x 28 inch: 128 outch: 256 kernel: 1 x 1 stride: 2
BatchNorm bn4a_branch1 0.016ms | feature_map: 14 x 14 inch: 256 outch: 256
Scale scale4a_branch1 0.057ms | feature_map: 14 x 14 inch: 256 outch: 256
Convolution res4a_branch2a 2.292ms | feature_map: 28 x 28 inch: 128 outch: 256 kernel: 3 x 3 stride: 2
BatchNorm bn4a_branch2a 0.036ms | feature_map: 14 x 14 inch: 256 outch: 256
Scale scale4a_branch2a 0.014ms | feature_map: 14 x 14 inch: 256 outch: 256
ReLU res4a_branch2a_relu 0.013ms | feature_map: 14 x 14 inch: 256 outch: 256
Convolution res4a_branch2b 3.235ms | feature_map: 14 x 14 inch: 256 outch: 256 kernel: 3 x 3
BatchNorm bn4a_branch2b 0.064ms | feature_map: 14 x 14 inch: 256 outch: 256
Scale scale4a_branch2b 0.013ms | feature_map: 14 x 14 inch: 256 outch: 256
Eltwise res4a 0.026ms |
ReLU res4a_relu 0.012ms | feature_map: 14 x 14 inch: 256 outch: 256
Split splitncnn_5 0.001ms |
Convolution res4b_branch2a 3.222ms | feature_map: 14 x 14 inch: 256 outch: 256 kernel: 3 x 3
BatchNorm bn4b_branch2a 0.076ms | feature_map: 14 x 14 inch: 256 outch: 256
Scale scale4b_branch2a 0.071ms | feature_map: 14 x 14 inch: 256 outch: 256
ReLU res4b_branch2a_relu 0.052ms | feature_map: 14 x 14 inch: 256 outch: 256
Convolution res4b_branch2b 3.355ms | feature_map: 14 x 14 inch: 256 outch: 256 kernel: 3 x 3
BatchNorm bn4b_branch2b 0.015ms | feature_map: 14 x 14 inch: 256 outch: 256
Scale scale4b_branch2b 0.012ms | feature_map: 14 x 14 inch: 256 outch: 256
Eltwise res4b 0.086ms |
ReLU res4b_relu 0.055ms | feature_map: 14 x 14 inch: 256 outch: 256
Split splitncnn_6 0.001ms |
Convolution res5a_branch1 1.202ms | feature_map: 14 x 14 inch: 256 outch: 512 kernel: 1 x 1 stride: 2
BatchNorm bn5a_branch1 0.011ms | feature_map: 7 x 7 inch: 512 outch: 512
Scale scale5a_branch1 0.009ms | feature_map: 7 x 7 inch: 512 outch: 512
Convolution res5a_branch2a 2.877ms | feature_map: 14 x 14 inch: 256 outch: 512 kernel: 3 x 3 stride: 2
BatchNorm bn5a_branch2a 0.062ms | feature_map: 7 x 7 inch: 512 outch: 512
Scale scale5a_branch2a 0.011ms | feature_map: 7 x 7 inch: 512 outch: 512
ReLU res5a_branch2a_relu 0.009ms | feature_map: 7 x 7 inch: 512 outch: 512
Convolution res5a_branch2b 6.811ms | feature_map: 7 x 7 inch: 512 outch: 512 kernel: 3 x 3
BatchNorm bn5a_branch2b 0.028ms | feature_map: 7 x 7 inch: 512 outch: 512
Scale scale5a_branch2b 0.056ms | feature_map: 7 x 7 inch: 512 outch: 512
Eltwise res5a 0.040ms |
ReLU res5a_relu 0.009ms | feature_map: 7 x 7 inch: 512 outch: 512
Split splitncnn_7 0.000ms |
Convolution res5b_branch2a 6.584ms | feature_map: 7 x 7 inch: 512 outch: 512 kernel: 3 x 3
BatchNorm bn5b_branch2a 0.090ms | feature_map: 7 x 7 inch: 512 outch: 512
Scale scale5b_branch2a 0.015ms | feature_map: 7 x 7 inch: 512 outch: 512
ReLU res5b_branch2a_relu 0.032ms | feature_map: 7 x 7 inch: 512 outch: 512
Convolution res5b_branch2b 6.611ms | feature_map: 7 x 7 inch: 512 outch: 512 kernel: 3 x 3
BatchNorm bn5b_branch2b 0.086ms | feature_map: 7 x 7 inch: 512 outch: 512
Scale scale5b_branch2b 0.035ms | feature_map: 7 x 7 inch: 512 outch: 512
Eltwise res5b 0.028ms |
ReLU res5b_relu 0.012ms | feature_map: 7 x 7 inch: 512 outch: 512
Pooling pool5 0.046ms | feature_map: 7 x 7 inch: 512 outch: 512 kernel: 7 x 7
InnerProduct fc1000 0.598ms | feature_map: 1 x 1 inch: 512 outch: 1000
Softmax prob 0.006ms | feature_map: 1000 x 1 inch: 1 outch: 1

Total average inference time = 76.336ms

其中可以控制测试的次数、线程数、powersave模式与输出blob的名字,并随后打印出所有的测试中每一层的平均时间以及包含的参数

NCNN框架结构

在NCNN中,类似于Caffe,每一层的数据被保存为Mat类型,这是一个NCNN自定的类,其计算使用了汇编,非常高效。网络定义为一个ncnn::Net类,其中的layers储存了每一层的信息,blobs储存了网络的中间数据。在计算时,先利用param文件和bin文件实例化一个ncnn::Net类,根据得到的net实例化一个ncnn::Extractor类,extractor中的net会被转为const类,以保证运算的时候blobs和layers等重要信息不会被改变。
当需要计算时,可以在extractor中的任意一层送入数据,如

ncnn::Mat in_blob(224, 224, 3);  // set input blob with given shape
extractor.input("data", in_blob);  // set network input using extractor 

随后就可以取出任意层的数据

ncnn::Mat output;
extractor.extract("res5.1.sum", output);  // extract any blob you want

在extract方法中,使用了net中的forward_layer方法递归地遍历网络。为了能够精确地计算每一层所消耗的时间,我们需要修改的地方,就在forward_layer方法内。

修改细节

在ncnn::Net类中,首先定义一个hash map用来记录每一层的时间,并定义一个bool变量profile作为判断是否记录时间的flag。由于ncnn::Net类在extractor中为const类,为了能够修改profiler,要加入mutable关键字。

public:
#if NCNN_BENCHMARK
    void set_profiler(int max_iter);
    // define a hash map to store inference time for each layer
    mutable std::map<std::string, int> profiler;
#endif
protected:
#if NCNN_BENCHMARK
    bool profile;
#endif

随后修改ncnn::Net的forward_layer方法,对于forward_layer中的每一次调用layer->forward()方法或者layer->forward_inplace(),计算其Inference时间,下面给出一段作为范例:

#if NCNN_BENCHMARK
            int start = get_msec();
            int ret = layer->forward_inplace(bottom_top_blob, opt);
            int end = get_msec();
            int duration = end - start;
            if (duration < 0) 
            {
                duration = end + (1000000 - start);
            }
            if (profile) 
            {
                profiler[layer->name] += duration;
                // reach the last iteration
                if (profiler["max_iter"] == profiler["iter"]) 
                {
                    benchmark(layer, bottom_top_blob, bottom_top_blob, profiler[layer->name] / (1000.0 * profiler["max_iter"]));
                }
            } 
#else
            int ret = layer->forward_inplace(bottom_top_blob, opt);
#endif // NCNN_BENCHMARK

定义set_profiler方法

#if NCNN_BENCHMARK
void Net::set_profiler(int max_iter) 
{
    profiler["max_iter"] = max_iter;
    profile = true;
}
#endif // NCNN_BENCHMARK

修改ncnn::Extractor类中的extract方法,加入

#if NCNN_BENCHMARK
    if (net->profile) 
    {
        net->profiler["iter"] ++;
    }
#endif // NCNN_BENCHMARK

注意,有两个extract方法需要修改。

在benchmark.cpp中定义计算时间的函数

int get_msec()
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    
    return tv.tv_usec; // return msec with 0.001ms precision
}

最后,编写一个benchncnn.cpp,包含执行测速的main函数

#include 
#include 

#ifdef _WIN32
#define NOMINMAX
#include 
#include  // Sleep()
#else
#include  // sleep()
#endif

#include "benchmark.h"
#include "cpu.h"
#include "net.h"

namespace ncnn {

// always return empty weights
class ModelBinFromEmpty : public ModelBin
{
public:
    virtual Mat load(int w, int /*type*/) const { return Mat(w); }
};

class BenchNet : public Net
{
public:
    int load_model()
    {
        // load file
        int ret = 0;

        ModelBinFromEmpty mb;
        for (size_t i=0; i<layers.size(); i++)
        {
            Layer* layer = layers[i];

            int lret = layer->load_model(mb);
            if (lret != 0)
            {
                fprintf(stderr, "layer load_model %d failed\n", (int)i);
                ret = -1;
                break;
            }
        }

        return ret;
    }
};

} // namespace ncnn

static int g_loop_count = 4;
static char* g_output_blob_name;
static char* g_mode;
static ncnn::UnlockedPoolAllocator g_blob_pool_allocator;
static ncnn::PoolAllocator g_workspace_pool_allocator;


/********************************/
/* My New Benchmark */
/********************************/
void ncnn_run(const ncnn::Net& net)
{
    ncnn::Extractor ex = net.create_extractor();
    
    if (strcmp("cls", g_mode) == 0) {
        ncnn::Mat in(224, 224, 3);
        ex.input("data", in);
    } else {
        ncnn::Mat in(384, 640, 3);
        ex.input("data", in);   
    } 
    
    ncnn::Mat out;
    ex.extract(g_output_blob_name, out);
}

void benchmark(char* model_name)
{
    ncnn::BenchNet net;
    int num_warm_up = 10; // warm up iteration
    
    fprintf(stderr, "%s :\n\n", model_name);
    
    // init ncnn net
    net.load_param(model_name);
    net.load_model();
    
    g_blob_pool_allocator.clear();
    g_workspace_pool_allocator.clear();

    // sleep 5 seconds for cooling down SOC :(
#ifdef _WIN32
    Sleep(5 * 1000);
#else
    sleep(5);
#endif
    
    // warm up
    for ( int i=0; i<num_warm_up; i++)
    {
        ncnn_run(net);
    }
    
    // let's profile~
    net.set_profiler(g_loop_count); // important!
    for (int i=0; i<g_loop_count; i++)
    {
        ncnn_run(net);
    }
    
    float total_time = 0.0;
    std::map<std::string, int>::iterator map_iter;
    for (map_iter = net.profiler.begin(); map_iter != net.profiler.end(); map_iter++)
    {
        total_time += map_iter->second / 1000.0;
    }
    fprintf(stderr, "\nTotal average inference time = %7.3fms\n\n", total_time / net.profiler["max_iter"]);
}

/********************************/
/* Main Function */
/********************************/
int main(int argc, char** argv)
{
    int loop_count = 4;
    int num_threads = ncnn::get_cpu_count();
    int powersave = 2;
    char* model_name; 
    char* output_blob_name;
    char* mode; // cls or seg, if cls, input_blob will be 224x224 else 384x640
    
    model_name = argv[1];
    if (argc >= 3) {
        loop_count = atoi(argv[2]);
    }
    if (argc >= 4) {
        num_threads = atoi(argv[3]);
    }
    if (argc >= 5) {
        powersave = atoi(argv[4]);
    }
    if (argc >= 6) {
        output_blob_name = argv[5];
    } else {
        output_blob_name = (char*)"fc";
    }
    if (argc >= 7) {
        mode = argv[6];
    } else {
        mode = (char*)"cls";
    }

    g_loop_count = loop_count;
    g_output_blob_name = output_blob_name;
    g_mode = mode;

    g_blob_pool_allocator.set_size_compare_ratio(0.0f);
    g_workspace_pool_allocator.set_size_compare_ratio(0.5f);

    ncnn::Option opt;
    opt.lightmode = true;
    opt.num_threads = num_threads;
    opt.blob_allocator = &g_blob_pool_allocator;
    opt.workspace_allocator = &g_workspace_pool_allocator;

    ncnn::set_default_option(opt);
    ncnn::set_cpu_powersave(powersave);
    ncnn::set_omp_dynamic(0);
    ncnn::set_omp_num_threads(num_threads);

    fprintf(stderr, "\nloop_count = %d\n", g_loop_count);
    fprintf(stderr, "num_threads = %d\n", num_threads);
    fprintf(stderr, "powersave = %d\n", ncnn::get_cpu_powersave());
    fprintf(stderr, "output_blob_name: %s\n", g_output_blob_name);
    fprintf(stderr, "mode: %s\n\n", g_mode);

    // run
    benchmark(model_name);

    return 0;
}

编译与使用

首先在CMakeLists.txt中修改

option(NCNN_BENCHMARK "print benchmark information for every layer" ON)  // OFF for default

随后就是一堆命令行

mkdir build-phone 
cd build-phone
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK_ROOT/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-27 ..
make -j2
cd ..
adb push build-phone/ /data/local/tmp/
adb shell
cd /data/local/tmp/build-phone/benchmark/
./benchncnn resnet18.param 100 4 2 prob

ojbk

你可能感兴趣的:(深度学习,CNN)