ARM 内联汇编 加速 算法

首推移动端arm cpu优化学习笔记第4弹--内联汇编入门,但是其只给出了代码,很多人还不知道怎么在手机上跑起来,其实只需要一个CMakeLists.txt就可以了.

cmake_minimum_required(VERSION 2.8)
set(PROJECT_NAME benchmark)
project(${PROJECT_NAME})
set(CMAKE_BUILD_TYPE Release)
aux_source_directory(. SRC_FILES)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
add_executable(${PROJECT_NAME} ${SRC_FILES})

1.使用htop监测android手机CPU运行情况

git clone https://github.com/LinGeLin/Compiled-Android-htop

在其下新建一个run.sh文件,内容为

install (){
adb push 2-2installht/xbin/htop /data/local/tmp
adb push 6-1ncinstall /data/local/tmp/
}

run(){
    adb shell "cd /data/local/tmp && export TERM=xterm && export TERMINFO=6-1ncinstall/share/terminfo &&export LD_LIBRARY_PATH=6-1ncinstall/lib &&./htop"
}

install
run

打开命令行切换到htop所在文件夹下执行./run.sh

首次运行拷贝完文件后将install那行注释掉,和Linux端还是很像的.

2.测试求和加速代码:

#include "iostream"

#if ANDROID
    #include 
#endif

#define ANDROID_LOG 1
#include "mrlog.h"

int rounds = 1000;

float sum_float(const float* data, const long num=1000){
    float sum = 0;
    for(long i=0;i>2;
    float *ptr = (float*)data;
    float32x4_t sum_vec = vdupq_n_f32(0);
    for(; nn>0; nn--,ptr+=4){
        float32x4_t tmp_vec = vld1q_f32(ptr);
        sum_vec = vaddq_f32(sum_vec, tmp_vec);
    }
    float sum = vgetq_lane_f32(sum_vec, 0)+vgetq_lane_f32(sum_vec, 1)+vgetq_lane_f32(sum_vec, 2)+vgetq_lane_f32(sum_vec, 3);
    int remain = size&3;
    for(;remain>0;remain--,ptr++){
        sum+=(*ptr);
    }
    return sum;
}

int sum_neon(const float* data, const long size=1000){
    float sum = 0;
    long kc = size >> 2;
    float *ptr = (float*) data;
    #if __aarch64__
        float *sum_vector= new float[4];
        asm volatile(
            "mov            x0,0                    \n"
            "dup            v0.4s, w0               \n"
            "0:                                     \n"
            "ld1            {v1.4s},[%[ptr]],#16    \n"
            "fadd           v0.4s,v0.4s,v1.4s       \n"
            "subs           %[kc], %[kc],#1         \n"
            "bne            0b                      \n"
            "st1            {v0.4s},[%[sum_vector]] \n"
            : [kc]  "=r" (kc),
              [ptr] "=r" (ptr),
              [sum_vector] "=r" (sum_vector)
            : [kc]  "0"  (kc),   
              [ptr] "1"  (ptr),
              [sum_vector] "2"  (sum_vector)
            : "cc","memory","x0","x1"
        );
        for(int i = 0; i < 4; i++){
            sum += sum_vector[i];
        }
        delete []sum_vector;
    #else
        asm volatile(
            "veor               q0,q0,q0            \n"
            "0:                                     \n"
            "vld1.f32           {q1},[%[ptr]]!      \n"
            "vadd.f32           q0,q0,q1            \n"
            "subs               %[kc], #1           \n"
            "bne                0b                  \n"
            "vpadd.f32          d0,d0,d1            \n"
            "vadd.f32           s0,s0,s1            \n"
            "vmov.32            %[sum],s0           \n"
            : [kc]  "=r" (kc),
              [ptr] "=r" (ptr),
              [sum] "=r" (sum)
            : [kc]  "0"  (kc),   
              [ptr] "1"  (ptr),
              [sum] "2"  (sum)
            : "cc","memory","q0","q1"
        );
    #endif
    for(int i = size - (kc<<2); i < size; ++i) {
        sum += data[i];
    }
    return sum;
}

int test_float(float *a, long num){
    float sum =0;
    MRTIMER_START(sum_time);
    for(int i=0;i

参考ARM NEON常用函数总结其中:float32x4_t vdupq_n_f32 (float32_t value)将value复制4分存到返回的寄存器中

float32x4_t vld1q_f32 (float32_t const * ptr)从数组中依次Load4个元素存到寄存器中,vst1q_f32写

float32x4_t vaddq_f32 (float32x4_t a, float32x4_t b)返回两个寄存器对应元素之和 r = a+b

float32_t vgetq_lane_f32 (float32x4_t v, const int lane)返回寄存器某一lane的值

自动化编译和运行:

#rm -rf build
mkdir build
cd build
#arm64-v8a
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 ..
make
adb push benchmark /data/local/tmp
adb shell "cd /data/local/tmp/ && ./benchmark"

在一加6 (高通845)上的运行时间为:

1e+06
float: 1.11039ms
1e+06
intrinsics: 0.351055ms
1e+06
neon: 0.351813ms

可见其可以加速3.09倍. PS: 一百万次加法运算耗费时间约为1毫秒.

矩阵乘法作为深度学习卷积的实现,直接影响最终的速度,如何处理才能榨干处理器的行能呢?

  1. OpenBLAS gemm从零入门
  2. 神经网络arm neon加速实现
  3. 【图像处理】NEON编程

winograd实现卷积原理就是求卷积过程中最大公约数,消除冗余运算

  1. 卷积神经网络中的Winograd快速卷积算法
  2. 详解Winograd变换矩阵生成原理
  3. 移动端arm cpu优化学习笔记第3弹--绑定cpu(cpu affinity)

你可能感兴趣的:(深度学习)