首推移动端arm cpu优化学习笔记第4弹--内联汇编入门,但是其只给出了代码,很多人还不知道怎么在手机上跑起来,其实只需要一个CMakeLists.txt就可以了.
cmake_minimum_required(VERSION 2.8)
set(PROJECT_NAME benchmark)
project(${PROJECT_NAME})
set(CMAKE_BUILD_TYPE Release)
aux_source_directory(. SRC_FILES)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
add_executable(${PROJECT_NAME} ${SRC_FILES})
1.使用htop监测android手机CPU运行情况
git clone https://github.com/LinGeLin/Compiled-Android-htop
在其下新建一个run.sh文件,内容为
install (){
adb push 2-2installht/xbin/htop /data/local/tmp
adb push 6-1ncinstall /data/local/tmp/
}
run(){
adb shell "cd /data/local/tmp && export TERM=xterm && export TERMINFO=6-1ncinstall/share/terminfo &&export LD_LIBRARY_PATH=6-1ncinstall/lib &&./htop"
}
install
run
打开命令行切换到htop所在文件夹下执行./run.sh
首次运行拷贝完文件后将install那行注释掉,和Linux端还是很像的.
2.测试求和加速代码:
#include "iostream"
#if ANDROID
#include
#endif
#define ANDROID_LOG 1
#include "mrlog.h"
int rounds = 1000;
float sum_float(const float* data, const long num=1000){
float sum = 0;
for(long i=0;i>2;
float *ptr = (float*)data;
float32x4_t sum_vec = vdupq_n_f32(0);
for(; nn>0; nn--,ptr+=4){
float32x4_t tmp_vec = vld1q_f32(ptr);
sum_vec = vaddq_f32(sum_vec, tmp_vec);
}
float sum = vgetq_lane_f32(sum_vec, 0)+vgetq_lane_f32(sum_vec, 1)+vgetq_lane_f32(sum_vec, 2)+vgetq_lane_f32(sum_vec, 3);
int remain = size&3;
for(;remain>0;remain--,ptr++){
sum+=(*ptr);
}
return sum;
}
int sum_neon(const float* data, const long size=1000){
float sum = 0;
long kc = size >> 2;
float *ptr = (float*) data;
#if __aarch64__
float *sum_vector= new float[4];
asm volatile(
"mov x0,0 \n"
"dup v0.4s, w0 \n"
"0: \n"
"ld1 {v1.4s},[%[ptr]],#16 \n"
"fadd v0.4s,v0.4s,v1.4s \n"
"subs %[kc], %[kc],#1 \n"
"bne 0b \n"
"st1 {v0.4s},[%[sum_vector]] \n"
: [kc] "=r" (kc),
[ptr] "=r" (ptr),
[sum_vector] "=r" (sum_vector)
: [kc] "0" (kc),
[ptr] "1" (ptr),
[sum_vector] "2" (sum_vector)
: "cc","memory","x0","x1"
);
for(int i = 0; i < 4; i++){
sum += sum_vector[i];
}
delete []sum_vector;
#else
asm volatile(
"veor q0,q0,q0 \n"
"0: \n"
"vld1.f32 {q1},[%[ptr]]! \n"
"vadd.f32 q0,q0,q1 \n"
"subs %[kc], #1 \n"
"bne 0b \n"
"vpadd.f32 d0,d0,d1 \n"
"vadd.f32 s0,s0,s1 \n"
"vmov.32 %[sum],s0 \n"
: [kc] "=r" (kc),
[ptr] "=r" (ptr),
[sum] "=r" (sum)
: [kc] "0" (kc),
[ptr] "1" (ptr),
[sum] "2" (sum)
: "cc","memory","q0","q1"
);
#endif
for(int i = size - (kc<<2); i < size; ++i) {
sum += data[i];
}
return sum;
}
int test_float(float *a, long num){
float sum =0;
MRTIMER_START(sum_time);
for(int i=0;i
参考ARM NEON常用函数总结其中:float32x4_t vdupq_n_f32 (float32_t value)将value复制4分存到返回的寄存器中
float32x4_t vld1q_f32 (float32_t const * ptr)从数组中依次Load4个元素存到寄存器中,vst1q_f32写
float32x4_t vaddq_f32 (float32x4_t a, float32x4_t b)返回两个寄存器对应元素之和 r = a+b
float32_t vgetq_lane_f32 (float32x4_t v, const int lane)返回寄存器某一lane的值
自动化编译和运行:
#rm -rf build
mkdir build
cd build
#arm64-v8a
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 ..
make
adb push benchmark /data/local/tmp
adb shell "cd /data/local/tmp/ && ./benchmark"
在一加6 (高通845)上的运行时间为:
1e+06
float: 1.11039ms
1e+06
intrinsics: 0.351055ms
1e+06
neon: 0.351813ms
可见其可以加速3.09倍. PS: 一百万次加法运算耗费时间约为1毫秒.
矩阵乘法作为深度学习卷积的实现,直接影响最终的速度,如何处理才能榨干处理器的行能呢?
winograd实现卷积原理就是求卷积过程中最大公约数,消除冗余运算