__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane)
{
return __a * __aarch64_vget_lane_f32 (__b, __lane);
}
说明:把a中的每个lane与b中由lane参数指定的lane相乘并返回。
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b,
float32x2_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vget_lane_f32 (__c, __lane)));
}
说明:把b中的每个lane与c中由lane参数指定的lane相乘,然后与a的每个lane相加,并返回。
以如下代码为例:
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
using namespace cv;
void altneonmult(const float *matrixA, const float *matrixB, float *matrixR)
{
float32x4_t a0,a1,a2,a3, b, r;
a0 = vld1q_f32(matrixA); /* col 0 of matrixA */
a1 = vld1q_f32(matrixA + 4); /* col 1 of matrixA */
a2 = vld1q_f32(matrixA + 8); /* col 2 of matrixA */
a3 = vld1q_f32(matrixA + 12); /* col 3 of matrixA */
b = vld1q_f32(matrixB); /* load col 0 of matrixB */
r = vmulq_lane_f32(a0, vget_low_f32(b), 0);
r = vmlaq_lane_f32(r, a1, vget_low_f32(b), 1);
r = vmlaq_lane_f32(r, a2, vget_high_f32(b), 0);
r = vmlaq_lane_f32(r, a3, vget_high_f32(b), 1);
vst1q_f32(matrixR, r); /* store col 0 of result */
b = vld1q_f32(matrixB + 4); /* load col 1 of matrixB */
r = vmulq_lane_f32(a0, vget_low_f32(b), 0);
r = vmlaq_lane_f32(r, a1, vget_low_f32(b), 1);
r = vmlaq_lane_f32(r, a2, vget_high_f32(b), 0);
r = vmlaq_lane_f32(r, a3, vget_high_f32(b), 1);
vst1q_f32(matrixR + 4, r); /* store col 1 of result */
b = vld1q_f32(matrixB + 8); /* load col 2 of matrixB */
r = vmulq_lane_f32(a0, vget_low_f32(b), 0);
r = vmlaq_lane_f32(r, a1, vget_low_f32(b), 1);
r = vmlaq_lane_f32(r, a2, vget_high_f32(b), 0);
r = vmlaq_lane_f32(r, a3, vget_high_f32(b), 1);
vst1q_f32(matrixR + 8, r); /* store col 2 of result */
b = vld1q_f32(matrixB + 12); /* load col 3 of matrixB */
r = vmulq_lane_f32(a0, vget_low_f32(b), 0);
r = vmlaq_lane_f32(r, a1, vget_low_f32(b), 1);
r = vmlaq_lane_f32(r, a2, vget_high_f32(b), 0);
r = vmlaq_lane_f32(r, a3, vget_high_f32(b), 1);
vst1q_f32(matrixR + 12, r); /* store col 3 of result */
}
int main()
{
struct timeval tv_start, tv_end;
Mat matrixA(4, 4, CV_32FC1), matrixB(4, 4, CV_32FC1),matrixR(4, 4, CV_32FC1);
float *pATmp = (float*)(matrixA.data);
float *pBTmp = (float*)(matrixB.data);
float *pRTmp = (float*)(matrixR.data);
for(int i = 0; i < 4; i++)
{
for(int j = 0; j < 4; j++ )
{
*pATmp = i*4+j;
*pBTmp = i*5+j;
*pRTmp = 0.0f;
pATmp++;
pBTmp++;
pRTmp++;
}
}
///transpose A and B, let cols align first
transpose(matrixA, matrixA);
transpose(matrixB, matrixB);
pATmp = (float*)(matrixA.data);
pBTmp = (float*)(matrixB.data);
for(int i = 0; i < 4; i++)
{
for(int j = 0; j < 4; j++ )
{
printf("%f ", *pATmp);
pATmp++;
}
printf("\n");
}
printf("\nA-------over\n");
for(int i = 0; i < 4; i++)
{
for(int j = 0; j < 4; j++ )
{
printf("%f ", *pBTmp);
pBTmp++;
}
printf("\n");
}
printf("\nB-------over\n");
call altneonmult to calcute the matrix multiply
gettimeofday(&tv_start, 0);
altneonmult( (float*)(matrixA.data), (float*)(matrixB.data), (float*)(matrixR.data));
gettimeofday(&tv_end, 0);
printf("using neon %dμs\n", (tv_end.tv_sec * 1000000 + tv_end.tv_usec - tv_start.tv_sec * 1000000 - tv_start.tv_usec));
pRTmp = (float*)(matrixR.data);
for(int i = 0; i < 4; i++)
{
for(int j = 0; j < 4; j++ )
{
printf("%f ", *pRTmp);
pRTmp++;
}
printf("\n");
}
///using standard matrix multiply
transpose(matrixA, matrixA);
transpose(matrixB, matrixB);
pATmp = (float*)(matrixA.data);
pBTmp = (float*)(matrixB.data);
pRTmp = (float*)(matrixR.data);
memset((void*)pRTmp, 0, 4*4*sizeof(float));
gettimeofday(&tv_start, 0);
{
for(int i = 0; i < 4; i++)
{
for(int j = 0; j < 4; j++)
{
for(int k=0; k < 4; k++)
{
*(pRTmp+4*i+j) += (*(pATmp + 4*i + k)) * (*(pBTmp + k*4 + j));
}
}
}
}
gettimeofday(&tv_end, 0);
printf("using arm general calc %dμs\n", (tv_end.tv_sec * 1000000 + tv_end.tv_usec - tv_start.tv_sec * 1000000 - tv_start.tv_usec));
pRTmp = (float*)(matrixR.data);
for(int i = 0; i < 4; i++)
{
for(int j = 0; j < 4; j++ )
{
printf("%f ", *pRTmp);
pRTmp++;
}
printf("\n");
}
return 0;
}
输出的结果如下:
Jelly-Pro:/data/local/tmp/neon # ./a.out
WARNING: linker: /data/local/tmp/neon/a.out: unsupported flags DT_FLAGS_1=0x8000001
0.000000 4.000000 8.000000 12.000000
1.000000 5.000000 9.000000 13.000000
2.000000 6.000000 10.000000 14.000000
3.000000 7.000000 11.000000 15.000000
using neon 3μs
70.000000 190.000000 310.000000 430.000000
76.000000 212.000000 348.000000 484.000000
82.000000 234.000000 386.000000 538.000000
88.000000 256.000000 424.000000 592.000000
Jelly-Pro:/data/local/tmp/neon # ./a.out
WARNING: linker: /data/local/tmp/neon/a.out: unsupported flags DT_FLAGS_1=0x8000001
0.000000 4.000000 8.000000 12.000000
1.000000 5.000000 9.000000 13.000000
2.000000 6.000000 10.000000 14.000000
3.000000 7.000000 11.000000 15.000000
A-------over
0.000000 5.000000 10.000000 15.000000
1.000000 6.000000 11.000000 16.000000
2.000000 7.000000 12.000000 17.000000
3.000000 8.000000 13.000000 18.000000
B-------over
using neon 2μs
70.000000 190.000000 310.000000 430.000000
76.000000 212.000000 348.000000 484.000000
82.000000 234.000000 386.000000 538.000000
88.000000 256.000000 424.000000 592.000000
Jelly-Pro:/data/local/tmp/neon # ./a.out
WARNING: linker: /data/local/tmp/neon/a.out: unsupported flags DT_FLAGS_1=0x8000001
0.000000 4.000000 8.000000 12.000000
1.000000 5.000000 9.000000 13.000000
2.000000 6.000000 10.000000 14.000000
3.000000 7.000000 11.000000 15.000000
A-------over
0.000000 5.000000 10.000000 15.000000
1.000000 6.000000 11.000000 16.000000
2.000000 7.000000 12.000000 17.000000
3.000000 8.000000 13.000000 18.000000
B-------over
using neon 2μs
70.000000 190.000000 310.000000 430.000000
76.000000 212.000000 348.000000 484.000000
82.000000 234.000000 386.000000 538.000000
88.000000 256.000000 424.000000 592.000000
using arm general calc 3μs
140.000000 266.000000 392.000000 518.000000
266.000000 424.000000 582.000000 740.000000
392.000000 582.000000 772.000000 962.000000
518.000000 740.000000 962.000000 1184.000000
Jelly-Pro:/data/local/tmp/neon # ./a.out
WARNING: linker: /data/local/tmp/neon/a.out: unsupported flags DT_FLAGS_1=0x8000001
0.000000 4.000000 8.000000 12.000000
1.000000 5.000000 9.000000 13.000000
2.000000 6.000000 10.000000 14.000000
3.000000 7.000000 11.000000 15.000000
A-------over
0.000000 5.000000 10.000000 15.000000
1.000000 6.000000 11.000000 16.000000
2.000000 7.000000 12.000000 17.000000
3.000000 8.000000 13.000000 18.000000
B-------over
using neon 3μs
70.000000 190.000000 310.000000 430.000000
76.000000 212.000000 348.000000 484.000000
82.000000 234.000000 386.000000 538.000000
88.000000 256.000000 424.000000 592.000000
using arm general calc 3μs
70.000000 76.000000 82.000000 88.000000
190.000000 212.000000 234.000000 256.000000
310.000000 348.000000 386.000000 424.000000
430.000000 484.000000 538.000000 592.000000
Jelly-Pro:/data/local/tmp/neon #
编译选项是:
NDK_ROOT=/home/android-ndk-r16b
CC=$(NDK_ROOT)/build/toolchains/aarch64-linux-android-clang++/bin/aarch64-linux-android-g++
SOURCES := $(shell ls ./*.cpp)
OBJS := $(patsubst %.cpp, %.o, $(SOURCES))
BIN := a.out
INCLUDES := \
-I $(NDK_ROOT)/sysroot/usr/include/ \
-I /data_1/songqing/tk1/3rdparty/OpenCV-android-sdk-3.1.0/sdk/native/jni/include
CFLAGS = -O3 -march=armv8-a -pie -fPIE -ffast-math
#CFLAGS = -O3 -march=armv8-a -mcpu=cortex-a8 -mfpu=neom -mfloat-abi=hard -ffast-math
LIBS := \
-pie -fPIE \
-L /data_1/songqing/tk1/3rdparty/OpenCV-android-sdk-3.1.0/sdk/native/libs/arm64-v8a \
-lopencv_core -lopencv_imgproc -lopencv_highgui -lopencv_imgcodecs \
-L /data_1/songqing/tk1/3rdparty/OpenCV-android-sdk-3.1.0/sdk/native/3rdparty/libs/arm64-v8a \
-lIlmImf -llibjasper -llibjpeg -llibpng -llibtiff -llibwebp -ltbb \
-lm -llog -lz
$(BIN): $(OBJS)
$(CC) $(OBJS) $(LIBS) -o $(BIN)
%.o: %.cpp
$(CC) $(INCLUDES) $(CFLAGS) -c $< -o $@