使用WEC7的NEON内在函数功能

    对于ARM交叉编译器,GCC或RCVT能够支持内联汇编,而WINCE6是不支持的。在高版本的GCC(如4.6)或RCVT(如4.0)针对Cortex-A8还增加了新的方式,那就是内在函数

(intrinsic),即在代码编写时通过引入头文件arm_neon.h,在C或C++文件中加入相关NEON指令的内在函数,编译器会将此函数按类似inline方式进行展开,这样即简化了汇编代码的编写,举一个简单的例子如下(调用两个内在函数,第一个是向量乘法,第二个是向量标量乘法):

#include 
#include 
#include 

//armcc -g --asm --interleave --cpu=Cortex-A8 -O3 -Otime --vectorize itest.c

#define c_minus_cephes_DP1 -0.78515625

typedef float32x4_t v4sf;  // vector of 4 float

void print4(float32x4_t v) {
  printf("[%13.8g, %13.8g, %13.8g, %13.8g]", vgetq_lane_f32(v,0), vgetq_lane_f32(v, 1), vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3));
}

int main()
{
	v4sf x = {1};
	v4sf y = {2};
	v4sf z = {3};
	
	z = vmulq_f32(x,y);	/* VMUL.F32 q0,q0,q0 */
	print4(z);
	
	z = vmulq_n_f32(x,c_minus_cephes_DP1);	/* VMUL.F32 q0,q0,d0[0] */
	print4(z);
	
	return 0;
}
生成的汇编代码如下:
; generated by ARM NEON C/C++ Compiler, RVCT4.0 [Build 400]
; commandline armcc [--cpp --debug --asm --interleave --cpu=Cortex-A8 -O3 -Otime --vectorize itest.c]
                          ARM

                          AREA ||.text||, CODE, READONLY, ALIGN=2

                  _Z6print419__simd128_float32_t PROC ; print4(__simd128_float32_t)
000000  eeb73ae0          VCVT.F64.F32 d3,s1
000004  eeb70ac0          VCVT.F64.F32 d0,s0
000008  eeb72ae1          VCVT.F64.F32 d2,s3
00000c  eeb71ac1          VCVT.F64.F32 d1,s2
000010  e52de004          PUSH     {lr}
000014  e28f001c          ADR      r0,|L1.56|
000018  ec532b10          VMOV     r2,r3,d0
00001c  e24dd01c          SUB      sp,sp,#0x1c
000020  ed8d2b04          VSTR     d2,[sp,#0x10]
000024  ed8d1b02          VSTR     d1,[sp,#8]
000028  ed8d3b00          VSTR     d3,[sp,#0]
00002c  ebfffffe          BL       __2printf
000030  e28dd01c          ADD      sp,sp,#0x1c
000034  e49df004          POP      {pc}
                  |L1.56|
000038  5b253133          DCB      "[%13"
00003c  2e38672c          DCB      ".8g,"
000040  20253133          DCB      " %13"
000044  2e38672c          DCB      ".8g,"
000048  20253133          DCB      " %13"
00004c  2e38672c          DCB      ".8g,"
000050  20253133          DCB      " %13"
000054  2e38675d          DCB      ".8g]"
000058  00000000          DCB      "\0\0\0\0"
                          ENDP

                  main PROC
;;;14     
;;;15     int main()
00005c  e59f10c0          LDR      r1,|L1.292|
;;;16     {
000060  e52de004          PUSH     {lr}
000064  e24dd04c          SUB      sp,sp,#0x4c
;;;17     	v4sf x = {1};
000068  e891500c          LDM      r1,{r2,r3,r12,lr}
00006c  e28d0038          ADD      r0,sp,#0x38
;;;18     	v4sf y = {2};
000070  e2811010          ADD      r1,r1,#0x10
000074  e880500c          STM      r0,{r2,r3,r12,lr}     ;17
000078  e28d0028          ADD      r0,sp,#0x28
00007c  e891500c          LDM      r1,{r2,r3,r12,lr}
;;;19     	v4sf z = {3};
000080  e2811010          ADD      r1,r1,#0x10
000084  e880500c          STM      r0,{r2,r3,r12,lr}     ;18
000088  e28d0018          ADD      r0,sp,#0x18
00008c  e891500c          LDM      r1,{r2,r3,r12,lr}
000090  e880500c          STM      r0,{r2,r3,r12,lr}
000094  e24f0064          ADR      r0,|L1.56|
000098  ed9d0b0e          VLDR     d0,[sp,#0x38]
00009c  ed9d1b10          VLDR     d1,[sp,#0x40]
0000a0  ed9d2b0a          VLDR     d2,[sp,#0x28]
0000a4  ed9d3b0c          VLDR     d3,[sp,#0x30]
;;;20     
;;;21     	z = vmulq_f32(x,y);
0000a8  f3000d52          VMUL.F32 q0,q0,q1		;已展开为NEON指令
0000ac  ed8d0b06          VSTR     d0,[sp,#0x18]
0000b0  eeb73ae0          VCVT.F64.F32 d3,s1
0000b4  eeb70ac0          VCVT.F64.F32 d0,s0
0000b8  ed8d1b08          VSTR     d1,[sp,#0x20]
0000bc  eeb72ae1          VCVT.F64.F32 d2,s3
0000c0  eeb71ac1          VCVT.F64.F32 d1,s2
0000c4  ec532b10          VMOV     r2,r3,d0
0000c8  ed8d2b04          VSTR     d2,[sp,#0x10]
0000cc  ed8d1b02          VSTR     d1,[sp,#8]
0000d0  ed8d3b00          VSTR     d3,[sp,#0]
0000d4  ebfffffe          BL       __2printf
0000d8  ed9d0b0e          VLDR     d0,[sp,#0x38]
0000dc  ed9d1b10          VLDR     d1,[sp,#0x40]
;;;22     	print4(z);
;;;23     	
;;;24     	z = vmulq_n_f32(x,c_minus_cephes_DP1);
0000e0  ed9f2a10          VLDR     s4,|L1.296|
0000e4  f3a00942          VMUL.F32 q0,q0,d2[0]		;已展开为NEON指令
0000e8  e24f00b8          ADR      r0,|L1.56|
0000ec  ed8d0b06          VSTR     d0,[sp,#0x18]
0000f0  eeb73ae0          VCVT.F64.F32 d3,s1
0000f4  eeb70ac0          VCVT.F64.F32 d0,s0
0000f8  ed8d1b08          VSTR     d1,[sp,#0x20]
0000fc  eeb72ae1          VCVT.F64.F32 d2,s3
000100  eeb71ac1          VCVT.F64.F32 d1,s2
000104  ec532b10          VMOV     r2,r3,d0
000108  ed8d2b04          VSTR     d2,[sp,#0x10]
00010c  ed8d1b02          VSTR     d1,[sp,#8]
000110  ed8d3b00          VSTR     d3,[sp,#0]
000114  ebfffffe          BL       __2printf
;;;25     	print4(z);
;;;26     	
;;;27     	return 0;
000118  e28dd04c          ADD      sp,sp,#0x4c
00011c  e3a00000          MOV      r0,#0
;;;28     }
000120  e49df004          POP      {pc}
                  |L1.292|
                          DCD      ||.constdata||
                  |L1.296|
000128  bf490000          DCFS     0xbf490000 ; -0.78515625
                          ENDP


                          AREA ||.ARM.exidx||, LINKORDER=||.text||, DATA, READONLY, SECTYPE={SHT_ARM_EXIDX}, ALIGN=2

                          DCD      0x00000000
                          RELOC 42, ||.text||
                          DCD      0x00000001

                          AREA ||.constdata||, DATA, READONLY, ALIGN=3

000000  00000000
000004  3ff00000          DCFD     0x3ff0000000000000 ; 1
000008  00000000
00000c  00000000          DCFD     0x0000000000000000 ; 0
000010  00000000
000014  40000000          DCFD     0x4000000000000000 ; 2
000018  00000000
00001c  00000000          DCFD     0x0000000000000000 ; 0
000020  00000000
000024  40080000          DCFD     0x4008000000000000 ; 3
000028  00000000
00002c  00000000          DCFD     0x0000000000000000 ; 0

                  __ARM_use_no_argv EQU 0

有趣的是在WCE7中偶然发现也加入了NEON内在函数支持:头文件名称也是arm_neon.h,函数名也差不多是一样的。不过有些函数是没有的,象vmulq_n_f32就没有,通过对比两个编译器生成的汇编代码,发现vmulq_n_f32在WEC7中要定义成

#define vmulq_n_f32(Qn, Dm)				( __neon_QdQnDmx( 0xf3a00940 | _NENC_5(0), (Qn), (Dm)) )

在宏定义中包含的__neon_QdQnDmx是原有的内在函数,原型定义是

__n128 __neon_QdQnDmx(unsigned int, __n128, __n64);

对比RVCT中的定义

__EXTERNC __PURE __VALUE_IN_REGS float32x4_t vmulq_n_f32(float32x4_t vec1, float32_t val2); /* VMUL.F32 q0,q0,d0[0] */

RVCT的第二个操作数是单字,而WEC7中的第二个参数是双字,两者不一致,因此需要修改测试程序才能在WEC7编译通过

#include "stdafx.h"
#include 
#include 
#include 

//armcc -g --asm --interleave --cpu=Cortex-A8 -O3 -Otime --vectorize itest.c

#define c_minus_cephes_DP1 -0.78515625

typedef float32x4_t v4sf;

void print44(v4sf v) {
  printf("[%13.8g, %13.8g, %13.8g, %13.8g]\n", vgetq_lane_f32(v,0), vgetq_lane_f32(v, 1), vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3));
}

int _tmain(int argc, _TCHAR* argv[])
{
	v4sf x = {1};
	v4sf y = {2};
	v4sf z = {3};
	float32x2_t f ;

	f.n64_f32[0] = c_minus_cephes_DP1;

	z = vmulq_f32(x, y);
	print44(z);

	z = vmulq_n_f32(x, f);
	print44(z);

	return 0;
}

生成的汇编代码如下

.text:00011070 ; int __cdecl wmain(int argc, wchar_t **argv)
.text:00011070 wmain                                   ; CODE XREF: mainCRTStartupHelper+94p
.text:00011070                                         ; DATA XREF: .pdata:00014008o
.text:00011070
.text:00011070 var_24          = -0x24
.text:00011070 var_1C          = -0x1C
.text:00011070 var_18          = -0x18
.text:00011070 var_4           = -4
.text:00011070
.text:00011070                 STR             LR, [SP,#var_4]!
.text:00011074                 VPUSH           {D8-D9}
.text:00011078                 SUB             SP, SP, #0x10
.text:0001107C
.text:0001107C $M32277
.text:0001107C                 MOV             R2, #0
.text:00011080                 STR             R2, [SP,#0x24+var_24+4]
.text:00011084                 MOV             R2, #0
.text:00011088                 MOV             R3, #2
.text:0001108C                 STR             R2, [SP,#0x24+var_1C]
.text:00011090                 STR             R2, [SP,#0x24+var_18]
.text:00011094                 STR             R3, [SP,#0x24+var_24]
.text:00011098                 LDR             R3, =__real_bf490000
.text:0001109C                 VLDMEA          SP, {D16-D17}
.text:000110A0                 MOV             R2, #0
.text:000110A4                 MOV             R1, #1
.text:000110A8                 MOV             R0, #0  ; __n128
.text:000110AC                 STR             R2, [SP,#0x24+var_1C]
.text:000110B0                 STR             R2, [SP,#0x24+var_18]
.text:000110B4                 STR             R1, [SP,#0x24+var_24]
.text:000110B8                 STR             R0, [SP,#0x24+var_24+4]
.text:000110BC                 VLDMEA          SP, {D8-D9}
.text:000110C0                 VLDR            S4, [R3]
.text:000110C4                 VMUL.F32        Q0, Q4, Q8 ; __n128	;已展开为NEON指令
.text:000110C8                 VSTR            S4, [SP,#0x24+var_24]
.text:000110CC                 BL              print44
.text:000110D0                 VLDR            D0, [SP,#0x24+var_24]
.text:000110D4                 VMUL.F32        Q0, Q4, D0[0] ; __n128	;已展开为NEON指令
.text:000110D8                 BL              print44
.text:000110DC                 MOV             R0, #0
.text:000110E0                 ADD             SP, SP, #0x10
.text:000110E4                 VPOP            {D8-D9}
.text:000110E8                 LDR             PC, [SP+4+var_4],#4
.text:000110E8 ; ---------------------------------------------------------------------------
.text:000110EC ; unsigned int off_110EC
.text:000110EC off_110EC       DCD __real_bf490000     ; DATA XREF: wmain+28r
.text:000110EC ; End of function wmain

对比两个编译器生成的汇编代码,发现在内在函数的展开上RCVT使用的寄存器更少,生成的总指令数也更少。相应的在优化程度上也优于WEC7。
补充:在WEC7下运行测试程序时,发现vmulq_n_f32的计算结果完全不对。考虑到它执行的是浮点运算,在将x,y,z的赋值方式修改为如下后结果正常
x.n128_f32[0] = 1;
x.n128_f32[1] = 1;
x.n128_f32[2] = 1;
x.n128_f32[3] = 1;

你可能感兴趣的:(使用WEC7的NEON内在函数功能)