对于ARM交叉编译器,GCC或RCVT能够支持内联汇编,而WINCE6是不支持的。在高版本的GCC(如4.6)或RCVT(如4.0)针对Cortex-A8还增加了新的方式,那就是内在函数
(intrinsic),即在代码编写时通过引入头文件arm_neon.h,在C或C++文件中加入相关NEON指令的内在函数,编译器会将此函数按类似inline方式进行展开,这样即简化了汇编代码的编写,举一个简单的例子如下(调用两个内在函数,第一个是向量乘法,第二个是向量标量乘法):
#include <arm_neon.h> #include <stdio.h> #include <stdlib.h> //armcc -g --asm --interleave --cpu=Cortex-A8 -O3 -Otime --vectorize itest.c #define c_minus_cephes_DP1 -0.78515625 typedef float32x4_t v4sf; // vector of 4 float void print4(float32x4_t v) { printf("[%13.8g, %13.8g, %13.8g, %13.8g]", vgetq_lane_f32(v,0), vgetq_lane_f32(v, 1), vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)); } int main() { v4sf x = {1}; v4sf y = {2}; v4sf z = {3}; z = vmulq_f32(x,y); /* VMUL.F32 q0,q0,q0 */ print4(z); z = vmulq_n_f32(x,c_minus_cephes_DP1); /* VMUL.F32 q0,q0,d0[0] */ print4(z); return 0; }
生成的汇编代码如下:
; generated by ARM NEON C/C++ Compiler, RVCT4.0 [Build 400] ; commandline armcc [--cpp --debug --asm --interleave --cpu=Cortex-A8 -O3 -Otime --vectorize itest.c] ARM AREA ||.text||, CODE, READONLY, ALIGN=2 _Z6print419__simd128_float32_t PROC ; print4(__simd128_float32_t) 000000 eeb73ae0 VCVT.F64.F32 d3,s1 000004 eeb70ac0 VCVT.F64.F32 d0,s0 000008 eeb72ae1 VCVT.F64.F32 d2,s3 00000c eeb71ac1 VCVT.F64.F32 d1,s2 000010 e52de004 PUSH {lr} 000014 e28f001c ADR r0,|L1.56| 000018 ec532b10 VMOV r2,r3,d0 00001c e24dd01c SUB sp,sp,#0x1c 000020 ed8d2b04 VSTR d2,[sp,#0x10] 000024 ed8d1b02 VSTR d1,[sp,#8] 000028 ed8d3b00 VSTR d3,[sp,#0] 00002c ebfffffe BL __2printf 000030 e28dd01c ADD sp,sp,#0x1c 000034 e49df004 POP {pc} |L1.56| 000038 5b253133 DCB "[%13" 00003c 2e38672c DCB ".8g," 000040 20253133 DCB " %13" 000044 2e38672c DCB ".8g," 000048 20253133 DCB " %13" 00004c 2e38672c DCB ".8g," 000050 20253133 DCB " %13" 000054 2e38675d DCB ".8g]" 000058 00000000 DCB "\0\0\0\0" ENDP main PROC ;;;14 ;;;15 int main() 00005c e59f10c0 LDR r1,|L1.292| ;;;16 { 000060 e52de004 PUSH {lr} 000064 e24dd04c SUB sp,sp,#0x4c ;;;17 v4sf x = {1}; 000068 e891500c LDM r1,{r2,r3,r12,lr} 00006c e28d0038 ADD r0,sp,#0x38 ;;;18 v4sf y = {2}; 000070 e2811010 ADD r1,r1,#0x10 000074 e880500c STM r0,{r2,r3,r12,lr} ;17 000078 e28d0028 ADD r0,sp,#0x28 00007c e891500c LDM r1,{r2,r3,r12,lr} ;;;19 v4sf z = {3}; 000080 e2811010 ADD r1,r1,#0x10 000084 e880500c STM r0,{r2,r3,r12,lr} ;18 000088 e28d0018 ADD r0,sp,#0x18 00008c e891500c LDM r1,{r2,r3,r12,lr} 000090 e880500c STM r0,{r2,r3,r12,lr} 000094 e24f0064 ADR r0,|L1.56| 000098 ed9d0b0e VLDR d0,[sp,#0x38] 00009c ed9d1b10 VLDR d1,[sp,#0x40] 0000a0 ed9d2b0a VLDR d2,[sp,#0x28] 0000a4 ed9d3b0c VLDR d3,[sp,#0x30] ;;;20 ;;;21 z = vmulq_f32(x,y); 0000a8 f3000d52 VMUL.F32 q0,q0,q1 ;已展开为NEON指令 0000ac ed8d0b06 VSTR d0,[sp,#0x18] 0000b0 eeb73ae0 VCVT.F64.F32 d3,s1 0000b4 eeb70ac0 VCVT.F64.F32 d0,s0 0000b8 ed8d1b08 VSTR d1,[sp,#0x20] 0000bc eeb72ae1 VCVT.F64.F32 d2,s3 0000c0 eeb71ac1 VCVT.F64.F32 d1,s2 0000c4 ec532b10 VMOV r2,r3,d0 0000c8 ed8d2b04 VSTR d2,[sp,#0x10] 0000cc ed8d1b02 VSTR d1,[sp,#8] 0000d0 ed8d3b00 VSTR d3,[sp,#0] 0000d4 ebfffffe BL __2printf 0000d8 ed9d0b0e VLDR d0,[sp,#0x38] 0000dc ed9d1b10 VLDR d1,[sp,#0x40] ;;;22 print4(z); ;;;23 ;;;24 z = vmulq_n_f32(x,c_minus_cephes_DP1); 0000e0 ed9f2a10 VLDR s4,|L1.296| 0000e4 f3a00942 VMUL.F32 q0,q0,d2[0] ;已展开为NEON指令 0000e8 e24f00b8 ADR r0,|L1.56| 0000ec ed8d0b06 VSTR d0,[sp,#0x18] 0000f0 eeb73ae0 VCVT.F64.F32 d3,s1 0000f4 eeb70ac0 VCVT.F64.F32 d0,s0 0000f8 ed8d1b08 VSTR d1,[sp,#0x20] 0000fc eeb72ae1 VCVT.F64.F32 d2,s3 000100 eeb71ac1 VCVT.F64.F32 d1,s2 000104 ec532b10 VMOV r2,r3,d0 000108 ed8d2b04 VSTR d2,[sp,#0x10] 00010c ed8d1b02 VSTR d1,[sp,#8] 000110 ed8d3b00 VSTR d3,[sp,#0] 000114 ebfffffe BL __2printf ;;;25 print4(z); ;;;26 ;;;27 return 0; 000118 e28dd04c ADD sp,sp,#0x4c 00011c e3a00000 MOV r0,#0 ;;;28 } 000120 e49df004 POP {pc} |L1.292| DCD ||.constdata|| |L1.296| 000128 bf490000 DCFS 0xbf490000 ; -0.78515625 ENDP AREA ||.ARM.exidx||, LINKORDER=||.text||, DATA, READONLY, SECTYPE={SHT_ARM_EXIDX}, ALIGN=2 DCD 0x00000000 RELOC 42, ||.text|| DCD 0x00000001 AREA ||.constdata||, DATA, READONLY, ALIGN=3 000000 00000000 000004 3ff00000 DCFD 0x3ff0000000000000 ; 1 000008 00000000 00000c 00000000 DCFD 0x0000000000000000 ; 0 000010 00000000 000014 40000000 DCFD 0x4000000000000000 ; 2 000018 00000000 00001c 00000000 DCFD 0x0000000000000000 ; 0 000020 00000000 000024 40080000 DCFD 0x4008000000000000 ; 3 000028 00000000 00002c 00000000 DCFD 0x0000000000000000 ; 0 __ARM_use_no_argv EQU 0
有趣的是在WCE7中偶然发现也加入了NEON内在函数支持:头文件名称也是arm_neon.h,函数名也差不多是一样的。不过有些函数是没有的,象vmulq_n_f32就没有,通过对比两个编译器生成的汇编代码,发现vmulq_n_f32在WEC7中要定义成
#define vmulq_n_f32(Qn, Dm) ( __neon_QdQnDmx( 0xf3a00940 | _NENC_5(0), (Qn), (Dm)) )
在宏定义中包含的__neon_QdQnDmx是原有的内在函数,原型定义是
__n128 __neon_QdQnDmx(unsigned int, __n128, __n64);
对比RVCT中的定义
__EXTERNC __PURE __VALUE_IN_REGS float32x4_t vmulq_n_f32(float32x4_t vec1, float32_t val2); /* VMUL.F32 q0,q0,d0[0] */
RVCT的第二个操作数是单字,而WEC7中的第二个参数是双字,两者不一致,因此需要修改测试程序才能在WEC7编译通过
#include "stdafx.h" #include <arm_neon.h> #include <stdio.h> #include <stdlib.h> //armcc -g --asm --interleave --cpu=Cortex-A8 -O3 -Otime --vectorize itest.c #define c_minus_cephes_DP1 -0.78515625 typedef float32x4_t v4sf; void print44(v4sf v) { printf("[%13.8g, %13.8g, %13.8g, %13.8g]\n", vgetq_lane_f32(v,0), vgetq_lane_f32(v, 1), vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)); } int _tmain(int argc, _TCHAR* argv[]) { v4sf x = {1}; v4sf y = {2}; v4sf z = {3}; float32x2_t f ; f.n64_f32[0] = c_minus_cephes_DP1; z = vmulq_f32(x, y); print44(z); z = vmulq_n_f32(x, f); print44(z); return 0; }
生成的汇编代码如下
.text:00011070 ; int __cdecl wmain(int argc, wchar_t **argv) .text:00011070 wmain ; CODE XREF: mainCRTStartupHelper+94p .text:00011070 ; DATA XREF: .pdata:00014008o .text:00011070 .text:00011070 var_24 = -0x24 .text:00011070 var_1C = -0x1C .text:00011070 var_18 = -0x18 .text:00011070 var_4 = -4 .text:00011070 .text:00011070 STR LR, [SP,#var_4]! .text:00011074 VPUSH {D8-D9} .text:00011078 SUB SP, SP, #0x10 .text:0001107C .text:0001107C $M32277 .text:0001107C MOV R2, #0 .text:00011080 STR R2, [SP,#0x24+var_24+4] .text:00011084 MOV R2, #0 .text:00011088 MOV R3, #2 .text:0001108C STR R2, [SP,#0x24+var_1C] .text:00011090 STR R2, [SP,#0x24+var_18] .text:00011094 STR R3, [SP,#0x24+var_24] .text:00011098 LDR R3, =__real_bf490000 .text:0001109C VLDMEA SP, {D16-D17} .text:000110A0 MOV R2, #0 .text:000110A4 MOV R1, #1 .text:000110A8 MOV R0, #0 ; __n128 .text:000110AC STR R2, [SP,#0x24+var_1C] .text:000110B0 STR R2, [SP,#0x24+var_18] .text:000110B4 STR R1, [SP,#0x24+var_24] .text:000110B8 STR R0, [SP,#0x24+var_24+4] .text:000110BC VLDMEA SP, {D8-D9} .text:000110C0 VLDR S4, [R3] .text:000110C4 VMUL.F32 Q0, Q4, Q8 ; __n128 ;已展开为NEON指令 .text:000110C8 VSTR S4, [SP,#0x24+var_24] .text:000110CC BL print44 .text:000110D0 VLDR D0, [SP,#0x24+var_24] .text:000110D4 VMUL.F32 Q0, Q4, D0[0] ; __n128 ;已展开为NEON指令 .text:000110D8 BL print44 .text:000110DC MOV R0, #0 .text:000110E0 ADD SP, SP, #0x10 .text:000110E4 VPOP {D8-D9} .text:000110E8 LDR PC, [SP+4+var_4],#4 .text:000110E8 ; --------------------------------------------------------------------------- .text:000110EC ; unsigned int off_110EC .text:000110EC off_110EC DCD __real_bf490000 ; DATA XREF: wmain+28r .text:000110EC ; End of function wmain
对比两个编译器生成的汇编代码,发现在内在函数的展开上RCVT使用的寄存器更少,生成的总指令数也更少。相应的在优化程度上也优于WEC7。
补充:在WEC7下运行测试程序时,发现vmulq_n_f32的计算结果完全不对。考虑到它执行的是浮点运算,在将x,y,z的赋值方式修改为如下后结果正常
x.n128_f32[0] = 1;
x.n128_f32[1] = 1;
x.n128_f32[2] = 1;
x.n128_f32[3] = 1;