对于ARM交叉编译器,GCC或RCVT能够支持内联汇编,而WINCE6是不支持的。在高版本的GCC(如4.6)或RCVT(如4.0)针对Cortex-A8还增加了新的方式,那就是内在函数
(intrinsic),即在代码编写时通过引入头文件arm_neon.h,在C或C++文件中加入相关NEON指令的内在函数,编译器会将此函数按类似inline方式进行展开,这样即简化了汇编代码的编写,举一个简单的例子如下(调用两个内在函数,第一个是向量乘法,第二个是向量标量乘法):
#include
#include
#include
//armcc -g --asm --interleave --cpu=Cortex-A8 -O3 -Otime --vectorize itest.c
#define c_minus_cephes_DP1 -0.78515625
typedef float32x4_t v4sf; // vector of 4 float
void print4(float32x4_t v) {
printf("[%13.8g, %13.8g, %13.8g, %13.8g]", vgetq_lane_f32(v,0), vgetq_lane_f32(v, 1), vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3));
}
int main()
{
v4sf x = {1};
v4sf y = {2};
v4sf z = {3};
z = vmulq_f32(x,y); /* VMUL.F32 q0,q0,q0 */
print4(z);
z = vmulq_n_f32(x,c_minus_cephes_DP1); /* VMUL.F32 q0,q0,d0[0] */
print4(z);
return 0;
}
生成的汇编代码如下:
; generated by ARM NEON C/C++ Compiler, RVCT4.0 [Build 400]
; commandline armcc [--cpp --debug --asm --interleave --cpu=Cortex-A8 -O3 -Otime --vectorize itest.c]
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
_Z6print419__simd128_float32_t PROC ; print4(__simd128_float32_t)
000000 eeb73ae0 VCVT.F64.F32 d3,s1
000004 eeb70ac0 VCVT.F64.F32 d0,s0
000008 eeb72ae1 VCVT.F64.F32 d2,s3
00000c eeb71ac1 VCVT.F64.F32 d1,s2
000010 e52de004 PUSH {lr}
000014 e28f001c ADR r0,|L1.56|
000018 ec532b10 VMOV r2,r3,d0
00001c e24dd01c SUB sp,sp,#0x1c
000020 ed8d2b04 VSTR d2,[sp,#0x10]
000024 ed8d1b02 VSTR d1,[sp,#8]
000028 ed8d3b00 VSTR d3,[sp,#0]
00002c ebfffffe BL __2printf
000030 e28dd01c ADD sp,sp,#0x1c
000034 e49df004 POP {pc}
|L1.56|
000038 5b253133 DCB "[%13"
00003c 2e38672c DCB ".8g,"
000040 20253133 DCB " %13"
000044 2e38672c DCB ".8g,"
000048 20253133 DCB " %13"
00004c 2e38672c DCB ".8g,"
000050 20253133 DCB " %13"
000054 2e38675d DCB ".8g]"
000058 00000000 DCB "\0\0\0\0"
ENDP
main PROC
;;;14
;;;15 int main()
00005c e59f10c0 LDR r1,|L1.292|
;;;16 {
000060 e52de004 PUSH {lr}
000064 e24dd04c SUB sp,sp,#0x4c
;;;17 v4sf x = {1};
000068 e891500c LDM r1,{r2,r3,r12,lr}
00006c e28d0038 ADD r0,sp,#0x38
;;;18 v4sf y = {2};
000070 e2811010 ADD r1,r1,#0x10
000074 e880500c STM r0,{r2,r3,r12,lr} ;17
000078 e28d0028 ADD r0,sp,#0x28
00007c e891500c LDM r1,{r2,r3,r12,lr}
;;;19 v4sf z = {3};
000080 e2811010 ADD r1,r1,#0x10
000084 e880500c STM r0,{r2,r3,r12,lr} ;18
000088 e28d0018 ADD r0,sp,#0x18
00008c e891500c LDM r1,{r2,r3,r12,lr}
000090 e880500c STM r0,{r2,r3,r12,lr}
000094 e24f0064 ADR r0,|L1.56|
000098 ed9d0b0e VLDR d0,[sp,#0x38]
00009c ed9d1b10 VLDR d1,[sp,#0x40]
0000a0 ed9d2b0a VLDR d2,[sp,#0x28]
0000a4 ed9d3b0c VLDR d3,[sp,#0x30]
;;;20
;;;21 z = vmulq_f32(x,y);
0000a8 f3000d52 VMUL.F32 q0,q0,q1 ;已展开为NEON指令
0000ac ed8d0b06 VSTR d0,[sp,#0x18]
0000b0 eeb73ae0 VCVT.F64.F32 d3,s1
0000b4 eeb70ac0 VCVT.F64.F32 d0,s0
0000b8 ed8d1b08 VSTR d1,[sp,#0x20]
0000bc eeb72ae1 VCVT.F64.F32 d2,s3
0000c0 eeb71ac1 VCVT.F64.F32 d1,s2
0000c4 ec532b10 VMOV r2,r3,d0
0000c8 ed8d2b04 VSTR d2,[sp,#0x10]
0000cc ed8d1b02 VSTR d1,[sp,#8]
0000d0 ed8d3b00 VSTR d3,[sp,#0]
0000d4 ebfffffe BL __2printf
0000d8 ed9d0b0e VLDR d0,[sp,#0x38]
0000dc ed9d1b10 VLDR d1,[sp,#0x40]
;;;22 print4(z);
;;;23
;;;24 z = vmulq_n_f32(x,c_minus_cephes_DP1);
0000e0 ed9f2a10 VLDR s4,|L1.296|
0000e4 f3a00942 VMUL.F32 q0,q0,d2[0] ;已展开为NEON指令
0000e8 e24f00b8 ADR r0,|L1.56|
0000ec ed8d0b06 VSTR d0,[sp,#0x18]
0000f0 eeb73ae0 VCVT.F64.F32 d3,s1
0000f4 eeb70ac0 VCVT.F64.F32 d0,s0
0000f8 ed8d1b08 VSTR d1,[sp,#0x20]
0000fc eeb72ae1 VCVT.F64.F32 d2,s3
000100 eeb71ac1 VCVT.F64.F32 d1,s2
000104 ec532b10 VMOV r2,r3,d0
000108 ed8d2b04 VSTR d2,[sp,#0x10]
00010c ed8d1b02 VSTR d1,[sp,#8]
000110 ed8d3b00 VSTR d3,[sp,#0]
000114 ebfffffe BL __2printf
;;;25 print4(z);
;;;26
;;;27 return 0;
000118 e28dd04c ADD sp,sp,#0x4c
00011c e3a00000 MOV r0,#0
;;;28 }
000120 e49df004 POP {pc}
|L1.292|
DCD ||.constdata||
|L1.296|
000128 bf490000 DCFS 0xbf490000 ; -0.78515625
ENDP
AREA ||.ARM.exidx||, LINKORDER=||.text||, DATA, READONLY, SECTYPE={SHT_ARM_EXIDX}, ALIGN=2
DCD 0x00000000
RELOC 42, ||.text||
DCD 0x00000001
AREA ||.constdata||, DATA, READONLY, ALIGN=3
000000 00000000
000004 3ff00000 DCFD 0x3ff0000000000000 ; 1
000008 00000000
00000c 00000000 DCFD 0x0000000000000000 ; 0
000010 00000000
000014 40000000 DCFD 0x4000000000000000 ; 2
000018 00000000
00001c 00000000 DCFD 0x0000000000000000 ; 0
000020 00000000
000024 40080000 DCFD 0x4008000000000000 ; 3
000028 00000000
00002c 00000000 DCFD 0x0000000000000000 ; 0
__ARM_use_no_argv EQU 0
有趣的是在WCE7中偶然发现也加入了NEON内在函数支持:头文件名称也是arm_neon.h,函数名也差不多是一样的。不过有些函数是没有的,象vmulq_n_f32就没有,通过对比两个编译器生成的汇编代码,发现vmulq_n_f32在WEC7中要定义成
#define vmulq_n_f32(Qn, Dm) ( __neon_QdQnDmx( 0xf3a00940 | _NENC_5(0), (Qn), (Dm)) )
在宏定义中包含的__neon_QdQnDmx是原有的内在函数,原型定义是
__n128 __neon_QdQnDmx(unsigned int, __n128, __n64);
对比RVCT中的定义
__EXTERNC __PURE __VALUE_IN_REGS float32x4_t vmulq_n_f32(float32x4_t vec1, float32_t val2); /* VMUL.F32 q0,q0,d0[0] */
RVCT的第二个操作数是单字,而WEC7中的第二个参数是双字,两者不一致,因此需要修改测试程序才能在WEC7编译通过
#include "stdafx.h"
#include
#include
#include
//armcc -g --asm --interleave --cpu=Cortex-A8 -O3 -Otime --vectorize itest.c
#define c_minus_cephes_DP1 -0.78515625
typedef float32x4_t v4sf;
void print44(v4sf v) {
printf("[%13.8g, %13.8g, %13.8g, %13.8g]\n", vgetq_lane_f32(v,0), vgetq_lane_f32(v, 1), vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3));
}
int _tmain(int argc, _TCHAR* argv[])
{
v4sf x = {1};
v4sf y = {2};
v4sf z = {3};
float32x2_t f ;
f.n64_f32[0] = c_minus_cephes_DP1;
z = vmulq_f32(x, y);
print44(z);
z = vmulq_n_f32(x, f);
print44(z);
return 0;
}
生成的汇编代码如下
.text:00011070 ; int __cdecl wmain(int argc, wchar_t **argv)
.text:00011070 wmain ; CODE XREF: mainCRTStartupHelper+94p
.text:00011070 ; DATA XREF: .pdata:00014008o
.text:00011070
.text:00011070 var_24 = -0x24
.text:00011070 var_1C = -0x1C
.text:00011070 var_18 = -0x18
.text:00011070 var_4 = -4
.text:00011070
.text:00011070 STR LR, [SP,#var_4]!
.text:00011074 VPUSH {D8-D9}
.text:00011078 SUB SP, SP, #0x10
.text:0001107C
.text:0001107C $M32277
.text:0001107C MOV R2, #0
.text:00011080 STR R2, [SP,#0x24+var_24+4]
.text:00011084 MOV R2, #0
.text:00011088 MOV R3, #2
.text:0001108C STR R2, [SP,#0x24+var_1C]
.text:00011090 STR R2, [SP,#0x24+var_18]
.text:00011094 STR R3, [SP,#0x24+var_24]
.text:00011098 LDR R3, =__real_bf490000
.text:0001109C VLDMEA SP, {D16-D17}
.text:000110A0 MOV R2, #0
.text:000110A4 MOV R1, #1
.text:000110A8 MOV R0, #0 ; __n128
.text:000110AC STR R2, [SP,#0x24+var_1C]
.text:000110B0 STR R2, [SP,#0x24+var_18]
.text:000110B4 STR R1, [SP,#0x24+var_24]
.text:000110B8 STR R0, [SP,#0x24+var_24+4]
.text:000110BC VLDMEA SP, {D8-D9}
.text:000110C0 VLDR S4, [R3]
.text:000110C4 VMUL.F32 Q0, Q4, Q8 ; __n128 ;已展开为NEON指令
.text:000110C8 VSTR S4, [SP,#0x24+var_24]
.text:000110CC BL print44
.text:000110D0 VLDR D0, [SP,#0x24+var_24]
.text:000110D4 VMUL.F32 Q0, Q4, D0[0] ; __n128 ;已展开为NEON指令
.text:000110D8 BL print44
.text:000110DC MOV R0, #0
.text:000110E0 ADD SP, SP, #0x10
.text:000110E4 VPOP {D8-D9}
.text:000110E8 LDR PC, [SP+4+var_4],#4
.text:000110E8 ; ---------------------------------------------------------------------------
.text:000110EC ; unsigned int off_110EC
.text:000110EC off_110EC DCD __real_bf490000 ; DATA XREF: wmain+28r
.text:000110EC ; End of function wmain
对比两个编译器生成的汇编代码,发现在内在函数的展开上RCVT使用的寄存器更少,生成的总指令数也更少。相应的在优化程度上也优于WEC7。
补充:在WEC7下运行测试程序时,发现vmulq_n_f32的计算结果完全不对。考虑到它执行的是浮点运算,在将x,y,z的赋值方式修改为如下后结果正常
x.n128_f32[0] = 1;
x.n128_f32[1] = 1;
x.n128_f32[2] = 1;
x.n128_f32[3] = 1;