.file "mp_mod_384.c" .text .p2align 4,,15 .globl mp_mod_384 .type mp_mod_384, @function mp_mod_384: .LFB0: .cfi_startproc # uint64_t mp_mod_384 (uint64_t r[6], uint64_t a[12]) # rdi = r[] # rsi = c[] # load (xmm0 ~ xmm5) = (a[0] ~ a[11]) movdqa (%rsi), %xmm0 movdqa 16(%rsi), %xmm1 movdqa 32(%rsi), %xmm2 movdqa 48(%rsi), %xmm3 movdqa 64(%rsi), %xmm4 movdqa 80(%rsi), %xmm5 # backup (r12, r13, r14, r15) movq %r12, %xmm14 movq %r14, %xmm15 pinsrq $1, %r13, %xmm14 pinsrq $1, %r15, %xmm15 # init xorq %r14, %r14 xorq %r15, %r15 xorq %rdx, %rdx # xmm6 = |a21|a22|a20|a23| # rsi = | 0 |a20| # r8 = |a21|a22| # r9 = |a23<<1 | pshufd $0xc9, %xmm5, %xmm6 pextrd $3, %xmm5, %r9d movd %xmm5, %esi movq %xmm6, %r8 shl $1, %r9 shl $32, %rsi # |a21|a22| 0 |a20|a21|a22|a23<<1 |(=) movq %r8, %r10 movq %rsi, %r11 movq %r8, %r12 movq %r9, %r13 # | 0 |a20|a21|a22|a23<<1 | 0 | 0 |(-) subq %rsi, %r10 sbbq %r8, %r11 sbbq %r9, %r12 sbbq $0, %r13 # rsi = |a20|a23| pextrq $1, %xmm6, %rsi # | 0 | 0 |a20|a23| 0 | 0 | 0 | 0 |(-) subq %rsi, %r11 sbbq $0, %r12 sbbq $0, %r13 # | 0 | 0 | 0 | 0 |a21|a22| 0 | 0 |(+) addq %r8, %r12 adcq $0, %r13 # r8 = |a20|a21| # r9 = |a22|a23| movq %xmm5, %r8 pextrq $1, %xmm5, %r9 # |a20|a23|a20|a21|a22|a23| 0 | 0 |(+) addq %rsi, %r10 adcq %r8, %r11 adcq %r9, %r12 adcq $0, %r13 # | 0 | 0 | 0 | 0 |a20|a21|a22|a23|a20|a21|a22|a23|(+) addq %r8, %r12 adcq %r9, %r13 adcq %r8, %r14 adcq %r9, %r15 adcq $0, %rdx # rax = |a12|a13| # rsi = |a14|a15| # r8 = |a16|a17| # r9 = |a18|a19| movq %xmm3, %rax pextrq $1, %xmm3, %rsi movq %xmm4, %r8 pextrq $1, %xmm4, %r9 # |a12|a13|a14|a15|a16|a17|a18|a19| 0 | 0 | 0 | 0 |(+) addq %rax, %r10 adcq %rsi, %r11 adcq %r8, %r12 adcq %r9, %r13 adcq $0, %r14 adcq $0, %r15 adcq $0, %rdx # | 0 | 0 | 0 | 0 |a12|a13|a14|a15|a16|a17|a18|a19|(+) addq %rax, %r12 adcq %rsi, %r13 adcq %r8, %r14 adcq %r9, %r15 adcq $0, %rdx # |a00|a01|a02|a03|a04|a05|a06|a07|a08|a09|a10|a11|(+) movq %xmm0, %rax addq %rax, %r10 pextrq $1, %xmm0, %rax adcq %rax, %r11 # ---- movq %xmm1, %rax adcq %rax, %r12 pextrq $1, %xmm1, %rax adcq %rax, %r13 # ---- movq %xmm2, %rax adcq %rax, %r14 pextrq $1, %xmm2, %rax adcq %rax, %r15 # ---- adcq $0, %rdx # |---------------|---------------|---------------| # | xmm3 | xmm4 | xmm5 | # |---|---|---|---|---|---|---|---|---|---|---|---|---| # |a23|a12|a13|a14|a15|a16|a17|a18|a19|a20|a21|a22| # |---|---|---|---|---|---|---|---|---|---|---|---| # | xmm0 | xmm1 | xmm2 | # |---------------|---------------|---------------| pshufd $0x93, %xmm3, %xmm0 pshufd $0x93, %xmm4, %xmm1 pshufd $0x93, %xmm5, %xmm2 pextrd $3, %xmm3, %r8d pextrd $3, %xmm4, %r9d pextrd $3, %xmm5, %esi pinsrd $0, %r8d, %xmm1 pinsrd $0, %r9d, %xmm2 pinsrd $0, %esi, %xmm0 # rax = |a23|a12| # rsi = |a13|a14| # r8 = |a15|a16| # r9 = |a17|a18| # rcx = |a19|a20| movq %xmm0, %rax pextrq $1, %xmm0, %rsi movq %xmm1, %r8 pextrq $1, %xmm1, %r9 movq %xmm2, %rcx # | 0 | 0 |a23|a12|a13|a14|a15|a16|a17|a18|a19|a20|(+) addq %rax, %r11 adcq %rsi, %r12 adcq %r8, %r13 adcq %r9, %r14 adcq %rcx, %r15 adcq $0, %rdx # |a23|a12|a13|a14|a15|a16|a17|a18|a19|a20| 0 | 0 |(+) subq %rax, %r10 sbbq %rsi, %r11 sbbq %r8, %r12 sbbq %r9, %r13 sbbq %rcx, %r14 sbbq $0, %r15 sbbq $0, %rdx # ---- # p384 # ---- xorq %rax, %rax notq %rax movq %rax, %r8 movq %rax, %r9 movq %rax, %rsi shr $32, %r8 shl $32, %r9 decq %rsi # | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |a21|a22|(-) movq %xmm6, %rcx subq %rcx, %r15 sbbq $0, %rdx jc .minus .reduce: subq %r8, %r10 sbbq %r9, %r11 sbbq %rsi, %r12 sbbq %rax, %r13 sbbq %rax, %r14 sbbq %rax, %r15 sbbq $0, %rdx jnc .reduce .minus: addq %r8, %r10 adcq %r9, %r11 adcq %rsi, %r12 adcq %rax, %r13 adcq %rax, %r14 adcq %rax, %r15 # ---- movq %r10, %xmm6 movq %r12, %xmm7 movq %r14, %xmm8 pinsrq $1, %r11, %xmm6 pinsrq $1, %r13, %xmm7 pinsrq $1, %r15, %xmm8 # restore (r12, r13, r14, r15) movq %xmm14, %r12 movq %xmm15, %r14 pextrq $1, %xmm14, %r13 pextrq $1, %xmm15, %r15 # output (xmm6, xmm7, xmm8) movdqa %xmm6, (%rdi) movdqa %xmm7, 16(%rdi) movdqa %xmm8, 32(%rdi) # return emms xorq %rax, %rax xorq %rdx, %rdx ret .cfi_endproc .LFE0: .size mp_mod_384, .-mp_mod_384 .ident "GCC: (GNU) 4.4.7 20120313 (Red Hat 4.4.7-11)" .section .note.GNU-stack,"",@progbits
测试使用时将程序文本保存为mp_mod_384.s,相应C程序头文件为mp_mod_384.h,其全部内容就下面一行
uint64_t mp_mod_384 (uint64_t r[6], uint64_t a[12]);
在C程序应用此汇编程序必须保证输入输出数组r和a的首地址16字节对齐,CPU须支持SSE42指令,本函数实现满足可重入要求,无须加锁即可在多线程环境中以最高效能运行。