.file "mp_mod_sm2.c" .text .p2align 4,,15 .globl mp_mod_sm2 .type mp_mod_sm2, @function mp_mod_sm2: .LFB0: .cfi_startproc # uint64_t mp_mod_sm2(uint64_t r[4], uint64_t a[8]) # rdi = r # rsi = c # --------------------------- # backup (r12, r13, r14, r15) # --------------------------- movq %r12, %xmm14 movq %r14, %xmm15 pinsrq $1, %r13, %xmm14 pinsrq $1, %r15, %xmm15 # ------------------------------------ # load (xmm10 ~ xmm13) = (a[0] ~ c[7]) # ------------------------------------ movdqa (%rsi), %xmm10 movdqa 16(%rsi), %xmm11 movdqa 32(%rsi), %xmm12 movdqa 48(%rsi), %xmm13 # --------- # r15 = a15 # --------- pextrd $3, %xmm13, %r15d # --------------- # r14 = a14 + a15 # --------------- pextrd $2, %xmm13, %r14d addq %r15, %r14 # --------------------- # r13 = a13 + a14 + a15 # --------------------- pextrd $1, %xmm13, %r13d addq %r14, %r13 # --------------------------- # r12 = a12 + a13 + a14 + a15 # --------------------------- movd %xmm13, %r12d addq %r13, %r12 # -------- # r8 = a08 # -------- movd %xmm12, %r8d # -------- # r9 = a09 # -------- pextrd $1, %xmm12, %r9d # -------- # r10 = a10 # -------- pextrd $2, %xmm12, %r10d # -------- # r11 = a11 # -------- pextrd $3, %xmm12, %r11d # --------------------------- # rsi = a08 + a09 + a10 + a11 # --------------------------- movq %r8, %rsi addq %r9, %rsi addq %r10, %rsi addq %r11, %rsi # -------------------------------------------------------------------------- # a00 + (a08 + a09 + a10 +a11) + (a12 + a13 + a14 + a15) + (a13 + a14 + a15) # -------------------------------------------------------------------------- movd %xmm10, %eax addq %rsi, %rax addq %r12, %rax addq %r13, %rax movd %eax, %xmm0 # -------------------------------------------------------------------------------- # up + a01 + (a08 + a09 + a10 + a11) + (a12 + a13 + a14 + a15) + (a14 + a15) - a08 # -------------------------------------------------------------------------------- shr $32, %rax pextrd $1, %xmm10, %edx addq %rdx, %rax addq %rsi, %rax addq %r12, %rax addq %r14, %rax subq %r8, %rax pinsrd $1, %eax, %xmm0 # ------------------------------------------- # up + a02 + (2 ^ 34) - a08 - a09 - a13 - a14 # ------------------------------------------- shr $32, %rax pextrd $2, %xmm10, %edx addq %rdx, %rax movq $1, %rdx shl $34, %rdx addq %rdx, %rax subq %r8, %rax subq %r9, %rax pextrd $1, %xmm13, %edx subq %rdx, %rax pextrd $2, %xmm13, %edx subq %rdx, %rax pinsrd $2, %eax, %xmm0 # ------------------------------------------------------------------- # up + a03 + (2 ^ 32) + (a12 + a13 + a14 + a15) + a08 + a11 + a13 - 4 # ------------------------------------------------------------------- shr $32, %rax pextrd $3, %xmm10, %edx addq %rdx, %rax movq $1, %rdx shl $32, %rdx addq %rdx, %rax addq %r12, %rax addq %r8, %rax addq %r11, %rax pextrd $1, %xmm13, %edx addq %rdx, %rax subq $4, %rax pinsrd $3, %eax, %xmm0 # -------------------------------------------------- # up + a04 + (a12 + a13 + a14 + a15) + a09 + a14 - 1 # -------------------------------------------------- shr $32, %rax movd %xmm11, %edx addq %rdx, %rax addq %r12, %rax addq %r9, %rax pextrd $2, %xmm13, %edx addq %rdx, %rax decq %rax movd %eax, %xmm1 # ---------------------------------------- # up + a05 + (a13 + a14 + a15) + a10 + a15 # ---------------------------------------- shr $32, %rax pextrd $1, %xmm11, %edx addq %rdx, %rax addq %r13, %rax addq %r10, %rax addq %r15, %rax pinsrd $1, %eax, %xmm1 # ---------------------------- # up + a06 + (a14 + a15) + a11 # ---------------------------- shr $32, %rax pextrd $2, %xmm11, %edx addq %rdx, %rax addq %r14, %rax addq %r11, %rax pinsrd $2, %eax, %xmm1 # ------------------------------------------------------------------------------------------- # up + a07 + (a08 + a09 + a10 +a11) + (a12 + a13 + a14 + a15) + (a12 + a13 + a14 + a15) + a15 # ------------------------------------------------------------------------------------------- shr $32, %rax pextrd $3, %xmm11, %edx addq %rdx, %rax addq %rsi, %rax addq %r12, %rax addq %r12, %rax addq %r15, %rax pinsrd $3, %eax, %xmm1 # ----- # final # ----- movq %xmm0, %r12 movq %xmm1, %r14 pextrq $1, %xmm0, %r13 pextrq $1, %xmm1, %r15 shr $32, %rax movq %rax, %rdx shl $32, %rdx movq %rdx, %rsi subq %rax, %rsi addq %rax, %r12 adcq %rsi, %r13 adcq $0, %r14 adcq %rdx, %r15 movq %r12, %xmm0 movq %r14, %xmm1 pinsrq $1, %r13, %xmm0 pinsrq $1, %r15, %xmm1 # ---------------------------- # restore (r12, r13, r14, r15) # ---------------------------- movq %xmm14, %r12 movq %xmm15, %r14 pextrq $1, %xmm14, %r13 pextrq $1, %xmm15, %r15 # ------------------- # output (xmm0, xmm1) # ------------------- movdqa %xmm0, (%rdi) movdqa %xmm1, 16(%rdi) # return emms xorq %rax, %rax xorq %rdx, %rdx ret .cfi_endproc .LFE0: .size mp_mod_sm2, .-mp_mod_sm2 .ident "GCC: (GNU) 4.4.7 20120313 (Red Hat 4.4.7-4)" .section .note.GNU-stack,"",@progbits
最后的收尾代码从数学上讲有些不够严谨,要改很容易拉