%macro SSE2_Store4x8p 6
SSE2_XSawp qdq, %2, %3, %6
SSE2_XSawp qdq, %4, %5, %3
MOVDQ [%1+0x00], %2
MOVDQ [%1+0x10], %4
MOVDQ [%1+0x20], %6
MOVDQ [%1+0x30], %3
%endmacro
;for TRANSPOSE
%macro SSE2_XSawp 4
movdqa %4, %2
punpckl%1 %2, %3
punpckh%1 %4, %3
%endmacro
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5 =>
SSE2_XSawp qdq, xmm4, xmm2, xmm5 =>
movdqa xmm5, xmm4
punpcklqdq xmm4, xmm2 =>(xmm4 存xmm4, xmm2低4字组合)
punpckhqdq xmm5, xmm2 =>(xmm5 存xmm4, xmm2高4字组合)
SSE2_XSawp qdq, xmm3, xmm0, xmm2 =>
movdqa xmm2, xmm3
punpcklqdq xmm3, xmm0 =>(xmm3 存xmm3, xmm0的低4字组合)
punpckhqdq xmm2, xmm0 =>(xmm2 存xmm3, xmm0的高4字组合)
movdqa [r0], xmm4
movdqa [r0 + 16], xmm3
movdqa [r0 + 32], xmm5
movdqa [r0 + 48], xmm2