; in: xmm1, xmm2, xmm3, xmm4, xmm5 pOut: xmm1, xmm4, xmm5, mm3
%macro SSE2_Trans4x4D 5
SSE2_XSawp dq, %1, %2, %5
SSE2_XSawp dq, %3, %4, %2
SSE2_XSawp qdq, %1, %3, %4
SSE2_XSawp qdq, %5, %2, %3
%endmacro
;for TRANSPOSE
%macro SSE2_XSawp 4
movdqa %4, %2
punpckl%1 %2, %3
punpckh%1 %4, %3
%endmacro
SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
类似 MMX_Trans4x4W, MMX_Trans4x4W操作的是16bit的字, 而SSE2_Trans4x4D 操作的是双字。