;in: xmm0, xmm1, xmm2, xmm3, xmm4 pOut: xmm0, xmm1, xmm3, xmm4
%macro SSE2_TransTwo4x4W 5
SSE2_XSawp wd, %1, %2, %5
SSE2_XSawp wd, %3, %4, %2
SSE2_XSawp dq, %1, %3, %4
SSE2_XSawp dq, %5, %2, %3
SSE2_XSawp qdq, %1, %5, %2
SSE2_XSawp qdq, %4, %3, %5
%endmacro
;for TRANSPOSE
%macro SSE2_XSawp 4
movdqa %4, %2
punpckl%1 %2, %3
punpckh%1 %4, %3
%endmacro
xmm2 = aw7aw6aw5aw4aw3aw2aw1aw0, xmm0 = bw7bw6bw5bw4bw3bw2bw1bw0
xmm3 = cw7cw6cw5cw4cw3cw2cw1cw0, xmm4 = dw7dw6dw5dw4dw3dw2dw1dw0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
展开如下:
SSE2_XSawp wd, xmm2, xmm0, xmm1 =>
movdqa xmm1, xmm2 => xmm1 = xmm2 = aw7.....aw0
punpcklwd xmm2, xmm0 => xmm2 = bw3aw3bw2aw2bw1aw1bw0aw0
punpckhwd xmm1, xmm0 => xmm1 = bw7aw7bw6aw6bw5aw5bw4aw4
SSE2_XSawp wd, xmm3, xmm4, xmm0 =>
movdqa xmm0, xmm3 => xmm0 = xmm3 = cw7......cw0
punpcklwd xmm3, xmm4 => xmm3 = dw3cw3dw2cw2dw1cw1dw0cw0
punpckhwd xmm0, xmm4 => xmm0 = dw7cw7dw6cw6dw5cw5dw4cw4
SSE2_XSawp dq, xmm2, xmm3, xmm4 =>
movdqa xmm4, xmm2 => xmm4 = xmm2 = bw3aw3bw2aw2bw1aw1bw0aw0
punpckldq xmm2, xmm3 => xmm2 = dw1cw1bw1aw1dw0cw0bw0aw0
punpckhdq xmm4, xmm3 => xmm4 = dw3cw3bw3aw3dw2cw2bw2aw2
SSE2_XSawp dq, xmm1, xmm0, xmm3 =>
movdqa xmm3, xmm1 => xmm3 = xmm1 = bw7aw7bw6aw6bw5aw5bw4aw4
punpckldq xmm1, xmm0 => xmm1 = dw5cw5bw5aw5dw4cw4bw4aw4
punpckhdq xmm3, xmm0 => xmm3 = dw7cw7bw7aw7dw6cw6bw6aw6
SSE2_XSawp qdq, xmm2, xmm1, xmm0 =>
movdqa xmm0, xmm2 => xmm0 = xmm2 = dw1cw1bw1aw1dw0cw0bw0aw0
punpcklqdq xmm2, xmm1 => xmm2 = dw4cw4bw4aw4dw0cw0bw0aw0
punpckhqdq xmm0, xmm1 => xmm0 = dw5cw5bw5aw5dw1cw1bw1aw1
SSE2_XSawp qdq, xmm4, xmm3, xmm1 =>
movdqa xmm1, xmm4 => xmm1 = xmm4 = dw3cw3bw3aw3dw2cw2bw2aw2
punpcklqdq xmm4, xmm3 => xmm4 = dw6cw6bw6aw6dw2cw2bw2aw2
punpckhqdq xmm1, xmm3 => xmm1 = dw7cw7bw7aw7dw3cw3bw3aw3