Assembly x64 Intro - SSE2 2x4x4W Transpose




;in: xmm0, xmm1, xmm2, xmm3,  xmm4 pOut:  xmm0, xmm1, xmm3, xmm4
%macro SSE2_TransTwo4x4W 5
    SSE2_XSawp wd,  %1, %2, %5
    SSE2_XSawp wd,  %3, %4, %2
    SSE2_XSawp dq,  %1, %3, %4
    SSE2_XSawp dq,  %5, %2, %3
    SSE2_XSawp qdq, %1, %5, %2
    SSE2_XSawp qdq, %4, %3, %5
%endmacro


;for TRANSPOSE
%macro SSE2_XSawp 4
    movdqa      %4, %2
    punpckl%1   %2, %3
    punpckh%1   %4, %3
%endmacro

xmm2 = aw7aw6aw5aw4aw3aw2aw1aw0,      xmm0 = bw7bw6bw5bw4bw3bw2bw1bw0
xmm3 = cw7cw6cw5cw4cw3cw2cw1cw0,        xmm4 = dw7dw6dw5dw4dw3dw2dw1dw0

SSE2_TransTwo4x4W   xmm2, xmm0, xmm3, xmm4, xmm1

展开如下:

SSE2_XSawp wd, xmm2, xmm0, xmm1 =>
         movdqa        xmm1, xmm2     => xmm1 = xmm2 = aw7.....aw0 
         punpcklwd    xmm2, xmm0    => xmm2 = bw3aw3bw2aw2bw1aw1bw0aw0
         punpckhwd   xmm1, xmm0    => xmm1 = bw7aw7bw6aw6bw5aw5bw4aw4
SSE2_XSawp wd, xmm3, xmm4, xmm0 =>
         movdqa        xmm0, xmm3    => xmm0 = xmm3 = cw7......cw0
         punpcklwd   xmm3, xmm4    =>  xmm3 = dw3cw3dw2cw2dw1cw1dw0cw0
         punpckhwd  xmm0, xmm4    =>  xmm0 = dw7cw7dw6cw6dw5cw5dw4cw4
SSE2_XSawp dq, xmm2, xmm3, xmm4 =>
         movdqa        xmm4, xmm2   =>  xmm4 = xmm2 = bw3aw3bw2aw2bw1aw1bw0aw0
         punpckldq     xmm2, xmm3  =>  xmm2 = dw1cw1bw1aw1dw0cw0bw0aw0
         punpckhdq    xmm4, xmm3  =>  xmm4 = dw3cw3bw3aw3dw2cw2bw2aw2
SSE2_XSawp dq, xmm1, xmm0, xmm3 =>
         movdqa        xmm3, xmm1   => xmm3 = xmm1 = bw7aw7bw6aw6bw5aw5bw4aw4
         punpckldq    xmm1, xmm0   =>  xmm1 = dw5cw5bw5aw5dw4cw4bw4aw4
         punpckhdq   xmm3, xmm0   =>  xmm3 = dw7cw7bw7aw7dw6cw6bw6aw6
SSE2_XSawp qdq, xmm2, xmm1, xmm0 =>
         movdqa        xmm0, xmm2   =>  xmm0 = xmm2 = dw1cw1bw1aw1dw0cw0bw0aw0
         punpcklqdq  xmm2, xmm1   =>  xmm2 = dw4cw4bw4aw4dw0cw0bw0aw0
         punpckhqdq xmm0, xmm1   =>  xmm0 = dw5cw5bw5aw5dw1cw1bw1aw1
SSE2_XSawp qdq, xmm4, xmm3, xmm1 =>
         movdqa        xmm1, xmm4   => xmm1 = xmm4 = dw3cw3bw3aw3dw2cw2bw2aw2
         punpcklqdq  xmm4, xmm3   => xmm4 = dw6cw6bw6aw6dw2cw2bw2aw2
         punpckhqdq xmm1, xmm3   => xmm1 = dw7cw7bw7aw7dw3cw3bw3aw3








你可能感兴趣的:(Assembly x64 Intro - SSE2 2x4x4W Transpose)