%macro SSE2_Load4x8p 6
MOVDQ %2, [%1+0x00]
MOVDQ %4, [%1+0x10]
MOVDQ %6, [%1+0x20]
MOVDQ %3, [%1+0x30]
SSE2_XSawp qdq, %4, %3, %5 ; 为什么要做这两步???
SSE2_XSawp qdq, %2, %6, %3
%endmacro
notes: MOVDQ defined as movdqa
;for TRANSPOSE
%macro SSE2_XSawp 4
movdqa %4, %2
punpckl%1 %2, %3
punpckh%1 %4, %3
%endmacro
如:
;Load 4x8
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
=> (r4: int16_t *)
movdqa xmm0, [r4 + 0x00]; // mov first 8 x 16 into 128bit' xmm0 => xmm0 = aw7aw6aw5aw4aw3aw2aw1aw0
movdqa xmm4, [r4 + 0x10]; // 2nd 8x16 into xmm4 => xmm4 = bw7bw6bw5bw4bw3bw2bw1bw0
movdqa xmm5, [r4 + 0x20]; // 3rd 8x16 into xmm5 => xmm5 = cw7cw6cw5cw4cw3cw2cw1cw0
movdqa xmm1, [r4 + 0x30]; // 4th 8x16 into xmm1 => xmm1= dw7dw6dw5dw4dw3dw2dw1dw0
SSE2_XSawp qdq, xmm4, xmm1, xmm2 =>
movdqa xmm2, xmm4 => xmm2 = xmm4 = bw7bw6bw5bw4bw3bw2bw1bw0
punpcklqdq xmm4, xmm1 => xmm1 = dw7dw6dw5dw4dw3dw2dw1dw0, xmm4 = dw3dw2dw1dw0bw3bw2bw1bw0
punpckhqdq xmm2, xmm1 => xmml = dw7dw6dw5dw4dw3dw2dw1dw0, xmm2 = dw7dw6dw5dw4bw7bw6bw5bw4
SSE2_XSawp qdq, xmm0, xmm5, xmm1 =>
movdqa xmm1, xmm0 => xmm1 = xmm0 = aw7aw6aw5aw4aw3aw2aw1aw0
punpcklqdq xmm0, xmm5 => xmm5 = cw7cw6cw5cw4cw3cw2cw1cw0, xmm0 = cw3cw2cw1cw0aw3aw2aw1aw0
punpckhqdq xmm1, xmm5 => xmm5 = cw7cw6cw5cw4cw3cw2cw1cw0, xmm1 = cw7cw6cw5cw4aw7aw6aw5aw4
%macro SSE2_Load4x8p 6
MOVDQ %2, [%1+0x00]
MOVDQ %4, [%1+0x10]
MOVDQ %6, [%1+0x20]
MOVDQ %3, [%1+0x30]
SSE2_XSawp qdq, %4, %3, %5
SSE2_XSawp qdq, %2, %6, %3
%endmacro
;for TRANSPOSE
%macro SSE2_XSawp 4
movdqa %4, %2
punpckl%1 %2, %3
punpckh%1 %4, %3
%endmacro
;Load 4x8
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
=> movdqa xmm0, [r4]
movdqa xmm4 [r4 + 16]
movdqa xmm5 [r4 + 32]
movdqa xmm1 [r4 + 48]
; 隔行置换, xmm0 <=> xmm5, xmm4 <=> xmm1
(xmm00, xmm01等分别表示4个16bit的word, 低高位)
xmm00 | xmm01 |
xmm40 | xmm41 |
xmm50 | xmm51 |
xmm10 | xmm11 |
SSE2_XSawp qdq xmm4, xmm1 xmm2 =>
movdqa xmm2, xmm4
punpcklqdq xmm4, xmm1
punpckhqdq xmm2, xmm1
SSE2_XSawp qdq xmm0, xmm5, xmm1 =>
movdqa xmm1, xmm0
punpcklqdq xmm0, xmm5
punpckhqdq xmm1, xmm5