Assembly x64 Intro - SSE2 Diff 8 Load




;***********************************************************************
; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
;***********************************************************************
WELS_EXTERN WelsDctFourT4_sse2
    %assign push_num 0
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r2, r2d
    SIGN_EXTENSION r4, r4d
    pxor    xmm7, xmm7
    ;Load 4x8
    SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1], [r3]
;xmm0, xmm6, xmm7, [eax], [ecx]
;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
;%macro SSE2_LoadDiff8P 5
;    movq         %1, %4 => movq      xmm0, [r1]
;    punpcklbw    %1, %3 => punpcklbw xmm0, xmm7 => 变换为8个word (16bits)
;    movq         %2, %5 => movq      xmm6, [r3]
;    punpcklbw    %2, %3 => punpcklbw xmm6, xmm7 => 变换为8个word (16bits)
;    psubw        %1, %2 => xmm0 = xmm0 - xmm6 = [r1] - [r3]
;%endmacro

    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
; 同样地, xmm1 = [r1 + r2] - [r3 + r4]
    lea     r1, [r1 + 2 * r2]
    lea     r3, [r3 + 2 * r4]
    SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
; 同样地, xmm2 = [r1 + 2r2] - [r3 + 2r4]
    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
; 同样地, xmm3 = [r1 + 3r2] - [r3 + 3r4]

;

;   执行列DCT变换, Y = Cf4 * X

    SSE2_DCT            xmm1, xmm2, xmm3, xmm4, xmm5, xmm0

; %macro SSE2_DCT 6
;    SSE2_SumSub     %6, %3, %5 
;   SSE2_SumSub     %1, %2, %5
;    SSE2_SumSub     %3, %2, %5
;    SSE2_SumSubMul2     %6, %1, %4
; %endmacro

;; m2 = m1 + m2, m1 = m1 - m2
;%macro SSE2_SumSub 3
;    movdqa  %3, %2
;    paddw   %2, %1
;    psubw   %1, %3
;%endmacro


;%macro SSE2_SumSubMul2 3
;    movdqa  %3, %1
;    paddw   %1, %1
;    paddw   %1, %2
;    psubw   %3, %2
;    psubw   %3, %2
;%endmacro


=> SSE2_SumSub   xmm0, xmm3, xmm5

     => movdqa xmm5, xmm3     => xmm5 = xmm3 = d

    =>  paddw   xmm3, xmm0     => xmm3 = a + d

    => psubw     xmm0, xmm5    => xmm0 = a - d

=> SSE2_SumSub   xmm1, xmm2, xmm5

    => movdqa  xmm5, xmm2    => xmm5 = xmm2 = c

    => paddw    xmm2, xmm1    => xmm2 = b + c

    => psubw    xmm1, xmm5    => xmm1 = b - c

=> SSE2_SumSub   xmm3, xmm2, xmm5

    => movdqa  xmm5, xmm2    => xmm5 = xmm2 = b + c

    => paddw    xmm2, xmm3    => xmm2 = a + b + c + d = NEWa

    => psubw    xmm3, xmm5    => xmm3 = a - b - c + d = NEWc

=> SSE2_SumSubMul2  xmm0, xmm1, xmm4

    => movdqa  xmm4, xmm0    => xmm4 = xmm0 = a - d          

    => paddw   xmm0, xmm0     => xmm0 = xmm0 = 2(a - d)

    => paddw  xmm0,  xmm1     => xmm0 = 2a + b - c - 2d = NEWb

    => psubw  xmm4, xmm1      => xmm4 = a - d - b + c

    => psubw  xmm4, xmm1      =>  xmm4 = a  - 2b + 2c - d = NEWd

所以结果为: xmm2 = NEWa, xmm0 = NEWb, xmm3 = NEWc, xmm4 = NEWd

;

;   执行转置变换   Y1 = T(Y)
    SSE2_TransTwo4x4W   xmm2, xmm0, xmm3, xmm4, xmm1

; 与上面的转置结合, 一起再次通过列DCT变换来执行行DCT变换, Z1 =  Cf4 * Y1
    SSE2_DCT            xmm0, xmm4, xmm1, xmm3, xmm5, xmm2

; 执行转置变换, Z = T(Z1) = T(Y1) * T(Cf4) = Y * T(Cf4),   从这可看到, 这儿最终执行的是行DCT变换。
    SSE2_TransTwo4x4W   xmm4, xmm2, xmm1, xmm3, xmm0

 ; Store4x8 => ???

    SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5

    lea     r1, [r1 + 2 * r2]
    lea     r3, [r3 + 2 * r4]

    ;Load 4x8
    SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1      ], [r3    ]
    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2  ], [r3+r4]
    lea     r1, [r1 + 2 * r2]
    lea     r3, [r3 + 2 * r4]
    SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]

    SSE2_DCT            xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
    SSE2_TransTwo4x4W   xmm2, xmm0, xmm3, xmm4, xmm1
    SSE2_DCT            xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
    SSE2_TransTwo4x4W   xmm4, xmm2, xmm1, xmm3, xmm0

    lea     r0, [r0+64]
    SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5

    POP_XMM
    LOAD_5_PARA_POP
    ret




你可能感兴趣的:(Assembly x64 Intro - SSE2 Diff 8 Load)