void WelsDctFourT4_c (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2)
{
int32_t stride_1 = iStride1 << 2;
int32_t stride_2 = iStride2 << 2;
WelsDctT4_c (pDct, &pPixel1[0], iStride1, &pPixel2[0], iStride2);
WelsDctT4_c (pDct + 16, &pPixel1[4], iStride1, &pPixel2[4], iStride2);
WelsDctT4_c (pDct + 32, &pPixel1[stride_1 ], iStride1, &pPixel2[stride_2 ], iStride2);
WelsDctT4_c (pDct + 48, &pPixel1[stride_1 + 4], iStride1, &pPixel2[stride_2 + 4], iStride2);
}
;***********************************************************************
; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
;***********************************************************************
WELS_EXTERN WelsDctFourT4_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d
pxor xmm7, xmm7
;Load 4x8
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
;xmm0, xmm6, xmm7, [eax], [ecx]
;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
;%macro SSE2_LoadDiff8P 5
; movq %1, %4 => movq xmm0, [r1]
; punpcklbw %1, %3 => punpcklbw xmm0, xmm7 => 变换为8个word (16bits)
; movq %2, %5 => movq xmm6, [r3]
; punpcklbw %2, %3 => punpcklbw xmm6, xmm7 => 变换为8个word (16bits)
; psubw %1, %2 => xmm0 = xmm0 - xmm6 = [r1] - [r3]
;%endmacro
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
; 同样地, xmm1 = [r1 + r2] - [r3 + r4]
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
; 同样地, xmm2 = [r1 + 2r2] - [r3 + 2r4]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
; 同样地, xmm3 = [r1 + 3r2] - [r3 + 3r4]
;
; 执行列DCT变换, Y = Cf4 * X
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
; %macro SSE2_DCT 6
; SSE2_SumSub %6, %3, %5
; SSE2_SumSub %1, %2, %5
; SSE2_SumSub %3, %2, %5
; SSE2_SumSubMul2 %6, %1, %4
; %endmacro
;; m2 = m1 + m2, m1 = m1 - m2
;%macro SSE2_SumSub 3
; movdqa %3, %2
; paddw %2, %1
; psubw %1, %3
;%endmacro
;%macro SSE2_SumSubMul2 3
; movdqa %3, %1
; paddw %1, %1
; paddw %1, %2
; psubw %3, %2
; psubw %3, %2
;%endmacro
=> SSE2_SumSub xmm0, xmm3, xmm5
=> movdqa xmm5, xmm3 => xmm5 = xmm3 = d
=> paddw xmm3, xmm0 => xmm3 = a + d
=> psubw xmm0, xmm5 => xmm0 = a - d
=> SSE2_SumSub xmm1, xmm2, xmm5
=> movdqa xmm5, xmm2 => xmm5 = xmm2 = c
=> paddw xmm2, xmm1 => xmm2 = b + c
=> psubw xmm1, xmm5 => xmm1 = b - c
=> SSE2_SumSub xmm3, xmm2, xmm5
=> movdqa xmm5, xmm2 => xmm5 = xmm2 = b + c
=> paddw xmm2, xmm3 => xmm2 = a + b + c + d = NEWa
=> psubw xmm3, xmm5 => xmm3 = a - b - c + d = NEWc
=> SSE2_SumSubMul2 xmm0, xmm1, xmm4
=> movdqa xmm4, xmm0 => xmm4 = xmm0 = a - d
=> paddw xmm0, xmm0 => xmm0 = xmm0 = 2(a - d)
=> paddw xmm0, xmm1 => xmm0 = 2a + b - c - 2d = NEWb
=> psubw xmm4, xmm1 => xmm4 = a - d - b + c
=> psubw xmm4, xmm1 => xmm4 = a - 2b + 2c - d = NEWd
所以结果为: xmm2 = NEWa, xmm0 = NEWb, xmm3 = NEWc, xmm4 = NEWd
;
; 执行转置变换 Y1 = T(Y)
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
; 与上面的转置结合, 一起再次通过列DCT变换来执行行DCT变换, Z1 = Cf4 * Y1
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
; 执行转置变换, Z = T(Z1) = T(Y1) * T(Cf4) = Y * T(Cf4), 从这可看到, 这儿最终执行的是行DCT变换。
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
; Store4x8 => ???
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
;Load 4x8
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
lea r1, [r1 + 2 * r2]
lea r3, [r3 + 2 * r4]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
lea r0, [r0+64]
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
POP_XMM
LOAD_5_PARA_POP
ret