void WelsIDctFourT4Rec_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct)
{
int32_t iDstStridex4 = iStride << 2;
int32_t iPredStridex4 = iPredStride << 2;
WelsIDctT4Rec_c (pRec, iStride, pPred, iPredStride, pDct);
WelsIDctT4Rec_c (&pRec[4], iStride, &pPred[4], iPredStride, pDct + 16);
WelsIDctT4Rec_c (&pRec[iDstStridex4 ], iStride, &pPred[iPredStridex4 ], iPredStride, pDct + 32);
WelsIDctT4Rec_c (&pRec[iDstStridex4 + 4], iStride, &pPred[iPredStridex4 + 4], iPredStride, pDct + 48);
}
;***********************************************************************
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
;***********************************************************************
WELS_EXTERN WelsIDctFourT4Rec_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
;Load 4x8
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
WELS_Zero xmm7
WELS_DW32 xmm6
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
add r4, 64
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
WELS_Zero xmm7
WELS_DW32 xmm6
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
POP_XMM
LOAD_5_PARA_POP
; pop esi
; pop ebx
ret