/* dc value pick up and hdm_4x4 */
void WelsHadamardT4Dc_c (int16_t* pLumaDc, int16_t* pDct)
{
int32_t p[16], s[4];
int32_t i, iIdx;
for (i = 0 ; i < 16 ; i += 4) {
iIdx = ((i & 0x08) << 4) + ((i & 0x04) << 3);
s[0] = pDct[iIdx ] + pDct[iIdx + 80];
s[3] = pDct[iIdx ] - pDct[iIdx + 80];
s[1] = pDct[iIdx + 16] + pDct[iIdx + 64];
s[2] = pDct[iIdx + 16] - pDct[iIdx + 64];
p[i ] = s[0] + s[1];
p[i + 2] = s[0] - s[1];
p[i + 1] = s[3] + s[2];
p[i + 3] = s[3] - s[2];
}
for (i = 0 ; i < 4 ; i ++) {
s[0] = p[i ] + p[i + 12];
s[3] = p[i ] - p[i + 12];
s[1] = p[i + 4] + p[i + 8];
s[2] = p[i + 4] - p[i + 8];
pLumaDc[i ] = WELS_CLIP3 ((s[0] + s[1] + 1) >> 1, -32768, 32767);
pLumaDc[i + 8 ] = WELS_CLIP3 ((s[0] - s[1] + 1) >> 1, -32768, 32767);
pLumaDc[i + 4 ] = WELS_CLIP3 ((s[3] + s[2] + 1) >> 1, -32768, 32767);
pLumaDc[i + 12] = WELS_CLIP3 ((s[3] - s[2] + 1) >> 1, -32768, 32767);
}
}
;***********************************************************************
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
;***********************************************************************
WELS_EXTERN WelsHadamardT4Dc_sse2
%assign push_num 0
LOAD_2_PARA
PUSH_XMM 8
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
SSE2_SumSubD xmm1, xmm2, xmm7
SSE2_SumSubD xmm3, xmm4, xmm7
SSE2_SumSubD xmm2, xmm4, xmm7
SSE2_SumSubD xmm1, xmm3, xmm7
SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
SSE2_SumSubD xmm4, xmm3, xmm7
SSE2_SumSubD xmm5, xmm1, xmm7
WELS_DD1 xmm6
SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
packssdw xmm3, xmm4
packssdw xmm2, xmm1
movdqa [r0+ 0], xmm3
movdqa [r0+16], xmm2
POP_XMM
ret