Assembly x64 Intro - SSE2 Hadamard 4 DC


/* dc value pick up and hdm_4x4 */
void WelsHadamardT4Dc_c (int16_t* pLumaDc, int16_t* pDct)

{
  int32_t p[16], s[4];
  int32_t i, iIdx;

  for (i = 0 ; i < 16 ; i += 4) {
    iIdx = ((i & 0x08) << 4) + ((i & 0x04) << 3);
    s[0] = pDct[iIdx ] + pDct[iIdx + 80];
    s[3] = pDct[iIdx ] - pDct[iIdx + 80];
    s[1] = pDct[iIdx + 16] + pDct[iIdx + 64];
    s[2] = pDct[iIdx + 16] - pDct[iIdx + 64];

    p[i  ] = s[0] + s[1];
    p[i + 2] = s[0] - s[1];
    p[i + 1] = s[3] + s[2];
    p[i + 3] = s[3] - s[2];
  }

  for (i = 0 ; i < 4 ; i ++) {
    s[0] = p[i ] + p[i + 12];
    s[3] = p[i ] - p[i + 12];
    s[1] = p[i + 4] + p[i + 8];
    s[2] = p[i + 4] - p[i + 8];

    pLumaDc[i  ] = WELS_CLIP3 ((s[0] + s[1] + 1) >> 1, -32768, 32767);
    pLumaDc[i + 8 ] = WELS_CLIP3 ((s[0] - s[1] + 1) >> 1, -32768, 32767);
    pLumaDc[i + 4 ] = WELS_CLIP3 ((s[3] + s[2] + 1) >> 1, -32768, 32767);
    pLumaDc[i + 12] = WELS_CLIP3 ((s[3] - s[2] + 1) >> 1, -32768, 32767);
  }
}


;***********************************************************************
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
;***********************************************************************
WELS_EXTERN WelsHadamardT4Dc_sse2
    %assign push_num 0
    LOAD_2_PARA
    PUSH_XMM 8
    SSE2_Load4Col       xmm1, xmm5, xmm6, xmm0, r1
    SSE2_Load4Col       xmm2, xmm5, xmm6, xmm0, r1 + 0x40
    SSE2_Load4Col       xmm3, xmm5, xmm6, xmm0, r1 + 0x100
    SSE2_Load4Col       xmm4, xmm5, xmm6, xmm0, r1 + 0x140

    SSE2_SumSubD        xmm1, xmm2, xmm7
    SSE2_SumSubD        xmm3, xmm4, xmm7
    SSE2_SumSubD        xmm2, xmm4, xmm7
    SSE2_SumSubD        xmm1, xmm3, xmm7

    SSE2_Trans4x4D      xmm4, xmm2, xmm1, xmm3, xmm5    ; pOut: xmm4,xmm3,xmm5,xmm1

    SSE2_SumSubD        xmm4, xmm3, xmm7
    SSE2_SumSubD        xmm5, xmm1, xmm7

    WELS_DD1 xmm6
    SSE2_SumSubDiv2D    xmm3, xmm1, xmm6, xmm0          ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
    SSE2_SumSubDiv2D    xmm4, xmm5, xmm6, xmm1          ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
    SSE2_Trans4x4D      xmm3, xmm0, xmm1, xmm4, xmm2    ; pOut: xmm3,xmm4,xmm2,xmm1

    packssdw    xmm3,   xmm4
    packssdw    xmm2,   xmm1
    movdqa  [r0+ 0],   xmm3
    movdqa  [r0+16],   xmm2

    POP_XMM
    ret




你可能感兴趣的:(Assembly x64 Intro - SSE2 Hadamard 4 DC)