Assembly x64 Intro - Dct.asm of OpenH264 Decode







;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        ?Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        ?Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  dct.asm
;*
;*  Abstract
;*      WelsDctFourT4_sse2
;*
;*  History
;*      8/4/2009 Created
;*
;*
;*************************************************************************/

%include "asm_inc.asm"

;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************
%macro MMX_SumSubDiv2 3                   ; 宏 MMX_SumSubDiv2 定义, 该宏可带三个参数
    movq    %3, %2                                      ; mov operate


;MOVQ instruction when operating on MMX registers and memory locations:
;DEST
<img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;MOVQ instruction when source and destination operands are XMM registers:
;DEST[63-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[63-0];
;MOVQ instruction when source operand is XMM register and destination
;operand is memory location:
;DEST <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[63-0];
;MOVQ instruction when source operand is memory location and destination
;operand is XMM register:
;DEST[63-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;DEST[127-64] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 0000000000000000H;


    psraw   %3, $01                                       ;  Packed Shift Right Arithmetic
    paddw   %3, %1

;PADDB 指令将压缩字节整数相加。单个结果太大而无法使用 8 位表示(上溢)时,则对结果进行舍位,将低 8 位写入目标操作数(即忽略进位)。

;PADDW 指令将压缩字整数相加。单个结果太大而无法使用 16 位表示(上溢)时,则对结果进行舍位,将低 16 位写入目标操作数。

;PADDD 指令将压缩双字整数相加。单个结果太大而无法使用 32 位表示(上溢)时,则对结果进行舍位,将低 32 位写入目标操作数。 .


    psraw   %1, $01
    psubw   %1, %2

;PSUBB 指令将压缩字节整数相减。单个结果太大或太小而无法使用一个字节表示时,则对结果执行舍位处理,将低 8 位写入目标元素。

;PSUBW 指令将压缩字整数相减。单个结果太大或太小而无法使用一个字表示时,则对结果执行舍位处理,将低 16 位写入目标元素。

;PSUBD 指令将压缩双字整数相减。单个结果太大或太小而无法使用一个双字表示时,则对结果执行舍位处理,将低 32 位写入目标元素。


%endmacro



%macro MMX_SumSub 3
    movq    %3, %2
    psubw   %2, %1
    paddw   %1, %3
%endmacro

%macro MMX_IDCT 6
    MMX_SumSub      %4, %5, %6
    MMX_SumSubDiv2  %3, %2, %1
    MMX_SumSub      %1, %4, %6
    MMX_SumSub      %3, %5, %6
%endmacro


%macro MMX_StoreDiff4P 5
    movd       %2, %5

;MOVD instruction when destination operand is MMX register:
;DEST[31-0]
<img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;DEST[63-32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 00000000H;
;MOVD instruction when destination operand is XMM register:
;DEST[31-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;DEST[127-32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 000000000000000000000000H;
;MOVD instruction when source operand is MMX or XXM register:
;DEST <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[31-0];


    punpcklbw  %2, %4


;PUNPCKLBW instruction with 64-bit operands:
;DEST[63..56]
<img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..24];
;DEST[55..48] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..24];
;DEST[47..40] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[23..16];
;DEST[39..32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[23..16];
;DEST[31..24] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15..8];
;DEST[23..16] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15..8];
;DEST[15..8] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[7..0];
;DEST[7..0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[7..0];

;PUNPCKLWD instruction with 64-bit operands:
;DEST[63..48] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..16];
;DEST[47..32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..16];
;DEST[31..16] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15..0];
;DEST[15..0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15..0];

;PUNPCKLDQ instruction with 64-bit operands:
;DEST[63..32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..0];
;DEST[31..0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..0];

;PUNPCKLBW instruction with 128-bit operands:
;DEST[7-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[7-0];
;DEST[15-8] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[7-0];
;DEST[23-16] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15-8];
;DEST[31-24] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15-8];
;DEST[39-32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[23-16];
;DEST[47-40] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[23-16];
;DEST[55-48] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-24];
;DEST[63-56] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-24];
;DEST[71-64] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[39-32];
;DEST[79-72] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[39-32];
;DEST[87-80] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[47-40];
;DEST[95-88] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[47-40];
;DEST[103-96] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[55-48];
;DEST[111-104] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[55-48];
;DEST[119-112] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-56];
;DEST[127-120] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-56];

;PUNPCKLWD instruction with 128-bit operands:
;DEST[15-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15-0];
;DEST[31-16] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15-0];
;DEST[47-32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-16];
;DEST[63-48] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-16];
;DEST[79-64] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[47-32];
;DEST[95-80] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[47-32];
;DEST[111-96] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-48];
;DEST[127-112] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-48];

;PUNPCKLDQ instruction with 128-bit operands:
;DEST[31-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-0];
;DEST[63-32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-0];
;DEST[95-64] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-32];
;DEST[127-96] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-32];
;PUNPCKLQDQ
;DEST[63-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-0];
;DEST[127-64] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-0];


    paddw      %1, %3
    psraw      %1, $06
    paddsw     %1, %2                                 ; Packed Add with Saturation


;PADDSB 指令将压缩有符号字节整数相加。单个字节结果超出有符号字节整数的范围(即大于 7FH 或小于 80H)时,则分别将饱和值 7FH 或 80H 写入目标操作数。

;PADDSW 指令将压缩有符号字整数相加。单个字结果超出有符号字整数的范围(即大于 7FFFH 或小于 8000H)时,则分别将饱和值 7FFFH 或 8000H 写入目标操作数。


    packuswb   %1, %2                                 ; Pack with Unsigned Saturation


;使用饱和运算将 mm 中的 4 个有符号字与mm/m64 中的 4 个有符号字压缩成 8 个无符号字节,结果放入mm

;使用饱和运算将 xmm1xmm2/m128 中的有符号字压缩成无符号字节,结果放入 xmm1



    movd       %5, %1
%endmacro


%macro WELS_EXTERN 1
    ALIGN 16
    %ifdef PREFIX
        global _%1
        %define %1 _%1
    %else
        global %1
    %endif
    %1:
%endmacro


%macro SIGN_EXTENSION 2
    %ifndef X86_32
        movsxd %1, %2
    %endif
%endmacro


; pOut mm1, mm4, mm5, mm3
%macro MMX_Trans4x4W 5
    MMX_XSwap wd, %1, %2, %5
    MMX_XSwap wd, %3, %4, %2
    MMX_XSwap dq, %1, %3, %4
    MMX_XSwap dq, %5, %2, %3
%endmacro


%macro MMX_XSwap  4
    movq        %4, %2
    punpckh%1   %4, %3
    punpckl%1   %2, %3
%endmacro


;*******************************************************************************
; Code
;*******************************************************************************

SECTION .text

;*******************************************************************************
;   void IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
;*******************************************************************************

WELS_EXTERN IdctResAddPred_mmx                         ; 定义 IdctResAddPred_mmx 函数
    %assign push_num 0
    LOAD_3_PARA
    SIGN_EXTENSION r1, r1d
    movq    mm0, [r2+ 0]
    movq    mm1, [r2+ 8]
    movq    mm2, [r2+16]
    movq    mm3, [r2+24]

    MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4

========>

    movq     mm4, mm0

   punpckhwd  mm4, mm1                           ; 高位交叉组合

   punpcklwd   mm0, mm1                           ; 低位交叉组合


   movq     mm1, mm2

   punpckhwd mm1, mm3

   punpcklwd  mm2, mm3


    movq      mm3, mm0

    punpckhdq    mm3, mm2

    punpckldq     mm0, mm2


    movq  mm2, mm4

    punpckhdq  mm2, mm1

    punpckldq    mm4, mm1




    MMX_IDCT            mm1, mm2, mm3, mm4, mm0, mm6


%macro MMX_SumSubDiv2 3
    movq    %3, %2
    psraw   %3, $01
    paddw   %3, %1
    psraw   %1, $01
    psubw   %1, %2
%endmacro

%macro MMX_SumSub 3
    movq    %3, %2
    psubw   %2, %1
    paddw   %1, %3
%endmacro

%macro MMX_IDCT 6
    MMX_SumSub      %4, %5, %6
    MMX_SumSubDiv2  %3, %2, %1
    MMX_SumSub      %1, %4, %6
    MMX_SumSub      %3, %5, %6
%endmacro




    MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
    MMX_IDCT            mm3, mm0, mm4, mm2, mm1, mm6

    WELS_Zero           mm7
    WELS_DW32           mm6

    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [r0]
    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [r0+r1]
    lea     r0, [r0+2*r1]
    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [r0]
    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [r0+r1]


    emms
    ret

;void WelsBlockZero16x16_sse2(int16_t * block, int32_t stride);
WELS_EXTERN WelsBlockZero16x16_sse2
    %assign  push_num 0
    LOAD_2_PARA
    SIGN_EXTENSION r1, r1d
    shl     r1, 1
    pxor    xmm0, xmm0
%rep 16
    movdqa  [r0], xmm0
    movdqa  [r0+16], xmm0
    add     r0, r1
%endrep
    ret

;void WelsBlockZero8x8_sse2(int16_t * block, int32_t stride);
WELS_EXTERN WelsBlockZero8x8_sse2
    %assign  push_num 0
    LOAD_2_PARA
    SIGN_EXTENSION r1, r1d
    shl     r1, 1
    pxor    xmm0, xmm0
%rep 8
    movdqa  [r0], xmm0
    add     r0, r1
%endrep
    ret





你可能感兴趣的:(Assembly x64 Intro - Dct.asm of OpenH264 Decode)