;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* ?Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* ?Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* dct.asm
;*
;* Abstract
;* WelsDctFourT4_sse2
;*
;* History
;* 8/4/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************
%macro MMX_SumSubDiv2 3 ; 宏 MMX_SumSubDiv2 定义, 该宏可带三个参数
movq %3, %2 ; mov operate
;MOVQ instruction when operating on MMX registers and memory locations:
;DEST <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;MOVQ instruction when source and destination operands are XMM registers:
;DEST[63-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[63-0];
;MOVQ instruction when source operand is XMM register and destination
;operand is memory location:
;DEST <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[63-0];
;MOVQ instruction when source operand is memory location and destination
;operand is XMM register:
;DEST[63-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;DEST[127-64] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 0000000000000000H;
psraw %3, $01 ; Packed Shift Right Arithmetic
paddw %3, %1
;PADDB 指令将压缩字节整数相加。单个结果太大而无法使用 8 位表示(上溢)时,则对结果进行舍位,将低 8 位写入目标操作数(即忽略进位)。
;PADDW 指令将压缩字整数相加。单个结果太大而无法使用 16 位表示(上溢)时,则对结果进行舍位,将低 16 位写入目标操作数。
;PADDD 指令将压缩双字整数相加。单个结果太大而无法使用 32 位表示(上溢)时,则对结果进行舍位,将低 32 位写入目标操作数。 .
psraw %1, $01
psubw %1, %2
;PSUBB 指令将压缩字节整数相减。单个结果太大或太小而无法使用一个字节表示时,则对结果执行舍位处理,将低 8 位写入目标元素。
;PSUBW 指令将压缩字整数相减。单个结果太大或太小而无法使用一个字表示时,则对结果执行舍位处理,将低 16 位写入目标元素。
;PSUBD 指令将压缩双字整数相减。单个结果太大或太小而无法使用一个双字表示时,则对结果执行舍位处理,将低 32 位写入目标元素。
%endmacro
%macro MMX_SumSub 3
movq %3, %2
psubw %2, %1
paddw %1, %3
%endmacro
%macro MMX_IDCT 6
MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1
MMX_SumSub %1, %4, %6
MMX_SumSub %3, %5, %6
%endmacro
%macro MMX_StoreDiff4P 5
movd %2, %5
;MOVD instruction when destination operand is MMX register:
;DEST[31-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;DEST[63-32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 00000000H;
;MOVD instruction when destination operand is XMM register:
;DEST[31-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC;
;DEST[127-32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> 000000000000000000000000H;
;MOVD instruction when source operand is MMX or XXM register:
;DEST <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs3'> SRC[31-0];
punpcklbw %2, %4
;PUNPCKLBW instruction with 64-bit operands:
;DEST[63..56] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..24];
;DEST[55..48] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..24];
;DEST[47..40] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[23..16];
;DEST[39..32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[23..16];
;DEST[31..24] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15..8];
;DEST[23..16] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15..8];
;DEST[15..8] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[7..0];
;DEST[7..0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[7..0];
;PUNPCKLWD instruction with 64-bit operands:
;DEST[63..48] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..16];
;DEST[47..32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..16];
;DEST[31..16] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15..0];
;DEST[15..0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15..0];
;PUNPCKLDQ instruction with 64-bit operands:
;DEST[63..32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31..0];
;DEST[31..0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31..0];
;PUNPCKLBW instruction with 128-bit operands:
;DEST[7-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[7-0];
;DEST[15-8] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[7-0];
;DEST[23-16] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15-8];
;DEST[31-24] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15-8];
;DEST[39-32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[23-16];
;DEST[47-40] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[23-16];
;DEST[55-48] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-24];
;DEST[63-56] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-24];
;DEST[71-64] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[39-32];
;DEST[79-72] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[39-32];
;DEST[87-80] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[47-40];
;DEST[95-88] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[47-40];
;DEST[103-96] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[55-48];
;DEST[111-104] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[55-48];
;DEST[119-112] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-56];
;DEST[127-120] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-56];
;PUNPCKLWD instruction with 128-bit operands:
;DEST[15-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[15-0];
;DEST[31-16] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[15-0];
;DEST[47-32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-16];
;DEST[63-48] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-16];
;DEST[79-64] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[47-32];
;DEST[95-80] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[47-32];
;DEST[111-96] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-48];
;DEST[127-112] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-48];
;PUNPCKLDQ instruction with 128-bit operands:
;DEST[31-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[31-0];
;DEST[63-32] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[31-0];
;DEST[95-64] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-32];
;DEST[127-96] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-32];
;PUNPCKLQDQ
;DEST[63-0] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> DEST[63-0];
;DEST[127-64] <img src='arrwleft.gif' style='margin-top:0px;margin-bottom:0px;margin-left:0px;margin-right:0px;' width='13px' height='16px' border='0' class='img_whs6'> SRC[63-0];
paddw %1, %3
psraw %1, $06
paddsw %1, %2 ; Packed Add with Saturation
;PADDSB 指令将压缩有符号字节整数相加。单个字节结果超出有符号字节整数的范围(即大于 7FH 或小于 80H)时,则分别将饱和值 7FH 或 80H 写入目标操作数。
;PADDSW 指令将压缩有符号字整数相加。单个字结果超出有符号字整数的范围(即大于 7FFFH 或小于 8000H)时,则分别将饱和值 7FFFH 或 8000H 写入目标操作数。
packuswb %1, %2 ; Pack with Unsigned Saturation
;使用饱和运算将 mm 中的 4 个有符号字与mm/m64 中的 4 个有符号字压缩成 8 个无符号字节,结果放入mm。
;使用饱和运算将 xmm1 与xmm2/m128 中的有符号字压缩成无符号字节,结果放入 xmm1。
movd %5, %1
%endmacro
%macro WELS_EXTERN 1
ALIGN 16
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%1:
%endmacro
%macro SIGN_EXTENSION 2
%ifndef X86_32
movsxd %1, %2
%endif
%endmacro
; pOut mm1, mm4, mm5, mm3
%macro MMX_Trans4x4W 5
MMX_XSwap wd, %1, %2, %5
MMX_XSwap wd, %3, %4, %2
MMX_XSwap dq, %1, %3, %4
MMX_XSwap dq, %5, %2, %3
%endmacro
%macro MMX_XSwap 4
movq %4, %2
punpckh%1 %4, %3
punpckl%1 %2, %3
%endmacro
;*******************************************************************************
; Code
;*******************************************************************************
SECTION .text
;*******************************************************************************
; void IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
;*******************************************************************************
WELS_EXTERN IdctResAddPred_mmx ; 定义 IdctResAddPred_mmx 函数
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r1, r1d
movq mm0, [r2+ 0]
movq mm1, [r2+ 8]
movq mm2, [r2+16]
movq mm3, [r2+24]
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
========>
movq mm4, mm0
punpckhwd mm4, mm1 ; 高位交叉组合
punpcklwd mm0, mm1 ; 低位交叉组合
movq mm1, mm2
punpckhwd mm1, mm3
punpcklwd mm2, mm3
movq mm3, mm0
punpckhdq mm3, mm2
punpckldq mm0, mm2
movq mm2, mm4
punpckhdq mm2, mm1
punpckldq mm4, mm1
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
%macro MMX_SumSubDiv2 3
movq %3, %2
psraw %3, $01
paddw %3, %1
psraw %1, $01
psubw %1, %2
%endmacro
%macro MMX_SumSub 3
movq %3, %2
psubw %2, %1
paddw %1, %3
%endmacro
%macro MMX_IDCT 6
MMX_SumSub %4, %5, %6
MMX_SumSubDiv2 %3, %2, %1
MMX_SumSub %1, %4, %6
MMX_SumSub %3, %5, %6
%endmacro
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
WELS_Zero mm7
WELS_DW32 mm6
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1]
lea r0, [r0+2*r1]
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0]
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1]
emms
ret
;void WelsBlockZero16x16_sse2(int16_t * block, int32_t stride);
WELS_EXTERN WelsBlockZero16x16_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
shl r1, 1
pxor xmm0, xmm0
%rep 16
movdqa [r0], xmm0
movdqa [r0+16], xmm0
add r0, r1
%endrep
ret
;void WelsBlockZero8x8_sse2(int16_t * block, int32_t stride);
WELS_EXTERN WelsBlockZero8x8_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
shl r1, 1
pxor xmm0, xmm0
%rep 8
movdqa [r0], xmm0
add r0, r1
%endrep
ret