CMOVcc r,r 1 1
CMOVcc r,m 1 1 1
XCHG r,r 3
XCHG r,m 4 1 1 1 high b)
XLAT 1 1
PUSH r/i 1 1 1
POP r 1 1
POP (E)SP 2 1
PUSH m 1 1 1 1
POP m 5 1 1 1
PUSH sr 2 1 1
POP sr 8 1
PUSHF(D) 3 11 1 1
POPF(D) 10 6 1
PUSHA(D) 2 8 8
POPA(D) 2 8
LAHF SAHF 1
LEA r,m 1 1 c)
LDS LES LFS LGS
LSS m 8 3
ADD SUB AND OR XOR r,r/i 1
ADD SUB AND OR XOR r,m 1 1
ADD SUB AND OR XOR m,r/i 1 1 1 1
ADC SBB r,r/i 2
ADC SBB r,m 2 1
ADC SBB m,r/i 3 1 1 1
CMP TEST r,r/i 1
CMP TEST m,r/i 1 1
INC DEC NEG NOT r 1
INC DEC NEG NOT m 1 1 1 1
AAS DAA DAS 1
AAD 1 2 4
AAM 1 1 2 15
MUL IMUL r,(r),(i) 1 4 1/1
MUL IMUL (r),m 1 1 4 1/1
DIV IDIV r8 2 1 19 1/12
DIV IDIV r16 3 1 23 1/21
DIV IDIV r32 3 1 39 1/37
DIV IDIV m8 2 1 1 19 1/12
DIV IDIV m16 2 1 1 23 1/21
DIV IDIV m32 2 1 1 39 1/37
CBW CWDE 1
CWD CDQ 1
SHR SHL SAR ROR
ROL r,i/CL 1
SHR SHL SAR ROR
ROL m,i/CL 1 1 1 1
RCR RCL r,1 1 1
RCR RCL r8,i/CL 4 4
RCR RCL r16/32,i/CL 3 3
RCR RCL m,1 1 2 1 1 1
RCR RCL m8,i/CL 4 3 1 1 1
RCR RCL m16/32,i/CL 4 2 1 1 1
SHLD SHRD r,r,i/CL 2
SHLD SHRD m,r,i/CL 2 1 1 1 1
BT r,r/i 1
BT m,r/i 1 6 1
BTR BTS BTC r,r/i 1
BTR BTS BTC m,r/i 1 6 1 1 1
BSF BSR r,r 1 1
BSF BSR r,m 1 1 1
SETcc r 1
SETcc m 1 1 1
JMP short/near 1 1/2
JMP far 21 1
JMP r 1 1/2
JMP m(near) 1 1 1/2
JMP m(far) 21 2
conditional jump short/near 1 1/2
CALL near 1 1 1 1 1/2
CALL far 28 1 2 2
CALL r 1 2 1 1 1/2
CALL m(near) 1 4 1 1 1 1/2
CALL m (far) 28 2 2 2
RETN 1 2 1 1/2
RETN i 1 3 1 1/2
RETF 23 3
RETF i 23 3
J(E)CXZ short 1 1
LOOP short 2 1 8
LOOP(N)E short 2 1 8
ENTER i,0 12 1 1
ENTER a,b ca. 18+4b b-1 2b
LEAVE 2 1
BOUND r,m 7 6 2
CLC STC CMC 1
CLD STD 4
CLI 9
STI 17
INTO 5
LODS 2
REP LODS 10+6n
STOS 1 1 1
REP STOS ca. 5n a)
MOVS 1 3 1 1
REP MOVS ca. 6n a)
SCAS 1 2
REP(N)E SCAS 12+7n
CMPS 4 2
REP(N)E CMPS 12+9n
BSWAP 1 1
CPUID 23-48
RDTSC 31
IN 18 >300
OUT 18 >300
PREFETCHNTA d) m 1
PREFETCHT0 d) m 1
PREFETCHT1 d) m 1
PREFETCHT2 d) m 1
SFENCE d) 1 1 1/6
Notes:
a) faster under certain conditions: see chapter 26.3.
b) see chapter 26.1
c) 3 if constant without base or index register
d) PIII only.
2 Floating point instructions (PPro, PII and PIII)
Instruction Operands micro-ops delay throughput
p0 p1 p01 p2 p3 p4
FLD r 1
FLD m32/64 1 1
FLD m80 2 2
FBLD m80 38 2
FST(P) r 1
FST(P) m32/m64 1 1 1
FSTP m80 2 2 2
FBSTP m80 165 2 2
FXCH r 0 3/1 f)
FILD m 3 1 5
FIST(P) m 2 1 1 5
FLDZ 1
FLD1 FLDPI FLDL2E etc. 2
FCMOVcc r 2 2
FNSTSW AX 3 7
FNSTSW m16 1 1 1
FLDCW m16 1 1 1 10
FNSTCW m16 1 1 1
FADD(P) FSUB(R)(P) r 1 3 1/1
FADD(P) FSUB(R)(P) m 1 1 3-4 1/1
FMUL(P) r 1 5 1/2 g)
FMUL(P) m 1 1 5-6 1/2 g)
FDIV(R)(P) r 1 38 h) 1/37
FDIV(R)(P) m 1 1 38 h) 1/37
FABS 1
FCHS 3 2
FCOM(P) FUCOM r 1 1
FCOM(P) FUCOM m 1 1 1
FCOMPP FUCOMPP 1 1 1
FCOMI(P) FUCOMI(P) r 1 1
FCOMI(P) FUCOMI(P) m 1 1 1
FIADD FISUB(R) m 6 1
FIMUL m 6 1
FIDIV(R) m 6 1
FICOM(P) m 6 1
FTST 1 1
FXAM 1 2
FPREM 23
FPREM1 33
FRNDINT 30
FSCALE 56
FXTRACT 15
FSQRT 1 69 e,i)
FSIN FCOS 17-97 27-103 e)
FSINCOS 18-110 29-130 e)
F2XM1 17-48 66 e)
FYL2X 36-54 103 e)
FYL2XP1 31-53 98-107 e)
FPTAN 21-102 13-143 e)
FPATAN 25-86 44-143 e)
FNOP 1
FINCSTP FDECSTP 1
FFREE r 1
FFREEP r 2
FNCLEX 3
FNINIT 13
FNSAVE 141
FRSTOR 72
WAIT 2
Notes:
e) not pipelined
f) FXCH generates 1 micro-op that is resolved by register renaming without going to any port.
g) FMUL uses the same circuitry as integer multiplication. Therefore, the combined throughput of mixed floating point and integer multiplications is
1 FMUL + 1 IMUL per 3 clock cycles.
h) FDIV delay depends on precision specified in control word: precision 64 bits gives delay 38, precision 53 bits gives delay 32, precision 24 bits gives delay 18. Division by a power of 2 takes 9 clocks. Throughput is
1/(delay-1).
i) faster for lower precision.
3 MMX instructions (PII and PIII)
Instruction Operands micro-ops delay throughput
p0 p1 p01 p2 p3 p4
MOVD MOVQ r,r 1 2/1
MOVD MOVQ r64,m32/64 1 1/1
MOVD MOVQ m32/64,r64 1 1 1/1
PADD PSUB PCMP r64,r64 1 1/1
PADD PSUB PCMP r64,m64 1 1 1/1
PMUL PMADD r64,r64 1 3 1/1
PMUL PMADD r64,m64 1 1 3 1/1
PAND PANDN POR
PXOR r64,r64 1 2/1
PAND PANDN POR
PXOR r64,m64 1 1 1/1
PSRA PSRL PSLL r64,r64/i 1 1/1
PSRA PSRL PSLL r64,m64 1 1 1/1
PACK PUNPCK r64,r64 1 1/1
PACK PUNPCK r64,m64 1 1 1/1
EMMS 11 6 k)
MASKMOVQ d) r64,r64 1 1 1 2-8 1/30-1/2
PMOVMSKB d) r32,r64 1 1 1/1
MOVNTQ d) m64,r64 1 1 1/30-1/1
PSHUFW d) r64,r64,i 1 1 1/1
PSHUFW d) r64,m64,i 1 1 2 1/1
PEXTRW d) r32,r64,i 1 1 2 1/1
PISRW d) r64,r32,i 1 1 1/1
PISRW d) r64,m16,i 1 1 2 1/1
PAVGB PAVGW d) r64,r64 1 1 2/1
PAVGB PAVGW d) r64,m64 1 1 2 1/1
PMINUB PMAXUB PMINSW
PMAXSW d) r64,r64 1 1 2/1
PMINUB PMAXUB PMINSW
PMAXSW d) r64,m64 1 1 2 1/1
PMULHUW d) r64,r64 1 3 1/1
PMULHUW d) r64,m64 1 1 4 1/1
PSADBW d) r64,r64 2 1 5 1/2
PSADBW d) r64,m64 2 1 1 6 1/2
Notes:
d) PIII only.
k) you may hide the delay by inserting other instructions between EMMS and any subsequent floating point instruction.
4 XMM instructions (PIII)
Instruction Operands micro-ops delay throughput
p0 p1 p01 p2 p3 p4
MOVAPS r128,r128 2 1 1/1
MOVAPS r128,m128 2 2 1/2
MOVAPS m128,r128 2 2 3 1/2
MOVUPS r128,m128 4 2 1/4
MOVUPS m128,r128 1 4 4 3 1/4
MOVSS r128,r128 1 1 1/1
MOVSS r128,m32 1 1 1 1/1
MOVSS m32,r128 1 1 1 1/1
MOVHPS MOVLPS r128,m64 1 1 1/1
MOVHPS MOVLPS m64,r128 1 1 1 1/1
MOVLHPS MOVHLPS r128,r128 1 1 1/1
MOVMSKPS r32,r128 1 1 1/1
MOVNTPS m128,r128 2 2 1/15-1/2
CVTPI2PS r128,r64 2 3 1/1
CVTPI2PS r128,m64 2 1 4 1/2
CVTPS2PI
CVTTPS2PI r64,r128 2 3 1/1
CVTPS2PI r64,m128 1 2 4 1/1
CVTSI2SS r128,r32 2 1 4 1/2
CVTSI2SS r128,m32 2 2 5 1/2
CVTSS2SI
CVTTSS2SI r32,r128 1 1 3 1/1
CVTSS2SI r32,m128 1 2 4 1/2
ADDPS SUBPS r128,r128 2 3 1/2
ADDPS SUBPS r128,m128 2 2 3 1/2
ADDSS SUBSS r128,r128 1 3 1/1
ADDSS SUBSS r128,m32 1 1 3 1/1
MULPS r128,r128 2 4 1/2
MULPS r128,m128 2 2 4 1/2
MULSS r128,r128 1 4 1/1
MULSS r128,m32 1 1 4 1/1
DIVPS r128,r128 2 48 1/34
DIVPS r128,m128 2 2 48 1/34
DIVSS r128,r128 1 18 1/17
DIVSS r128,m32 1 1 18 1/17
ANDPS ANDNPS ORPS
XORPS r128,r128 2 2 1/2
ANDPS ANDNPS ORPS
XORPS r128,m128 2 2 2 1/2
MAXPS MINPS r128,r128 2 3 1/2
MAXPS MINPS r128,m128 2 2 3 1/2
MAXSS MINSS r128,r128 1 3 1/1
MAXSS MINSS r128,m32 1 1 3 1/1
CMPccPS r128,r128 2 3 1/2
CMPccPS r128,m128 2 2 3 1/2
CMPccSS r128,r128 1 1 3 1/1
CMPccSS r128,m32 1 1 3 1/1
COMISS UCOMISS r128,r128 1 1 1/1
COMISS UCOMISS r128,m32 1 1 1 1/1
SQRTPS r128,r128 2 56 1/56
SQRTPS r128,m128 2 2 57 1/56
SQRTSS r128,r128 2 30 1/28
SQRTSS r128,m32 2 1 31 1/28
RSQRTPS r128,r128 2 2 1/2
RSQRTPS r128,m128 2 2 3 1/2
RSQRTSS r128,r128 1 1 1/1
RSQRTSS r128,m32 1 1 2 1/1
RCPPS r128,r128 2 2 1/2
RCPPS r128,m128 2 2 3 1/2
RCPSS r128,r128 1 1 1/1
RCPSS r128,m32 1 1 2 1/1
SHUFPS r128,r128,i 2 1 2 1/2
SHUFPS r128,m128,i 2 2 2 1/2
UNPCKHPS UNPCKLPS r128,r128 2 2 3 1/2
UNPCKHPS UNPCKLPS r128,m128 2 2 3 1/2
LDMXCSR m32 11 15 1/15
STMXCSR m32 6 7 1/9
FXSAVE m4096 116 62
FXRSTOR m4096 89 68
====================================================================