简要过程如下:
开始剖析:
首先写一个lua的demo程序
local x = 0
for out=1,100 do
for n=1,200 do
x = x+1
end
print(x)
end
这种固定分支必然会触发热点优化的
$ ./luajit -jv x.lua
[TRACE 1 x.lua:3 loop]
200
400
600
800
1000
1200
1400
1600
1800
[TRACE --- (1/3) x.lua:6 -- NYI: FFFFFFFFFFFFastFunc print]
2000
[TRACE --- (1/3) x.lua:6 -- NYI: FFFFFFFFFFFFastFunc print]
2200
[TRACE --- (1/3) x.lua:6 -- NYI: FFFFFFFFFFFFastFunc print]
2400
[TRACE --- (1/3) x.lua:6 -- NYI: FFFFFFFFFFFFastFunc print]
2600
[TRACE 2 (1/3) x.lua:6 -- fallback to interpreter]
$ ./luajit -jdump x2.lua
---- TRACE 1 start x2.lua:3
0010 ADDVN 0 0 0 ; 1
0011 FORL 5 => 0010
---- TRACE 1 IR
0001 int SLOAD #6 CI
0002 > num SLOAD #1 T
0003 + num ADD 0002 +1
0004 + int ADD 0001 +1
0005 > int LE 0004 +200
0006 ------ LOOP ------------
0007 + num ADD 0003 +1
0008 + int ADD 0004 +1
0009 > int LE 0008 +200
0010 int PHI 0004 0008
0011 num PHI 0003 0007
---- TRACE 1 mcode 81
2a4fffa3 mov dword [0x4092d4a0], 0x1
2a4fffae movsd xmm0, [0x4093dae0]
2a4fffb7 cvtsd2si ebp, [rdx+0x28]
2a4fffbc cmp dword [rdx+0x4], 0xfffeffff
2a4fffc3 jnb 0x2a4f0010 ->0
2a4fffc9 movsd xmm7, [rdx]
2a4fffcd addsd xmm7, xmm0
2a4fffd1 add ebp, +0x01
2a4fffd4 cmp ebp, 0xc8
2a4fffda jg 0x2a4f0014 ->1
->LOOP:
2a4fffe0 addsd xmm7, xmm0
2a4fffe4 add ebp, +0x01
2a4fffe7 cmp ebp, 0xc8
2a4fffed jle 0x2a4fffe0 ->LOOP
2a4fffef jmp 0x2a4f001c ->3
---- TRACE 1 stop -> loop
200
400
600
800
1000
1200
1400
1600
1800
---- TRACE 2 start 1/3 x2.lua:6
0012 GGET 5 0 ; "print"
0013 MOV 6 0
0014 CALL 5 1 2
0000 . FUNCC ; print
---- TRACE 2 abort x2.lua:6 -- NYI: FFFFFFFFFFFFastFunc print
2000
---- TRACE 2 start 1/3 x2.lua:6
0012 GGET 5 0 ; "print"
0013 MOV 6 0
0014 CALL 5 1 2
0000 . FUNCC ; print
---- TRACE 2 abort x2.lua:6 -- NYI: FFFFFFFFFFFFastFunc print
2200
---- TRACE 2 start 1/3 x2.lua:6
0012 GGET 5 0 ; "print"
0013 MOV 6 0
0014 CALL 5 1 2
0000 . FUNCC ; print
---- TRACE 2 abort x2.lua:6 -- NYI: FFFFFFFFFFFFastFunc print
通过浏览源码 初步定位
LuaJIT用一张hash表,维护了相关指令(跳转和调用)的热度
HotCount hotcount[HOTCOUNT_SIZE]; /* Hot counters. */
(gdb) r
Starting program: /home/x/luajit-2.0/src/./luajit -jv x.lua
Temporary breakpoint 5, lj_trace_hot (J=0x40000550, pc=0x4000b498) at lj_trace.c:662
662 {
(gdb) bt
#0 lj_trace_hot (J=0x40000550, pc=0x4000b498) at lj_trace.c:662
#1 0x000000000041d5a4 in lj_vm_hotloop ()
#2 0x000000000040b9f0 in lua_pcall (L=L@entry=0x40000378, nargs=nargs@entry=0, nresults=, errfunc=errfunc@entry=2) at lj_api.c:1052
#3 0x000000000040450c in docall (L=0x40000378, narg=0, clear=0) at luajit.c:122
#4 0x000000000040529e in handle_script (n=, argv=, L=) at luajit.c:290
#5 pmain (L=0x40000378) at luajit.c:540
#6 0x000000000041bb86 in lj_BC_FUNCC ()
#7 0x000000000040ba6a in lua_cpcall (L=L@entry=0x40000378, func=func@entry=0x404ac0 , ud=ud@entry=0x0) at lj_api.c:1074
#8 0x0000000000404054 in main (argc=3, argv=0x7fffffffdff8) at luajit.c:568
继续深入分析lj_vm_hotloop ()
lj_vm_hotloop:
.byte 139,106,248,139,109,16,15,182,69,199,141,4,194,139,108,36
.byte 24,137,85,16,137,69,24,137,222,65,141,190,224,245,255,255
.byte 73,137,174,64,246,255,255,137,92,36,28,232
.long lj_trace_hot-.-4
.byte 235,171
.globl lj_vm_callhook
.hidden lj_vm_callhook
.type lj_vm_callhook, @function
.size lj_vm_callhook, 6
找他的实现代码,换个方向。
源码中的vm_xxx.dasc文件,本文以vm_x86.dasc为例。
vm_86.dasc由dynasm.lua解析,然后跟buildvm.c一起链接。
分析阶段
源码lj_dispatch.h
/* Type of hot counter. Must match the code in the assembler VM. */
/* 16 bits are sufficient. Only 0.0015% overhead with maximum slot penalty. */
typedef uint16_t HotCount;
/* Number of hot counter hash table entries (must be a power of two). */
#define HOTCOUNT_SIZE 64
#define HOTCOUNT_PCMASK ((HOTCOUNT_SIZE-1)*sizeof(HotCount))
/* Hotcount decrements. */
#define HOTCOUNT_LOOP 2
#define HOTCOUNT_CALL 1
#define hotcount_get(gg, pc) \
(gg)->hotcount[(u32ptr(pc)>>2) & (HOTCOUNT_SIZE-1)]
#define hotcount_set(gg, pc, val) \
(hotcount_get((gg), (pc)) = (HotCount)(val))
以及lj_jit.h
_(\007, hotloop, 56) /* # of iter. to detect a hot loop/call. */ \
|// Decrement hashed hotcount and trigger trace recorder if zero.
|.macro hotloop, reg
| mov reg, PC
| shr reg, 1
| and reg, HOTCOUNT_PCMASK
| sub word [DISPATCH+reg+GG_DISP2HOT], HOTCOUNT_LOOP
| jb ->vm_hotloop
|.endmacro
翻译器每次执行pcall会调用hotloop和hotcall宏来更新并检查上面的热点计数器。
汇编前面的竖线是直接放在汇编里的一种自定义解析方式。
最终会跳转到vm_hotloop,如下
|->vm_hotloop: // Hot loop counter underflow.
|.if JIT
| mov LFUNC:RB, [BASE-8] // Same as curr_topL(L).
| mov RB, LFUNC:RB->pc
| movzx RD, byte [RB+PC2PROTO(framesize)]
| lea RD, [BASE+RD*8]
| mov L:RB, SAVE_L
| mov L:RB->base, BASE
| mov L:RB->top, RD
| mov FCARG2, PC
| lea FCARG1, [DISPATCH+GG_DISP2J]
| mov aword [DISPATCH+DISPATCH_J(L)], L:RBa
| mov SAVE_PC, PC
| call extern lj_trace_hot@8 // (jit_State *J, const BCIns *pc)
| jmp <3
|.endif
最后调用lj_trace_hot,进入追踪阶段。
综上,探测热点阶段粗略的理解就是根据函数调用次数进行计数,启发式判断热点是否达到阈值,进行追踪
追踪阶段
从lj_trace_hot开始,tracing subsystem有五个状态
trace_state
LJ_TRACE_IDLE, /* Trace compiler idle. */
LJ_TRACE_ACTIVE = 0x10,
LJ_TRACE_RECORD, /* Bytecode recording active. */
LJ_TRACE_START, /* New trace started. */
LJ_TRACE_END, /* End of trace. */
LJ_TRACE_ASM, /* Assemble trace. */
LJ_TRACE_ERR /* Trace aborted with error. */
把lua翻译成特定代码路径的字节码线性序列,字节码流被译作中间码形式,然后再优化和编译成汇编。