%%%-------------------------------------------------------------------
%%% @author cnkizy
%%% @copyright (C) 2020
%%% @doc
%%% VM内部调优排查
%%% https://www.cnblogs.com/lulu/p/4149204.html
%%% @end
%%% Created : 2021/1/23-12-05
%%%-------------------------------------------------------------------
-module(analysis_lib).
-description("analysis_lib").
-author("cnkizy").
-vsn(1).
%% API
-export([fp_pid/1, fp_apply/3, pstack/1, etop/0, etop_mem/0, etop_stop/0, gc_all/0, eprof_all/1, eprof/2]).
-export([scheduler_usage/0, scheduler_usage/1]).
-export([scheduler_stat/0, scheduler_stat/1]).
-export([trace/1, trace/2, trace_stop/0]).
-export([proc_mem_all/1, proc_mem/1, proc_mem/2]).
%% -----------------------------------------------------------------
%% Function: fp_apply/3
%% Description: 执行一次MFA并进行过程分析,fprof 结果比较详细,能够输出热点调用路径
%% CNT表示总的函数调用次数,ACC表示Trace起始经历的时间,OWN是Trace中所有函数执行的时间和(不包含调用函数的执行时间
%% 时间单位:ms
%% Detail Reference: https://www.it610.com/article/519481.htm
%% -----------------------------------------------------------------
fp_apply(M, F, A) ->
fprof:start(),
fprof:apply(M, F, A),
fprof:stop(),
fprof:profile(),
fprof:analyse({dest, "profile.txt"}).
fp_pid(Pid) ->
fprof:start(),
fprof:trace([start, {procs, Pid}]),
fprof:stop(),
fprof:profile(),
fprof:analyse({dest, "profile.txt"}).
%% ================================== 进程栈 =======================================
%% -----------------------------------------------------------------
%% Function: pstack/3
%% Description: 类似于jstack,发现大量进程挂起,进程数过高,运行慢,hang住等问题用到
%% -----------------------------------------------------------------
pstack(Reg) when is_atom(Reg) ->
case whereis(Reg) of
undefined -> undefined;
Pid -> pstack(Pid)
end;
pstack(Pid) ->
io:format("~s~n", [element(2, process_info(Pid, backtrace))]).
%% -----------------------------------------------------------------
%% Function: etop/0
%% Description: 启动etop 每10秒输出一次 当前进程CPU占用排名
%% -----------------------------------------------------------------
etop() ->
spawn(fun() -> etop:start([{output, text}, {interval, 10}, {lines, 20}, {sort, reductions}]) end).
%% -----------------------------------------------------------------
%% Function: etop_mem/0
%% Description: 启动etop 每10秒输出一次 进程Mem占用排名
%% -----------------------------------------------------------------
etop_mem() ->
spawn(fun() -> etop:start([{output, text}, {interval, 10}, {lines, 20}, {sort, memory}]) end).
%% -----------------------------------------------------------------
%% Function: etop_stop/0
%% Description: 停止etop
%% -----------------------------------------------------------------
etop_stop() ->
etop:stop().
%% -----------------------------------------------------------------
%% Function: gc_all/0
%% Description: 对所有process做gc、进程内存过高时,来一发,看看是内存泄露还是gc不过来
%% -----------------------------------------------------------------
gc_all() ->
[erlang:garbage_collect(Pid) || Pid <- processes()].
%% -----------------------------------------------------------------
%% Function: eprof_all/1
%% Description:
%%
%% 对整个节点内所有进程执行eprof, eprof 对线上业务有一定影响,慎用!
%% 建议TimeoutSec<10s,且进程数< 1000,否则可能导致节点crash
%%
%% 结果:
%% 输出每个方法实际执行时间(不会累计方法内其他mod调用执行时间)
%% 只能得到mod - Fun 执行次数 执行耗时
%% -----------------------------------------------------------------
eprof_all(TimeoutSec) ->
eprof(processes() -- [whereis(eprof)], TimeoutSec).
eprof(Pids, TimeoutSec) ->
eprof:start(),
eprof:start_profiling(Pids),
timer:sleep(TimeoutSec),
eprof:stop_profiling(),
eprof:analyze(total),
eprof:stop().
%% -----------------------------------------------------------------
%% Function: scheduler_usage/0
%% Description: 统计下1s每个调度器CPU的实际利用率(因为有spin wait、调度工作, 可能usage 比top显示低很多)
%% -----------------------------------------------------------------
scheduler_usage() ->
scheduler_usage(1000).
scheduler_usage(RunMs) ->
erlang:system_flag(scheduler_wall_time, true),
Ts0 = lists:sort(erlang:statistics(scheduler_wall_time)),
timer:sleep(RunMs),
Ts1 = lists:sort(erlang:statistics(scheduler_wall_time)),
erlang:system_flag(scheduler_wall_time, false),
Cores = lists:map(
fun({{I, A0, T0}, {I, A1, T1}}) ->
{I, (A1 - A0) / (T1 - T0)}
end, lists:zip(Ts0, Ts1)),
{A, T} = lists:foldl(
fun({{_, A0, T0}, {_, A1, T1}}, {Ai, Ti}) ->
{Ai + (A1 - A0), Ti + (T1 - T0)}
end, {0, 0}, lists:zip(Ts0, Ts1)),
Total = A / T,
io:format("~p~n", [[{total, Total} | Cores]]).
%% -----------------------------------------------------------------
%% Function: scheduler_stat/0
%% Description: 统计下1s内调度进程数量(含义:第一个数字执行进程数量,第二个数字迁移进程数量)
%% -----------------------------------------------------------------
scheduler_stat() ->
scheduler_stat(1000).
scheduler_stat(RunMs) ->
erlang:system_flag(scheduling_statistics, enable),
Ts0 = erlang:system_info(total_scheduling_statistics),
timer:sleep(RunMs),
Ts1 = erlang:system_info(total_scheduling_statistics),
erlang:system_flag(scheduling_statistics, disable),
lists:map(fun scheduler_stat_/1, lists:zip(Ts0, Ts1)).
scheduler_stat_({{Key, In0, Out0}, {Key, In1, Out1}}) ->
{Key, In1 - In0, Out1 - Out0}.
%% -----------------------------------------------------------------
%% Function: trace/1
%% Description: 会把mod 每次调用详细MFA log 下来,args 太大就不好看了
%% trace Mod 所有方法的调用
%% -----------------------------------------------------------------
trace(Mod) ->
dbg:tracer(),
dbg:tpl(Mod, '_', []),
dbg:p(all, c).
%% -----------------------------------------------------------------
%% Function: trace/2
%% Description: trace Node上指定 Mod 所有方法的调用, 结果将输出到本地shell
%% -----------------------------------------------------------------
trace(Node, Mod) ->
dbg:tracer(),
dbg:n(Node),
dbg:tpl(Mod, '_', []),
dbg:p(all, c).
%% -----------------------------------------------------------------
%% Function: trace_stop/0
%% Description: 停止trace
%% -----------------------------------------------------------------
trace_stop() ->
dbg:stop_clear().
%% ================================== 内存高OOM 排查工具 =======================================
%% -----------------------------------------------------------------
%% Function: proc_mem_all/1
%% Description: etop 无法应对10w+ 进程节点, 下面代码就没问题了;找到可疑proc后通过pstack、message_queu_len 排查原因
%% -----------------------------------------------------------------
proc_mem_all(SizeLimitKb) ->
Procs = [{undefined, Pid} || Pid <- erlang:processes()],
proc_mem(Procs, SizeLimitKb).
%% -----------------------------------------------------------------
%% Function: proc_mem/1
%% Description: etop 无法应对10w+ 进程节点, 下面代码就没问题了;找到可疑proc后通过pstack、message_queu_len 排查原因
%% -----------------------------------------------------------------
proc_mem(SizeLimitKb) ->
Procs = [{Name, Pid} || {_, Name, Pid, _} <- release_handler_1:get_supervised_procs(),
is_process_alive(Pid)],
proc_mem(Procs, SizeLimitKb).
proc_mem(Procs, SizeLimitKb) ->
SizeLimit = SizeLimitKb * 1024,
{R, Total} = lists:foldl(fun({Name, Pid}, {Acc, TotalSize}) ->
case erlang:process_info(Pid, total_heap_size) of
{_, Size0} ->
Size = Size0 * 8,
case Size > SizeLimit of
true -> {[{Name, Pid, Size} | Acc], TotalSize + Size};
false -> {Acc, TotalSize}
end;
_ -> {Acc, TotalSize}
end
end, {[], 0}, Procs),
R1 = lists:keysort(3, R),
{Total, lists:reverse(R1)}.
分析单个函数时,fprof较为实用。
褚霸发过一遍fprof http://blog.yufeng.info/archives/tag/fprof 弄感兴趣的数据收集才有意义。