【Erlang】优化排查工具

%%%-------------------------------------------------------------------
%%% @author cnkizy
%%% @copyright (C) 2020
%%% @doc
%%%    VM内部调优排查
%%%    https://www.cnblogs.com/lulu/p/4149204.html
%%% @end 
%%% Created : 2021/1/23-12-05
%%%-------------------------------------------------------------------
-module(analysis_lib).
-description("analysis_lib").
-author("cnkizy").
-vsn(1).
%% API
-export([fp_pid/1, fp_apply/3, pstack/1, etop/0, etop_mem/0, etop_stop/0, gc_all/0, eprof_all/1, eprof/2]).
-export([scheduler_usage/0, scheduler_usage/1]).
-export([scheduler_stat/0, scheduler_stat/1]).
-export([trace/1, trace/2, trace_stop/0]).
-export([proc_mem_all/1, proc_mem/1, proc_mem/2]).
%% -----------------------------------------------------------------
%% Function: fp_apply/3
%% Description: 执行一次MFA并进行过程分析,fprof 结果比较详细,能够输出热点调用路径
%% CNT表示总的函数调用次数,ACC表示Trace起始经历的时间,OWN是Trace中所有函数执行的时间和(不包含调用函数的执行时间
%% 时间单位:ms
%% Detail Reference: https://www.it610.com/article/519481.htm
%% -----------------------------------------------------------------
fp_apply(M, F, A) ->
	fprof:start(),
	fprof:apply(M, F, A),
	fprof:stop(),
	fprof:profile(),
	fprof:analyse({dest, "profile.txt"}).

fp_pid(Pid) ->
	fprof:start(),
	fprof:trace([start, {procs, Pid}]),
	fprof:stop(),
	fprof:profile(),
	fprof:analyse({dest, "profile.txt"}).

%% ================================== 进程栈 =======================================
%% -----------------------------------------------------------------
%% Function: pstack/3
%% Description: 类似于jstack,发现大量进程挂起,进程数过高,运行慢,hang住等问题用到
%% -----------------------------------------------------------------
pstack(Reg) when is_atom(Reg) ->
	case whereis(Reg) of
		undefined -> undefined;
		Pid -> pstack(Pid)
	end;
pstack(Pid) ->
	io:format("~s~n", [element(2, process_info(Pid, backtrace))]).

%% -----------------------------------------------------------------
%% Function: etop/0
%% Description: 启动etop 每10秒输出一次 当前进程CPU占用排名
%% -----------------------------------------------------------------
etop() ->
	spawn(fun() -> etop:start([{output, text}, {interval, 10}, {lines, 20}, {sort, reductions}]) end).
%% -----------------------------------------------------------------
%% Function: etop_mem/0
%% Description: 启动etop  每10秒输出一次 进程Mem占用排名
%% -----------------------------------------------------------------
etop_mem() ->
	spawn(fun() -> etop:start([{output, text}, {interval, 10}, {lines, 20}, {sort, memory}]) end).
%% -----------------------------------------------------------------
%% Function: etop_stop/0
%% Description: 停止etop
%% -----------------------------------------------------------------
etop_stop() ->
	etop:stop().

%% -----------------------------------------------------------------
%% Function: gc_all/0
%% Description: 对所有process做gc、进程内存过高时,来一发,看看是内存泄露还是gc不过来
%% -----------------------------------------------------------------
gc_all() ->
	[erlang:garbage_collect(Pid) || Pid <- processes()].

%% -----------------------------------------------------------------
%% Function: eprof_all/1
%% Description:
%%
%% 对整个节点内所有进程执行eprof, eprof 对线上业务有一定影响,慎用!
%% 建议TimeoutSec<10s,且进程数< 1000,否则可能导致节点crash
%%
%% 结果:
%% 输出每个方法实际执行时间(不会累计方法内其他mod调用执行时间)
%% 只能得到mod - Fun 执行次数 执行耗时
%% -----------------------------------------------------------------
eprof_all(TimeoutSec) ->
	eprof(processes() -- [whereis(eprof)], TimeoutSec).
eprof(Pids, TimeoutSec) ->
	eprof:start(),
	eprof:start_profiling(Pids),
	timer:sleep(TimeoutSec),
	eprof:stop_profiling(),
	eprof:analyze(total),
	eprof:stop().

%% -----------------------------------------------------------------
%% Function: scheduler_usage/0
%% Description: 统计下1s每个调度器CPU的实际利用率(因为有spin wait、调度工作, 可能usage 比top显示低很多)
%% -----------------------------------------------------------------
scheduler_usage() ->
	scheduler_usage(1000).
scheduler_usage(RunMs) ->
	erlang:system_flag(scheduler_wall_time, true),
	Ts0 = lists:sort(erlang:statistics(scheduler_wall_time)),
	timer:sleep(RunMs),
	Ts1 = lists:sort(erlang:statistics(scheduler_wall_time)),
	erlang:system_flag(scheduler_wall_time, false),
	Cores = lists:map(
		fun({{I, A0, T0}, {I, A1, T1}}) ->
			{I, (A1 - A0) / (T1 - T0)}
		end, lists:zip(Ts0, Ts1)),
	{A, T} = lists:foldl(
		fun({{_, A0, T0}, {_, A1, T1}}, {Ai, Ti}) ->
			{Ai + (A1 - A0), Ti + (T1 - T0)}
		end, {0, 0}, lists:zip(Ts0, Ts1)),
	Total = A / T,
	io:format("~p~n", [[{total, Total} | Cores]]).

%% -----------------------------------------------------------------
%% Function: scheduler_stat/0
%% Description: 统计下1s内调度进程数量(含义:第一个数字执行进程数量,第二个数字迁移进程数量)
%% -----------------------------------------------------------------
scheduler_stat() ->
	scheduler_stat(1000).
scheduler_stat(RunMs) ->
	erlang:system_flag(scheduling_statistics, enable),
	Ts0 = erlang:system_info(total_scheduling_statistics),
	timer:sleep(RunMs),
	Ts1 = erlang:system_info(total_scheduling_statistics),
	erlang:system_flag(scheduling_statistics, disable),
	lists:map(fun scheduler_stat_/1, lists:zip(Ts0, Ts1)).

scheduler_stat_({{Key, In0, Out0}, {Key, In1, Out1}}) ->
	{Key, In1 - In0, Out1 - Out0}.

%% -----------------------------------------------------------------
%% Function: trace/1
%% Description: 会把mod 每次调用详细MFA log 下来,args 太大就不好看了
%% trace Mod 所有方法的调用
%% -----------------------------------------------------------------
trace(Mod) ->
	dbg:tracer(),
	dbg:tpl(Mod, '_', []),
	dbg:p(all, c).
%% -----------------------------------------------------------------
%% Function: trace/2
%% Description: trace Node上指定 Mod 所有方法的调用, 结果将输出到本地shell
%% -----------------------------------------------------------------
trace(Node, Mod) ->
	dbg:tracer(),
	dbg:n(Node),
	dbg:tpl(Mod, '_', []),
	dbg:p(all, c).
%% -----------------------------------------------------------------
%% Function: trace_stop/0
%% Description: 停止trace
%% -----------------------------------------------------------------
trace_stop() ->
	dbg:stop_clear().

%% ================================== 内存高OOM 排查工具 =======================================
%% -----------------------------------------------------------------
%% Function: proc_mem_all/1
%% Description: etop 无法应对10w+ 进程节点, 下面代码就没问题了;找到可疑proc后通过pstack、message_queu_len 排查原因
%% -----------------------------------------------------------------
proc_mem_all(SizeLimitKb) ->
	Procs = [{undefined, Pid} || Pid <- erlang:processes()],
	proc_mem(Procs, SizeLimitKb).
%% -----------------------------------------------------------------
%% Function: proc_mem/1
%% Description: etop 无法应对10w+ 进程节点, 下面代码就没问题了;找到可疑proc后通过pstack、message_queu_len 排查原因
%% -----------------------------------------------------------------
proc_mem(SizeLimitKb) ->
	Procs = [{Name, Pid} || {_, Name, Pid, _} <- release_handler_1:get_supervised_procs(),
	         is_process_alive(Pid)],
	proc_mem(Procs, SizeLimitKb).
proc_mem(Procs, SizeLimitKb) ->
	SizeLimit = SizeLimitKb * 1024,
	{R, Total} = lists:foldl(fun({Name, Pid}, {Acc, TotalSize}) ->
		case erlang:process_info(Pid, total_heap_size) of
			{_, Size0} ->
				Size = Size0 * 8,
				case Size > SizeLimit of
					true -> {[{Name, Pid, Size} | Acc], TotalSize + Size};
					false -> {Acc, TotalSize}
				end;
			_ -> {Acc, TotalSize}
		end
	                         end, {[], 0}, Procs),
	R1 = lists:keysort(3, R),
	{Total, lists:reverse(R1)}.

分析单个函数时,fprof较为实用。

褚霸发过一遍fprof http://blog.yufeng.info/archives/tag/fprof 弄感兴趣的数据收集才有意义。

你可能感兴趣的:(erlang)