membase之ns_server监控数据采集

过程,由master的tick定时器定期往所有worker发送定时事件,systemstat和membasestat订阅这些事件然后定时上报状态

该tick守候进程只在master节点上启动

参与的类有

 

采集执行模块:

system_stats_collector 采集CPU,内存,磁盘情况

stats_collector 通过二进制命令往membase采集bucket数据

stats_archiver把采集的数据存储到mnesia中

stats_reader 读取模块

他们除了stats_reader外,其他的都是gen_server

 

辅助模块:

ns_tick 定时器守候进程,只在master运行

mb_master 逻辑模块,根据情况决定当前节点是master,candidate或者worker

mb_master_sup master节点的监控模块,把master节点需要的服务启动,如ns_tick

 

ns_pubsub 订阅辅助模块,采集执行体中相关的gen_server模块都是通过它从而获得订阅通知功能。


代码细节分析

mb_master判断和启动是否成为master

Master节点启动
init([]) ->
    Self = self(),
    ns_pubsub:subscribe(
      ns_config_events,
      fun ({nodes_wanted, Nodes}, State) ->
              Self ! {peers, Nodes},
              State;
          (_, State) ->
              State
      end, empty),
    erlang:process_flag(trap_exit, true),
    {ok, _} = timer:send_interval(?HEARTBEAT_INTERVAL, send_heartbeat),
    case ns_node_disco:nodes_wanted() of
        [N] = P when N == node() ->
            ?log_info("I'm the only node, so I'm the master.", []),
            {ok, master, start_master(#state{last_heard=now(), peers=P})};
        Peers when is_list(Peers) ->
            case lists:member(node(), Peers) of
                false ->
                    %% We're a worker, but don't know who the master is yet
                    ?log_info("Starting as worker. Peers: ~p", [Peers]),
                    {ok, worker, #state{last_heard=now()}};
                true ->
                    %% We're a candidate
                    ?log_info("Starting as candidate. Peers: ~p", [Peers]),
                    {ok, candidate, #state{last_heard=now(), peers=Peers}}
            end
end.
自动接管:
handle_info(send_heartbeat, candidate, #state{peers=Peers} = StateData) ->
    send_heartbeat(Peers, candidate, StateData),
    case timer:now_diff(now(), StateData#state.last_heard) >= ?TIMEOUT of
        true ->
            %% Take over
            ?log_info("Haven't heard from a higher priority node or "
                      "a master, so I'm taking over.", []),
            {ok, Pid} = mb_master_sup:start_link(),
            {next_state, master, StateData#state{child=Pid, master=node()}};
        false ->
            {next_state, candidate, StateData}
    end;

ns_tick对所有的节点发送统计命令

系统运行一个tick服务(gen_server),它定义了一个定时器(timer:send_interval(Interval, tick)),在定时器触发时对所有的node发出tick事件
handle_info(tick, State) ->
    misc:verify_name(?MODULE), % MB-3180: make sure we're still registered
    Now = misc:time_to_epoch_ms_int(now()),
    lists:foreach(fun (Node) ->
                          gen_event:notify({?EVENT_MANAGER, Node}, {tick, Now})
                  end, [node() | nodes()]),
{noreply, State#state{time=Now}};

通用的no embed事件订阅
-module(ns_pubsub).
-behaviour(gen_event).
subscribe(Name) ->
    subscribe(Name, msg_fun(self()), ignored).
subscribe(Name, Fun, State) ->
    Ref = make_ref(),
    ok = gen_event:add_sup_handler(Name, {?MODULE, Ref},
                                   #state{func=Fun, func_state=State}),
Ref.
msg_fun(Pid) ->
    fun (Event, ignored) ->
            Pid ! Event,
            ignored
end.
handle_event(Event, State = #state{func=Fun, func_state=FS}) ->
    NewState = Fun(Event, FS),
    {ok, State#state{func_state=NewState}};


系统监控:依赖于portsigar(sigar system-level stats for erlang) /RabbitMQ直接调用/usr/bin/vm_stat,/usr/sbin/prtconf和读取‘文件’"/proc/meminfo“

init([]) ->
    Path = path_config:component_path(bin, "sigar_port"),
    Port =
        try open_port({spawn_executable, Path},
                      [stream, use_stdio, exit_status,
                       binary, eof, {arg0, lists:flatten(io_lib:format("portsigar for ~s", [node()]))}]) of
            X ->
                ns_pubsub:subscribe(ns_tick_event),
                X
        catch error:enoent ->
                ?log_warning("~s is missing. Will not collect system-level stats", [Path]),
                undefined
        end,
    {ok, #state{port = Port}}.
handle_info({tick, TS}, #state{port = Port, prev_sample = PrevSample}) ->
    case flush_ticks(0) of
        0 -> ok;
        N -> ?log_warning("lost ~p ticks", [N])
    end,
    port_command(Port, <<0:32/native>>),
    Binary = recv_data(Port, [], ?STATS_BLOCK_SIZE),
    {Stats0, NewPrevSample} = unpack_data(Binary, PrevSample),
    case Stats0 of
        undefined -> ok;
        _ ->
            Stats = lists:sort(Stats0),
            gen_event:notify(ns_stats_event,
                             {stats, "@system", #stat_entry{timestamp = TS,
                                                            values = lists:sort(Stats)}})
    end,
{noreply, #state{port = Port, prev_sample = NewPrevSample}};

协议解析
unpack_data(Bin, PrevSample) ->
    <<Version:32/native,
      StructSize:32/native,
      CPULocalMS:64/native,
      CPUIdleMS:64/native,
      SwapTotal:64/native,
      SwapUsed:64/native,
      _SwapPageIn:64/native,
      _SwapPageOut:64/native,
      MemTotal:64/native,
      MemUsed:64/native,
      MemActualUsed:64/native,
      MemActualFree:64/native>> = Bin,
    StructSize = erlang:size(Bin),
    Version = 0,
    RawStats = [{cpu_local_ms, CPULocalMS},
                {cpu_idle_ms, CPUIdleMS},
                {swap_total, SwapTotal},
                {swap_used, SwapUsed},
                %% {swap_page_in, SwapPageIn},
                %% {swap_page_out, SwapPageOut},
                {mem_total, MemTotal},
                {mem_used, MemUsed},
                {mem_actual_used, MemActualUsed},
                {mem_actual_free, MemActualFree}],
    NowSamples = case PrevSample of
                     undefined -> undefined;
                     _ -> {_, OldCPULocal} = lists:keyfind(cpu_local_ms, 1, PrevSample),
                          {_, OldCPUIdle} = lists:keyfind(cpu_idle_ms, 1, PrevSample),
                          LocalDiff = CPULocalMS - OldCPULocal,
                          IdleDiff = CPUIdleMS - OldCPUIdle,
                          RV1 = lists:keyreplace(cpu_local_ms, 1, RawStats, {cpu_local_ms, LocalDiff}),
                          RV2 = lists:keyreplace(cpu_idle_ms, 1, RV1, {cpu_idle_ms, IdleDiff}),
                          [{mem_free, MemTotal - MemUsed},
                           {cpu_utilization_rate, try 100 * (LocalDiff - IdleDiff) / LocalDiff
                                                  catch error:badarith -> 0 end}
                           | RV2]
                 end,
{NowSamples, RawStats}.

stats_collector

连接本地memcached采集memcached状态数据。


持久化,stats_archived

do_handle_info({stats, Bucket, Sample}, State = #state{bucket=Bucket}) ->
    Tab = table(Bucket, minute),
    {atomic, ok} = mnesia:transaction(fun () ->
                                              mnesia:write(Tab, Sample, write)
                                      end, ?RETRIES),
    gen_event:notify(ns_stats_event, {sample_archived, Bucket, Sample}),
    {noreply, State};

stats_reader读取数据

resample(Bucket, Period, Step, N) ->
    Seconds = N * Step,
    Tab = stats_archiver:table(Bucket, Period),
    case mnesia:dirty_last(Tab) of
        '$end_of_table' ->
            {ok, []};
        Key ->
            Oldest = Key - Seconds * 1000 + 500,
            Handle = qlc:q([Sample || #stat_entry{timestamp=TS} = Sample
                                          <- mnesia:table(Tab), TS > Oldest]),
            F = fun (#stat_entry{timestamp = T} = Sample,
                     {T1, Acc, Chunk}) ->
                        case misc:trunc_ts(T, Step) of
                            T1 ->
                                {T1, Acc, [Sample|Chunk]};
                            T2 when T1 == undefined ->
                                {T2, Acc, [Sample]};
                            T2 ->
                                {T2, [avg(T1, Chunk)|Acc], [Sample]}
                        end
                end,
            case mnesia:activity(async_dirty, fun qlc:fold/3,
                                 [F, {undefined, [], []},
                                  Handle]) of
                {error, _, _} = Error ->
                    Error;
                {undefined, [], []} ->
                    {ok, []};
                {T, Acc, LastChunk} ->
                    {ok, lists:reverse([avg(T, LastChunk)|Acc])}
            end
    end.



你可能感兴趣的:(membase之ns_server监控数据采集)