Fix restart logic and add watchdog
This commit is contained in:
@@ -0,0 +1,146 @@
|
||||
-module(gmhc_watchdog).
|
||||
-behavior(gen_server).
|
||||
|
||||
-export([ watch/2
|
||||
, unwatch/0
|
||||
, ping/0 ]).
|
||||
|
||||
-export([ note_started/2
|
||||
, get_restart_info/0 ]).
|
||||
|
||||
-export([ start_link/0
|
||||
, init/1
|
||||
, handle_call/3
|
||||
, handle_cast/2
|
||||
, handle_info/2
|
||||
, terminate/2
|
||||
, code_change/3 ]).
|
||||
|
||||
-record(st, {services = #{}}).
|
||||
-record(svc, { n = 5 :: pos_integer()
|
||||
, n0 = 5 :: pos_integer()
|
||||
, interval = 5000 :: pos_integer()
|
||||
, tref :: reference()
|
||||
, mref :: reference()
|
||||
}).
|
||||
|
||||
-include_lib("kernel/include/logger.hrl").
|
||||
|
||||
watch(Interval, N) ->
|
||||
gen_server:call(?MODULE, {watch, self(), Interval, N}).
|
||||
|
||||
unwatch() ->
|
||||
gen_server:cast({unwatch, self()}).
|
||||
|
||||
ping() ->
|
||||
gen_server:cast(?MODULE, {ping, self()}).
|
||||
|
||||
note_started(Id, Info) ->
|
||||
gen_server:call(?MODULE, {note_started, Id, Info}).
|
||||
|
||||
get_restart_info() ->
|
||||
gen_server:call(?MODULE, get_restart_info).
|
||||
|
||||
start_link() ->
|
||||
gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
|
||||
|
||||
init([]) ->
|
||||
{ok, #st{}}.
|
||||
|
||||
handle_call({note_started, Id, Info}, _From, S) ->
|
||||
update_pt(Id, Info),
|
||||
{reply, ok, S};
|
||||
handle_call(get_restart_info, _From, S) ->
|
||||
{reply, get_pt(), S};
|
||||
handle_call({watch, Pid, Interval, N}, _From, S) ->
|
||||
{reply, ok, add_watch(Pid, Interval, N, S)};
|
||||
handle_call(_Req, _From, S) ->
|
||||
{reply, {error, unknown_method}, S}.
|
||||
|
||||
handle_cast({ping, Pid}, S) ->
|
||||
{noreply, reset_watch(Pid, S)};
|
||||
handle_cast({unwatch, Pid}, S) ->
|
||||
{noreply, delete_watch(Pid, S)};
|
||||
handle_cast(Msg, S) ->
|
||||
?LOG_DEBUG("Unknown cast: ~p", [Msg]),
|
||||
{noreply, S}.
|
||||
|
||||
handle_info({timeout, _, Pid}, S) ->
|
||||
?LOG_INFO("Timeout for pid ~p", [Pid]),
|
||||
{noreply, ping_timeout(Pid, S)};
|
||||
handle_info({'DOWN', _, process, Pid, _}, S) ->
|
||||
{noreply, delete_watch(Pid, S)};
|
||||
handle_info(Msg, S) ->
|
||||
?LOG_DEBUG("Unknown msg: ~p", [Msg]),
|
||||
{noreply, S}.
|
||||
|
||||
terminate(_, _) ->
|
||||
ok.
|
||||
|
||||
code_change(_FromVsn, S, _Extra) ->
|
||||
{ok, S}.
|
||||
|
||||
add_watch(Pid, Interval, N, #st{services = Svcs} = S) ->
|
||||
MRef = erlang:monitor(process, Pid),
|
||||
Svc0 = #svc{ interval = Interval
|
||||
, mref = MRef
|
||||
, n = N
|
||||
, n0 = N},
|
||||
Svc = start_timer(Pid, Svc0),
|
||||
S#st{services = Svcs#{Pid => Svc}}.
|
||||
|
||||
reset_watch(Pid, #st{services = Svcs} = S) ->
|
||||
case maps:find(Pid, Svcs) of
|
||||
{ok, #svc{ n0 = N0 } = Svc} ->
|
||||
Svc1 = restart_timer(Pid, Svc#svc{n = N0}),
|
||||
S#st{services = Svcs#{Pid := Svc1}};
|
||||
error ->
|
||||
S
|
||||
end.
|
||||
|
||||
delete_watch(Pid, #st{services = Svcs} = S) ->
|
||||
case maps:find(Pid, Svcs) of
|
||||
{ok, #svc{tref = TRef, mref = MRef}} ->
|
||||
erlang:cancel_timer(TRef),
|
||||
erlang:demonitor(MRef),
|
||||
S#st{services = maps:remove(Pid, Svcs)};
|
||||
error ->
|
||||
S
|
||||
end.
|
||||
|
||||
ping_timeout(Pid, #st{services = Svcs} = S) ->
|
||||
case maps:find(Pid, Svcs) of
|
||||
{ok, #svc{ n = N } = Svc} ->
|
||||
N1 = N-1,
|
||||
if N1 =< 0 ->
|
||||
?LOG_ERROR("Will exit Pid ~p", [Pid]),
|
||||
exit(Pid, kill),
|
||||
S#st{services = maps:remove(Pid, Svcs)};
|
||||
true ->
|
||||
Svc1 = restart_timer(Pid, Svc#svc{n = N1}),
|
||||
S#st{services = Svcs#{Pid := Svc1}}
|
||||
end;
|
||||
error ->
|
||||
S
|
||||
end.
|
||||
|
||||
start_timer(Pid, #svc{interval = T} = Svc) ->
|
||||
TRef = erlang:start_timer(T, self(), Pid),
|
||||
Svc#svc{tref = TRef}.
|
||||
|
||||
restart_timer(Pid, #svc{tref = TRef} = Svc) ->
|
||||
erlang:cancel_timer(TRef),
|
||||
start_timer(Pid, Svc#svc{tref = undefined}).
|
||||
|
||||
update_pt(Id, Info) ->
|
||||
Pt = get_pt(),
|
||||
put_pt(Pt#{Id => Info}).
|
||||
|
||||
get_pt() ->
|
||||
persistent_term:get(pt_key(), #{}).
|
||||
|
||||
put_pt(Pt) ->
|
||||
persistent_term:put(pt_key(), Pt).
|
||||
|
||||
pt_key() ->
|
||||
{?MODULE, restart_info}.
|
||||
Reference in New Issue
Block a user