Fix restart logic and add watchdog

This commit is contained in:
Ulf Wiger
2025-10-23 22:22:59 +02:00
parent 847ffc810a
commit 1f6066705c
8 changed files with 180 additions and 10 deletions
+146
View File
@@ -0,0 +1,146 @@
-module(gmhc_watchdog).
-behavior(gen_server).
-export([ watch/2
, unwatch/0
, ping/0 ]).
-export([ note_started/2
, get_restart_info/0 ]).
-export([ start_link/0
, init/1
, handle_call/3
, handle_cast/2
, handle_info/2
, terminate/2
, code_change/3 ]).
-record(st, {services = #{}}).
-record(svc, { n = 5 :: pos_integer()
, n0 = 5 :: pos_integer()
, interval = 5000 :: pos_integer()
, tref :: reference()
, mref :: reference()
}).
-include_lib("kernel/include/logger.hrl").
watch(Interval, N) ->
gen_server:call(?MODULE, {watch, self(), Interval, N}).
unwatch() ->
gen_server:cast({unwatch, self()}).
ping() ->
gen_server:cast(?MODULE, {ping, self()}).
note_started(Id, Info) ->
gen_server:call(?MODULE, {note_started, Id, Info}).
get_restart_info() ->
gen_server:call(?MODULE, get_restart_info).
start_link() ->
gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
init([]) ->
{ok, #st{}}.
handle_call({note_started, Id, Info}, _From, S) ->
update_pt(Id, Info),
{reply, ok, S};
handle_call(get_restart_info, _From, S) ->
{reply, get_pt(), S};
handle_call({watch, Pid, Interval, N}, _From, S) ->
{reply, ok, add_watch(Pid, Interval, N, S)};
handle_call(_Req, _From, S) ->
{reply, {error, unknown_method}, S}.
handle_cast({ping, Pid}, S) ->
{noreply, reset_watch(Pid, S)};
handle_cast({unwatch, Pid}, S) ->
{noreply, delete_watch(Pid, S)};
handle_cast(Msg, S) ->
?LOG_DEBUG("Unknown cast: ~p", [Msg]),
{noreply, S}.
handle_info({timeout, _, Pid}, S) ->
?LOG_INFO("Timeout for pid ~p", [Pid]),
{noreply, ping_timeout(Pid, S)};
handle_info({'DOWN', _, process, Pid, _}, S) ->
{noreply, delete_watch(Pid, S)};
handle_info(Msg, S) ->
?LOG_DEBUG("Unknown msg: ~p", [Msg]),
{noreply, S}.
terminate(_, _) ->
ok.
code_change(_FromVsn, S, _Extra) ->
{ok, S}.
add_watch(Pid, Interval, N, #st{services = Svcs} = S) ->
MRef = erlang:monitor(process, Pid),
Svc0 = #svc{ interval = Interval
, mref = MRef
, n = N
, n0 = N},
Svc = start_timer(Pid, Svc0),
S#st{services = Svcs#{Pid => Svc}}.
reset_watch(Pid, #st{services = Svcs} = S) ->
case maps:find(Pid, Svcs) of
{ok, #svc{ n0 = N0 } = Svc} ->
Svc1 = restart_timer(Pid, Svc#svc{n = N0}),
S#st{services = Svcs#{Pid := Svc1}};
error ->
S
end.
delete_watch(Pid, #st{services = Svcs} = S) ->
case maps:find(Pid, Svcs) of
{ok, #svc{tref = TRef, mref = MRef}} ->
erlang:cancel_timer(TRef),
erlang:demonitor(MRef),
S#st{services = maps:remove(Pid, Svcs)};
error ->
S
end.
ping_timeout(Pid, #st{services = Svcs} = S) ->
case maps:find(Pid, Svcs) of
{ok, #svc{ n = N } = Svc} ->
N1 = N-1,
if N1 =< 0 ->
?LOG_ERROR("Will exit Pid ~p", [Pid]),
exit(Pid, kill),
S#st{services = maps:remove(Pid, Svcs)};
true ->
Svc1 = restart_timer(Pid, Svc#svc{n = N1}),
S#st{services = Svcs#{Pid := Svc1}}
end;
error ->
S
end.
start_timer(Pid, #svc{interval = T} = Svc) ->
TRef = erlang:start_timer(T, self(), Pid),
Svc#svc{tref = TRef}.
restart_timer(Pid, #svc{tref = TRef} = Svc) ->
erlang:cancel_timer(TRef),
start_timer(Pid, Svc#svc{tref = undefined}).
update_pt(Id, Info) ->
Pt = get_pt(),
put_pt(Pt#{Id => Info}).
get_pt() ->
persistent_term:get(pt_key(), #{}).
put_pt(Pt) ->
persistent_term:put(pt_key(), Pt).
pt_key() ->
{?MODULE, restart_info}.