You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by ja...@apache.org on 2023/06/01 16:19:19 UTC
[couchdb] branch main updated: Fix prometheus to survive mem3_sync termination
This is an automated email from the ASF dual-hosted git repository.
jaydoane pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/couchdb.git
The following commit(s) were added to refs/heads/main by this push:
new e996fa07b Fix prometheus to survive mem3_sync termination
e996fa07b is described below
commit e996fa07b4878c6ab9a542e22e27210991abcb54
Author: Jay Doane <ja...@apache.org>
AuthorDate: Wed May 31 11:01:31 2023 -0700
Fix prometheus to survive mem3_sync termination
Currently, if `mem3_sync` is terminated, `prometheus_server` will
crash the BEAM when it tries to get internal replication jobs:
```
[error] 2023-05-31T15:52:13.989437Z node1@127.0.0.1 <0.1065.0> -------- gen_server couch_prometheus_server terminated with reason: no such process or port in call to gen_server:call(mem3_sync, get_backlog) at gen_server:call/2(line:370) <= couch_prometheus_server:get_internal_replication_jobs_stat/0(line:131) <= couch_prometheus_server:get_system_stats/0(line:118) <= couch_prometheus_server:refresh_metrics/0(line:90) <= couch_prometheus_server:handle_info/2(line:76) <= gen_server:try_ [...]
```
and eventually
```
[os_mon] cpu supervisor port (cpu_sup): Erlang has closed
{"Kernel pid terminated",application_controller,"{application_terminated,couch_prometheus,shutdown}"}
```
This adds a try/catch to prevent the crash.
---
src/couch_prometheus/src/couch_prometheus_server.erl | 14 +++++++++++++-
.../test/eunit/couch_prometheus_e2e_tests.erl | 16 +++++++++++++++-
2 files changed, 28 insertions(+), 2 deletions(-)
diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index d40efc702..1649898c7 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -36,6 +36,12 @@
terminate/2
]).
+-ifdef(TEST).
+-export([
+ get_internal_replication_jobs_stat/0
+]).
+-endif.
+
-include("couch_prometheus.hrl").
start_link() ->
@@ -128,7 +134,13 @@ get_internal_replication_jobs_stat() ->
internal_replication_jobs,
gauge,
"count of internal replication changes to process",
- mem3_sync:get_backlog()
+ try
+ mem3_sync:get_backlog()
+ catch
+ _:_ ->
+ couch_log:warning("~p mem3_sync down", [?MODULE]),
+ 0
+ end
).
get_membership_stat() ->
diff --git a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
index 2a1016099..d24a01b20 100644
--- a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
+++ b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
@@ -41,7 +41,8 @@ e2e_test_() ->
?TDEF_FE(t_prometheus_port),
?TDEF_FE(t_metric_updated),
?TDEF_FE(t_no_duplicate_metrics),
- ?TDEF_FE(t_starts_with_couchdb)
+ ?TDEF_FE(t_starts_with_couchdb),
+ ?TDEF_FE(t_survives_mem3_sync_termination)
]
}
}
@@ -173,6 +174,19 @@ t_starts_with_couchdb(Port) ->
Lines
).
+t_survives_mem3_sync_termination(_) ->
+ ServerPid = whereis(couch_prometheus_server),
+ ?assertNotEqual(undefined, ServerPid),
+ ?assertNotEqual(undefined, whereis(mem3_sync)),
+ ok = supervisor:terminate_child(mem3_sup, mem3_sync),
+ ?assertEqual(undefined, whereis(mem3_sync)),
+ ?assertMatch(
+ [[_, _], <<"couchdb_internal_replication_jobs 0">>],
+ couch_prometheus_server:get_internal_replication_jobs_stat()
+ ),
+ {ok, _} = supervisor:restart_child(mem3_sup, mem3_sync),
+ ?assertEqual(ServerPid, whereis(couch_prometheus_server)).
+
node_local_url(Port) ->
Addr = config:get("chttpd", "bind_address", "127.0.0.1"),
lists:concat(["http://", Addr, ":", Port, "/_node/_local/_prometheus"]).