You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by ja...@apache.org on 2023/05/31 18:01:50 UTC
[couchdb] 01/01: Fix prometheus to survive mem3_sync termination
This is an automated email from the ASF dual-hosted git repository.
jaydoane pushed a commit to branch prometheus-mem3-sync-crash
in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 6ecb2b1abf3c0e4dfaa4964b451a86bdc8cf4795
Author: Jay Doane <ja...@apache.org>
AuthorDate: Wed May 31 11:01:31 2023 -0700
Fix prometheus to survive mem3_sync termination
Currently, if `mem3_sync` is terminated, `prometheus_server` will
crash the BEAM when it tries to get internal replication jobs:
```
[error] 2023-05-31T15:52:13.989437Z node1@127.0.0.1 <0.1065.0> -------- gen_server couch_prometheus_server terminated with reason: no such process or port in call to gen_server:call(mem3_sync, get_backlog) at gen_server:call/2(line:370) <= couch_prometheus_server:get_internal_replication_jobs_stat/0(line:131) <= couch_prometheus_server:get_system_stats/0(line:118) <= couch_prometheus_server:refresh_metrics/0(line:90) <= couch_prometheus_server:handle_info/2(line:76) <= gen_server:try_ [...]
```
and eventually
```
[os_mon] cpu supervisor port (cpu_sup): Erlang has closed
{"Kernel pid terminated",application_controller,"{application_terminated,couch_prometheus,shutdown}"}
```
This adds a try/catch to prevent the crash.
---
src/couch_prometheus/src/couch_prometheus_server.erl | 10 +++++++++-
.../test/eunit/couch_prometheus_e2e_tests.erl | 16 +++++++++++++++-
2 files changed, 24 insertions(+), 2 deletions(-)
diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index d40efc702..7182fe3a0 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -22,6 +22,7 @@
]).
-export([
+ get_internal_replication_jobs_stat/0, % for testing
scrape/0,
version/0
]).
@@ -128,7 +129,14 @@ get_internal_replication_jobs_stat() ->
internal_replication_jobs,
gauge,
"count of internal replication changes to process",
- mem3_sync:get_backlog()
+ try mem3_sync:get_backlog() of
+ Backlog ->
+ Backlog
+ catch
+ _:_ ->
+ couch_log:warning("~p mem3_sync down", [?MODULE]),
+ 0
+ end
).
get_membership_stat() ->
diff --git a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
index 2a1016099..d2c50fc5d 100644
--- a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
+++ b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
@@ -41,7 +41,8 @@ e2e_test_() ->
?TDEF_FE(t_prometheus_port),
?TDEF_FE(t_metric_updated),
?TDEF_FE(t_no_duplicate_metrics),
- ?TDEF_FE(t_starts_with_couchdb)
+ ?TDEF_FE(t_starts_with_couchdb),
+ ?TDEF_FE(t_survives_mem3_sync_termination)
]
}
}
@@ -173,6 +174,19 @@ t_starts_with_couchdb(Port) ->
Lines
).
+t_survives_mem3_sync_termination(_) ->
+ ServerPid = whereis(couch_prometheus_server),
+ ?assertNotEqual(undefined, ServerPid),
+ ?assertNotEqual(undefined, whereis(mem3_sync)),
+ ok = supervisor:terminate_child(mem3_sup, mem3_sync),
+ ?assertEqual(undefined, whereis(mem3_sync)),
+ ?assertMatch(
+ [[_, _], <<"couchdb_internal_replication_jobs 0">>],
+ couch_prometheus_server:get_internal_replication_jobs_stat()
+ ),
+ {ok, _} = supervisor:restart_child(mem3_sup, mem3_sync),
+ ?assertEqual(ServerPid, whereis(couch_prometheus_server)).
+
node_local_url(Port) ->
Addr = config:get("chttpd", "bind_address", "127.0.0.1"),
lists:concat(["http://", Addr, ":", Port, "/_node/_local/_prometheus"]).