You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by ja...@apache.org on 2023/06/01 16:19:19 UTC

[couchdb] branch main updated: Fix prometheus to survive mem3_sync termination

This is an automated email from the ASF dual-hosted git repository.

jaydoane pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/couchdb.git


The following commit(s) were added to refs/heads/main by this push:
     new e996fa07b Fix prometheus to survive mem3_sync termination
e996fa07b is described below

commit e996fa07b4878c6ab9a542e22e27210991abcb54
Author: Jay Doane <ja...@apache.org>
AuthorDate: Wed May 31 11:01:31 2023 -0700

    Fix prometheus to survive mem3_sync termination
    
    Currently, if `mem3_sync` is terminated, `prometheus_server` will
    crash the BEAM when it tries to get internal replication jobs:
    ```
    [error] 2023-05-31T15:52:13.989437Z node1@127.0.0.1 <0.1065.0> -------- gen_server couch_prometheus_server terminated with reason: no such process or port in call to gen_server:call(mem3_sync, get_backlog) at gen_server:call/2(line:370) <= couch_prometheus_server:get_internal_replication_jobs_stat/0(line:131) <= couch_prometheus_server:get_system_stats/0(line:118) <= couch_prometheus_server:refresh_metrics/0(line:90) <= couch_prometheus_server:handle_info/2(line:76) <= gen_server:try_ [...]
    ```
    and eventually
    ```
    [os_mon] cpu supervisor port (cpu_sup): Erlang has closed
    {"Kernel pid terminated",application_controller,"{application_terminated,couch_prometheus,shutdown}"}
    ```
    
    This adds a try/catch to prevent the crash.
---
 src/couch_prometheus/src/couch_prometheus_server.erl     | 14 +++++++++++++-
 .../test/eunit/couch_prometheus_e2e_tests.erl            | 16 +++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index d40efc702..1649898c7 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -36,6 +36,12 @@
     terminate/2
 ]).
 
+-ifdef(TEST).
+-export([
+    get_internal_replication_jobs_stat/0
+]).
+-endif.
+
 -include("couch_prometheus.hrl").
 
 start_link() ->
@@ -128,7 +134,13 @@ get_internal_replication_jobs_stat() ->
         internal_replication_jobs,
         gauge,
         "count of internal replication changes to process",
-        mem3_sync:get_backlog()
+        try
+            mem3_sync:get_backlog()
+        catch
+            _:_ ->
+                couch_log:warning("~p mem3_sync down", [?MODULE]),
+                0
+        end
     ).
 
 get_membership_stat() ->
diff --git a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
index 2a1016099..d24a01b20 100644
--- a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
+++ b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
@@ -41,7 +41,8 @@ e2e_test_() ->
                     ?TDEF_FE(t_prometheus_port),
                     ?TDEF_FE(t_metric_updated),
                     ?TDEF_FE(t_no_duplicate_metrics),
-                    ?TDEF_FE(t_starts_with_couchdb)
+                    ?TDEF_FE(t_starts_with_couchdb),
+                    ?TDEF_FE(t_survives_mem3_sync_termination)
                 ]
             }
         }
@@ -173,6 +174,19 @@ t_starts_with_couchdb(Port) ->
         Lines
     ).
 
+t_survives_mem3_sync_termination(_) ->
+    ServerPid = whereis(couch_prometheus_server),
+    ?assertNotEqual(undefined, ServerPid),
+    ?assertNotEqual(undefined, whereis(mem3_sync)),
+    ok = supervisor:terminate_child(mem3_sup, mem3_sync),
+    ?assertEqual(undefined, whereis(mem3_sync)),
+    ?assertMatch(
+        [[_, _], <<"couchdb_internal_replication_jobs 0">>],
+        couch_prometheus_server:get_internal_replication_jobs_stat()
+    ),
+    {ok, _} = supervisor:restart_child(mem3_sup, mem3_sync),
+    ?assertEqual(ServerPid, whereis(couch_prometheus_server)).
+
 node_local_url(Port) ->
     Addr = config:get("chttpd", "bind_address", "127.0.0.1"),
     lists:concat(["http://", Addr, ":", Port, "/_node/_local/_prometheus"]).