You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by ja...@apache.org on 2023/05/31 18:01:50 UTC

[couchdb] 01/01: Fix prometheus to survive mem3_sync termination

This is an automated email from the ASF dual-hosted git repository.

jaydoane pushed a commit to branch prometheus-mem3-sync-crash
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 6ecb2b1abf3c0e4dfaa4964b451a86bdc8cf4795
Author: Jay Doane <ja...@apache.org>
AuthorDate: Wed May 31 11:01:31 2023 -0700

    Fix prometheus to survive mem3_sync termination
    
    Currently, if `mem3_sync` is terminated, `prometheus_server` will
    crash the BEAM when it tries to get internal replication jobs:
    ```
    [error] 2023-05-31T15:52:13.989437Z node1@127.0.0.1 <0.1065.0> -------- gen_server couch_prometheus_server terminated with reason: no such process or port in call to gen_server:call(mem3_sync, get_backlog) at gen_server:call/2(line:370) <= couch_prometheus_server:get_internal_replication_jobs_stat/0(line:131) <= couch_prometheus_server:get_system_stats/0(line:118) <= couch_prometheus_server:refresh_metrics/0(line:90) <= couch_prometheus_server:handle_info/2(line:76) <= gen_server:try_ [...]
    ```
    and eventually
    ```
    [os_mon] cpu supervisor port (cpu_sup): Erlang has closed
    {"Kernel pid terminated",application_controller,"{application_terminated,couch_prometheus,shutdown}"}
    ```
    
    This adds a try/catch to prevent the crash.
---
 src/couch_prometheus/src/couch_prometheus_server.erl     | 10 +++++++++-
 .../test/eunit/couch_prometheus_e2e_tests.erl            | 16 +++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index d40efc702..7182fe3a0 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -22,6 +22,7 @@
 ]).
 
 -export([
+    get_internal_replication_jobs_stat/0, % for testing
     scrape/0,
     version/0
 ]).
@@ -128,7 +129,14 @@ get_internal_replication_jobs_stat() ->
         internal_replication_jobs,
         gauge,
         "count of internal replication changes to process",
-        mem3_sync:get_backlog()
+        try mem3_sync:get_backlog() of
+            Backlog ->
+                Backlog
+        catch
+            _:_ ->
+                couch_log:warning("~p mem3_sync down", [?MODULE]),
+                0
+        end
     ).
 
 get_membership_stat() ->
diff --git a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
index 2a1016099..d2c50fc5d 100644
--- a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
+++ b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
@@ -41,7 +41,8 @@ e2e_test_() ->
                     ?TDEF_FE(t_prometheus_port),
                     ?TDEF_FE(t_metric_updated),
                     ?TDEF_FE(t_no_duplicate_metrics),
-                    ?TDEF_FE(t_starts_with_couchdb)
+                    ?TDEF_FE(t_starts_with_couchdb),
+                    ?TDEF_FE(t_survives_mem3_sync_termination)
                 ]
             }
         }
@@ -173,6 +174,19 @@ t_starts_with_couchdb(Port) ->
         Lines
     ).
 
+t_survives_mem3_sync_termination(_) ->
+    ServerPid = whereis(couch_prometheus_server),
+    ?assertNotEqual(undefined, ServerPid),
+    ?assertNotEqual(undefined, whereis(mem3_sync)),
+    ok = supervisor:terminate_child(mem3_sup, mem3_sync),
+    ?assertEqual(undefined, whereis(mem3_sync)),
+    ?assertMatch(
+         [[_, _], <<"couchdb_internal_replication_jobs 0">>],
+         couch_prometheus_server:get_internal_replication_jobs_stat()
+    ),
+    {ok, _} = supervisor:restart_child(mem3_sup, mem3_sync),
+    ?assertEqual(ServerPid, whereis(couch_prometheus_server)).    
+    
 node_local_url(Port) ->
     Addr = config:get("chttpd", "bind_address", "127.0.0.1"),
     lists:concat(["http://", Addr, ":", Port, "/_node/_local/_prometheus"]).