You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by ja...@apache.org on 2023/05/31 18:01:49 UTC

[couchdb] branch prometheus-mem3-sync-crash created (now 6ecb2b1ab)

This is an automated email from the ASF dual-hosted git repository.

jaydoane pushed a change to branch prometheus-mem3-sync-crash
in repository https://gitbox.apache.org/repos/asf/couchdb.git


      at 6ecb2b1ab Fix prometheus to survive mem3_sync termination

This branch includes the following new commits:

     new 6ecb2b1ab Fix prometheus to survive mem3_sync termination

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[couchdb] 01/01: Fix prometheus to survive mem3_sync termination

Posted by ja...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jaydoane pushed a commit to branch prometheus-mem3-sync-crash
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 6ecb2b1abf3c0e4dfaa4964b451a86bdc8cf4795
Author: Jay Doane <ja...@apache.org>
AuthorDate: Wed May 31 11:01:31 2023 -0700

    Fix prometheus to survive mem3_sync termination
    
    Currently, if `mem3_sync` is terminated, `prometheus_server` will
    crash the BEAM when it tries to get internal replication jobs:
    ```
    [error] 2023-05-31T15:52:13.989437Z node1@127.0.0.1 <0.1065.0> -------- gen_server couch_prometheus_server terminated with reason: no such process or port in call to gen_server:call(mem3_sync, get_backlog) at gen_server:call/2(line:370) <= couch_prometheus_server:get_internal_replication_jobs_stat/0(line:131) <= couch_prometheus_server:get_system_stats/0(line:118) <= couch_prometheus_server:refresh_metrics/0(line:90) <= couch_prometheus_server:handle_info/2(line:76) <= gen_server:try_ [...]
    ```
    and eventually
    ```
    [os_mon] cpu supervisor port (cpu_sup): Erlang has closed
    {"Kernel pid terminated",application_controller,"{application_terminated,couch_prometheus,shutdown}"}
    ```
    
    This adds a try/catch to prevent the crash.
---
 src/couch_prometheus/src/couch_prometheus_server.erl     | 10 +++++++++-
 .../test/eunit/couch_prometheus_e2e_tests.erl            | 16 +++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index d40efc702..7182fe3a0 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -22,6 +22,7 @@
 ]).
 
 -export([
+    get_internal_replication_jobs_stat/0, % for testing
     scrape/0,
     version/0
 ]).
@@ -128,7 +129,14 @@ get_internal_replication_jobs_stat() ->
         internal_replication_jobs,
         gauge,
         "count of internal replication changes to process",
-        mem3_sync:get_backlog()
+        try mem3_sync:get_backlog() of
+            Backlog ->
+                Backlog
+        catch
+            _:_ ->
+                couch_log:warning("~p mem3_sync down", [?MODULE]),
+                0
+        end
     ).
 
 get_membership_stat() ->
diff --git a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
index 2a1016099..d2c50fc5d 100644
--- a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
+++ b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
@@ -41,7 +41,8 @@ e2e_test_() ->
                     ?TDEF_FE(t_prometheus_port),
                     ?TDEF_FE(t_metric_updated),
                     ?TDEF_FE(t_no_duplicate_metrics),
-                    ?TDEF_FE(t_starts_with_couchdb)
+                    ?TDEF_FE(t_starts_with_couchdb),
+                    ?TDEF_FE(t_survives_mem3_sync_termination)
                 ]
             }
         }
@@ -173,6 +174,19 @@ t_starts_with_couchdb(Port) ->
         Lines
     ).
 
+t_survives_mem3_sync_termination(_) ->
+    ServerPid = whereis(couch_prometheus_server),
+    ?assertNotEqual(undefined, ServerPid),
+    ?assertNotEqual(undefined, whereis(mem3_sync)),
+    ok = supervisor:terminate_child(mem3_sup, mem3_sync),
+    ?assertEqual(undefined, whereis(mem3_sync)),
+    ?assertMatch(
+         [[_, _], <<"couchdb_internal_replication_jobs 0">>],
+         couch_prometheus_server:get_internal_replication_jobs_stat()
+    ),
+    {ok, _} = supervisor:restart_child(mem3_sup, mem3_sync),
+    ?assertEqual(ServerPid, whereis(couch_prometheus_server)).    
+    
 node_local_url(Port) ->
     Addr = config:get("chttpd", "bind_address", "127.0.0.1"),
     lists:concat(["http://", Addr, ":", Port, "/_node/_local/_prometheus"]).