You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by wi...@apache.org on 2023/04/13 05:46:50 UTC

[couchdb] 02/03: feat (prometheus): couch_db_updater and couch_file queue stats

This is an automated email from the ASF dual-hosted git repository.

willholley pushed a commit to branch prometheus_erlang_dist
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 7d4f0a6702e2540c8d2a4a3e2d88331d3af6edae
Author: Will Holley <wi...@uk.ibm.com>
AuthorDate: Wed Apr 12 15:08:45 2023 +0000

    feat (prometheus): couch_db_updater and couch_file queue stats
    
    # What
    
    Adds summary metrics for couch_db_updater and couch_file, the same as
    returned by the `_system` endpoint.
    
    Unlike the other message queue stats, these are returned as a Prometheus
    summary type across the following metrics, using `couch_db_updater` as
    an example:
    
     * couchdb_erlang_message_queue_couch_db_updater{quantile="0.5"}
     * couchdb_erlang_message_queue_couch_db_updater{quantile="0.9"}
     * couchdb_erlang_message_queue_couch_db_updater{quantile="0.99"}
     * couchdb_erlang_message_queue_couch_db_updater_sum
     * couchdb_erlang_message_queue_couch_db_updater_count
    
    The count metric represents the number of processes and the sum is the
    total size of all message queues for those processes.
    
    In addition, min and max message queue sizes are returned, matching
    the _system endpoint response:
    
     * couchdb_erlang_message_queue_couch_db_updater_min
     * couchdb_erlang_message_queue_couch_db_updater_max
    
    # How
    
    This represents a new type of metric in the prometheus endpoint - the
    existing `summary` types have all been for latency histograms - so
    a new utility function `pid_to_prom_summary` is added to format the
    message queue stats into prometheus metrics series.
    
    In `chttpd_node` I've extracted the formatting step from the `db_pid_stats`
    function to allow for re-use between `chttpd_node` and
    `couch_prometheus_server`, where the result is formatted differently.
    `chttpd_node` doesn't seem like the best place to put shared code like
    this but neither does there seem an obvious place to extract it to as
    an alternative, so I've left it for now.
---
 src/chttpd/src/chttpd_node.erl                     | 11 +++++---
 .../src/couch_prometheus_server.erl                | 31 ++++++++++++++++++++++
 src/couch_prometheus/src/couch_prometheus_util.erl |  1 +
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/src/chttpd/src/chttpd_node.erl b/src/chttpd/src/chttpd_node.erl
index bb3cf4798..ef586e174 100644
--- a/src/chttpd/src/chttpd_node.erl
+++ b/src/chttpd/src/chttpd_node.erl
@@ -287,7 +287,7 @@ get_stats() ->
     {NumberOfGCs, WordsReclaimed, _} = statistics(garbage_collection),
     {{input, Input}, {output, Output}} = statistics(io),
 
-    {CF, CDU} = db_pid_stats(),
+    {CF, CDU} = db_pid_stats_formatted(),
     MessageQueuesHist = [
         {couch_file, {CF}},
         {couch_db_updater, {CDU}}
@@ -315,6 +315,10 @@ get_stats() ->
         {distribution, {get_distribution_stats()}}
     ].
 
+db_pid_stats_formatted() ->
+    {CF, CDU} = db_pid_stats(),
+    {format_pid_stats(CF), format_pid_stats(CDU)}.
+
 db_pid_stats() ->
     {monitors, M} = process_info(whereis(couch_stats_process_tracker), monitors),
     Candidates = [Pid || {process, Pid} <- M],
@@ -323,7 +327,7 @@ db_pid_stats() ->
     {CouchFiles, CouchDbUpdaters}.
 
 db_pid_stats(Mod, Candidates) ->
-    Mailboxes = lists:foldl(
+    lists:foldl(
         fun(Pid, Acc) ->
             case process_info(Pid, [message_queue_len, dictionary]) of
                 undefined ->
@@ -343,8 +347,7 @@ db_pid_stats(Mod, Candidates) ->
         end,
         [],
         Candidates
-    ),
-    format_pid_stats(Mailboxes).
+    ).
 
 format_pid_stats([]) ->
     [];
diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index 847ad947d..884d792f0 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -17,6 +17,7 @@
 -import(couch_prometheus_util, [
     couch_to_prom/3,
     to_prom/4,
+    to_prom/2,
     to_prom_summary/2
 ]).
 
@@ -110,6 +111,7 @@ get_system_stats() ->
         get_uptime_stat(),
         get_io_stats(),
         get_message_queue_stats(),
+        get_db_pid_stats(),
         get_run_queue_stats(),
         get_vm_stats(),
         get_ets_stats(),
@@ -220,6 +222,35 @@ get_message_queue_stats() ->
         to_prom(erlang_message_queue_size, gauge, "size of message queue", QueueLenByLabel)
     ].
 
+get_db_pid_stats() ->
+    {CF, CDU} = chttpd_node:db_pid_stats(),
+    [
+        pid_to_prom_summary(
+            "erlang_message_queue_couch_file",
+            "size of message queue across couch_file processes",
+            CF
+        ),
+        pid_to_prom_summary(
+            "erlang_message_queue_couch_db_updater",
+            "size of message queue across couch_db_updater processes",
+            CDU
+        )
+    ].
+
+pid_to_prom_summary(Metric, Desc, Mailboxes) ->
+    Sorted = lists:sort(Mailboxes),
+    Count = length(Sorted),
+    Quantiles = [
+        {[{quantile, <<"0.5">>}], lists:nth(round(Count * 0.5), Sorted)},
+        {[{quantile, <<"0.9">>}], lists:nth(round(Count * 0.9), Sorted)},
+        {[{quantile, <<"0.99">>}], lists:nth(round(Count * 0.99), Sorted)}
+    ],
+    SumStat = to_prom(Metric ++ ["_sum"], lists:sum(Sorted)),
+    CountStat = to_prom(Metric ++ ["_count"], length(Sorted)),
+    MinStat = to_prom(Metric ++ ["_min"], hd(Sorted)),
+    MaxStat = to_prom(Metric ++ ["_max"], lists:nth(Count, Sorted)),
+    to_prom(Metric, summary, Desc, Quantiles) ++ [SumStat, CountStat, MinStat, MaxStat].
+
 get_run_queue_stats() ->
     %% Workaround for https://bugs.erlang.org/browse/ERL-1355
     {SQ, DCQ} = chttpd_node:run_queues(),
diff --git a/src/couch_prometheus/src/couch_prometheus_util.erl b/src/couch_prometheus/src/couch_prometheus_util.erl
index 5775b9693..4665ba7f9 100644
--- a/src/couch_prometheus/src/couch_prometheus_util.erl
+++ b/src/couch_prometheus/src/couch_prometheus_util.erl
@@ -16,6 +16,7 @@
     couch_to_prom/3,
     to_bin/1,
     to_prom/4,
+    to_prom/2,
     to_prom_summary/2
 ]).