You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by wi...@apache.org on 2023/04/13 05:46:50 UTC
[couchdb] 02/03: feat (prometheus): couch_db_updater and couch_file queue stats
This is an automated email from the ASF dual-hosted git repository.
willholley pushed a commit to branch prometheus_erlang_dist
in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 7d4f0a6702e2540c8d2a4a3e2d88331d3af6edae
Author: Will Holley <wi...@uk.ibm.com>
AuthorDate: Wed Apr 12 15:08:45 2023 +0000
feat (prometheus): couch_db_updater and couch_file queue stats
# What
Adds summary metrics for couch_db_updater and couch_file, the same as
returned by the `_system` endpoint.
Unlike the other message queue stats, these are returned as a Prometheus
summary type across the following metrics, using `couch_db_updater` as
an example:
* couchdb_erlang_message_queue_couch_db_updater{quantile="0.5"}
* couchdb_erlang_message_queue_couch_db_updater{quantile="0.9"}
* couchdb_erlang_message_queue_couch_db_updater{quantile="0.99"}
* couchdb_erlang_message_queue_couch_db_updater_sum
* couchdb_erlang_message_queue_couch_db_updater_count
The count metric represents the number of processes and the sum is the
total size of all message queues for those processes.
In addition, min and max message queue sizes are returned, matching
the _system endpoint response:
* couchdb_erlang_message_queue_couch_db_updater_min
* couchdb_erlang_message_queue_couch_db_updater_max
# How
This represents a new type of metric in the prometheus endpoint - the
existing `summary` types have all been for latency histograms - so
a new utility function `pid_to_prom_summary` is added to format the
message queue stats into prometheus metrics series.
In `chttpd_node` I've extracted the formatting step from the `db_pid_stats`
function to allow for re-use between `chttpd_node` and
`couch_prometheus_server`, where the result is formatted differently.
`chttpd_node` doesn't seem like the best place to put shared code like
this but neither does there seem an obvious place to extract it to as
an alternative, so I've left it for now.
---
src/chttpd/src/chttpd_node.erl | 11 +++++---
.../src/couch_prometheus_server.erl | 31 ++++++++++++++++++++++
src/couch_prometheus/src/couch_prometheus_util.erl | 1 +
3 files changed, 39 insertions(+), 4 deletions(-)
diff --git a/src/chttpd/src/chttpd_node.erl b/src/chttpd/src/chttpd_node.erl
index bb3cf4798..ef586e174 100644
--- a/src/chttpd/src/chttpd_node.erl
+++ b/src/chttpd/src/chttpd_node.erl
@@ -287,7 +287,7 @@ get_stats() ->
{NumberOfGCs, WordsReclaimed, _} = statistics(garbage_collection),
{{input, Input}, {output, Output}} = statistics(io),
- {CF, CDU} = db_pid_stats(),
+ {CF, CDU} = db_pid_stats_formatted(),
MessageQueuesHist = [
{couch_file, {CF}},
{couch_db_updater, {CDU}}
@@ -315,6 +315,10 @@ get_stats() ->
{distribution, {get_distribution_stats()}}
].
+db_pid_stats_formatted() ->
+ {CF, CDU} = db_pid_stats(),
+ {format_pid_stats(CF), format_pid_stats(CDU)}.
+
db_pid_stats() ->
{monitors, M} = process_info(whereis(couch_stats_process_tracker), monitors),
Candidates = [Pid || {process, Pid} <- M],
@@ -323,7 +327,7 @@ db_pid_stats() ->
{CouchFiles, CouchDbUpdaters}.
db_pid_stats(Mod, Candidates) ->
- Mailboxes = lists:foldl(
+ lists:foldl(
fun(Pid, Acc) ->
case process_info(Pid, [message_queue_len, dictionary]) of
undefined ->
@@ -343,8 +347,7 @@ db_pid_stats(Mod, Candidates) ->
end,
[],
Candidates
- ),
- format_pid_stats(Mailboxes).
+ ).
format_pid_stats([]) ->
[];
diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index 847ad947d..884d792f0 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -17,6 +17,7 @@
-import(couch_prometheus_util, [
couch_to_prom/3,
to_prom/4,
+ to_prom/2,
to_prom_summary/2
]).
@@ -110,6 +111,7 @@ get_system_stats() ->
get_uptime_stat(),
get_io_stats(),
get_message_queue_stats(),
+ get_db_pid_stats(),
get_run_queue_stats(),
get_vm_stats(),
get_ets_stats(),
@@ -220,6 +222,35 @@ get_message_queue_stats() ->
to_prom(erlang_message_queue_size, gauge, "size of message queue", QueueLenByLabel)
].
+get_db_pid_stats() ->
+ {CF, CDU} = chttpd_node:db_pid_stats(),
+ [
+ pid_to_prom_summary(
+ "erlang_message_queue_couch_file",
+ "size of message queue across couch_file processes",
+ CF
+ ),
+ pid_to_prom_summary(
+ "erlang_message_queue_couch_db_updater",
+ "size of message queue across couch_db_updater processes",
+ CDU
+ )
+ ].
+
+pid_to_prom_summary(Metric, Desc, Mailboxes) ->
+ Sorted = lists:sort(Mailboxes),
+ Count = length(Sorted),
+ Quantiles = [
+ {[{quantile, <<"0.5">>}], lists:nth(round(Count * 0.5), Sorted)},
+ {[{quantile, <<"0.9">>}], lists:nth(round(Count * 0.9), Sorted)},
+ {[{quantile, <<"0.99">>}], lists:nth(round(Count * 0.99), Sorted)}
+ ],
+ SumStat = to_prom(Metric ++ ["_sum"], lists:sum(Sorted)),
+ CountStat = to_prom(Metric ++ ["_count"], length(Sorted)),
+ MinStat = to_prom(Metric ++ ["_min"], hd(Sorted)),
+ MaxStat = to_prom(Metric ++ ["_max"], lists:nth(Count, Sorted)),
+ to_prom(Metric, summary, Desc, Quantiles) ++ [SumStat, CountStat, MinStat, MaxStat].
+
get_run_queue_stats() ->
%% Workaround for https://bugs.erlang.org/browse/ERL-1355
{SQ, DCQ} = chttpd_node:run_queues(),
diff --git a/src/couch_prometheus/src/couch_prometheus_util.erl b/src/couch_prometheus/src/couch_prometheus_util.erl
index 5775b9693..4665ba7f9 100644
--- a/src/couch_prometheus/src/couch_prometheus_util.erl
+++ b/src/couch_prometheus/src/couch_prometheus_util.erl
@@ -16,6 +16,7 @@
couch_to_prom/3,
to_bin/1,
to_prom/4,
+ to_prom/2,
to_prom_summary/2
]).