You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by wi...@apache.org on 2023/03/13 19:11:14 UTC
[couchdb] 01/01: fix: add type and descriptions to prometheus output
This is an automated email from the ASF dual-hosted git repository.
willholley pushed a commit to branch prometheus-help
in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 8026e4dad9320e5da5b88795302821f32002a75e
Author: Will Holley <wi...@apache.org>
AuthorDate: Mon Mar 13 16:48:15 2023 +0000
fix: add type and descriptions to prometheus output
The `/_node/_local/_prometheus` is missing `TYPE` annotations for
some metrics. In addition, it contains no `HELP` annotations, which
are useful particularly where metrics do not strictly match those
returned by the `_stats` or `_system` endpoints.
This PR adds the missing `TYPE` annotations and adds `HELP` annotations
to all metrics.
The spec for the prometheus text format is at
https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md,
for reference.
## couch_prometheus_util:to_prom/3
`couch_prometheus_util:to_prom/3` is replaced by `couch_prometheus_util:to_prom/4`
which now expects a description alongside the metric name and type.
## couch_prometheus_util:couch_to_prom/3
`couch_prometheus_util:couch_to_prom/3` now extracts the metrics
description from the object returned for each metric by `couch_stats`.
In some cases, where the metrics are transformed e.g. from multiple
metrics to a single metric with a tag, the description is explicitly
changed to be more generic.
---
.../src/couch_prometheus_server.erl | 83 ++++++++++++++-----
src/couch_prometheus/src/couch_prometheus_util.erl | 93 +++++++++++++++-------
.../test/eunit/couch_prometheus_util_tests.erl | 70 ----------------
3 files changed, 128 insertions(+), 118 deletions(-)
diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index 7597c7e28..5e446a914 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -16,7 +16,7 @@
-import(couch_prometheus_util, [
couch_to_prom/3,
- to_prom/3,
+ to_prom/4,
to_prom_summary/2
]).
@@ -116,7 +116,7 @@ get_system_stats() ->
]).
get_uptime_stat() ->
- to_prom(uptime_seconds, counter, couch_app:uptime() div 1000).
+ to_prom(uptime_seconds, counter, "couchdb uptime", couch_app:uptime() div 1000).
get_vm_stats() ->
MemLabels = lists:map(
@@ -131,29 +131,70 @@ get_vm_stats() ->
ProcCount = erlang:system_info(process_count),
ProcLimit = erlang:system_info(process_limit),
[
- to_prom(erlang_memory_bytes, gauge, MemLabels),
- to_prom(erlang_gc_collections_total, counter, NumGCs),
- to_prom(erlang_gc_words_reclaimed_total, counter, WordsReclaimed),
- to_prom(erlang_context_switches_total, counter, CtxSwitches),
- to_prom(erlang_reductions_total, counter, Reds),
- to_prom(erlang_processes, gauge, ProcCount),
- to_prom(erlang_process_limit, gauge, ProcLimit)
+ to_prom(
+ erlang_memory_bytes,
+ gauge,
+ "size of memory dynamically allocated by the Erlang emulator",
+ MemLabels
+ ),
+ to_prom(
+ erlang_gc_collections_total,
+ counter,
+ "number of garbage collections by the Erlang emulator",
+ NumGCs
+ ),
+ to_prom(
+ erlang_gc_words_reclaimed_total,
+ counter,
+ "number of words reclaimed by garbage collections",
+ WordsReclaimed
+ ),
+ to_prom(
+ erlang_context_switches_total, counter, "total number of context switches", CtxSwitches
+ ),
+ to_prom(erlang_reductions_total, counter, "total number of reductions", Reds),
+ to_prom(erlang_processes, gauge, "the number of Erlang processes", ProcCount),
+ to_prom(
+ erlang_process_limit,
+ gauge,
+ "the maximum number of simultaneously existing Erlang processes",
+ ProcLimit
+ )
].
get_io_stats() ->
{{input, In}, {output, Out}} = erlang:statistics(io),
[
- to_prom(erlang_io_recv_bytes_total, counter, In),
- to_prom(erlang_io_sent_bytes_total, counter, Out)
+ to_prom(
+ erlang_io_recv_bytes_total,
+ counter,
+ "the total number of bytes received through ports",
+ In
+ ),
+ to_prom(
+ erlang_io_sent_bytes_total, counter, "the total number of bytes output to ports", Out
+ )
].
get_message_queue_stats() ->
QLenFun = fun(Name) -> message_queue_len(whereis(Name)) end,
Queues = lists:map(QLenFun, registered()),
[
- to_prom(erlang_message_queues, gauge, lists:sum(Queues)),
- to_prom(erlang_message_queue_min, gauge, lists:min(Queues)),
- to_prom(erlang_message_queue_max, gauge, lists:max(Queues))
+ to_prom(
+ erlang_message_queues, gauge, "total size of all message queues", lists:sum(Queues)
+ ),
+ to_prom(
+ erlang_message_queue_min,
+ gauge,
+ "minimum size across all message queues",
+ lists:min(Queues)
+ ),
+ to_prom(
+ erlang_message_queue_max,
+ gauge,
+ "maximum size across all message queues",
+ lists:max(Queues)
+ )
].
message_queue_len(undefined) ->
@@ -177,13 +218,18 @@ get_run_queue_stats() ->
{lists:sum(SQs), DCQ}
end,
[
- to_prom(erlang_scheduler_queues, gauge, Normal),
- to_prom(erlang_dirty_cpu_scheduler_queues, gauge, Dirty)
+ to_prom(erlang_scheduler_queues, gauge, "the total size of all normal run queues", Normal),
+ to_prom(
+ erlang_dirty_cpu_scheduler_queues,
+ gauge,
+ "the total size of all dirty CPU scheduler run queues",
+ Dirty
+ )
].
get_ets_stats() ->
NumTabs = length(ets:all()),
- to_prom(erlang_ets_table, gauge, NumTabs).
+ to_prom(erlang_ets_table, gauge, "number of ETS tables", NumTabs).
drain_refresh_messages() ->
receive
@@ -205,7 +251,8 @@ system_stats_test() ->
lists:foreach(
fun(Line) ->
?assert(is_binary(Line)),
- ?assert((starts_with(<<"couchdb_">>, Line) orelse starts_with(<<"# TYPE ">>, Line)))
+ Trimmed = string:trim(Line),
+ ?assert(starts_with(<<"couchdb_">>, Trimmed) orelse starts_with(<<"# ">>, Trimmed))
end,
get_system_stats()
).
diff --git a/src/couch_prometheus/src/couch_prometheus_util.erl b/src/couch_prometheus/src/couch_prometheus_util.erl
index c9563687e..3ac0253b9 100644
--- a/src/couch_prometheus/src/couch_prometheus_util.erl
+++ b/src/couch_prometheus/src/couch_prometheus_util.erl
@@ -15,56 +15,78 @@
-export([
couch_to_prom/3,
to_bin/1,
- to_prom/3,
+ to_prom/4,
to_prom_summary/2
]).
-include("couch_prometheus.hrl").
couch_to_prom([couch_log, level, alert], Info, _All) ->
- to_prom(couch_log_requests_total, counter, {[{level, alert}], val(Info)});
+ to_prom(couch_log_requests_total, counter, "number of logged messages", {
+ [{level, alert}], val(Info)
+ });
couch_to_prom([couch_log, level, Level], Info, _All) ->
to_prom(couch_log_requests_total, {[{level, Level}], val(Info)});
couch_to_prom([couch_replicator, checkpoints, failure], Info, _All) ->
- to_prom(couch_replicator_checkpoints_failure_total, counter, val(Info));
+ to_prom(couch_replicator_checkpoints_failure_total, counter, desc(Info), val(Info));
couch_to_prom([couch_replicator, checkpoints, success], Info, All) ->
Total = val(Info) + val([couch_replicator, checkpoints, failure], All),
- to_prom(couch_replicator_checkpoints_total, counter, Total);
+ to_prom(couch_replicator_checkpoints_total, counter, "number of checkpoint saves", Total);
couch_to_prom([couch_replicator, responses, failure], Info, _All) ->
- to_prom(couch_replicator_responses_failure_total, counter, val(Info));
+ to_prom(couch_replicator_responses_failure_total, counter, desc(Info), val(Info));
couch_to_prom([couch_replicator, responses, success], Info, All) ->
Total = val(Info) + val([couch_replicator, responses, failure], All),
- to_prom(couch_replicator_responses_total, counter, Total);
+ to_prom(
+ couch_replicator_responses_total,
+ counter,
+ "number of HTTP responses received by the replicator",
+ Total
+ );
couch_to_prom([couch_replicator, stream_responses, failure], Info, _All) ->
- to_prom(couch_replicator_stream_responses_failure_total, counter, val(Info));
+ to_prom(couch_replicator_stream_responses_failure_total, counter, desc(Info), val(Info));
couch_to_prom([couch_replicator, stream_responses, success], Info, All) ->
Total = val(Info) + val([couch_replicator, stream_responses, failure], All),
- to_prom(couch_replicator_stream_responses_total, counter, Total);
+ to_prom(
+ couch_replicator_stream_responses_total,
+ counter,
+ "number of streaming HTTP responses received by the replicator",
+ Total
+ );
couch_to_prom([couchdb, auth_cache_hits], Info, All) ->
Total = val(Info) + val([couchdb, auth_cache_misses], All),
- to_prom(auth_cache_requests_total, counter, Total);
+ to_prom(auth_cache_requests_total, counter, "number of authentication cache requests", Total);
couch_to_prom([couchdb, auth_cache_misses], Info, _All) ->
- to_prom(auth_cache_misses_total, counter, val(Info));
+ to_prom(auth_cache_misses_total, counter, desc(Info), val(Info));
+% force a # TYPE definition for httpd_request_methods
couch_to_prom([couchdb, httpd_request_methods, 'COPY'], Info, _All) ->
- to_prom(httpd_request_methods, counter, {[{method, 'COPY'}], val(Info)});
+ to_prom(httpd_request_methods, counter, "number of HTTP requests by method", {
+ [{method, 'COPY'}], val(Info)
+ });
couch_to_prom([couchdb, httpd_request_methods, Method], Info, _All) ->
to_prom(httpd_request_methods, {[{method, Method}], val(Info)});
+% force a # TYPE definition for httpd_status_codes
+couch_to_prom([couchdb, httpd_status_codes, 200], Info, _All) ->
+ to_prom(httpd_status_codes, counter, "number of HTTP responses by status code", {
+ [{code, 200}], val(Info)
+ });
couch_to_prom([couchdb, httpd_status_codes, Code], Info, _All) ->
to_prom(httpd_status_codes, {[{code, Code}], val(Info)});
couch_to_prom([ddoc_cache, hit], Info, All) ->
Total = val(Info) + val([ddoc_cache, miss], All),
- to_prom(ddoc_cache_requests_total, counter, Total);
+ to_prom(ddoc_cache_requests_total, counter, "number of design doc cache requests", Total);
couch_to_prom([ddoc_cache, miss], Info, _All) ->
- to_prom(ddoc_cache_requests_failures_total, counter, val(Info));
+ to_prom(ddoc_cache_requests_failures_total, counter, desc(Info), val(Info));
couch_to_prom([ddoc_cache, recovery], Info, _All) ->
- to_prom(ddoc_cache_requests_recovery_total, counter, val(Info));
+ to_prom(ddoc_cache_requests_recovery_total, counter, desc(Info), val(Info));
couch_to_prom([fabric, read_repairs, failure], Info, _All) ->
- to_prom(fabric_read_repairs_failures_total, counter, val(Info));
+ to_prom(fabric_read_repairs_failures_total, counter, desc(Info), val(Info));
couch_to_prom([fabric, read_repairs, success], Info, All) ->
Total = val(Info) + val([fabric, read_repairs, failure], All),
- to_prom(fabric_read_repairs_total, counter, Total);
+ to_prom(fabric_read_repairs_total, counter, "number of fabric read repairs", Total);
couch_to_prom([rexi, streams, timeout, init_stream], Info, _All) ->
- to_prom(rexi_streams_timeout_total, counter, {[{stage, init_stream}], val(Info)});
+ to_prom(rexi_streams_timeout_total, counter, "number of rexi stream timeouts", {
+ [{stage, init_stream}], val(Info)
+ });
couch_to_prom([rexi_streams, timeout, Stage], Info, _All) ->
to_prom(rexi_streams_timeout_total, {[{stage, Stage}], val(Info)});
couch_to_prom([couchdb | Rest], Info, All) ->
@@ -73,15 +95,22 @@ couch_to_prom(Path, Info, _All) ->
case lists:keyfind(type, 1, Info) of
{type, counter} ->
Metric = counter_metric(Path),
- to_prom(Metric, counter, val(Info));
+ to_prom(Metric, counter, desc(Info), val(Info));
{type, gauge} ->
- to_prom(path_to_name(Path), gauge, val(Info));
+ to_prom(path_to_name(Path), gauge, desc(Info), val(Info));
{type, histogram} ->
to_prom_summary(Path, Info)
end.
-to_prom(Metric, Type, Data) ->
- TypeStr = to_bin(io_lib:format("# TYPE ~s ~s", [to_prom_name(Metric), Type])),
+type_def(Metric, Type, Desc) ->
+ Name = to_prom_name(Metric),
+ [
+ to_bin(io_lib:format("\n# HELP ~s ~s\r", [Name, Desc])),
+ to_bin(io_lib:format("# TYPE ~s ~s", [Name, Type]))
+ ].
+
+to_prom(Metric, Type, Desc, Data) ->
+ TypeStr = type_def(Metric, Type, Desc),
[TypeStr] ++ to_prom(Metric, Data).
to_prom(Metric, Instances) when is_list(Instances) ->
@@ -130,7 +159,7 @@ to_prom_summary(Path, Info) ->
SumStat = to_prom(SumMetric, Count * Mean),
CountMetric = path_to_name(Path ++ ["seconds", "count"]),
CountStat = to_prom(CountMetric, Count),
- to_prom(Metric, summary, Quantiles) ++ [SumStat, CountStat].
+ to_prom(Metric, summary, desc(Info), Quantiles) ++ [SumStat, CountStat].
to_prom_name(Metric) ->
to_bin(io_lib:format("couchdb_~s", [Metric])).
@@ -168,7 +197,9 @@ val(Key, Stats) ->
{Key, Data} = lists:keyfind(Key, 1, Stats),
val(Data).
-
+desc(Info) ->
+ {desc, V} = lists:keyfind(desc, 1, Info),
+ V.
-ifdef(TEST).
-include_lib("couch/include/couch_eunit.hrl").
@@ -177,15 +208,17 @@ to_prom_test_() ->
[
?_assertEqual(
<<"couchdb_ddoc_cache 10">>,
- test_to_prom_output(ddoc_cache, counter, 10)
+ test_to_prom_output(ddoc_cache, counter, "size of ddoc cache", 10)
),
?_assertEqual(
<<"couchdb_httpd_status_codes{code=\"200\"} 3">>,
- test_to_prom_output(httpd_status_codes, counter, {[{code, 200}], 3})
+ test_to_prom_output(httpd_status_codes, counter, "HTTP request status by code", {
+ [{code, 200}], 3
+ })
),
?_assertEqual(
<<"couchdb_temperature_celsius 36">>,
- test_to_prom_output(temperature_celsius, gauge, 36)
+ test_to_prom_output(temperature_celsius, gauge, "temp", 36)
),
?_assertEqual(
<<"couchdb_mango_query_time_seconds{quantile=\"0.75\"} 4.5">>,
@@ -224,16 +257,16 @@ counter_metric_test_() ->
[
?_assertEqual(
"document_purges_total",
- counter_metric([document_purges,total])
+ counter_metric([document_purges, total])
),
?_assertEqual(
"document_purges_total",
- counter_metric([document_purges,total])
+ counter_metric([document_purges, total])
)
].
-test_to_prom_output(Metric, Type, Val) ->
- Out = to_prom(Metric, Type, Val),
+test_to_prom_output(Metric, Type, Desc, Val) ->
+ Out = to_prom(Metric, Type, Desc, Val),
lists:nth(2, Out).
test_to_prom_sum_output(Metric, Info) ->
diff --git a/src/couch_prometheus/test/eunit/couch_prometheus_util_tests.erl b/src/couch_prometheus/test/eunit/couch_prometheus_util_tests.erl
deleted file mode 100644
index 547741c5f..000000000
--- a/src/couch_prometheus/test/eunit/couch_prometheus_util_tests.erl
+++ /dev/null
@@ -1,70 +0,0 @@
-% Licensed under the Apache License, Version 2.0 (the "License"); you may not
-% use this file except in compliance with the License. You may obtain a copy of
-% the License at
-%
-% http://www.apache.org/licenses/LICENSE-2.0
-%
-% Unless required by applicable law or agreed to in writing, software
-% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-% License for the specific language governing permissions and limitations under
-% the License.
-
--module(couch_prometheus_util_tests).
-
--include_lib("couch/include/couch_eunit.hrl").
-
-couch_prometheus_util_test_() ->
- [
- ?_assertEqual(
- <<"couchdb_ddoc_cache 10">>,
- test_to_prom_output(ddoc_cache, counter, 10)
- ),
- ?_assertEqual(
- <<"couchdb_httpd_status_codes{code=\"200\"} 3">>,
- test_to_prom_output(httpd_status_codes, counter, {[{code, 200}], 3})
- ),
- ?_assertEqual(
- <<"couchdb_temperature_celsius 36">>,
- test_to_prom_output(temperature_celsius, gauge, 36)
- ),
- ?_assertEqual(
- <<"couchdb_mango_query_time_seconds{quantile=\"0.75\"} 4.5">>,
- test_to_prom_sum_output([mango_query_time], [
- {value, [
- {min, 0.0},
- {max, 0.0},
- {arithmetic_mean, 0.0},
- {geometric_mean, 0.0},
- {harmonic_mean, 0.0},
- {median, 0.0},
- {variance, 0.0},
- {standard_deviation, 0.0},
- {skewness, 0.0},
- {kurtosis, 0.0},
- {percentile, [
- {50, 0.0},
- {75, 4500},
- {90, 0.0},
- {95, 0.0},
- {99, 0.0},
- {999, 0.0}
- ]},
- {histogram, [
- {0, 0}
- ]},
- {n, 0}
- ]},
- {type, histogram},
- {desc, <<"length of time processing a mango query">>}
- ])
- )
- ].
-
-test_to_prom_output(Metric, Type, Val) ->
- Out = couch_prometheus_util:to_prom(Metric, Type, Val),
- lists:nth(2, Out).
-
-test_to_prom_sum_output(Metric, Info) ->
- Out = couch_prometheus_util:to_prom_summary(Metric, Info),
- lists:nth(3, Out).