You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by wi...@apache.org on 2023/03/13 18:57:40 UTC

[couchdb] 02/02: prometheus: add HELP

This is an automated email from the ASF dual-hosted git repository.

willholley pushed a commit to branch prometheus-help
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 07d5807a5a70bf072ab9765ce63df80fddc47853
Author: Will Holley <wi...@apache.org>
AuthorDate: Mon Mar 13 16:48:15 2023 +0000

    prometheus: add HELP
    
    adds HELP annotations for prometheus metrics according to the
    spec at https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md
---
 .../src/couch_prometheus_server.erl                | 86 +++++++++++++++-----
 src/couch_prometheus/src/couch_prometheus_util.erl | 95 +++++++++++++++-------
 .../test/eunit/couch_prometheus_e2e_tests.erl      | 18 +++-
 .../test/eunit/couch_prometheus_util_tests.erl     | 70 ----------------
 4 files changed, 149 insertions(+), 120 deletions(-)

diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index 7a0eb4bf9..96def1a7d 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -16,7 +16,9 @@
 
 -import(couch_prometheus_util, [
     couch_to_prom/3,
-    to_prom/3,
+    type_def/3,
+    to_prom/2,
+    to_prom/4,
     to_prom_summary/2
 ]).
 
@@ -108,7 +110,6 @@ get_couchdb_stats() ->
 get_system_stats() ->
     lists:flatten([
         get_uptime_stat(),
-        get_vm_stats(),
         get_io_stats(),
         get_message_queue_stats(),
         get_run_queue_stats(),
@@ -117,7 +118,7 @@ get_system_stats() ->
     ]).
 
 get_uptime_stat() ->
-    to_prom(uptime_seconds, counter, couch_app:uptime() div 1000).
+    to_prom(uptime_seconds, counter, "couchdb uptime", couch_app:uptime() div 1000).
 
 get_vm_stats() ->
     MemLabels = lists:map(
@@ -132,29 +133,70 @@ get_vm_stats() ->
     ProcCount = erlang:system_info(process_count),
     ProcLimit = erlang:system_info(process_limit),
     [
-        to_prom(erlang_memory_bytes, gauge, MemLabels),
-        to_prom(erlang_gc_collections_total, counter, NumGCs),
-        to_prom(erlang_gc_words_reclaimed_total, counter, WordsReclaimed),
-        to_prom(erlang_context_switches_total, counter, CtxSwitches),
-        to_prom(erlang_reductions_total, counter, Reds),
-        to_prom(erlang_processes, gauge, ProcCount),
-        to_prom(erlang_process_limit, gauge, ProcLimit)
+        to_prom(
+            erlang_memory_bytes,
+            gauge,
+            "size of memory dynamically allocated by the Erlang emulator",
+            MemLabels
+        ),
+        to_prom(
+            erlang_gc_collections_total,
+            counter,
+            "number of garbage collections by the Erlang emulator",
+            NumGCs
+        ),
+        to_prom(
+            erlang_gc_words_reclaimed_total,
+            counter,
+            "number of words reclaimed by garbage collections",
+            WordsReclaimed
+        ),
+        to_prom(
+            erlang_context_switches_total, counter, "total number of context switches", CtxSwitches
+        ),
+        to_prom(erlang_reductions_total, counter, "total number of reductions", Reds),
+        to_prom(erlang_processes, gauge, "the number of Erlang processes", ProcCount),
+        to_prom(
+            erlang_process_limit,
+            gauge,
+            "the maximum number of simultaneously existing Erlang processes",
+            ProcLimit
+        )
     ].
 
 get_io_stats() ->
     {{input, In}, {output, Out}} = erlang:statistics(io),
     [
-        to_prom(erlang_io_recv_bytes_total, counter, In),
-        to_prom(erlang_io_sent_bytes_total, counter, Out)
+        to_prom(
+            erlang_io_recv_bytes_total,
+            counter,
+            "the total number of bytes received through ports",
+            In
+        ),
+        to_prom(
+            erlang_io_sent_bytes_total, counter, "the total number of bytes output to ports", Out
+        )
     ].
 
 get_message_queue_stats() ->
     QLenFun = fun(Name) -> message_queue_len(whereis(Name)) end,
     Queues = lists:map(QLenFun, registered()),
     [
-        to_prom(erlang_message_queues, gauge, lists:sum(Queues)),
-        to_prom(erlang_message_queue_min, gauge, lists:min(Queues)),
-        to_prom(erlang_message_queue_max, gauge, lists:max(Queues))
+        to_prom(
+            erlang_message_queues, gauge, "total size of all message queues", lists:sum(Queues)
+        ),
+        to_prom(
+            erlang_message_queue_min,
+            gauge,
+            "minimum size across all message queues",
+            lists:min(Queues)
+        ),
+        to_prom(
+            erlang_message_queue_max,
+            gauge,
+            "maximum size across all message queues",
+            lists:max(Queues)
+        )
     ].
 
 message_queue_len(undefined) ->
@@ -178,13 +220,18 @@ get_run_queue_stats() ->
                 {lists:sum(SQs), DCQ}
         end,
     [
-        to_prom(erlang_scheduler_queues, gauge, Normal),
-        to_prom(erlang_dirty_cpu_scheduler_queues, gauge, Dirty)
+        to_prom(erlang_scheduler_queues, gauge, "the total size of all normal run queues", Normal),
+        to_prom(
+            erlang_dirty_cpu_scheduler_queues,
+            gauge,
+            "the total size of all dirty CPU scheduler run queues",
+            Dirty
+        )
     ].
 
 get_ets_stats() ->
     NumTabs = length(ets:all()),
-    to_prom(erlang_ets_table, gauge, NumTabs).
+    to_prom(erlang_ets_table, gauge, "number of ETS tables", NumTabs).
 
 drain_refresh_messages() ->
     receive
@@ -206,7 +253,8 @@ system_stats_test() ->
     lists:foreach(
         fun(Line) ->
             ?assert(is_binary(Line)),
-            ?assert((starts_with(<<"couchdb_">>, Line) orelse starts_with(<<"# TYPE ">>, Line)))
+            Trimmed = string:trim(Line),
+            ?assert(starts_with(<<"couchdb_">>, Trimmed) orelse starts_with(<<"# ">>, Trimmed))
         end,
         get_system_stats()
     ).
diff --git a/src/couch_prometheus/src/couch_prometheus_util.erl b/src/couch_prometheus/src/couch_prometheus_util.erl
index c9563687e..412aca8ac 100644
--- a/src/couch_prometheus/src/couch_prometheus_util.erl
+++ b/src/couch_prometheus/src/couch_prometheus_util.erl
@@ -15,56 +15,80 @@
 -export([
     couch_to_prom/3,
     to_bin/1,
-    to_prom/3,
+    type_def/3,
+    to_prom/2,
+    to_prom/4,
     to_prom_summary/2
 ]).
 
 -include("couch_prometheus.hrl").
 
 couch_to_prom([couch_log, level, alert], Info, _All) ->
-    to_prom(couch_log_requests_total, counter, {[{level, alert}], val(Info)});
+    to_prom(couch_log_requests_total, counter, "number of logged messages", {
+        [{level, alert}], val(Info)
+    });
 couch_to_prom([couch_log, level, Level], Info, _All) ->
     to_prom(couch_log_requests_total, {[{level, Level}], val(Info)});
 couch_to_prom([couch_replicator, checkpoints, failure], Info, _All) ->
-    to_prom(couch_replicator_checkpoints_failure_total, counter, val(Info));
+    to_prom(couch_replicator_checkpoints_failure_total, counter, desc(Info), val(Info));
 couch_to_prom([couch_replicator, checkpoints, success], Info, All) ->
     Total = val(Info) + val([couch_replicator, checkpoints, failure], All),
-    to_prom(couch_replicator_checkpoints_total, counter, Total);
+    to_prom(couch_replicator_checkpoints_total, counter, "number of checkpoint saves", Total);
 couch_to_prom([couch_replicator, responses, failure], Info, _All) ->
-    to_prom(couch_replicator_responses_failure_total, counter, val(Info));
+    to_prom(couch_replicator_responses_failure_total, counter, desc(Info), val(Info));
 couch_to_prom([couch_replicator, responses, success], Info, All) ->
     Total = val(Info) + val([couch_replicator, responses, failure], All),
-    to_prom(couch_replicator_responses_total, counter, Total);
+    to_prom(
+        couch_replicator_responses_total,
+        counter,
+        "number of HTTP responses received by the replicator",
+        Total
+    );
 couch_to_prom([couch_replicator, stream_responses, failure], Info, _All) ->
-    to_prom(couch_replicator_stream_responses_failure_total, counter, val(Info));
+    to_prom(couch_replicator_stream_responses_failure_total, counter, desc(Info), val(Info));
 couch_to_prom([couch_replicator, stream_responses, success], Info, All) ->
     Total = val(Info) + val([couch_replicator, stream_responses, failure], All),
-    to_prom(couch_replicator_stream_responses_total, counter, Total);
+    to_prom(
+        couch_replicator_stream_responses_total,
+        counter,
+        "number of streaming HTTP responses received by the replicator",
+        Total
+    );
 couch_to_prom([couchdb, auth_cache_hits], Info, All) ->
     Total = val(Info) + val([couchdb, auth_cache_misses], All),
-    to_prom(auth_cache_requests_total, counter, Total);
+    to_prom(auth_cache_requests_total, counter, "number of authentication cache requests", Total);
 couch_to_prom([couchdb, auth_cache_misses], Info, _All) ->
-    to_prom(auth_cache_misses_total, counter, val(Info));
+    to_prom(auth_cache_misses_total, counter, desc(Info), val(Info));
+% force a # TYPE definition for httpd_request_methods
 couch_to_prom([couchdb, httpd_request_methods, 'COPY'], Info, _All) ->
-    to_prom(httpd_request_methods, counter, {[{method, 'COPY'}], val(Info)});
+    to_prom(httpd_request_methods, counter, "number of HTTP requests by method", {
+        [{method, 'COPY'}], val(Info)
+    });
 couch_to_prom([couchdb, httpd_request_methods, Method], Info, _All) ->
     to_prom(httpd_request_methods, {[{method, Method}], val(Info)});
+% force a # TYPE definition for httpd_status_codes
+couch_to_prom([couchdb, httpd_status_codes, 200], Info, _All) ->
+    to_prom(httpd_status_codes, counter, "number of HTTP responses by status code", {
+        [{code, 200}], val(Info)
+    });
 couch_to_prom([couchdb, httpd_status_codes, Code], Info, _All) ->
     to_prom(httpd_status_codes, {[{code, Code}], val(Info)});
 couch_to_prom([ddoc_cache, hit], Info, All) ->
     Total = val(Info) + val([ddoc_cache, miss], All),
-    to_prom(ddoc_cache_requests_total, counter, Total);
+    to_prom(ddoc_cache_requests_total, counter, "number of design doc cache requests", Total);
 couch_to_prom([ddoc_cache, miss], Info, _All) ->
-    to_prom(ddoc_cache_requests_failures_total, counter, val(Info));
+    to_prom(ddoc_cache_requests_failures_total, counter, desc(Info), val(Info));
 couch_to_prom([ddoc_cache, recovery], Info, _All) ->
-    to_prom(ddoc_cache_requests_recovery_total, counter, val(Info));
+    to_prom(ddoc_cache_requests_recovery_total, counter, desc(Info), val(Info));
 couch_to_prom([fabric, read_repairs, failure], Info, _All) ->
-    to_prom(fabric_read_repairs_failures_total, counter, val(Info));
+    to_prom(fabric_read_repairs_failures_total, counter, desc(Info), val(Info));
 couch_to_prom([fabric, read_repairs, success], Info, All) ->
     Total = val(Info) + val([fabric, read_repairs, failure], All),
-    to_prom(fabric_read_repairs_total, counter, Total);
+    to_prom(fabric_read_repairs_total, counter, "number of fabric read repairs", Total);
 couch_to_prom([rexi, streams, timeout, init_stream], Info, _All) ->
-    to_prom(rexi_streams_timeout_total, counter, {[{stage, init_stream}], val(Info)});
+    to_prom(rexi_streams_timeout_total, counter, "number of rexi stream timeouts", {
+        [{stage, init_stream}], val(Info)
+    });
 couch_to_prom([rexi_streams, timeout, Stage], Info, _All) ->
     to_prom(rexi_streams_timeout_total, {[{stage, Stage}], val(Info)});
 couch_to_prom([couchdb | Rest], Info, All) ->
@@ -73,15 +97,22 @@ couch_to_prom(Path, Info, _All) ->
     case lists:keyfind(type, 1, Info) of
         {type, counter} ->
             Metric = counter_metric(Path),
-            to_prom(Metric, counter, val(Info));
+            to_prom(Metric, counter, desc(Info), val(Info));
         {type, gauge} ->
-            to_prom(path_to_name(Path), gauge, val(Info));
+            to_prom(path_to_name(Path), gauge, desc(Info), val(Info));
         {type, histogram} ->
             to_prom_summary(Path, Info)
     end.
 
-to_prom(Metric, Type, Data) ->
-    TypeStr = to_bin(io_lib:format("# TYPE ~s ~s", [to_prom_name(Metric), Type])),
+type_def(Metric, Type, Desc) ->
+    Name = to_prom_name(Metric),
+    [
+        to_bin(io_lib:format("\n# HELP ~s ~s\r", [Name, Desc])),
+        to_bin(io_lib:format("# TYPE ~s ~s", [Name, Type]))
+    ].
+
+to_prom(Metric, Type, Desc, Data) ->
+    TypeStr = type_def(Metric, Type, Desc),
     [TypeStr] ++ to_prom(Metric, Data).
 
 to_prom(Metric, Instances) when is_list(Instances) ->
@@ -130,7 +161,7 @@ to_prom_summary(Path, Info) ->
     SumStat = to_prom(SumMetric, Count * Mean),
     CountMetric = path_to_name(Path ++ ["seconds", "count"]),
     CountStat = to_prom(CountMetric, Count),
-    to_prom(Metric, summary, Quantiles) ++ [SumStat, CountStat].
+    to_prom(Metric, summary, desc(Info), Quantiles) ++ [SumStat, CountStat].
 
 to_prom_name(Metric) ->
     to_bin(io_lib:format("couchdb_~s", [Metric])).
@@ -168,7 +199,9 @@ val(Key, Stats) ->
     {Key, Data} = lists:keyfind(Key, 1, Stats),
     val(Data).
 
-
+desc(Info) ->
+    {desc, V} = lists:keyfind(desc, 1, Info),
+    V.
 
 -ifdef(TEST).
 -include_lib("couch/include/couch_eunit.hrl").
@@ -177,15 +210,17 @@ to_prom_test_() ->
     [
         ?_assertEqual(
             <<"couchdb_ddoc_cache 10">>,
-            test_to_prom_output(ddoc_cache, counter, 10)
+            test_to_prom_output(ddoc_cache, counter, "size of ddoc cache", 10)
         ),
         ?_assertEqual(
             <<"couchdb_httpd_status_codes{code=\"200\"} 3">>,
-            test_to_prom_output(httpd_status_codes, counter, {[{code, 200}], 3})
+            test_to_prom_output(httpd_status_codes, counter, "HTTP request status by code", {
+                [{code, 200}], 3
+            })
         ),
         ?_assertEqual(
             <<"couchdb_temperature_celsius 36">>,
-            test_to_prom_output(temperature_celsius, gauge, 36)
+            test_to_prom_output(temperature_celsius, gauge, "temp", 36)
         ),
         ?_assertEqual(
             <<"couchdb_mango_query_time_seconds{quantile=\"0.75\"} 4.5">>,
@@ -224,16 +259,16 @@ counter_metric_test_() ->
     [
         ?_assertEqual(
             "document_purges_total",
-            counter_metric([document_purges,total])
+            counter_metric([document_purges, total])
         ),
         ?_assertEqual(
             "document_purges_total",
-            counter_metric([document_purges,total])
+            counter_metric([document_purges, total])
         )
     ].
 
-test_to_prom_output(Metric, Type, Val) ->
-    Out = to_prom(Metric, Type, Val),
+test_to_prom_output(Metric, Type, Desc, Val) ->
+    Out = to_prom(Metric, Type, Desc, Val),
     lists:nth(2, Out).
 
 test_to_prom_sum_output(Metric, Info) ->
diff --git a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
index 5f458ba8c..9b1c47633 100644
--- a/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
+++ b/src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
@@ -39,7 +39,8 @@ e2e_test_() ->
                 [
                     ?TDEF_FE(t_chttpd_port),
                     ?TDEF_FE(t_prometheus_port),
-                    ?TDEF_FE(t_metric_updated)
+                    ?TDEF_FE(t_metric_updated),
+                    ?TDEF_FE(t_no_duplicate_metrics)
                 ]
             }
         }
@@ -105,6 +106,21 @@ t_reject_prometheus_port(Port) ->
     Response = test_request:get(node_local_url(Port), [?CONTENT_JSON, ?AUTH]),
     ?assertEqual({error, {conn_failed, {error, econnrefused}}}, Response).
 
+t_no_duplicate_metrics(Port) ->
+    Url = node_local_url(Port),
+    Stats = get_stats(Url),
+    Lines = re:split(Stats, "\n"),
+    % Filter the result to only the lines containing the metric
+    % definition, not the values. These lines always start with
+    % a # character.
+    MetricDefs = lists:filter(fun(S) -> string:find(S, "#") =:= S end, Lines),
+    ?assertNotEqual(erlang:length(MetricDefs), 0),
+    Diff = get_duplicates(MetricDefs),
+    ?assertEqual(erlang:length(Diff), 0).
+
+get_duplicates(List) ->
+    List -- sets:to_list(sets:from_list(List)).
+
 t_metric_updated(Port) ->
     % The passage of time should increment this metric
     Metric = "couchdb_uptime_seconds",
diff --git a/src/couch_prometheus/test/eunit/couch_prometheus_util_tests.erl b/src/couch_prometheus/test/eunit/couch_prometheus_util_tests.erl
deleted file mode 100644
index 547741c5f..000000000
--- a/src/couch_prometheus/test/eunit/couch_prometheus_util_tests.erl
+++ /dev/null
@@ -1,70 +0,0 @@
-% Licensed under the Apache License, Version 2.0 (the "License"); you may not
-% use this file except in compliance with the License. You may obtain a copy of
-% the License at
-%
-%   http://www.apache.org/licenses/LICENSE-2.0
-%
-% Unless required by applicable law or agreed to in writing, software
-% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-% License for the specific language governing permissions and limitations under
-% the License.
-
--module(couch_prometheus_util_tests).
-
--include_lib("couch/include/couch_eunit.hrl").
-
-couch_prometheus_util_test_() ->
-    [
-        ?_assertEqual(
-            <<"couchdb_ddoc_cache 10">>,
-            test_to_prom_output(ddoc_cache, counter, 10)
-        ),
-        ?_assertEqual(
-            <<"couchdb_httpd_status_codes{code=\"200\"} 3">>,
-            test_to_prom_output(httpd_status_codes, counter, {[{code, 200}], 3})
-        ),
-        ?_assertEqual(
-            <<"couchdb_temperature_celsius 36">>,
-            test_to_prom_output(temperature_celsius, gauge, 36)
-        ),
-        ?_assertEqual(
-            <<"couchdb_mango_query_time_seconds{quantile=\"0.75\"} 4.5">>,
-            test_to_prom_sum_output([mango_query_time], [
-                {value, [
-                    {min, 0.0},
-                    {max, 0.0},
-                    {arithmetic_mean, 0.0},
-                    {geometric_mean, 0.0},
-                    {harmonic_mean, 0.0},
-                    {median, 0.0},
-                    {variance, 0.0},
-                    {standard_deviation, 0.0},
-                    {skewness, 0.0},
-                    {kurtosis, 0.0},
-                    {percentile, [
-                        {50, 0.0},
-                        {75, 4500},
-                        {90, 0.0},
-                        {95, 0.0},
-                        {99, 0.0},
-                        {999, 0.0}
-                    ]},
-                    {histogram, [
-                        {0, 0}
-                    ]},
-                    {n, 0}
-                ]},
-                {type, histogram},
-                {desc, <<"length of time processing a mango query">>}
-            ])
-        )
-    ].
-
-test_to_prom_output(Metric, Type, Val) ->
-    Out = couch_prometheus_util:to_prom(Metric, Type, Val),
-    lists:nth(2, Out).
-
-test_to_prom_sum_output(Metric, Info) ->
-    Out = couch_prometheus_util:to_prom_summary(Metric, Info),
-    lists:nth(3, Out).