You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2023/05/20 16:29:01 UTC
[couchdb] 01/01: Add a few more commit and query engine metrics
This is an automated email from the ASF dual-hosted git repository.
vatamane pushed a commit to branch add-a-few-more-metrics
in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit f36c324c8f75376423c43df92b4f368497b7a7d4
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Sat May 20 11:51:28 2023 -0400
Add a few more commit and query engine metrics
fsync times can vary widely, especially on a remote block storage, so add a
histogram and count for fsync calls. At a higher level, we can coalesce similar
update doc requests from multiple clients, so add metrics to track that as
well.
Improving couch_proc_manager, and experimenting with quickjs js engine [1],
noticed we lacked metrics in that area. Add metrics to track query process
acquires, start/stops (including crashes) as well as how many prompt and prompt
errors we have. These should help gain some confidence when upgrading to a new
engine version or engine type by comparng the before and after metrics output.
Trying to test some of these new metrics, noticed couch_file tests did most of
the test logic in the setup part, which resulted in plenty of log noise during
the test runs about started/stopped applications, so updated a few tests which
were affected by this PR to use the ?TDEF_FE macro. That should make test
output a bit smaller and neater.
[1] https://github.com/apache/couchdb/issues/4448#issuecomment-1447196423
---
src/couch/priv/stats_descriptions.cfg | 44 ++++++++
src/couch/src/couch_bt_engine.erl | 1 +
src/couch/src/couch_db_updater.erl | 4 +
src/couch/src/couch_file.erl | 14 ++-
src/couch/src/couch_os_process.erl | 7 ++
src/couch/src/couch_proc_manager.erl | 8 +-
src/couch/test/eunit/couch_file_tests.erl | 115 +++++++++++++--------
src/couch/test/eunit/couch_js_tests.erl | 8 +-
.../src/cpse_test_read_write_docs.erl | 5 +-
9 files changed, 156 insertions(+), 50 deletions(-)
diff --git a/src/couch/priv/stats_descriptions.cfg b/src/couch/priv/stats_descriptions.cfg
index 7c8fd94cb..38ae37bff 100644
--- a/src/couch/priv/stats_descriptions.cfg
+++ b/src/couch/priv/stats_descriptions.cfg
@@ -278,6 +278,18 @@
{type, histogram},
{desc, <<"length of a request inside CouchDB without MochiWeb">>}
]}.
+{[couchdb, commits], [
+ {type, counter},
+ {desc, <<"number of commits performed">>}
+]}.
+{[couchdb, coalesced_updates, interactive], [
+ {type, counter},
+ {desc, <<"number of coalesced interactive updates">>}
+]}.
+{[couchdb, coalesced_updates, replicated], [
+ {type, counter},
+ {desc, <<"number of coalesced replicated updates">>}
+]}.
{[couchdb, couch_server, lru_skip], [
{type, counter},
{desc, <<"number of couch_server LRU operations skipped">>}
@@ -290,6 +302,30 @@
{type, histogram},
{desc, <<"duration of validate_doc_update function calls">>}
]}.
+{[couchdb, query_server, acquired_processes], [
+ {type, counter},
+ {desc, <<"number of acquired external processes">>}
+]}.
+{[couchdb, query_server, process_starts], [
+ {type, counter},
+ {desc, <<"number of os process starts">>}
+]}.
+{[couchdb, query_server, process_exists], [
+ {type, counter},
+ {desc, <<"number of os normal process exits">>}
+]}.
+{[couchdb, query_server, process_errors], [
+ {type, counter},
+ {desc, <<"number of os error process exits">>}
+]}.
+{[couchdb, query_server, process_prompts], [
+ {type, counter},
+ {desc, <<"number of successful os process prompts">>}
+]}.
+{[couchdb, query_server, process_prompt_errors], [
+ {type, counter},
+ {desc, <<"number of os process prompt errors">>}
+]}.
{[pread, exceed_eof], [
{type, counter},
{desc, <<"number of the attempts to read beyond end of db file">>}
@@ -298,6 +334,14 @@
{type, counter},
{desc, <<"number of the attempts to read beyond set limit">>}
]}.
+{[fsync, time], [
+ {type, histogram},
+ {desc, <<"microseconds to call fsync">>}
+]}.
+{[fsync, count], [
+ {type, counter},
+ {desc, <<"number of fsync calls">>}
+]}.
{[mango, unindexed_queries], [
{type, counter},
{desc, <<"number of mango queries that could not use an index">>}
diff --git a/src/couch/src/couch_bt_engine.erl b/src/couch/src/couch_bt_engine.erl
index e5b7749f3..7bc02146e 100644
--- a/src/couch/src/couch_bt_engine.erl
+++ b/src/couch/src/couch_bt_engine.erl
@@ -570,6 +570,7 @@ commit_data(St) ->
couch_file:sync(Fd),
ok = couch_file:write_header(Fd, NewHeader),
couch_file:sync(Fd),
+ couch_stats:increment_counter([couchdb, commits]),
{ok, St#st{
header = NewHeader,
needs_commit = false
diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl
index 93cdba16e..30f61aea0 100644
--- a/src/couch/src/couch_db_updater.erl
+++ b/src/couch/src/couch_db_updater.erl
@@ -295,6 +295,10 @@ collect_updates(GroupedDocsAcc, ClientsAcc, MergeConflicts) ->
% updaters than deal with their possible conflicts, and local docs
% writes are relatively rare. Can be optmized later if really needed.
{update_docs, Client, GroupedDocs, [], MergeConflicts} ->
+ case MergeConflicts of
+ true -> couch_stats:increment_counter([couchdb, coalesced_updates, replicated]);
+ false -> couch_stats:increment_counter([couchdb, coalesced_updates, interactive])
+ end,
GroupedDocs2 = sort_and_tag_grouped_docs(Client, GroupedDocs),
GroupedDocsAcc2 =
merge_updates(GroupedDocsAcc, GroupedDocs2),
diff --git a/src/couch/src/couch_file.erl b/src/couch/src/couch_file.erl
index 514d4e3d9..d68f8c49c 100644
--- a/src/couch/src/couch_file.erl
+++ b/src/couch/src/couch_file.erl
@@ -250,7 +250,7 @@ sync(Filepath) when is_list(Filepath) ->
case file:open(Filepath, [append, raw]) of
{ok, Fd} ->
try
- case file:sync(Fd) of
+ case fsync(Fd) of
ok ->
ok;
{error, Reason} ->
@@ -437,7 +437,7 @@ init({Filepath, Options, ReturnPid, Ref}) ->
true ->
{ok, 0} = file:position(Fd, 0),
ok = file:truncate(Fd),
- ok = file:sync(Fd),
+ ok = fsync(Fd),
maybe_track_open_os_files(Options),
erlang:send_after(?INITIAL_WAIT, self(), maybe_close),
{ok, #file{fd = Fd, is_sys = IsSys, pread_limit = Limit}};
@@ -554,7 +554,7 @@ handle_call({set_db_pid, Pid}, _From, #file{db_monitor = OldRef} = File) ->
Ref = monitor(process, Pid),
{reply, ok, File#file{db_monitor = Ref}};
handle_call(sync, _From, #file{fd = Fd} = File) ->
- case file:sync(Fd) of
+ case fsync(Fd) of
ok ->
{reply, ok, File};
{error, _} = Error ->
@@ -642,6 +642,14 @@ format_status(_Opt, [PDict, #file{} = File]) ->
{_Fd, FilePath} = couch_util:get_value(couch_file_fd, PDict),
[{data, [{"State", File}, {"InitialFilePath", FilePath}]}].
+fsync(Fd) ->
+ T0 = erlang:monotonic_time(microsecond),
+ Res = file:sync(Fd),
+ Dt = erlang:monotonic_time(microsecond) - T0,
+ couch_stats:update_histogram([fsync, time], Dt),
+ couch_stats:increment_counter([fsync, count]),
+ Res.
+
find_header(Fd, Block) ->
case (catch load_header(Fd, Block)) of
{ok, Bin} ->
diff --git a/src/couch/src/couch_os_process.erl b/src/couch/src/couch_os_process.erl
index 5ac387fc5..0b6436dda 100644
--- a/src/couch/src/couch_os_process.erl
+++ b/src/couch/src/couch_os_process.erl
@@ -169,6 +169,7 @@ init([Command, Options, PortOptions]) ->
KillCmd = iolist_to_binary(readline(BaseProc)),
Pid = self(),
couch_log:debug("OS Process Start :: ~p", [BaseProc#os_proc.port]),
+ couch_stats:increment_counter([couchdb, query_server, process_starts]),
spawn(fun() ->
% this ensure the real os process is killed when this process dies.
erlang:monitor(process, Pid),
@@ -207,13 +208,17 @@ handle_call({prompt, Data}, _From, #os_proc{idle = Idle} = OsProc) ->
#os_proc{writer = Writer, reader = Reader} = OsProc,
try
Writer(OsProc, Data),
+ couch_stats:increment_counter([couchdb, query_server, process_prompts]),
{reply, {ok, Reader(OsProc)}, OsProc, Idle}
catch
throw:{error, OsError} ->
+ couch_stats:increment_counter([couchdb, query_server, process_errors]),
{reply, OsError, OsProc, Idle};
throw:{fatal, OsError} ->
+ couch_stats:increment_counter([couchdb, query_server, process_errors]),
{stop, normal, OsError, OsProc};
throw:OtherError ->
+ couch_stats:increment_counter([couchdb, query_server, process_errors]),
{stop, normal, OtherError, OsProc}
after
garbage_collect()
@@ -243,9 +248,11 @@ handle_info(timeout, #os_proc{idle = Idle} = OsProc) ->
{noreply, OsProc, Idle};
handle_info({Port, {exit_status, 0}}, #os_proc{port = Port} = OsProc) ->
couch_log:info("OS Process terminated normally", []),
+ couch_stats:increment_counter([couchdb, query_server, process_exits]),
{stop, normal, OsProc};
handle_info({Port, {exit_status, Status}}, #os_proc{port = Port} = OsProc) ->
couch_log:error("OS Process died with status: ~p", [Status]),
+ couch_stats:increment_counter([couchdb, query_server, process_error_exits]),
{stop, {exit_status, Status}, OsProc};
handle_info(Msg, #os_proc{idle = Idle} = OsProc) ->
couch_log:debug("OS Proc: Unknown info: ~p", [Msg]),
diff --git a/src/couch/src/couch_proc_manager.erl b/src/couch/src/couch_proc_manager.erl
index 5055a06f3..623734e6e 100644
--- a/src/couch/src/couch_proc_manager.erl
+++ b/src/couch/src/couch_proc_manager.erl
@@ -81,13 +81,17 @@ get_proc(#doc{body = {Props}} = DDoc, DbKey, {_DDocId, _Rev} = DDocKey) ->
Lang = couch_util:to_binary(LangStr),
Client = #client{lang = Lang, ddoc = DDoc, db_key = DbKey, ddoc_key = DDocKey},
Timeout = get_os_process_timeout(),
- gen_server:call(?MODULE, {get_proc, Client}, Timeout).
+ Res = gen_server:call(?MODULE, {get_proc, Client}, Timeout),
+ couch_stats:increment_counter([couchdb, query_server, acquired_processes]),
+ Res.
get_proc(LangStr) ->
Lang = couch_util:to_binary(LangStr),
Client = #client{lang = Lang},
Timeout = get_os_process_timeout(),
- gen_server:call(?MODULE, {get_proc, Client}, Timeout).
+ Res = gen_server:call(?MODULE, {get_proc, Client}, Timeout),
+ couch_stats:increment_counter([couchdb, query_server, acquired_processes]),
+ Res.
ret_proc(#proc{} = Proc) ->
gen_server:call(?MODULE, {ret_proc, Proc}, infinity).
diff --git a/src/couch/test/eunit/couch_file_tests.erl b/src/couch/test/eunit/couch_file_tests.erl
index 1b54cd70e..061b81e36 100644
--- a/src/couch/test/eunit/couch_file_tests.erl
+++ b/src/couch/test/eunit/couch_file_tests.erl
@@ -70,67 +70,86 @@ read_write_test_() ->
"Common file read/write tests",
{
setup,
- fun() -> test_util:start(?MODULE, [ioq]) end,
- fun test_util:stop/1,
- ?foreach([
- fun should_increase_file_size_on_write/1,
- fun should_return_current_file_size_on_write/1,
- fun should_write_and_read_term/1,
- fun should_write_and_read_binary/1,
- fun should_write_and_read_large_binary/1,
- fun should_return_term_as_binary_for_reading_binary/1,
- fun should_read_term_written_as_binary/1,
- fun should_read_iolist/1,
- fun should_fsync/1,
- fun should_not_read_beyond_eof/1,
- fun should_truncate/1
- ])
+ fun() -> test_util:start_couch() end,
+ fun test_util:stop_couch/1,
+ {
+ foreach,
+ fun setup/0,
+ fun teardown/1,
+ [
+ ?TDEF_FE(should_increase_file_size_on_write),
+ ?TDEF_FE(should_return_current_file_size_on_write),
+ ?TDEF_FE(should_write_and_read_term),
+ ?TDEF_FE(should_write_and_read_binary),
+ ?TDEF_FE(should_write_and_read_large_binary),
+ ?TDEF_FE(should_return_term_as_binary_for_reading_binary),
+ ?TDEF_FE(should_read_term_written_as_binary),
+ ?TDEF_FE(should_read_iolist),
+ ?TDEF_FE(should_fsync),
+ ?TDEF_FE(should_update_fsync_stats),
+ ?TDEF_FE(should_not_read_beyond_eof),
+ ?TDEF_FE(should_truncate)
+ ]
+ }
}
}.
should_increase_file_size_on_write(Fd) ->
{ok, 0, _} = couch_file:append_term(Fd, foo),
{ok, Size} = couch_file:bytes(Fd),
- ?_assert(Size > 0).
+ ?assert(Size > 0).
should_return_current_file_size_on_write(Fd) ->
{ok, 0, _} = couch_file:append_term(Fd, foo),
{ok, Size} = couch_file:bytes(Fd),
- ?_assertMatch({ok, Size, _}, couch_file:append_term(Fd, bar)).
+ ?assertMatch({ok, Size, _}, couch_file:append_term(Fd, bar)).
should_write_and_read_term(Fd) ->
{ok, Pos, _} = couch_file:append_term(Fd, foo),
- ?_assertMatch({ok, foo}, couch_file:pread_term(Fd, Pos)).
+ ?assertMatch({ok, foo}, couch_file:pread_term(Fd, Pos)).
should_write_and_read_binary(Fd) ->
{ok, Pos, _} = couch_file:append_binary(Fd, <<"fancy!">>),
- ?_assertMatch({ok, <<"fancy!">>}, couch_file:pread_binary(Fd, Pos)).
+ ?assertMatch({ok, <<"fancy!">>}, couch_file:pread_binary(Fd, Pos)).
should_return_term_as_binary_for_reading_binary(Fd) ->
{ok, Pos, _} = couch_file:append_term(Fd, foo),
Foo = couch_compress:compress(foo, snappy),
- ?_assertMatch({ok, Foo}, couch_file:pread_binary(Fd, Pos)).
+ ?assertMatch({ok, Foo}, couch_file:pread_binary(Fd, Pos)).
should_read_term_written_as_binary(Fd) ->
{ok, Pos, _} = couch_file:append_binary(Fd, <<131, 100, 0, 3, 102, 111, 111>>),
- ?_assertMatch({ok, foo}, couch_file:pread_term(Fd, Pos)).
+ ?assertMatch({ok, foo}, couch_file:pread_term(Fd, Pos)).
should_write_and_read_large_binary(Fd) ->
BigBin = list_to_binary(lists:duplicate(100000, 0)),
{ok, Pos, _} = couch_file:append_binary(Fd, BigBin),
- ?_assertMatch({ok, BigBin}, couch_file:pread_binary(Fd, Pos)).
+ ?assertMatch({ok, BigBin}, couch_file:pread_binary(Fd, Pos)).
should_read_iolist(Fd) ->
%% append_binary == append_iolist?
%% Possible bug in pread_iolist or iolist() -> append_binary
{ok, Pos, _} = couch_file:append_binary(Fd, ["foo", $m, <<"bam">>]),
{ok, IoList} = couch_file:pread_iolist(Fd, Pos),
- ?_assertMatch(<<"foombam">>, iolist_to_binary(IoList)).
+ ?assertMatch(<<"foombam">>, iolist_to_binary(IoList)).
should_fsync(Fd) ->
- {"How does on test fsync?", ?_assertMatch(ok, couch_file:sync(Fd))}.
-
-should_not_read_beyond_eof(Fd) ->
+ ?assertMatch(ok, couch_file:sync(Fd)).
+
+should_update_fsync_stats(Fd) ->
+ Count0 = couch_stats:sample([fsync, count]),
+ Seq = lists:seq(1, 10),
+ lists:foreach(fun(_) -> ok = couch_file:sync(Fd) end, Seq),
+ Hist = couch_stats:sample([fsync, time]),
+ Count1 = couch_stats:sample([fsync, count]),
+ ?assert(Count1 > Count0),
+ HistMax = proplists:get_value(max, Hist),
+ HistPct = proplists:get_value(percentile, Hist),
+ ?assert(HistMax > 0),
+ ?assertMatch([{50, P50} | _] when P50 > 0, HistPct).
+
+should_not_read_beyond_eof(_) ->
+ {ok, Fd} = couch_file:open(?tempfile(), [create, overwrite]),
BigBin = list_to_binary(lists:duplicate(100000, 0)),
DoubleBin = round(byte_size(BigBin) * 2),
{ok, Pos, _Size} = couch_file:append_binary(Fd, BigBin),
@@ -140,8 +159,10 @@ should_not_read_beyond_eof(Fd) ->
ok = file:pwrite(Io, Pos, <<0:1/integer, DoubleBin:31/integer>>),
file:close(Io),
unlink(Fd),
- ExpectedError = {badmatch, {'EXIT', {bad_return_value, {read_beyond_eof, Filepath}}}},
- ?_assertError(ExpectedError, couch_file:pread_binary(Fd, Pos)).
+ ExpectExit = {bad_return_value, {read_beyond_eof, Filepath}},
+ ExpectError = {badmatch, {'EXIT', ExpectExit}},
+ ?assertError(ExpectError, couch_file:pread_binary(Fd, Pos)),
+ catch file:close(Fd).
should_truncate(Fd) ->
{ok, 0, _} = couch_file:append_term(Fd, foo),
@@ -149,7 +170,7 @@ should_truncate(Fd) ->
BigBin = list_to_binary(lists:duplicate(100000, 0)),
{ok, _, _} = couch_file:append_binary(Fd, BigBin),
ok = couch_file:truncate(Fd, Size),
- ?_assertMatch({ok, foo}, couch_file:pread_term(Fd, 0)).
+ ?assertMatch({ok, foo}, couch_file:pread_term(Fd, 0)).
pread_limit_test_() ->
{
@@ -157,31 +178,39 @@ pread_limit_test_() ->
{
setup,
fun() ->
- Ctx = test_util:start(?MODULE),
- config:set("couchdb", "max_pread_size", "50000"),
+ Ctx = test_util:start_couch([ioq]),
+ config:set("couchdb", "max_pread_size", "50000", _Persist=false),
Ctx
end,
fun(Ctx) ->
- config:delete("couchdb", "max_pread_size"),
- test_util:stop(Ctx)
+ config:delete("couchdb", "max_pread_size", _Persist=false),
+ test_util:stop_couch(Ctx)
end,
- ?foreach([
- fun should_increase_file_size_on_write/1,
- fun should_return_current_file_size_on_write/1,
- fun should_write_and_read_term/1,
- fun should_write_and_read_binary/1,
- fun should_not_read_more_than_pread_limit/1
- ])
+ {
+ foreach,
+ fun setup/0,
+ fun teardown/1,
+ [
+ ?TDEF_FE(should_increase_file_size_on_write),
+ ?TDEF_FE(should_return_current_file_size_on_write),
+ ?TDEF_FE(should_write_and_read_term),
+ ?TDEF_FE(should_write_and_read_binary),
+ ?TDEF_FE(should_not_read_more_than_pread_limit)
+ ]
+ }
}
}.
-should_not_read_more_than_pread_limit(Fd) ->
+should_not_read_more_than_pread_limit(_) ->
+ {ok, Fd} = couch_file:open(?tempfile(), [create, overwrite]),
{_, Filepath} = couch_file:process_info(Fd),
BigBin = list_to_binary(lists:duplicate(100000, 0)),
{ok, Pos, _Size} = couch_file:append_binary(Fd, BigBin),
unlink(Fd),
- ExpectedError = {badmatch, {'EXIT', {bad_return_value, {exceed_pread_limit, Filepath, 50000}}}},
- ?_assertError(ExpectedError, couch_file:pread_binary(Fd, Pos)).
+ ExpectExit = {bad_return_value, {exceed_pread_limit, Filepath, 50000}},
+ ExpectError = {badmatch, {'EXIT', ExpectExit}},
+ ?assertError(ExpectError, couch_file:pread_binary(Fd, Pos)),
+ catch file:close(Fd).
header_test_() ->
{
diff --git a/src/couch/test/eunit/couch_js_tests.erl b/src/couch/test/eunit/couch_js_tests.erl
index ccd2cd0b5..6fa6cad8d 100644
--- a/src/couch/test/eunit/couch_js_tests.erl
+++ b/src/couch/test/eunit/couch_js_tests.erl
@@ -53,6 +53,11 @@ should_create_sandbox() ->
?assertMatch([[[true, <<_/binary>>]]], Result),
[[[true, ErrMsg]]] = Result,
?assertNotEqual([], binary:matches(ErrMsg, <<"not defined">>)),
+ ?assert(couch_stats:sample([couchdb, query_server, process_starts]) > 0),
+ ?assert(couch_stats:sample([couchdb, query_server, process_prompts]) > 0),
+ ?assert(couch_stats:sample([couchdb, query_server, acquired_processes]) > 0),
+ ?assert(couch_stats:sample([couchdb, query_server, process_exists]) >= 0),
+ ?assert(couch_stats:sample([couchdb, query_server, process_errors]) >= 0),
couch_query_servers:ret_os_process(Proc).
%% erlfmt-ignore
@@ -274,7 +279,8 @@ should_exit_on_internal_error() ->
% It may fail and just exit the process. That's expected as well
throw:{os_process_error, _} ->
ok
- end.
+ end,
+ ?assert(couch_stats:sample([couchdb, query_server, process_errors]) > 0).
trigger_oom(Proc) ->
Status =
diff --git a/src/couch_pse_tests/src/cpse_test_read_write_docs.erl b/src/couch_pse_tests/src/cpse_test_read_write_docs.erl
index f51e50aec..16f4283a9 100644
--- a/src/couch_pse_tests/src/cpse_test_read_write_docs.erl
+++ b/src/couch_pse_tests/src/cpse_test_read_write_docs.erl
@@ -43,6 +43,7 @@ cpse_write_one_doc(Db1) ->
?assertEqual(0, couch_db_engine:get_doc_count(Db1)),
?assertEqual(0, couch_db_engine:get_del_doc_count(Db1)),
?assertEqual(0, couch_db_engine:get_update_seq(Db1)),
+ Commits = couch_stats:sample([couchdb, commits]),
Actions = [
{create, {<<"foo">>, {[{<<"vsn">>, 1}]}}}
@@ -57,7 +58,9 @@ cpse_write_one_doc(Db1) ->
?assertEqual(1, couch_db_engine:get_doc_count(Db3)),
?assertEqual(0, couch_db_engine:get_del_doc_count(Db3)),
?assertEqual(1, couch_db_engine:get_update_seq(Db3)),
-
+ ?assert(couch_stats:sample([couchdb, commits]) > Commits),
+ ?assert(couch_stats:sample([couchdb, coalesced_updates, interactive]) >= 0),
+ ?assert(couch_stats:sample([couchdb, coalesced_updates, replicated]) >= 0),
[FDI] = couch_db_engine:open_docs(Db3, [<<"foo">>]),
#rev_info{
rev = {RevPos, PrevRevId},