You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@couchdb.apache.org by va...@apache.org on 2021/09/08 22:52:10 UTC

[couchdb] branch improve-get-db-timeouts created (now 0fa1d44)

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a change to branch improve-get-db-timeouts
in repository https://gitbox.apache.org/repos/asf/couchdb.git.


      at 0fa1d44  Improve fabric_util get_db timeout logic

This branch includes the following new commits:

     new 0fa1d44  Improve fabric_util get_db timeout logic

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.

[couchdb] 01/01: Improve fabric_util get_db timeout logic

Posted by va...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch improve-get-db-timeouts
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 0fa1d448478c72c296d13e68cb0e6231b389d59f
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Wed Sep 8 18:20:46 2021 -0400

    Improve fabric_util get_db timeout logic
    
    Previously, users with low {Q, N} dbs often got the `"No DB shards could be
    opened."` error when the cluster is overloaded. The hard-coded 100 msec timeout
    was too low to open the few available shards and the whole request would crash
    with a 500 error.
    
    Attempt to calculate an optimal timeout value based on the number of shards and
    the max fabric request timeout limit.
    
    The sequence of doubling (by default) timeouts forms a geometric progression.
    Use the well known closed form formula for the sum [0], and the maximum request
    timeout, to calculate the initial timeout. The test case illustrates a few
    examples with some default Q and N values.
    
    Because we don't want the timeout value to be too low, since it takes time to
    open shards, and we don't want to quickly cycle through a few initial shards
    and discard the results, the minimum inital timeout is clipped to the
    previously hard-coded 100 msec timeout. Unlike previously however, this minimum
    value can now also be configured.
    
    Another issue with the previous code was that it was emitting a generic error
    without a specific reason why the shards could not be opened. Timeout was the
    most likely reason, but to confirm user either had to enable debug logging, or
    apply clever erlang tracing on the `couch_log:debug/2` call. So as an
    improvement, emit the reason string into the get_shard/5 recursive call so it
    can be bubbled up with the error tuple.
    
    [0] https://en.wikipedia.org/wiki/Geometric_series
---
 rel/overlay/etc/default.ini    |  1 +
 src/fabric/src/fabric_util.erl | 90 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/rel/overlay/etc/default.ini b/rel/overlay/etc/default.ini
index d3710ce..93aa1ca 100644
--- a/rel/overlay/etc/default.ini
+++ b/rel/overlay/etc/default.ini
@@ -276,6 +276,7 @@ bind_address = 127.0.0.1
 ; all_docs_concurrency = 10
 ; changes_duration = 
 ; shard_timeout_factor = 2
+; shard_timeout_min_msec = 100
 ; uuid_prefix_len = 7
 ; request_timeout = 60000
 ; all_docs_timeout = 10000
diff --git a/src/fabric/src/fabric_util.erl b/src/fabric/src/fabric_util.erl
index 9dd8e71..2ba89f2 100644
--- a/src/fabric/src/fabric_util.erl
+++ b/src/fabric/src/fabric_util.erl
@@ -100,19 +100,22 @@ get_db(DbName) ->
     get_db(DbName, []).
 
 get_db(DbName, Options) ->
-    {Local, SameZone, DifferentZone} = mem3:group_by_proximity(mem3:shards(DbName)),
+    LiveShards = mem3:live_shards(DbName, [node() | nodes()]),
+    {Local, SameZone, DifferentZone} = mem3:group_by_proximity(LiveShards),
     % Prefer shards on the same node over other nodes, prefer shards in the same zone over
     % over zones and sort each remote list by name so that we don't repeatedly try the same node.
     Shards = Local ++ lists:keysort(#shard.name, SameZone) ++ lists:keysort(#shard.name, DifferentZone),
-    % suppress shards from down nodes
-    Nodes = [node()|erlang:nodes()],
-    Live = [S || #shard{node = N} = S <- Shards, lists:member(N, Nodes)],
-    Factor = list_to_integer(config:get("fabric", "shard_timeout_factor", "2")),
-    get_shard(Live, Options, 100, Factor).
-
-get_shard([], _Opts, _Timeout, _Factor) ->
-    erlang:error({internal_server_error, "No DB shards could be opened."});
-get_shard([#shard{node = Node, name = Name} | Rest], Opts, Timeout, Factor) ->
+    % Only accept factors > 1, otherwise our math breaks further down
+    Factor = max(2, config:get_integer("fabric", "shard_timeout_factor", 2)),
+    MinTimeout = config:get_integer("fabric", "shard_timeout_min_msec", 100),
+    MaxTimeout = request_timeout(),
+    Timeout = get_db_timeout(length(Shards), Factor, MinTimeout, MaxTimeout),
+    get_shard(Shards, Options, Timeout, Factor, "No shards found").
+
+get_shard([], _Opts, _Timeout, _Factor, Reason) ->
+    Msg = io_lib:format("No DB shards could be opened. Reason: ~s", [Reason]),
+    erlang:error({internal_server_error, lists:flatten(Msg)});
+get_shard([#shard{node = Node, name = Name} | Rest], Opts, Timeout, Factor, _) ->
     Mon = rexi_monitor:start([rexi_utils:server_pid(Node)]),
     MFA = {fabric_rpc, open_shard, [Name, [{timeout, Timeout} | Opts]]},
     Ref = rexi:cast(Node, self(), MFA, [sync]),
@@ -125,15 +128,40 @@ get_shard([#shard{node = Node, name = Name} | Rest], Opts, Timeout, Factor) ->
             throw(Error);
         {Ref, Reason} ->
             couch_log:debug("Failed to open shard ~p because: ~p", [Name, Reason]),
-            get_shard(Rest, Opts, Timeout, Factor)
+            FmtReason = lists:flatten(io_lib:format("~p", [Reason])),
+            get_shard(Rest, Opts, Timeout, Factor, FmtReason)
         after Timeout ->
             couch_log:debug("Failed to open shard ~p after: ~p", [Name, Timeout]),
-            get_shard(Rest, Opts, Factor * Timeout, Factor)
+            NextTimeout = min(request_timeout(), Factor * Timeout),
+            FmtReason = lists:flatten(io_lib:format("timeout ~B msec", [Timeout])),
+            get_shard(Rest, Opts, NextTimeout, Factor, FmtReason)
         end
     after
         rexi_monitor:stop(Mon)
     end.
 
+
+get_db_timeout(N, Factor, MinTimeout, MaxTimeout) ->
+    %
+    % The progression of timeouts forms a geometric series:
+    %
+    %     MaxTimeout = T + T*F + T*F^2 + T*F^3 ...
+    %
+    % Where T is the initial timeout and F is the factor. The formula for
+    % the sum is:
+    %
+    %     Sum[T * F^I, I <- 0..N] = T * (1 - F^(N + 1)) / (1 - F)
+    %
+    % Then, for a given sum and factor we can calculate the initial timeout T:
+    %
+    %     T = Sum / ((1 - F^(N+1)) / (1 - F))
+    %
+    Timeout = MaxTimeout / ((1 - math:pow(Factor, N + 1)) / (1 - Factor)),
+
+    % Apply a minimum timeout value
+    max(MinTimeout, trunc(Timeout)).
+
+
 error_info({{timeout, _} = Error, _Stack}) ->
     Error;
 error_info({{Error, Reason}, Stack}) ->
@@ -400,3 +428,41 @@ do_isolate(Fun) ->
 
 
 -endif.
+
+
+get_db_timeout_test() ->
+    % Q=1, N=1
+    ?assertEqual(20000, get_db_timeout(1, 2, 100, 60000)),
+
+    % Q=2, N=1
+    ?assertEqual(8571, get_db_timeout(2, 2, 100, 60000)),
+
+    % Q=2, N=3 (default)
+    ?assertEqual(472, get_db_timeout(2 * 3, 2, 100, 60000)),
+
+    % Q=3, N=3
+    ?assertEqual(100, get_db_timeout(3 * 3, 2, 100, 60000)),
+
+    % Q=4, N=1
+    ?assertEqual(1935, get_db_timeout(4, 2, 100, 60000)),
+
+    % Q=8, N=1
+    ?assertEqual(117, get_db_timeout(8, 2, 100, 60000)),
+
+    % Q=8, N=3 (default in 2.x)
+    ?assertEqual(100, get_db_timeout(8 * 3, 2, 100, 60000)),
+
+    % Q=256, N=3
+    ?assertEqual(100, get_db_timeout(256 * 3, 2, 100, 60000)),
+
+    % Large factor = 100
+    ?assertEqual(100, get_db_timeout(2 * 3, 100, 100, 60000)),
+
+    % Small total request timeout = 1 sec
+    ?assertEqual(100, get_db_timeout(2 * 3, 2, 100, 1000)),
+
+    % Large total request timeout
+    ?assertEqual(28346, get_db_timeout(2 * 3, 2, 100, 3600000)),
+
+    % No shards at all
+    ?assertEqual(60000, get_db_timeout(0, 2, 100, 60000)).