You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2021/04/26 20:09:40 UTC

[couchdb] branch improve-retry-handling-in-dbs-endpoints created (now 979a597)

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a change to branch improve-retry-handling-in-dbs-endpoints
in repository https://gitbox.apache.org/repos/asf/couchdb.git.


      at 979a597  Improve transaction retry behavior for _all_dbs, _dbs_info and _deleted_dbs

This branch includes the following new commits:

     new 12c34bb  Improve tx retry resilience when transaction restart
     new 979a597  Improve transaction retry behavior for _all_dbs, _dbs_info and _deleted_dbs

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


[couchdb] 01/02: Improve tx retry resilience when transaction restart

Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch improve-retry-handling-in-dbs-endpoints
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 12c34bb5ad970100b08b80cbbb374d07fa8bd485
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Mon Apr 26 15:25:42 2021 -0400

    Improve tx retry resilience when transaction restart
    
    When running integration tests with a "buggified" client [1], sometimes
    `fold_range_not_progressing` is triggered since it's possible retriable errors
    might be thrown 3 times in a row. Instead of bumping it arbitrarily, since we
    already have a retry limit in fabric2_server, start using that.
    
    [1] ```
     ERL_ZFLAGS="-erlfdb network_options '[client_buggify_enable, {client_buggify_section_activated_probability, 25}, {client_buggify_section_fired_probability, 25}]'" make elixir tests=test/elixir/test/basics_test.exs
    ```
---
 src/fabric/src/fabric2_fdb.erl                 | 6 ++----
 src/fabric/src/fabric2_server.erl              | 9 ++++++++-
 src/fabric/test/fabric2_changes_fold_tests.erl | 4 ++++
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/fabric/src/fabric2_fdb.erl b/src/fabric/src/fabric2_fdb.erl
index a4c3f89..e86b037 100644
--- a/src/fabric/src/fabric2_fdb.erl
+++ b/src/fabric/src/fabric2_fdb.erl
@@ -91,9 +91,6 @@
 -include("fabric2.hrl").
 
 
--define(MAX_FOLD_RANGE_RETRIES, 3).
-
-
 -record(fold_acc, {
     db,
     restart_tx,
@@ -1882,10 +1879,11 @@ restart_fold(Tx, #fold_acc{} = Acc) ->
 
     ok = erlfdb:reset(Tx),
 
+    MaxRetries = fabric2_server:get_retry_limit(),
     case {erase(?PDICT_FOLD_ACC_STATE), Acc#fold_acc.retries} of
         {#fold_acc{db = Db} = Acc1, _} ->
             Acc1#fold_acc{db = check_db_instance(Db), retries = 0};
-        {undefined, Retries} when Retries < ?MAX_FOLD_RANGE_RETRIES ->
+        {undefined, Retries} when Retries < MaxRetries ->
             Db = check_db_instance(Acc#fold_acc.db),
             Acc#fold_acc{db = Db, retries = Retries + 1};
         {undefined, _} ->
diff --git a/src/fabric/src/fabric2_server.erl b/src/fabric/src/fabric2_server.erl
index e427f20..8a4a8d8 100644
--- a/src/fabric/src/fabric2_server.erl
+++ b/src/fabric/src/fabric2_server.erl
@@ -27,7 +27,8 @@
     maybe_remove/1,
 
     fdb_directory/0,
-    fdb_cluster/0
+    fdb_cluster/0,
+    get_retry_limit/0
 ]).
 
 
@@ -194,6 +195,12 @@ fdb_directory() ->
 fdb_cluster() ->
     get_env(?FDB_CLUSTER).
 
+
+get_retry_limit() ->
+    Default = list_to_integer(?DEFAULT_RETRY_LIMIT),
+    config:get_integer(?TX_OPTIONS_SECTION, "retry_limit", Default).
+
+
 get_env(Key) ->
     case get(Key) of
         undefined ->
diff --git a/src/fabric/test/fabric2_changes_fold_tests.erl b/src/fabric/test/fabric2_changes_fold_tests.erl
index a8578f9..2f63883 100644
--- a/src/fabric/test/fabric2_changes_fold_tests.erl
+++ b/src/fabric/test/fabric2_changes_fold_tests.erl
@@ -81,6 +81,7 @@ changes_fold_test_() ->
 setup_all() ->
     Ctx = test_util:start_couch([fabric]),
     meck:new(erlfdb, [passthrough]),
+    meck:new(fabric2_server, [passthrough]),
     Ctx.
 
 
@@ -91,6 +92,7 @@ teardown_all(Ctx) ->
 
 setup() ->
     fabric2_test_util:tx_too_old_mock_erlfdb(),
+    meck:expect(fabric2_server, get_retry_limit, 0, 3),
     {ok, Db} = fabric2_db:create(?tempdb(), [{user_ctx, ?ADMIN_USER}]),
     Rows = lists:map(fun(Val) ->
         DocId = fabric2_util:uuid(),
@@ -111,6 +113,8 @@ setup() ->
 
 
 cleanup({Db, _DocIdRevs}) ->
+    meck:reset(fabric2_server),
+    meck:expect(fabric2_server, get_retry_limit, 0, meck:passthrough()),
     fabric2_test_util:tx_too_old_reset_errors(),
     ok = fabric2_db:delete(fabric2_db:name(Db), []).
 

[couchdb] 02/02: Improve transaction retry behavior for _all_dbs, _dbs_info and _deleted_dbs

Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch improve-retry-handling-in-dbs-endpoints
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 979a597dcd6b2a774eab186c19caa9dbc31ee872
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Mon Apr 26 15:30:15 2021 -0400

    Improve transaction retry behavior for _all_dbs, _dbs_info and _deleted_dbs
    
    After recent improvements to how retries are handled in `fabric2_fdb`, Elixir
    integration tests can often pass when running under "buggify" mode. The chance
    of re-sending response data during retries is now much lower, too. However,
    there are still some instances of that type of failure when running `_all_dbs`
    tests. To trigger it would have to run the all_dbs test from basics_tests.exs a
    few thousands times in a row. The reason for the failure is that retryable
    errors might be still thrown during the `LayerPrefix = get_dir(Tx)` call, and
    the whole transaction closure would be retried in [2]. When that happens,
    user's callback is called twice with `meta` and it sends `[` twice in a row to
    the client [3], which is incorrect.
    
    A simple fix is to not call `meta` or `complete` from the transaction context.
    That works because we don't pass the transaction object into user's callback
    and the user won't be able to run another transaction in the same process
    anyway.
    
    There are already tests which test retriable errors in _all_dbsa and _dbs_info,
    but they had been updated to only throw retriable errors when rows are emitted
    to match the new behavior.
    
    [0] https://github.com/apache/couchdb/commit/acb43e12fd7fddc6f606246875909f7c7df27324
    
    [1] ```
    ERL_ZFLAGS="-erlfdb network_options '[client_buggify_enable, {client_buggify_section_activated_probability, 35}, {client_buggify_section_fired_probability, 35}]'" make elixir tests=test/elixir/test/basics_test.exs:71
    ```
    [2] https://github.com/apache/couchdb/blob/082f8078411aab7d71cc86afca0fe2eff3104b01/src/fabric/src/fabric2_db.erl#L279-L287
    
    [3] https://github.com/apache/couchdb/blob/082f8078411aab7d71cc86afca0fe2eff3104b01/src/chttpd/src/chttpd_misc.erl#L137
---
 src/fabric/src/fabric2_db.erl             | 67 +++++++++++++------------------
 src/fabric/test/fabric2_db_crud_tests.erl | 18 ++++++---
 2 files changed, 41 insertions(+), 44 deletions(-)

diff --git a/src/fabric/src/fabric2_db.erl b/src/fabric/src/fabric2_db.erl
index 0074865..a5de00c 100644
--- a/src/fabric/src/fabric2_db.erl
+++ b/src/fabric/src/fabric2_db.erl
@@ -275,20 +275,15 @@ list_dbs(UserFun, UserAcc0, Options) ->
     FoldFun = fun
         (DbName, Acc) -> maybe_stop(UserFun({row, [{id, DbName}]}, Acc))
     end,
-    fabric2_fdb:transactional(fun(Tx) ->
-        try
-            UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
-            UserAcc2 = fabric2_fdb:list_dbs(
-                    Tx,
-                    FoldFun,
-                    UserAcc1,
-                    Options
-                ),
-            {ok, maybe_stop(UserFun(complete, UserAcc2))}
-        catch throw:{stop, FinalUserAcc} ->
-            {ok, FinalUserAcc}
-        end
-    end).
+    try
+        UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
+        UserAcc2 = fabric2_fdb:transactional(fun(Tx) ->
+            fabric2_fdb:list_dbs(Tx, FoldFun, UserAcc1, Options)
+        end),
+        {ok, maybe_stop(UserFun(complete, UserAcc2))}
+    catch throw:{stop, FinalUserAcc} ->
+        {ok, FinalUserAcc}
+    end.
 
 
 list_dbs_info() ->
@@ -313,22 +308,22 @@ list_dbs_info(UserFun, UserAcc0, Options) ->
         NewFutureQ = queue:in({DbName, InfoFuture}, FutureQ),
         drain_info_futures(NewFutureQ, Count + 1, UserFun, Acc)
     end,
-    fabric2_fdb:transactional(fun(Tx) ->
-        try
-            UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
-            InitAcc = {queue:new(), 0, UserAcc1},
+    try
+        UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
+        InitAcc = {queue:new(), 0, UserAcc1},
+        UserAcc3 = fabric2_fdb:transactional(fun(Tx) ->
             {FinalFutureQ, _, UserAcc2} = fabric2_fdb:list_dbs_info(
                     Tx,
                     FoldFun,
                     InitAcc,
                     Options
                 ),
-            UserAcc3 = drain_all_info_futures(FinalFutureQ, UserFun, UserAcc2),
-            {ok, maybe_stop(UserFun(complete, UserAcc3))}
-        catch throw:{stop, FinalUserAcc} ->
-            {ok, FinalUserAcc}
-        end
-    end).
+            drain_all_info_futures(FinalFutureQ, UserFun, UserAcc2)
+        end),
+        {ok, maybe_stop(UserFun(complete, UserAcc3))}
+    catch throw:{stop, FinalUserAcc} ->
+        {ok, FinalUserAcc}
+    end.
 
 
 list_deleted_dbs_info() ->
@@ -390,26 +385,22 @@ list_deleted_dbs_info(UserFun, UserAcc0, Options0) ->
         NewFutureQ = queue:in({DbName, TimeStamp, InfoFuture}, FutureQ),
         drain_deleted_info_futures(NewFutureQ, Count + 1, UserFun, Acc)
     end,
-    fabric2_fdb:transactional(fun(Tx) ->
-        try
-            UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
-            InitAcc = {queue:new(), 0, UserAcc1},
+    try
+        UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
+        InitAcc = {queue:new(), 0, UserAcc1},
+        UserAcc3 = fabric2_fdb:transactional(fun(Tx) ->
             {FinalFutureQ, _, UserAcc2} = fabric2_fdb:list_deleted_dbs_info(
                     Tx,
                     FoldFun,
                     InitAcc,
                     Options2
                 ),
-            UserAcc3 = drain_all_deleted_info_futures(
-                    FinalFutureQ,
-                    UserFun,
-                    UserAcc2
-                ),
-            {ok, maybe_stop(UserFun(complete, UserAcc3))}
-        catch throw:{stop, FinalUserAcc} ->
-            {ok, FinalUserAcc}
-        end
-    end).
+            drain_all_deleted_info_futures(FinalFutureQ, UserFun, UserAcc2)
+        end),
+        {ok, maybe_stop(UserFun(complete, UserAcc3))}
+    catch throw:{stop, FinalUserAcc} ->
+        {ok, FinalUserAcc}
+    end.
 
 
 is_admin(Db, {SecProps}) when is_list(SecProps) ->
diff --git a/src/fabric/test/fabric2_db_crud_tests.erl b/src/fabric/test/fabric2_db_crud_tests.erl
index 3d90c65..4479c40 100644
--- a/src/fabric/test/fabric2_db_crud_tests.erl
+++ b/src/fabric/test/fabric2_db_crud_tests.erl
@@ -449,9 +449,12 @@ list_dbs_tx_too_old(_) ->
     ?assertMatch({ok, _}, fabric2_db:create(DbName1, [])),
     ?assertMatch({ok, _}, fabric2_db:create(DbName2, [])),
 
-    UserFun = fun(Row, Acc) ->
-        fabric2_test_util:tx_too_old_raise_in_user_fun(),
-        {ok, [Row | Acc]}
+    UserFun = fun
+        ({row, _} = Row, Acc) ->
+            fabric2_test_util:tx_too_old_raise_in_user_fun(),
+                {ok, [Row | Acc]};
+        (Row, Acc) ->
+                {ok, [Row | Acc]}
     end,
 
     % Get get expected output without any transactions timing out
@@ -492,9 +495,12 @@ list_dbs_info_tx_too_old(_) ->
         DbName
     end, lists:seq(1, DbCount)),
 
-    UserFun = fun(Row, Acc) ->
-        fabric2_test_util:tx_too_old_raise_in_user_fun(),
-        {ok, [Row | Acc]}
+    UserFun = fun
+        ({row, _} = Row, Acc) ->
+            fabric2_test_util:tx_too_old_raise_in_user_fun(),
+            {ok, [Row | Acc]};
+        (Row, Acc) ->
+            {ok, [Row | Acc]}
     end,
 
     % This is the expected return with no tx timeouts