You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2021/04/27 18:33:46 UTC

[couchdb] branch main updated (33ca754 -> ff2242c)

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/couchdb.git.


    from 33ca754  Fix flaky couch_jobs metadata test
     new d701176  Improve tx retry resilience when transaction restart
     new ff2242c  Improve transaction retry behavior for _all_dbs, _dbs_info and _deleted_dbs

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/fabric/src/fabric2_db.erl                  | 67 +++++++++++---------------
 src/fabric/src/fabric2_fdb.erl                 |  6 +--
 src/fabric/src/fabric2_server.erl              |  9 +++-
 src/fabric/test/fabric2_changes_fold_tests.erl |  4 ++
 src/fabric/test/fabric2_db_crud_tests.erl      | 18 ++++---
 5 files changed, 55 insertions(+), 49 deletions(-)

[couchdb] 02/02: Improve transaction retry behavior for _all_dbs, _dbs_info and _deleted_dbs

Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit ff2242c5f90fd307eb5df52bc09952b1e580e3f2
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Mon Apr 26 15:30:15 2021 -0400

    Improve transaction retry behavior for _all_dbs, _dbs_info and _deleted_dbs
    
    After recent improvements to how retries are handled in `fabric2_fdb`, Elixir
    integration tests can often pass when running under "buggify" mode. The chance
    of re-sending response data during retries is now much lower, too. However,
    there are still some instances of that type of failure when running `_all_dbs`
    tests. To trigger it would have to run the all_dbs test from basics_tests.exs a
    few thousands times in a row. The reason for the failure is that retryable
    errors might be still thrown during the `LayerPrefix = get_dir(Tx)` call, and
    the whole transaction closure would be retried in [2]. When that happens,
    user's callback is called twice with `meta` and it sends `[` twice in a row to
    the client [3], which is incorrect.
    
    A simple fix is to not call `meta` or `complete` from the transaction context.
    That works because we don't pass the transaction object into user's callback
    and the user won't be able to run another transaction in the same process
    anyway.
    
    There are already tests which test retriable errors in _all_dbsa and _dbs_info,
    but they had been updated to only throw retriable errors when rows are emitted
    to match the new behavior.
    
    [0] https://github.com/apache/couchdb/commit/acb43e12fd7fddc6f606246875909f7c7df27324
    
    [1] ```
    ERL_ZFLAGS="-erlfdb network_options '[client_buggify_enable, {client_buggify_section_activated_probability, 35}, {client_buggify_section_fired_probability, 35}]'" make elixir tests=test/elixir/test/basics_test.exs:71
    ```
    [2] https://github.com/apache/couchdb/blob/082f8078411aab7d71cc86afca0fe2eff3104b01/src/fabric/src/fabric2_db.erl#L279-L287
    
    [3] https://github.com/apache/couchdb/blob/082f8078411aab7d71cc86afca0fe2eff3104b01/src/chttpd/src/chttpd_misc.erl#L137
---
 src/fabric/src/fabric2_db.erl             | 67 +++++++++++++------------------
 src/fabric/test/fabric2_db_crud_tests.erl | 18 ++++++---
 2 files changed, 41 insertions(+), 44 deletions(-)

diff --git a/src/fabric/src/fabric2_db.erl b/src/fabric/src/fabric2_db.erl
index 0074865..a5de00c 100644
--- a/src/fabric/src/fabric2_db.erl
+++ b/src/fabric/src/fabric2_db.erl
@@ -275,20 +275,15 @@ list_dbs(UserFun, UserAcc0, Options) ->
     FoldFun = fun
         (DbName, Acc) -> maybe_stop(UserFun({row, [{id, DbName}]}, Acc))
     end,
-    fabric2_fdb:transactional(fun(Tx) ->
-        try
-            UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
-            UserAcc2 = fabric2_fdb:list_dbs(
-                    Tx,
-                    FoldFun,
-                    UserAcc1,
-                    Options
-                ),
-            {ok, maybe_stop(UserFun(complete, UserAcc2))}
-        catch throw:{stop, FinalUserAcc} ->
-            {ok, FinalUserAcc}
-        end
-    end).
+    try
+        UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
+        UserAcc2 = fabric2_fdb:transactional(fun(Tx) ->
+            fabric2_fdb:list_dbs(Tx, FoldFun, UserAcc1, Options)
+        end),
+        {ok, maybe_stop(UserFun(complete, UserAcc2))}
+    catch throw:{stop, FinalUserAcc} ->
+        {ok, FinalUserAcc}
+    end.
 
 
 list_dbs_info() ->
@@ -313,22 +308,22 @@ list_dbs_info(UserFun, UserAcc0, Options) ->
         NewFutureQ = queue:in({DbName, InfoFuture}, FutureQ),
         drain_info_futures(NewFutureQ, Count + 1, UserFun, Acc)
     end,
-    fabric2_fdb:transactional(fun(Tx) ->
-        try
-            UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
-            InitAcc = {queue:new(), 0, UserAcc1},
+    try
+        UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
+        InitAcc = {queue:new(), 0, UserAcc1},
+        UserAcc3 = fabric2_fdb:transactional(fun(Tx) ->
             {FinalFutureQ, _, UserAcc2} = fabric2_fdb:list_dbs_info(
                     Tx,
                     FoldFun,
                     InitAcc,
                     Options
                 ),
-            UserAcc3 = drain_all_info_futures(FinalFutureQ, UserFun, UserAcc2),
-            {ok, maybe_stop(UserFun(complete, UserAcc3))}
-        catch throw:{stop, FinalUserAcc} ->
-            {ok, FinalUserAcc}
-        end
-    end).
+            drain_all_info_futures(FinalFutureQ, UserFun, UserAcc2)
+        end),
+        {ok, maybe_stop(UserFun(complete, UserAcc3))}
+    catch throw:{stop, FinalUserAcc} ->
+        {ok, FinalUserAcc}
+    end.
 
 
 list_deleted_dbs_info() ->
@@ -390,26 +385,22 @@ list_deleted_dbs_info(UserFun, UserAcc0, Options0) ->
         NewFutureQ = queue:in({DbName, TimeStamp, InfoFuture}, FutureQ),
         drain_deleted_info_futures(NewFutureQ, Count + 1, UserFun, Acc)
     end,
-    fabric2_fdb:transactional(fun(Tx) ->
-        try
-            UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
-            InitAcc = {queue:new(), 0, UserAcc1},
+    try
+        UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
+        InitAcc = {queue:new(), 0, UserAcc1},
+        UserAcc3 = fabric2_fdb:transactional(fun(Tx) ->
             {FinalFutureQ, _, UserAcc2} = fabric2_fdb:list_deleted_dbs_info(
                     Tx,
                     FoldFun,
                     InitAcc,
                     Options2
                 ),
-            UserAcc3 = drain_all_deleted_info_futures(
-                    FinalFutureQ,
-                    UserFun,
-                    UserAcc2
-                ),
-            {ok, maybe_stop(UserFun(complete, UserAcc3))}
-        catch throw:{stop, FinalUserAcc} ->
-            {ok, FinalUserAcc}
-        end
-    end).
+            drain_all_deleted_info_futures(FinalFutureQ, UserFun, UserAcc2)
+        end),
+        {ok, maybe_stop(UserFun(complete, UserAcc3))}
+    catch throw:{stop, FinalUserAcc} ->
+        {ok, FinalUserAcc}
+    end.
 
 
 is_admin(Db, {SecProps}) when is_list(SecProps) ->
diff --git a/src/fabric/test/fabric2_db_crud_tests.erl b/src/fabric/test/fabric2_db_crud_tests.erl
index 3d90c65..ab157d8 100644
--- a/src/fabric/test/fabric2_db_crud_tests.erl
+++ b/src/fabric/test/fabric2_db_crud_tests.erl
@@ -449,9 +449,12 @@ list_dbs_tx_too_old(_) ->
     ?assertMatch({ok, _}, fabric2_db:create(DbName1, [])),
     ?assertMatch({ok, _}, fabric2_db:create(DbName2, [])),
 
-    UserFun = fun(Row, Acc) ->
-        fabric2_test_util:tx_too_old_raise_in_user_fun(),
-        {ok, [Row | Acc]}
+    UserFun = fun
+        ({row, _} = Row, Acc) ->
+            fabric2_test_util:tx_too_old_raise_in_user_fun(),
+            {ok, [Row | Acc]};
+        (Row, Acc) ->
+            {ok, [Row | Acc]}
     end,
 
     % Get get expected output without any transactions timing out
@@ -492,9 +495,12 @@ list_dbs_info_tx_too_old(_) ->
         DbName
     end, lists:seq(1, DbCount)),
 
-    UserFun = fun(Row, Acc) ->
-        fabric2_test_util:tx_too_old_raise_in_user_fun(),
-        {ok, [Row | Acc]}
+    UserFun = fun
+        ({row, _} = Row, Acc) ->
+            fabric2_test_util:tx_too_old_raise_in_user_fun(),
+            {ok, [Row | Acc]};
+        (Row, Acc) ->
+            {ok, [Row | Acc]}
     end,
 
     % This is the expected return with no tx timeouts

[couchdb] 01/02: Improve tx retry resilience when transaction restart

Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit d7011765673e649ac2adcde3db2beede920f40be
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Mon Apr 26 15:25:42 2021 -0400

    Improve tx retry resilience when transaction restart
    
    When running integration tests with a "buggified" client [1], sometimes
    `fold_range_not_progressing` is triggered since it's possible retriable errors
    might be thrown 3 times in a row. Instead of bumping it arbitrarily, since we
    already have a retry limit in fabric2_server, start using that.
    
    [1] ```
     ERL_ZFLAGS="-erlfdb network_options '[client_buggify_enable, {client_buggify_section_activated_probability, 25}, {client_buggify_section_fired_probability, 25}]'" make elixir tests=test/elixir/test/basics_test.exs
    ```
---
 src/fabric/src/fabric2_fdb.erl                 | 6 ++----
 src/fabric/src/fabric2_server.erl              | 9 ++++++++-
 src/fabric/test/fabric2_changes_fold_tests.erl | 4 ++++
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/fabric/src/fabric2_fdb.erl b/src/fabric/src/fabric2_fdb.erl
index a4c3f89..e86b037 100644
--- a/src/fabric/src/fabric2_fdb.erl
+++ b/src/fabric/src/fabric2_fdb.erl
@@ -91,9 +91,6 @@
 -include("fabric2.hrl").
 
 
--define(MAX_FOLD_RANGE_RETRIES, 3).
-
-
 -record(fold_acc, {
     db,
     restart_tx,
@@ -1882,10 +1879,11 @@ restart_fold(Tx, #fold_acc{} = Acc) ->
 
     ok = erlfdb:reset(Tx),
 
+    MaxRetries = fabric2_server:get_retry_limit(),
     case {erase(?PDICT_FOLD_ACC_STATE), Acc#fold_acc.retries} of
         {#fold_acc{db = Db} = Acc1, _} ->
             Acc1#fold_acc{db = check_db_instance(Db), retries = 0};
-        {undefined, Retries} when Retries < ?MAX_FOLD_RANGE_RETRIES ->
+        {undefined, Retries} when Retries < MaxRetries ->
             Db = check_db_instance(Acc#fold_acc.db),
             Acc#fold_acc{db = Db, retries = Retries + 1};
         {undefined, _} ->
diff --git a/src/fabric/src/fabric2_server.erl b/src/fabric/src/fabric2_server.erl
index e427f20..8a4a8d8 100644
--- a/src/fabric/src/fabric2_server.erl
+++ b/src/fabric/src/fabric2_server.erl
@@ -27,7 +27,8 @@
     maybe_remove/1,
 
     fdb_directory/0,
-    fdb_cluster/0
+    fdb_cluster/0,
+    get_retry_limit/0
 ]).
 
 
@@ -194,6 +195,12 @@ fdb_directory() ->
 fdb_cluster() ->
     get_env(?FDB_CLUSTER).
 
+
+get_retry_limit() ->
+    Default = list_to_integer(?DEFAULT_RETRY_LIMIT),
+    config:get_integer(?TX_OPTIONS_SECTION, "retry_limit", Default).
+
+
 get_env(Key) ->
     case get(Key) of
         undefined ->
diff --git a/src/fabric/test/fabric2_changes_fold_tests.erl b/src/fabric/test/fabric2_changes_fold_tests.erl
index a8578f9..2f63883 100644
--- a/src/fabric/test/fabric2_changes_fold_tests.erl
+++ b/src/fabric/test/fabric2_changes_fold_tests.erl
@@ -81,6 +81,7 @@ changes_fold_test_() ->
 setup_all() ->
     Ctx = test_util:start_couch([fabric]),
     meck:new(erlfdb, [passthrough]),
+    meck:new(fabric2_server, [passthrough]),
     Ctx.
 
 
@@ -91,6 +92,7 @@ teardown_all(Ctx) ->
 
 setup() ->
     fabric2_test_util:tx_too_old_mock_erlfdb(),
+    meck:expect(fabric2_server, get_retry_limit, 0, 3),
     {ok, Db} = fabric2_db:create(?tempdb(), [{user_ctx, ?ADMIN_USER}]),
     Rows = lists:map(fun(Val) ->
         DocId = fabric2_util:uuid(),
@@ -111,6 +113,8 @@ setup() ->
 
 
 cleanup({Db, _DocIdRevs}) ->
+    meck:reset(fabric2_server),
+    meck:expect(fabric2_server, get_retry_limit, 0, meck:passthrough()),
     fabric2_test_util:tx_too_old_reset_errors(),
     ok = fabric2_db:delete(fabric2_db:name(Db), []).