You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2021/04/26 20:09:40 UTC
[couchdb] branch improve-retry-handling-in-dbs-endpoints created
(now 979a597)
This is an automated email from the ASF dual-hosted git repository.
vatamane pushed a change to branch improve-retry-handling-in-dbs-endpoints
in repository https://gitbox.apache.org/repos/asf/couchdb.git.
at 979a597 Improve transaction retry behavior for _all_dbs, _dbs_info and _deleted_dbs
This branch includes the following new commits:
new 12c34bb Improve tx retry resilience when transaction restart
new 979a597 Improve transaction retry behavior for _all_dbs, _dbs_info and _deleted_dbs
The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
[couchdb] 01/02: Improve tx retry resilience when transaction
restart
Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
vatamane pushed a commit to branch improve-retry-handling-in-dbs-endpoints
in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 12c34bb5ad970100b08b80cbbb374d07fa8bd485
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Mon Apr 26 15:25:42 2021 -0400
Improve tx retry resilience when transaction restart
When running integration tests with a "buggified" client [1], sometimes
`fold_range_not_progressing` is triggered since it's possible retriable errors
might be thrown 3 times in a row. Instead of bumping it arbitrarily, since we
already have a retry limit in fabric2_server, start using that.
[1] ```
ERL_ZFLAGS="-erlfdb network_options '[client_buggify_enable, {client_buggify_section_activated_probability, 25}, {client_buggify_section_fired_probability, 25}]'" make elixir tests=test/elixir/test/basics_test.exs
```
---
src/fabric/src/fabric2_fdb.erl | 6 ++----
src/fabric/src/fabric2_server.erl | 9 ++++++++-
src/fabric/test/fabric2_changes_fold_tests.erl | 4 ++++
3 files changed, 14 insertions(+), 5 deletions(-)
diff --git a/src/fabric/src/fabric2_fdb.erl b/src/fabric/src/fabric2_fdb.erl
index a4c3f89..e86b037 100644
--- a/src/fabric/src/fabric2_fdb.erl
+++ b/src/fabric/src/fabric2_fdb.erl
@@ -91,9 +91,6 @@
-include("fabric2.hrl").
--define(MAX_FOLD_RANGE_RETRIES, 3).
-
-
-record(fold_acc, {
db,
restart_tx,
@@ -1882,10 +1879,11 @@ restart_fold(Tx, #fold_acc{} = Acc) ->
ok = erlfdb:reset(Tx),
+ MaxRetries = fabric2_server:get_retry_limit(),
case {erase(?PDICT_FOLD_ACC_STATE), Acc#fold_acc.retries} of
{#fold_acc{db = Db} = Acc1, _} ->
Acc1#fold_acc{db = check_db_instance(Db), retries = 0};
- {undefined, Retries} when Retries < ?MAX_FOLD_RANGE_RETRIES ->
+ {undefined, Retries} when Retries < MaxRetries ->
Db = check_db_instance(Acc#fold_acc.db),
Acc#fold_acc{db = Db, retries = Retries + 1};
{undefined, _} ->
diff --git a/src/fabric/src/fabric2_server.erl b/src/fabric/src/fabric2_server.erl
index e427f20..8a4a8d8 100644
--- a/src/fabric/src/fabric2_server.erl
+++ b/src/fabric/src/fabric2_server.erl
@@ -27,7 +27,8 @@
maybe_remove/1,
fdb_directory/0,
- fdb_cluster/0
+ fdb_cluster/0,
+ get_retry_limit/0
]).
@@ -194,6 +195,12 @@ fdb_directory() ->
fdb_cluster() ->
get_env(?FDB_CLUSTER).
+
+get_retry_limit() ->
+ Default = list_to_integer(?DEFAULT_RETRY_LIMIT),
+ config:get_integer(?TX_OPTIONS_SECTION, "retry_limit", Default).
+
+
get_env(Key) ->
case get(Key) of
undefined ->
diff --git a/src/fabric/test/fabric2_changes_fold_tests.erl b/src/fabric/test/fabric2_changes_fold_tests.erl
index a8578f9..2f63883 100644
--- a/src/fabric/test/fabric2_changes_fold_tests.erl
+++ b/src/fabric/test/fabric2_changes_fold_tests.erl
@@ -81,6 +81,7 @@ changes_fold_test_() ->
setup_all() ->
Ctx = test_util:start_couch([fabric]),
meck:new(erlfdb, [passthrough]),
+ meck:new(fabric2_server, [passthrough]),
Ctx.
@@ -91,6 +92,7 @@ teardown_all(Ctx) ->
setup() ->
fabric2_test_util:tx_too_old_mock_erlfdb(),
+ meck:expect(fabric2_server, get_retry_limit, 0, 3),
{ok, Db} = fabric2_db:create(?tempdb(), [{user_ctx, ?ADMIN_USER}]),
Rows = lists:map(fun(Val) ->
DocId = fabric2_util:uuid(),
@@ -111,6 +113,8 @@ setup() ->
cleanup({Db, _DocIdRevs}) ->
+ meck:reset(fabric2_server),
+ meck:expect(fabric2_server, get_retry_limit, 0, meck:passthrough()),
fabric2_test_util:tx_too_old_reset_errors(),
ok = fabric2_db:delete(fabric2_db:name(Db), []).
[couchdb] 02/02: Improve transaction retry behavior for _all_dbs,
_dbs_info and _deleted_dbs
Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
vatamane pushed a commit to branch improve-retry-handling-in-dbs-endpoints
in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 979a597dcd6b2a774eab186c19caa9dbc31ee872
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Mon Apr 26 15:30:15 2021 -0400
Improve transaction retry behavior for _all_dbs, _dbs_info and _deleted_dbs
After recent improvements to how retries are handled in `fabric2_fdb`, Elixir
integration tests can often pass when running under "buggify" mode. The chance
of re-sending response data during retries is now much lower, too. However,
there are still some instances of that type of failure when running `_all_dbs`
tests. To trigger it would have to run the all_dbs test from basics_tests.exs a
few thousands times in a row. The reason for the failure is that retryable
errors might be still thrown during the `LayerPrefix = get_dir(Tx)` call, and
the whole transaction closure would be retried in [2]. When that happens,
user's callback is called twice with `meta` and it sends `[` twice in a row to
the client [3], which is incorrect.
A simple fix is to not call `meta` or `complete` from the transaction context.
That works because we don't pass the transaction object into user's callback
and the user won't be able to run another transaction in the same process
anyway.
There are already tests which test retriable errors in _all_dbsa and _dbs_info,
but they had been updated to only throw retriable errors when rows are emitted
to match the new behavior.
[0] https://github.com/apache/couchdb/commit/acb43e12fd7fddc6f606246875909f7c7df27324
[1] ```
ERL_ZFLAGS="-erlfdb network_options '[client_buggify_enable, {client_buggify_section_activated_probability, 35}, {client_buggify_section_fired_probability, 35}]'" make elixir tests=test/elixir/test/basics_test.exs:71
```
[2] https://github.com/apache/couchdb/blob/082f8078411aab7d71cc86afca0fe2eff3104b01/src/fabric/src/fabric2_db.erl#L279-L287
[3] https://github.com/apache/couchdb/blob/082f8078411aab7d71cc86afca0fe2eff3104b01/src/chttpd/src/chttpd_misc.erl#L137
---
src/fabric/src/fabric2_db.erl | 67 +++++++++++++------------------
src/fabric/test/fabric2_db_crud_tests.erl | 18 ++++++---
2 files changed, 41 insertions(+), 44 deletions(-)
diff --git a/src/fabric/src/fabric2_db.erl b/src/fabric/src/fabric2_db.erl
index 0074865..a5de00c 100644
--- a/src/fabric/src/fabric2_db.erl
+++ b/src/fabric/src/fabric2_db.erl
@@ -275,20 +275,15 @@ list_dbs(UserFun, UserAcc0, Options) ->
FoldFun = fun
(DbName, Acc) -> maybe_stop(UserFun({row, [{id, DbName}]}, Acc))
end,
- fabric2_fdb:transactional(fun(Tx) ->
- try
- UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
- UserAcc2 = fabric2_fdb:list_dbs(
- Tx,
- FoldFun,
- UserAcc1,
- Options
- ),
- {ok, maybe_stop(UserFun(complete, UserAcc2))}
- catch throw:{stop, FinalUserAcc} ->
- {ok, FinalUserAcc}
- end
- end).
+ try
+ UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
+ UserAcc2 = fabric2_fdb:transactional(fun(Tx) ->
+ fabric2_fdb:list_dbs(Tx, FoldFun, UserAcc1, Options)
+ end),
+ {ok, maybe_stop(UserFun(complete, UserAcc2))}
+ catch throw:{stop, FinalUserAcc} ->
+ {ok, FinalUserAcc}
+ end.
list_dbs_info() ->
@@ -313,22 +308,22 @@ list_dbs_info(UserFun, UserAcc0, Options) ->
NewFutureQ = queue:in({DbName, InfoFuture}, FutureQ),
drain_info_futures(NewFutureQ, Count + 1, UserFun, Acc)
end,
- fabric2_fdb:transactional(fun(Tx) ->
- try
- UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
- InitAcc = {queue:new(), 0, UserAcc1},
+ try
+ UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
+ InitAcc = {queue:new(), 0, UserAcc1},
+ UserAcc3 = fabric2_fdb:transactional(fun(Tx) ->
{FinalFutureQ, _, UserAcc2} = fabric2_fdb:list_dbs_info(
Tx,
FoldFun,
InitAcc,
Options
),
- UserAcc3 = drain_all_info_futures(FinalFutureQ, UserFun, UserAcc2),
- {ok, maybe_stop(UserFun(complete, UserAcc3))}
- catch throw:{stop, FinalUserAcc} ->
- {ok, FinalUserAcc}
- end
- end).
+ drain_all_info_futures(FinalFutureQ, UserFun, UserAcc2)
+ end),
+ {ok, maybe_stop(UserFun(complete, UserAcc3))}
+ catch throw:{stop, FinalUserAcc} ->
+ {ok, FinalUserAcc}
+ end.
list_deleted_dbs_info() ->
@@ -390,26 +385,22 @@ list_deleted_dbs_info(UserFun, UserAcc0, Options0) ->
NewFutureQ = queue:in({DbName, TimeStamp, InfoFuture}, FutureQ),
drain_deleted_info_futures(NewFutureQ, Count + 1, UserFun, Acc)
end,
- fabric2_fdb:transactional(fun(Tx) ->
- try
- UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
- InitAcc = {queue:new(), 0, UserAcc1},
+ try
+ UserAcc1 = maybe_stop(UserFun({meta, []}, UserAcc0)),
+ InitAcc = {queue:new(), 0, UserAcc1},
+ UserAcc3 = fabric2_fdb:transactional(fun(Tx) ->
{FinalFutureQ, _, UserAcc2} = fabric2_fdb:list_deleted_dbs_info(
Tx,
FoldFun,
InitAcc,
Options2
),
- UserAcc3 = drain_all_deleted_info_futures(
- FinalFutureQ,
- UserFun,
- UserAcc2
- ),
- {ok, maybe_stop(UserFun(complete, UserAcc3))}
- catch throw:{stop, FinalUserAcc} ->
- {ok, FinalUserAcc}
- end
- end).
+ drain_all_deleted_info_futures(FinalFutureQ, UserFun, UserAcc2)
+ end),
+ {ok, maybe_stop(UserFun(complete, UserAcc3))}
+ catch throw:{stop, FinalUserAcc} ->
+ {ok, FinalUserAcc}
+ end.
is_admin(Db, {SecProps}) when is_list(SecProps) ->
diff --git a/src/fabric/test/fabric2_db_crud_tests.erl b/src/fabric/test/fabric2_db_crud_tests.erl
index 3d90c65..4479c40 100644
--- a/src/fabric/test/fabric2_db_crud_tests.erl
+++ b/src/fabric/test/fabric2_db_crud_tests.erl
@@ -449,9 +449,12 @@ list_dbs_tx_too_old(_) ->
?assertMatch({ok, _}, fabric2_db:create(DbName1, [])),
?assertMatch({ok, _}, fabric2_db:create(DbName2, [])),
- UserFun = fun(Row, Acc) ->
- fabric2_test_util:tx_too_old_raise_in_user_fun(),
- {ok, [Row | Acc]}
+ UserFun = fun
+ ({row, _} = Row, Acc) ->
+ fabric2_test_util:tx_too_old_raise_in_user_fun(),
+ {ok, [Row | Acc]};
+ (Row, Acc) ->
+ {ok, [Row | Acc]}
end,
% Get get expected output without any transactions timing out
@@ -492,9 +495,12 @@ list_dbs_info_tx_too_old(_) ->
DbName
end, lists:seq(1, DbCount)),
- UserFun = fun(Row, Acc) ->
- fabric2_test_util:tx_too_old_raise_in_user_fun(),
- {ok, [Row | Acc]}
+ UserFun = fun
+ ({row, _} = Row, Acc) ->
+ fabric2_test_util:tx_too_old_raise_in_user_fun(),
+ {ok, [Row | Acc]};
+ (Row, Acc) ->
+ {ok, [Row | Acc]}
end,
% This is the expected return with no tx timeouts