You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@couchdb.apache.org by GitBox <gi...@apache.org> on 2022/04/23 00:11:06 UTC

[GitHub] [couchdb] nickva opened a new pull request, #4003: Improve index building during shard splitting

nickva opened a new pull request, #4003:
URL: https://github.com/apache/couchdb/pull/4003

   Previously we didn't check responses from get_state/2 or await/2 functions when building indices. If an index updater crashed, and the index never finished building, the get_state/2 call would simply return an error and the process
   would exit normally. Then, the shard splitting job would count that as a success and continue to make progress.
   
   To fix that, make sure to check the response to all the supported indexing types and wait until they return an `ok` result.
   
   Additionally, increase the index building resilience to allow, for more retries on failure and for configurable retries for individual index builders. That might come in handy if there is any operator intervention to to clear up space by killing couch_files forcibly and deleting if there is a disk space alert.
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4003: Improve index building during shard splitting

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4003:
URL: https://github.com/apache/couchdb/pull/4003#discussion_r861256054


##########
src/mem3/src/mem3_reshard_index.erl:
##########
@@ -108,46 +95,102 @@ dreyfus_indices(DbName, Doc) ->
 hastings_indices(DbName, Doc) ->
     try
         Indices = hastings_index:design_doc_to_indexes(Doc),
-        [{hastings, DbName, Index} || Index <- Indices]
+        [{?HASTINGS, DbName, Index} || Index <- Indices]
     catch
         Tag:Err ->
             Msg = "~p couldn't get hasting indices ~p ~p ~p:~p",
             couch_log:error(Msg, [?MODULE, DbName, Doc, Tag, Err]),
             []
     end.
 
-build_index({mrview, DbName, MRSt}) ->
+build_index({?MRVIEW, DbName, MRSt} = Ctx, Try) ->

Review Comment:
   Oh good idea, that would look much better!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4003: Improve index building during shard splitting

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4003:
URL: https://github.com/apache/couchdb/pull/4003#discussion_r861284959


##########
src/mem3/src/mem3_reshard_index.erl:
##########
@@ -108,46 +95,102 @@ dreyfus_indices(DbName, Doc) ->
 hastings_indices(DbName, Doc) ->
     try
         Indices = hastings_index:design_doc_to_indexes(Doc),
-        [{hastings, DbName, Index} || Index <- Indices]
+        [{?HASTINGS, DbName, Index} || Index <- Indices]
     catch
         Tag:Err ->
             Msg = "~p couldn't get hasting indices ~p ~p ~p:~p",
             couch_log:error(Msg, [?MODULE, DbName, Doc, Tag, Err]),
             []
     end.
 
-build_index({mrview, DbName, MRSt}) ->
+build_index({?MRVIEW, DbName, MRSt} = Ctx, Try) ->

Review Comment:
   Looks much better with the your suggestion, thanks @jaydoane!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva merged pull request #4003: Improve index building during shard splitting

Posted by GitBox <gi...@apache.org>.
nickva merged PR #4003:
URL: https://github.com/apache/couchdb/pull/4003


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4003: Improve index building during shard splitting

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4003:
URL: https://github.com/apache/couchdb/pull/4003#discussion_r861268967


##########
src/mem3/test/eunit/mem3_reshard_test.erl:
##########
@@ -38,11 +38,13 @@ setup() ->
     PartProps = [{partitioned, true}, {hash, [couch_partition, hash, []]}],
     create_db(Db2, [{q, 1}, {n, 1}, {props, PartProps}]),
     config:set("reshard", "retry_interval_sec", "0", _Persist = false),
+    config:set("reshard", "index_retry_interval_sec", "0", _Persist = false),

Review Comment:
   Updated



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4003: Improve index building during shard splitting

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4003:
URL: https://github.com/apache/couchdb/pull/4003#discussion_r861255228


##########
src/mem3/src/mem3_reshard_index.erl:
##########
@@ -108,46 +95,102 @@ dreyfus_indices(DbName, Doc) ->
 hastings_indices(DbName, Doc) ->
     try
         Indices = hastings_index:design_doc_to_indexes(Doc),
-        [{hastings, DbName, Index} || Index <- Indices]
+        [{?HASTINGS, DbName, Index} || Index <- Indices]
     catch
         Tag:Err ->
             Msg = "~p couldn't get hasting indices ~p ~p ~p:~p",
             couch_log:error(Msg, [?MODULE, DbName, Doc, Tag, Err]),
             []
     end.
 
-build_index({mrview, DbName, MRSt}) ->
+build_index({?MRVIEW, DbName, MRSt} = Ctx, Try) ->
     case couch_index_server:get_index(couch_mrview_index, MRSt) of
         {ok, Pid} ->
-            Args = [Pid, get_update_seq(DbName)],
-            WPid = spawn_link(couch_index, get_state, Args),
-            {ok, WPid};
-        Error ->
-            Error
+            try
+                case couch_index:get_state(Pid, get_update_seq(DbName)) of
+                    {ok, _} -> ok;
+                    Error -> maybe_retry(Ctx, Error, Try)
+                end
+            catch
+                _:CatchError ->
+                    maybe_retry(Ctx, CatchError, Try)
+            end;
+        OpenError ->
+            maybe_retry(Ctx, OpenError, Try)
     end;
-build_index({dreyfus, DbName, Index}) ->
-    case dreyfus_index_manager:get_index(DbName, Index) of
+build_index({?DREYFUS, DbName, DIndex} = Ctx, Try) ->
+    case dreyfus_index_manager:get_index(DbName, DIndex) of
         {ok, Pid} ->
-            Args = [Pid, get_update_seq(DbName)],
-            WPid = spawn_link(dreyfus_index, await, Args),
-            {ok, WPid};
-        Error ->
-            Error
+            try
+                case dreyfus_index:await(Pid, get_update_seq(DbName)) of
+                    {ok, _, _} -> ok;
+                    Error -> maybe_retry(Ctx, Error, Try)
+                end
+            catch
+                _:CatchError ->
+                    maybe_retry(Ctx, CatchError, Try)
+            end;
+        OpenError ->
+            maybe_retry(Ctx, OpenError, Try)
     end;
-build_index({hastings, DbName, Index}) ->
-    case hastings_index_manager:get_index(DbName, Index) of
+build_index({?HASTINGS, DbName, HIndex} = Ctx, Try) ->
+    case hastings_index_manager:get_index(DbName, HIndex) of
         {ok, Pid} ->
-            Args = [Pid, get_update_seq(DbName)],
-            WPid = spawn_link(hastings_index, await, Args),
-            {ok, WPid};
-        Error ->
-            Error
+            try
+                case hastings_index:await(Pid, get_update_seq(DbName)) of
+                    {ok, _} -> ok;
+                    Error -> maybe_retry(Ctx, Error, Try)
+                end
+            catch
+                _:CatchErorr ->
+                    maybe_retry(Ctx, CatchErorr, Try)
+            end;
+        OpenError ->
+            maybe_retry(Ctx, OpenError, Try)
     end.
 
+maybe_retry(Ctx, killed = Error, Try) ->
+    retry(Ctx, Error, Try);
+maybe_retry(Ctx, {killed, _} = Error, Try) ->
+    retry(Ctx, Error, Try);
+maybe_retry(Ctx, shutdown = Error, Try) ->
+    retry(Ctx, Error, Try);
+maybe_retry(Ctx, Error, 0) ->
+    fail(Ctx, Error);
+maybe_retry(Ctx, Error, Try) when is_integer(Try), Try > 0 ->
+    retry(Ctx, Error, Try - 1).
+
+retry(Ctx, Error, Try) ->
+    IndexInfo = index_info(Ctx),
+    LogMsg = "~p : error ~p when building ~p, retrying (~p)",
+    couch_log:warning(LogMsg, [?MODULE, Error, IndexInfo, Try]),
+    timer:sleep(retry_interval_sec() * 1000),
+    build_index(Ctx, Try).
+
+fail(Ctx, Error) ->
+    IndexInfo = index_info(Ctx),
+    LogMsg = "~p : error ~p when building ~p, max tries exceeded, failing",
+    couch_log:error(LogMsg, [?MODULE, Error, IndexInfo]),
+    exit({error_building_index, IndexInfo}).
+
+index_info({?MRVIEW, DbName, MRSt}) ->
+    GroupName = couch_mrview_index:get(idx_name, MRSt),
+    {DbName, GroupName};
+index_info({?DREYFUS, DbName, Index}) ->
+    {DbName, Index};
+index_info({?HASTINGS, DbName, Index}) ->
+    {DbName, Index}.
+
 has_app(App) ->
     code:lib_dir(App) /= {error, bad_name}.
 
 get_update_seq(DbName) ->
     couch_util:with_db(DbName, fun(Db) ->
         couch_db:get_update_seq(Db)
     end).
+
+max_retries() ->
+    config:get_integer("reshard", "index_max_retries", 5).
+
+retry_interval_sec() ->
+    config:get_integer("reshard", "index_retry_interval_sec", 10).

Review Comment:
   Good catch, definitely should be added



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] jaydoane commented on a diff in pull request #4003: Improve index building during shard splitting

Posted by GitBox <gi...@apache.org>.
jaydoane commented on code in PR #4003:
URL: https://github.com/apache/couchdb/pull/4003#discussion_r861216302


##########
src/mem3/test/eunit/mem3_reshard_test.erl:
##########
@@ -38,11 +38,13 @@ setup() ->
     PartProps = [{partitioned, true}, {hash, [couch_partition, hash, []]}],
     create_db(Db2, [{q, 1}, {n, 1}, {props, PartProps}]),
     config:set("reshard", "retry_interval_sec", "0", _Persist = false),
+    config:set("reshard", "index_retry_interval_sec", "0", _Persist = false),

Review Comment:
   The OTP 24 compiler complains:
   ```
   src/mem3/test/eunit/mem3_reshard_test.erl:41:60: Warning: variable '_Persist' is already bound. If you mean to ignore this value, use '_' or a different underscore-prefixed name
   src/mem3/test/eunit/mem3_reshard_test.erl:48:52: Warning: variable '_Persist' is already bound. If you mean to ignore this value, use '_' or a different underscore-prefixed name
   ```



##########
src/mem3/src/mem3_reshard_index.erl:
##########
@@ -108,46 +95,102 @@ dreyfus_indices(DbName, Doc) ->
 hastings_indices(DbName, Doc) ->
     try
         Indices = hastings_index:design_doc_to_indexes(Doc),
-        [{hastings, DbName, Index} || Index <- Indices]
+        [{?HASTINGS, DbName, Index} || Index <- Indices]
     catch
         Tag:Err ->
             Msg = "~p couldn't get hasting indices ~p ~p ~p:~p",
             couch_log:error(Msg, [?MODULE, DbName, Doc, Tag, Err]),
             []
     end.
 
-build_index({mrview, DbName, MRSt}) ->
+build_index({?MRVIEW, DbName, MRSt} = Ctx, Try) ->

Review Comment:
   This could be condensed somewhat by abstracting the await/retry boilerplate further:
   ```diff
   --- a/src/mem3/src/mem3_reshard_index.erl
   +++ b/src/mem3/src/mem3_reshard_index.erl
   @@ -103,42 +103,36 @@ hastings_indices(DbName, Doc) ->
                []
        end.
    
   -build_index({?MRVIEW, DbName, MRSt} = Ctx, Try) ->
   -    case couch_index_server:get_index(couch_mrview_index, MRSt) of
   -        {ok, Pid} ->
   -            try
   -                case couch_index:get_state(Pid, get_update_seq(DbName)) of
   -                    {ok, _} -> ok;
   -                    Error -> maybe_retry(Ctx, Error, Try)
   -                end
   -            catch
   -                _:CatchError ->
   -                    maybe_retry(Ctx, CatchError, Try)
   -            end;
   -        OpenError ->
   -            maybe_retry(Ctx, OpenError, Try)
   -    end;
   +build_index({?MRVIEW, _DbName, MRSt} = Ctx, Try) ->
   +    await_retry(
   +        couch_index_server:get_index(couch_mrview_index, MRSt),
   +        fun couch_index:get_state/2,
   +        Ctx,
   +        Try
   +    );
    build_index({?DREYFUS, DbName, DIndex} = Ctx, Try) ->
   -    case dreyfus_index_manager:get_index(DbName, DIndex) of
   -        {ok, Pid} ->
   -            try
   -                case dreyfus_index:await(Pid, get_update_seq(DbName)) of
   -                    {ok, _, _} -> ok;
   -                    Error -> maybe_retry(Ctx, Error, Try)
   -                end
   -            catch
   -                _:CatchError ->
   -                    maybe_retry(Ctx, CatchError, Try)
   -            end;
   -        OpenError ->
   -            maybe_retry(Ctx, OpenError, Try)
   -    end;
   +    await_retry(
   +        dreyfus_index_manager:get_index(DbName, DIndex),
   +        fun dreyfus_index:await/2,
   +        Ctx,
   +        Try
   +    );
    build_index({?HASTINGS, DbName, HIndex} = Ctx, Try) ->
   -    case hastings_index_manager:get_index(DbName, HIndex) of
   +    await_retry(
   +        hastings_index_manager:get_index(DbName, HIndex),
   +        fun hastings_index:await/2,
   +        Ctx,
   +        Try
   +    ).
   +
   +await_retry(GetIndex, AwaitIndex, Ctx, Try) ->
   +    case GetIndex of
            {ok, Pid} ->
   +            {_, DbName, _} = Ctx,
                try
   -                case hastings_index:await(Pid, get_update_seq(DbName)) of
   +                case AwaitIndex(Pid, get_update_seq(DbName)) of
                        {ok, _} -> ok;
   +                    {ok, _, _} -> ok;
                        Error -> maybe_retry(Ctx, Error, Try)
                    end
                catch
   ```
   but I'm not convinced it's worth it, other than an exercise for me to see the exact differences in the three cases, which are unfortunate.



##########
src/mem3/src/mem3_reshard_index.erl:
##########
@@ -108,46 +95,102 @@ dreyfus_indices(DbName, Doc) ->
 hastings_indices(DbName, Doc) ->
     try
         Indices = hastings_index:design_doc_to_indexes(Doc),
-        [{hastings, DbName, Index} || Index <- Indices]
+        [{?HASTINGS, DbName, Index} || Index <- Indices]
     catch
         Tag:Err ->
             Msg = "~p couldn't get hasting indices ~p ~p ~p:~p",
             couch_log:error(Msg, [?MODULE, DbName, Doc, Tag, Err]),
             []
     end.
 
-build_index({mrview, DbName, MRSt}) ->
+build_index({?MRVIEW, DbName, MRSt} = Ctx, Try) ->
     case couch_index_server:get_index(couch_mrview_index, MRSt) of
         {ok, Pid} ->
-            Args = [Pid, get_update_seq(DbName)],
-            WPid = spawn_link(couch_index, get_state, Args),
-            {ok, WPid};
-        Error ->
-            Error
+            try
+                case couch_index:get_state(Pid, get_update_seq(DbName)) of
+                    {ok, _} -> ok;
+                    Error -> maybe_retry(Ctx, Error, Try)
+                end
+            catch
+                _:CatchError ->
+                    maybe_retry(Ctx, CatchError, Try)
+            end;
+        OpenError ->
+            maybe_retry(Ctx, OpenError, Try)
     end;
-build_index({dreyfus, DbName, Index}) ->
-    case dreyfus_index_manager:get_index(DbName, Index) of
+build_index({?DREYFUS, DbName, DIndex} = Ctx, Try) ->
+    case dreyfus_index_manager:get_index(DbName, DIndex) of
         {ok, Pid} ->
-            Args = [Pid, get_update_seq(DbName)],
-            WPid = spawn_link(dreyfus_index, await, Args),
-            {ok, WPid};
-        Error ->
-            Error
+            try
+                case dreyfus_index:await(Pid, get_update_seq(DbName)) of
+                    {ok, _, _} -> ok;
+                    Error -> maybe_retry(Ctx, Error, Try)
+                end
+            catch
+                _:CatchError ->
+                    maybe_retry(Ctx, CatchError, Try)
+            end;
+        OpenError ->
+            maybe_retry(Ctx, OpenError, Try)
     end;
-build_index({hastings, DbName, Index}) ->
-    case hastings_index_manager:get_index(DbName, Index) of
+build_index({?HASTINGS, DbName, HIndex} = Ctx, Try) ->
+    case hastings_index_manager:get_index(DbName, HIndex) of
         {ok, Pid} ->
-            Args = [Pid, get_update_seq(DbName)],
-            WPid = spawn_link(hastings_index, await, Args),
-            {ok, WPid};
-        Error ->
-            Error
+            try
+                case hastings_index:await(Pid, get_update_seq(DbName)) of
+                    {ok, _} -> ok;
+                    Error -> maybe_retry(Ctx, Error, Try)
+                end
+            catch
+                _:CatchErorr ->
+                    maybe_retry(Ctx, CatchErorr, Try)
+            end;
+        OpenError ->
+            maybe_retry(Ctx, OpenError, Try)
     end.
 
+maybe_retry(Ctx, killed = Error, Try) ->
+    retry(Ctx, Error, Try);
+maybe_retry(Ctx, {killed, _} = Error, Try) ->
+    retry(Ctx, Error, Try);
+maybe_retry(Ctx, shutdown = Error, Try) ->
+    retry(Ctx, Error, Try);
+maybe_retry(Ctx, Error, 0) ->
+    fail(Ctx, Error);
+maybe_retry(Ctx, Error, Try) when is_integer(Try), Try > 0 ->
+    retry(Ctx, Error, Try - 1).
+
+retry(Ctx, Error, Try) ->
+    IndexInfo = index_info(Ctx),
+    LogMsg = "~p : error ~p when building ~p, retrying (~p)",
+    couch_log:warning(LogMsg, [?MODULE, Error, IndexInfo, Try]),
+    timer:sleep(retry_interval_sec() * 1000),
+    build_index(Ctx, Try).
+
+fail(Ctx, Error) ->
+    IndexInfo = index_info(Ctx),
+    LogMsg = "~p : error ~p when building ~p, max tries exceeded, failing",
+    couch_log:error(LogMsg, [?MODULE, Error, IndexInfo]),
+    exit({error_building_index, IndexInfo}).
+
+index_info({?MRVIEW, DbName, MRSt}) ->
+    GroupName = couch_mrview_index:get(idx_name, MRSt),
+    {DbName, GroupName};
+index_info({?DREYFUS, DbName, Index}) ->
+    {DbName, Index};
+index_info({?HASTINGS, DbName, Index}) ->
+    {DbName, Index}.
+
 has_app(App) ->
     code:lib_dir(App) /= {error, bad_name}.
 
 get_update_seq(DbName) ->
     couch_util:with_db(DbName, fun(Db) ->
         couch_db:get_update_seq(Db)
     end).
+
+max_retries() ->
+    config:get_integer("reshard", "index_max_retries", 5).
+
+retry_interval_sec() ->
+    config:get_integer("reshard", "index_retry_interval_sec", 10).

Review Comment:
   Should these defaults be added to `default.ini`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4003: Improve index building during shard splitting

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4003:
URL: https://github.com/apache/couchdb/pull/4003#discussion_r861255228


##########
src/mem3/src/mem3_reshard_index.erl:
##########
@@ -108,46 +95,102 @@ dreyfus_indices(DbName, Doc) ->
 hastings_indices(DbName, Doc) ->
     try
         Indices = hastings_index:design_doc_to_indexes(Doc),
-        [{hastings, DbName, Index} || Index <- Indices]
+        [{?HASTINGS, DbName, Index} || Index <- Indices]
     catch
         Tag:Err ->
             Msg = "~p couldn't get hasting indices ~p ~p ~p:~p",
             couch_log:error(Msg, [?MODULE, DbName, Doc, Tag, Err]),
             []
     end.
 
-build_index({mrview, DbName, MRSt}) ->
+build_index({?MRVIEW, DbName, MRSt} = Ctx, Try) ->
     case couch_index_server:get_index(couch_mrview_index, MRSt) of
         {ok, Pid} ->
-            Args = [Pid, get_update_seq(DbName)],
-            WPid = spawn_link(couch_index, get_state, Args),
-            {ok, WPid};
-        Error ->
-            Error
+            try
+                case couch_index:get_state(Pid, get_update_seq(DbName)) of
+                    {ok, _} -> ok;
+                    Error -> maybe_retry(Ctx, Error, Try)
+                end
+            catch
+                _:CatchError ->
+                    maybe_retry(Ctx, CatchError, Try)
+            end;
+        OpenError ->
+            maybe_retry(Ctx, OpenError, Try)
     end;
-build_index({dreyfus, DbName, Index}) ->
-    case dreyfus_index_manager:get_index(DbName, Index) of
+build_index({?DREYFUS, DbName, DIndex} = Ctx, Try) ->
+    case dreyfus_index_manager:get_index(DbName, DIndex) of
         {ok, Pid} ->
-            Args = [Pid, get_update_seq(DbName)],
-            WPid = spawn_link(dreyfus_index, await, Args),
-            {ok, WPid};
-        Error ->
-            Error
+            try
+                case dreyfus_index:await(Pid, get_update_seq(DbName)) of
+                    {ok, _, _} -> ok;
+                    Error -> maybe_retry(Ctx, Error, Try)
+                end
+            catch
+                _:CatchError ->
+                    maybe_retry(Ctx, CatchError, Try)
+            end;
+        OpenError ->
+            maybe_retry(Ctx, OpenError, Try)
     end;
-build_index({hastings, DbName, Index}) ->
-    case hastings_index_manager:get_index(DbName, Index) of
+build_index({?HASTINGS, DbName, HIndex} = Ctx, Try) ->
+    case hastings_index_manager:get_index(DbName, HIndex) of
         {ok, Pid} ->
-            Args = [Pid, get_update_seq(DbName)],
-            WPid = spawn_link(hastings_index, await, Args),
-            {ok, WPid};
-        Error ->
-            Error
+            try
+                case hastings_index:await(Pid, get_update_seq(DbName)) of
+                    {ok, _} -> ok;
+                    Error -> maybe_retry(Ctx, Error, Try)
+                end
+            catch
+                _:CatchErorr ->
+                    maybe_retry(Ctx, CatchErorr, Try)
+            end;
+        OpenError ->
+            maybe_retry(Ctx, OpenError, Try)
     end.
 
+maybe_retry(Ctx, killed = Error, Try) ->
+    retry(Ctx, Error, Try);
+maybe_retry(Ctx, {killed, _} = Error, Try) ->
+    retry(Ctx, Error, Try);
+maybe_retry(Ctx, shutdown = Error, Try) ->
+    retry(Ctx, Error, Try);
+maybe_retry(Ctx, Error, 0) ->
+    fail(Ctx, Error);
+maybe_retry(Ctx, Error, Try) when is_integer(Try), Try > 0 ->
+    retry(Ctx, Error, Try - 1).
+
+retry(Ctx, Error, Try) ->
+    IndexInfo = index_info(Ctx),
+    LogMsg = "~p : error ~p when building ~p, retrying (~p)",
+    couch_log:warning(LogMsg, [?MODULE, Error, IndexInfo, Try]),
+    timer:sleep(retry_interval_sec() * 1000),
+    build_index(Ctx, Try).
+
+fail(Ctx, Error) ->
+    IndexInfo = index_info(Ctx),
+    LogMsg = "~p : error ~p when building ~p, max tries exceeded, failing",
+    couch_log:error(LogMsg, [?MODULE, Error, IndexInfo]),
+    exit({error_building_index, IndexInfo}).
+
+index_info({?MRVIEW, DbName, MRSt}) ->
+    GroupName = couch_mrview_index:get(idx_name, MRSt),
+    {DbName, GroupName};
+index_info({?DREYFUS, DbName, Index}) ->
+    {DbName, Index};
+index_info({?HASTINGS, DbName, Index}) ->
+    {DbName, Index}.
+
 has_app(App) ->
     code:lib_dir(App) /= {error, bad_name}.
 
 get_update_seq(DbName) ->
     couch_util:with_db(DbName, fun(Db) ->
         couch_db:get_update_seq(Db)
     end).
+
+max_retries() ->
+    config:get_integer("reshard", "index_max_retries", 5).
+
+retry_interval_sec() ->
+    config:get_integer("reshard", "index_retry_interval_sec", 10).

Review Comment:
   Good catch, definitely should be added. Added in a fixup commit



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org