You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@couchdb.apache.org by GitBox <gi...@apache.org> on 2022/04/28 19:44:11 UTC

[GitHub] [couchdb] nickva commented on a diff in pull request #4003: Improve index building during shard splitting

nickva commented on code in PR #4003:
URL: https://github.com/apache/couchdb/pull/4003#discussion_r861255228


##########
src/mem3/src/mem3_reshard_index.erl:
##########
@@ -108,46 +95,102 @@ dreyfus_indices(DbName, Doc) ->
 hastings_indices(DbName, Doc) ->
     try
         Indices = hastings_index:design_doc_to_indexes(Doc),
-        [{hastings, DbName, Index} || Index <- Indices]
+        [{?HASTINGS, DbName, Index} || Index <- Indices]
     catch
         Tag:Err ->
             Msg = "~p couldn't get hasting indices ~p ~p ~p:~p",
             couch_log:error(Msg, [?MODULE, DbName, Doc, Tag, Err]),
             []
     end.
 
-build_index({mrview, DbName, MRSt}) ->
+build_index({?MRVIEW, DbName, MRSt} = Ctx, Try) ->
     case couch_index_server:get_index(couch_mrview_index, MRSt) of
         {ok, Pid} ->
-            Args = [Pid, get_update_seq(DbName)],
-            WPid = spawn_link(couch_index, get_state, Args),
-            {ok, WPid};
-        Error ->
-            Error
+            try
+                case couch_index:get_state(Pid, get_update_seq(DbName)) of
+                    {ok, _} -> ok;
+                    Error -> maybe_retry(Ctx, Error, Try)
+                end
+            catch
+                _:CatchError ->
+                    maybe_retry(Ctx, CatchError, Try)
+            end;
+        OpenError ->
+            maybe_retry(Ctx, OpenError, Try)
     end;
-build_index({dreyfus, DbName, Index}) ->
-    case dreyfus_index_manager:get_index(DbName, Index) of
+build_index({?DREYFUS, DbName, DIndex} = Ctx, Try) ->
+    case dreyfus_index_manager:get_index(DbName, DIndex) of
         {ok, Pid} ->
-            Args = [Pid, get_update_seq(DbName)],
-            WPid = spawn_link(dreyfus_index, await, Args),
-            {ok, WPid};
-        Error ->
-            Error
+            try
+                case dreyfus_index:await(Pid, get_update_seq(DbName)) of
+                    {ok, _, _} -> ok;
+                    Error -> maybe_retry(Ctx, Error, Try)
+                end
+            catch
+                _:CatchError ->
+                    maybe_retry(Ctx, CatchError, Try)
+            end;
+        OpenError ->
+            maybe_retry(Ctx, OpenError, Try)
     end;
-build_index({hastings, DbName, Index}) ->
-    case hastings_index_manager:get_index(DbName, Index) of
+build_index({?HASTINGS, DbName, HIndex} = Ctx, Try) ->
+    case hastings_index_manager:get_index(DbName, HIndex) of
         {ok, Pid} ->
-            Args = [Pid, get_update_seq(DbName)],
-            WPid = spawn_link(hastings_index, await, Args),
-            {ok, WPid};
-        Error ->
-            Error
+            try
+                case hastings_index:await(Pid, get_update_seq(DbName)) of
+                    {ok, _} -> ok;
+                    Error -> maybe_retry(Ctx, Error, Try)
+                end
+            catch
+                _:CatchErorr ->
+                    maybe_retry(Ctx, CatchErorr, Try)
+            end;
+        OpenError ->
+            maybe_retry(Ctx, OpenError, Try)
     end.
 
+maybe_retry(Ctx, killed = Error, Try) ->
+    retry(Ctx, Error, Try);
+maybe_retry(Ctx, {killed, _} = Error, Try) ->
+    retry(Ctx, Error, Try);
+maybe_retry(Ctx, shutdown = Error, Try) ->
+    retry(Ctx, Error, Try);
+maybe_retry(Ctx, Error, 0) ->
+    fail(Ctx, Error);
+maybe_retry(Ctx, Error, Try) when is_integer(Try), Try > 0 ->
+    retry(Ctx, Error, Try - 1).
+
+retry(Ctx, Error, Try) ->
+    IndexInfo = index_info(Ctx),
+    LogMsg = "~p : error ~p when building ~p, retrying (~p)",
+    couch_log:warning(LogMsg, [?MODULE, Error, IndexInfo, Try]),
+    timer:sleep(retry_interval_sec() * 1000),
+    build_index(Ctx, Try).
+
+fail(Ctx, Error) ->
+    IndexInfo = index_info(Ctx),
+    LogMsg = "~p : error ~p when building ~p, max tries exceeded, failing",
+    couch_log:error(LogMsg, [?MODULE, Error, IndexInfo]),
+    exit({error_building_index, IndexInfo}).
+
+index_info({?MRVIEW, DbName, MRSt}) ->
+    GroupName = couch_mrview_index:get(idx_name, MRSt),
+    {DbName, GroupName};
+index_info({?DREYFUS, DbName, Index}) ->
+    {DbName, Index};
+index_info({?HASTINGS, DbName, Index}) ->
+    {DbName, Index}.
+
 has_app(App) ->
     code:lib_dir(App) /= {error, bad_name}.
 
 get_update_seq(DbName) ->
     couch_util:with_db(DbName, fun(Db) ->
         couch_db:get_update_seq(Db)
     end).
+
+max_retries() ->
+    config:get_integer("reshard", "index_max_retries", 5).
+
+retry_interval_sec() ->
+    config:get_integer("reshard", "index_retry_interval_sec", 10).

Review Comment:
   Good catch, definitely should be added



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org