You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2023/10/17 18:57:54 UTC

[couchdb] branch fix-index-server-premature-return created (now a44b3fc1c)

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a change to branch fix-index-server-premature-return
in repository https://gitbox.apache.org/repos/asf/couchdb.git


      at a44b3fc1c Prevent delayed opener error from crashing index servers

This branch includes the following new commits:

     new 73c8adadf Fix index server crash when async opener returns too early
     new a44b3fc1c Prevent delayed opener error from crashing index servers

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[couchdb] 02/02: Prevent delayed opener error from crashing index servers

Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch fix-index-server-premature-return
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit a44b3fc1c5b49e63a361a2a13df8c8151309e2bf
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Tue Oct 17 14:39:18 2023 -0400

    Prevent delayed opener error from crashing index servers
    
    Previously, an index and opener process dying could have resulted in the index
    gen_server crashing. This was observed in the CI test as described in:
    https://github.com/apache/couchdb/issues/4809
    
    The process in more detail was as follows:
    
      * When an async opener result is handled in the index server, there is a
        period of time when the index server is linked to both the index and the
        opener process.
    
      * After we reply early to the waiting clients, a client may do something to
        cause the indexer to crash, which would crash the opener along with it.
        That would generate two `{'EXIT', Pid, _}` messages queued in the indexer
        process' mailbox.
    
      * The index gen_server, is still processing the async opener result callback,
        where it would remove the openener from the `openers` table, then it
        returns `ok` to the async opener.
    
      * Index gen_server continues processing queued `EXIT` messages in `handle_info`:
         - The one for the indexer Pid is handled appropriately
         - The one for the opener leads to an eexit(...)` clause since we ended
           with an unknown process exiting.
    
    To avoid the race condition, and the extra opener `EXIT` message, unlink and
    reply early to the opener, as soon we linked to the indexer or had received the
    error. To avoid the small chance of still getting an `EXIT` message from the
    opener, in case it crashed right before we unlinked, flush any exit messages
    from it. We do a similar flushing in two other places so create small utility
    function to avoid duplicating the code too much.
    
    Fix: https://github.com/apache/couchdb/issues/4809
---
 src/couch_index/src/couch_index_server.erl | 44 ++++++++++++++++++------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/couch_index/src/couch_index_server.erl b/src/couch_index/src/couch_index_server.erl
index 5055382d5..807f87a88 100644
--- a/src/couch_index/src/couch_index_server.erl
+++ b/src/couch_index/src/couch_index_server.erl
@@ -167,19 +167,29 @@ handle_call({get_index, {_Mod, _IdxState, DbName, Sig} = Args}, From, State) ->
         [{_, Pid}] when is_pid(Pid) ->
             {reply, {ok, Pid}, State}
     end;
-handle_call({async_open, {DbName, DDocId, Sig}, {ok, Pid}}, {OpenerPid, _}, State) ->
-    [{_, Waiters}] = ets:lookup(State#st.by_sig, {DbName, Sig}),
+handle_call({async_open, {DbName, DDocId, Sig}, {ok, Pid}}, {OpenerPid, _} = FromOpener, State) ->
     link(Pid),
+    % Once linked with the indexer, dismiss and ignore the opener.
+    unlink(OpenerPid),
     ets:delete(State#st.openers, OpenerPid),
+    gen_server:reply(FromOpener, ok),
+    [{_, Waiters}] = ets:lookup(State#st.by_sig, {DbName, Sig}),
     add_to_ets(DbName, Sig, DDocId, Pid, State),
     [gen_server:reply(From, {ok, Pid}) || From <- Waiters],
-    {reply, ok, State};
-handle_call({async_error, {DbName, _DDocId, Sig}, Error}, {OpenerPid, _}, State) ->
-    [{_, Waiters}] = ets:lookup(State#st.by_sig, {DbName, Sig}),
+    % Flush opener exit messages in case it died before we unlinked it
+    ok = flush_exit_messages_from(OpenerPid),
+    {noreply, State};
+handle_call({async_error, {DbName, _DDocId, Sig}, Error}, {OpenerPid, _} = FromOpener, State) ->
+    % Once opener reported the error, we can dismiss the opener
+    unlink(OpenerPid),
     ets:delete(State#st.openers, OpenerPid),
+    gen_server:reply(FromOpener, ok),
+    [{_, Waiters}] = ets:lookup(State#st.by_sig, {DbName, Sig}),
     ets:delete(State#st.by_sig, {DbName, Sig}),
     [gen_server:reply(From, Error) || From <- Waiters],
-    {reply, ok, State};
+    % Flush opener exit messages in case it died before we unlinked it
+    ok = flush_exit_messages_from(OpenerPid),
+    {noreply, State};
 handle_call({reset_indexes, DbName}, _From, State) ->
     reset_indexes(DbName, State),
     {reply, ok, State}.
@@ -283,12 +293,7 @@ reset_indexes(DbName, #st{} = State) ->
         [{_, Pid}] = ets:lookup(State#st.by_sig, {DbName, Sig}),
         unlink(Pid),
         gen_server:cast(Pid, delete),
-        receive
-            {'EXIT', Pid, _} ->
-                ok
-        after 0 ->
-            ok
-        end,
+        ok = flush_exit_messages_from(Pid),
         rem_from_ets(DbName, Sig, DDocIds, Pid, State)
     end,
     lists:foreach(Fun, dict:to_list(SigDDocIds)),
@@ -327,12 +332,7 @@ rem_from_ets(DbName, #st{} = State) ->
     Fun = fun({Sig, DDocIds}) ->
         [{_, Pid}] = ets:lookup(State#st.by_sig, {DbName, Sig}),
         unlink(Pid),
-        receive
-            {'EXIT', Pid, _} ->
-                ok
-        after 0 ->
-            ok
-        end,
+        ok = flush_exit_messages_from(Pid),
         rem_from_ets(DbName, Sig, DDocIds, Pid, State)
     end,
     lists:foreach(Fun, dict:to_list(SigDDocIds)).
@@ -442,3 +442,11 @@ aggregate_queue_len() ->
      || Name <- Names
     ],
     lists:sum([X || {_, X} <- MQs]).
+
+flush_exit_messages_from(Pid) ->
+    receive
+        {'EXIT', Pid, _} ->
+            ok
+    after 0 ->
+        ok
+    end.


[couchdb] 01/02: Fix index server crash when async opener returns too early

Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch fix-index-server-premature-return
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 73c8adadfb39e9ce67ac5c09707a86f6238e3fba
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Tue Oct 17 01:25:57 2023 -0400

    Fix index server crash when async opener returns too early
    
    Previously, the index process and the still linked async opener processes could have
    crashed, after we already replied the the waiting clients. That would genarate
    two `EXIT` messages: one for the indexer pid, and one for the opener. By the
    time the opener `EXIT` was handled, the `st.openers` entry was already missing
    (since handle_call of async_open could delete it), and so the index server
    itself would crash.
---
 src/couch_index/src/couch_index_server.erl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/couch_index/src/couch_index_server.erl b/src/couch_index/src/couch_index_server.erl
index 35df43d2a..5055382d5 100644
--- a/src/couch_index/src/couch_index_server.erl
+++ b/src/couch_index/src/couch_index_server.erl
@@ -169,16 +169,16 @@ handle_call({get_index, {_Mod, _IdxState, DbName, Sig} = Args}, From, State) ->
     end;
 handle_call({async_open, {DbName, DDocId, Sig}, {ok, Pid}}, {OpenerPid, _}, State) ->
     [{_, Waiters}] = ets:lookup(State#st.by_sig, {DbName, Sig}),
-    [gen_server:reply(From, {ok, Pid}) || From <- Waiters],
     link(Pid),
     ets:delete(State#st.openers, OpenerPid),
     add_to_ets(DbName, Sig, DDocId, Pid, State),
+    [gen_server:reply(From, {ok, Pid}) || From <- Waiters],
     {reply, ok, State};
 handle_call({async_error, {DbName, _DDocId, Sig}, Error}, {OpenerPid, _}, State) ->
     [{_, Waiters}] = ets:lookup(State#st.by_sig, {DbName, Sig}),
-    [gen_server:reply(From, Error) || From <- Waiters],
     ets:delete(State#st.openers, OpenerPid),
     ets:delete(State#st.by_sig, {DbName, Sig}),
+    [gen_server:reply(From, Error) || From <- Waiters],
     {reply, ok, State};
 handle_call({reset_indexes, DbName}, _From, State) ->
     reset_indexes(DbName, State),