You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@couchdb.apache.org by va...@apache.org on 2019/11/01 19:28:47 UTC

[couchdb] branch handle-replication-process-restart-better created (now 37a083e)

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a change to branch handle-replication-process-restart-better
in repository https://gitbox.apache.org/repos/asf/couchdb.git.


      at 37a083e  Do not mark replication jobs as failed if doc processor crashes

This branch includes the following new commits:

     new 37a083e  Do not mark replication jobs as failed if doc processor crashes

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.

[couchdb] 01/01: Do not mark replication jobs as failed if doc processor crashes

Posted by va...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch handle-replication-process-restart-better
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 37a083edf1d8ab3554dc4f07a3861dc515c8dda1
Author: Nick Vatamaniuc <va...@apache.org>
AuthorDate: Fri Nov 1 13:46:44 2019 -0400

    Do not mark replication jobs as failed if doc processor crashes
    
    Previously if couch_replicator_doc_processor crashed, the job was marked as
    "failed". We now ignore that case. It's safe to do that since supervisor will
    restart it anyway, and it will rescan all the docs again. Most of all, we want
    to prevent the job becoming failed permanently and needing a manual
    intervention to restart it.
---
 .../src/couch_replicator_doc_processor.erl         | 30 +++++++++++++++++++---
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/couch_replicator/src/couch_replicator_doc_processor.erl b/src/couch_replicator/src/couch_replicator_doc_processor.erl
index 772037d..b5b5b45 100644
--- a/src/couch_replicator/src/couch_replicator_doc_processor.erl
+++ b/src/couch_replicator/src/couch_replicator_doc_processor.erl
@@ -101,6 +101,9 @@ db_change(DbName, {ChangeProps} = Change, Server) ->
     try
         ok = process_change(DbName, Change)
     catch
+    exit:{Error, {gen_server, call, [?MODULE, _, _]}} ->
+        ErrMsg = "~p exited ~p while processing change from db ~p",
+        couch_log:error(ErrMsg, [?MODULE, Error, DbName]);
     _Tag:Error ->
         {RepProps} = get_json_value(doc, ChangeProps),
         DocId = get_json_value(<<"_id">>, RepProps),
@@ -611,6 +614,7 @@ cluster_membership_foldl(#rdoc{id = {DbName, DocId} = Id, rid = RepId}, nil) ->
 -include_lib("eunit/include/eunit.hrl").
 
 -define(DB, <<"db">>).
+-define(EXIT_DB, <<"exit_db">>).
 -define(DOC1, <<"doc1">>).
 -define(DOC2, <<"doc2">>).
 -define(R1, {"1", ""}).
@@ -625,6 +629,7 @@ doc_processor_test_() ->
         [
             t_bad_change(),
             t_regular_change(),
+            t_change_with_doc_processor_crash(),
             t_change_with_existing_job(),
             t_deleted_change(),
             t_triggered_change(),
@@ -658,6 +663,16 @@ t_regular_change() ->
     end).
 
 
+% Handle cases where doc processor exits or crashes while processing a change
+t_change_with_doc_processor_crash() ->
+    ?_test(begin
+        mock_existing_jobs_lookup([]),
+        unlink(whereis(?MODULE)), % don't want to kill the test process
+        ?assertEqual(acc, db_change(?EXIT_DB, change(), acc)),
+        ?assert(failed_state_not_updated())
+  end).
+
+
 % Regular change, parse to a #rep{} and then add job but there is already
 % a running job with same Id found.
 t_change_with_existing_job() ->
@@ -834,16 +849,19 @@ setup() ->
     meck:expect(couch_replicator_clustering, owner, 2, node()),
     meck:expect(couch_replicator_clustering, link_cluster_event_listener, 3,
         ok),
-    meck:expect(couch_replicator_doc_processor_worker, spawn_worker, 4, pid),
+    meck:expect(couch_replicator_doc_processor_worker, spawn_worker, fun
+        ({?EXIT_DB, _}, _, _, _) -> exit(kapow);
+        (_, _, _, _) -> pid
+    end),
     meck:expect(couch_replicator_scheduler, remove_job, 1, ok),
     meck:expect(couch_replicator_docs, remove_state_fields, 2, ok),
     meck:expect(couch_replicator_docs, update_failed, 3, ok),
     {ok, Pid} = start_link(),
+    unlink(Pid),
     Pid.
 
 
 teardown(Pid) ->
-    unlink(Pid),
     exit(Pid, kill),
     meck:unload().
 
@@ -871,10 +889,14 @@ did_not_spawn_worker() ->
 updated_doc_with_failed_state() ->
     1 == meck:num_calls(couch_replicator_docs, update_failed, '_').
 
+failed_state_not_updated() ->
+    0 == meck:num_calls(couch_replicator_docs, update_failed, '_').
 
 mock_existing_jobs_lookup(ExistingJobs) ->
-    meck:expect(couch_replicator_scheduler, find_jobs_by_doc,
-        fun(?DB, ?DOC1) -> ExistingJobs end).
+    meck:expect(couch_replicator_scheduler, find_jobs_by_doc, fun
+        (?EXIT_DB, ?DOC1) -> [];
+        (?DB, ?DOC1) -> ExistingJobs
+    end).
 
 
 test_rep(Id) ->