You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2023/05/25 03:54:12 UTC

[couchdb] branch handle-upgrade-case-for-instance-start-time created (now 3b18d04cf)

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a change to branch handle-upgrade-case-for-instance-start-time
in repository https://gitbox.apache.org/repos/asf/couchdb.git


      at 3b18d04cf Handle replicator instance start time during upgrades better

This branch includes the following new commits:

     new 3b18d04cf Handle replicator instance start time during upgrades better

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[couchdb] 01/01: Handle replicator instance start time during upgrades better

Posted by va...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch handle-upgrade-case-for-instance-start-time
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 3b18d04cfc5eb723bd514ebc4659d3fd72842b51
Author: Nick Vatamaniuc <va...@gmail.com>
AuthorDate: Wed May 24 23:34:45 2023 -0400

    Handle replicator instance start time during upgrades better
    
    During cluster upgrades from 3.2 to 3.3 when instance start time switched from
    being always  `0` to an actual timestamp, replication jobs will crash when
    endpoints are upgraded. Replication jobs were started when endpoint
    emitted a `0` and then it becomes a non-`0` value which will crash the next checkpoint attempt.
    
    After the crash jobs will restart and continue fine were they left off without
    rewinding. However they will make a logging mess while they crash. All four
    workers will exit the `{checkpoint_commit_failure,...}` error. This commit make
    it the checkpoint ignore mismatches if one of the instance start times is 0.
---
 .../src/couch_replicator_scheduler_job.erl          | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/couch_replicator/src/couch_replicator_scheduler_job.erl b/src/couch_replicator/src/couch_replicator_scheduler_job.erl
index e16412e4a..cd751d8f2 100644
--- a/src/couch_replicator/src/couch_replicator_scheduler_job.erl
+++ b/src/couch_replicator/src/couch_replicator_scheduler_job.erl
@@ -785,9 +785,9 @@ do_checkpoint(State) ->
         current_through_seq = {_Ts, NewSeq} = NewTsSeq,
         source_log = SourceLog,
         target_log = TargetLog,
-        rep_starttime = ReplicationStartTime,
-        src_starttime = SrcInstanceStartTime,
-        tgt_starttime = TgtInstanceStartTime,
+        rep_starttime = RepStartTs,
+        src_starttime = SrcStartTs,
+        tgt_starttime = TgtStartTs,
         stats = Stats,
         rep_details = #rep{options = Options},
         session_id = SessionId
@@ -799,13 +799,16 @@ do_checkpoint(State) ->
         {target_error, Reason} ->
             {checkpoint_commit_failure,
                 <<"Failure on target commit: ", (to_binary(Reason))/binary>>};
-        {SrcInstanceStartTime, TgtInstanceStartTime} ->
+        {<<S/binary>>, <<T/binary>>} when
+            (S =:= SrcStartTs orelse T =:= <<"0">> orelse SrcStartTs =:= <<"0">>) andalso
+                (T =:= TgtStartTs orelse T =:= <<"0">> orelse TgtStartTs =:= <<"0">>)
+        ->
             couch_log:notice(
                 "recording a checkpoint for `~s` -> `~s` at source update_seq ~p",
                 [SourceName, TargetName, NewSeq]
             ),
-            LocalStartTime = calendar:now_to_local_time(ReplicationStartTime),
-            StartTime = ?l2b(httpd_util:rfc1123_date(LocalStartTime)),
+            LocalStartTs = calendar:now_to_local_time(RepStartTs),
+            StartTime = ?l2b(httpd_util:rfc1123_date(LocalStartTs)),
             EndTime = ?l2b(httpd_util:rfc1123_date()),
             NewHistoryEntry =
                 {[
@@ -870,15 +873,15 @@ do_checkpoint(State) ->
                 throw:{checkpoint_commit_failure, _} = Failure ->
                     Failure
             end;
-        {SrcInstanceStartTime, _NewTgtInstanceStartTime} ->
+        {SrcStartTs, _NewTgtStartTs} ->
             {checkpoint_commit_failure, <<
                 "instance_start_time on target database has changed since last checkpoint."
             >>};
-        {_NewSrcInstanceStartTime, TgtInstanceStartTime} ->
+        {_NewSrcStartTs, TgtStartTs} ->
             {checkpoint_commit_failure, <<
                 "instance_start_time on source database has changed since last checkpoint."
             >>};
-        {_NewSrcInstanceStartTime, _NewTgtInstanceStartTime} ->
+        {_NewSrcStartTs, _NewTgtStartTs} ->
             {checkpoint_commit_failure, <<
                 "instance_start_time on source and target database has changed since last checkpoint."
             >>}