You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2022/09/17 23:01:41 UTC

[couchdb] branch main updated: Don't double-encode changes sequence strings in the replicator

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/couchdb.git


The following commit(s) were added to refs/heads/main by this push:
     new 21eebad0f Don't double-encode changes sequence strings in the replicator
21eebad0f is described below

commit 21eebad0fb6ea62786d915b29797983be537908a
Author: Nick Vatamaniuc <va...@apache.org>
AuthorDate: Fri Sep 16 11:21:10 2022 -0400

    Don't double-encode changes sequence strings in the replicator
    
    Previously we always encoded the sequence as json. In case of a strings, the
    most common case, this ended up with something like `"\"1-gA..\""`. Then, the
    endpoints would strip out and ignore the extra `"`. So, avoid sending the extra
    bytes just so they can be striped out in the end anyway. This should make the
    logs look a bit cleaner too.
    
    Integers or any other sequences are still json-encoded. Integers will still
    look like `?since=123` and, in the unlikely we case we replicate with a
    BigCouch 0.4 era endpoint, the `[SeqNum, OpaqueString]` should be properly
    encoded as before and replications should work.
    
    This also fixes a minor annoyance when the `_scheduler/{jobs,docs}` results
    returning the default start sequence as `0` (an integer) even though in the
    majority of case they'd almost always turn into a string. After this we
    consistently return a string as it would be passed in the `_changes?since=...`
    parameter.
---
 src/couch_replicator/src/couch_replicator_api_wrap.erl       |  6 ++++--
 src/couch_replicator/src/couch_replicator_scheduler_job.erl  | 11 +++++++----
 src/couch_replicator/src/couch_replicator_utils.erl          | 12 +++++++++++-
 .../test/eunit/couch_replicator_compact_tests.erl            |  4 ++--
 4 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/couch_replicator/src/couch_replicator_api_wrap.erl b/src/couch_replicator/src/couch_replicator_api_wrap.erl
index a6e39cb02..a44a79da1 100644
--- a/src/couch_replicator/src/couch_replicator_api_wrap.erl
+++ b/src/couch_replicator/src/couch_replicator_api_wrap.erl
@@ -148,7 +148,8 @@ get_pending_count(#httpdb{} = Db, Seq) when is_number(Seq) ->
         end
     end);
 get_pending_count(#httpdb{} = Db, Seq) ->
-    Options = [{path, "_changes"}, {qs, [{"since", ?JSON_ENCODE(Seq)}, {"limit", "0"}]}],
+    EncodedSeq = couch_replicator_utils:seq_encode(Seq),
+    Options = [{path, "_changes"}, {qs, [{"since", EncodedSeq}, {"limit", "0"}]}],
     send_req(Db, Options, fun(200, _, {Props}) ->
         {ok, couch_util:get_value(<<"pending">>, Props, null)}
     end).
@@ -539,6 +540,7 @@ changes_since(
     Options
 ) ->
     Timeout = erlang:max(1000, InactiveTimeout div 3),
+    EncodedSeq = couch_replicator_utils:seq_encode(StartSeq),
     BaseQArgs =
         case get_value(continuous, Options, false) of
             false ->
@@ -548,7 +550,7 @@ changes_since(
         end ++
             [
                 {"style", atom_to_list(Style)},
-                {"since", ?JSON_ENCODE(StartSeq)},
+                {"since", EncodedSeq},
                 {"timeout", integer_to_list(Timeout)}
             ],
     DocIds = get_value(doc_ids, Options),
diff --git a/src/couch_replicator/src/couch_replicator_scheduler_job.erl b/src/couch_replicator/src/couch_replicator_scheduler_job.erl
index e06a1ffea..1ba933a5e 100644
--- a/src/couch_replicator/src/couch_replicator_scheduler_job.erl
+++ b/src/couch_replicator/src/couch_replicator_scheduler_job.erl
@@ -154,7 +154,7 @@ do_init(#rep{options = Options, id = {BaseId, Ext}, user_ctx = UserCtx} = Rep) -
             {source, ?l2b(SourceName)},
             {target, ?l2b(TargetName)},
             {continuous, get_value(continuous, Options, false)},
-            {source_seq, HighestSeq},
+            {source_seq, seq_encode(HighestSeq)},
             {checkpoint_interval, CheckpointInterval}
         ] ++ rep_stats(State)
     ),
@@ -1034,6 +1034,9 @@ get_pending_count_int(#rep_state{source = Db} = St) ->
     {ok, Pending} = couch_replicator_api_wrap:get_pending_count(Db, Seq),
     Pending.
 
+seq_encode(Seq) ->
+    couch_replicator_utils:seq_encode(Seq).
+
 update_task(State) ->
     #rep_state{
         rep_details = #rep{id = JobId},
@@ -1043,8 +1046,8 @@ update_task(State) ->
     Status =
         rep_stats(State) ++
             [
-                {source_seq, HighestSeq},
-                {through_seq, ThroughSeq}
+                {source_seq, seq_encode(HighestSeq)},
+                {through_seq, seq_encode(ThroughSeq)}
             ],
     couch_replicator_scheduler:update_job_stats(JobId, Status),
     couch_task_status:update(Status).
@@ -1063,7 +1066,7 @@ rep_stats(State) ->
         {doc_write_failures, couch_replicator_stats:doc_write_failures(Stats)},
         {bulk_get_docs, couch_replicator_stats:bulk_get_docs(Stats)},
         {bulk_get_attempts, couch_replicator_stats:bulk_get_attempts(Stats)},
-        {checkpointed_source_seq, CommittedSeq}
+        {checkpointed_source_seq, seq_encode(CommittedSeq)}
     ].
 
 replication_start_error({unauthorized, DbUri}) ->
diff --git a/src/couch_replicator/src/couch_replicator_utils.erl b/src/couch_replicator/src/couch_replicator_utils.erl
index c9cfac62f..e4a2cd12f 100644
--- a/src/couch_replicator/src/couch_replicator_utils.erl
+++ b/src/couch_replicator/src/couch_replicator_utils.erl
@@ -27,7 +27,8 @@
     ejson_state_info/1,
     get_basic_auth_creds/1,
     remove_basic_auth_creds/1,
-    normalize_basic_auth/1
+    normalize_basic_auth/1,
+    seq_encode/1
 ]).
 
 -include_lib("ibrowse/include/ibrowse.hrl").
@@ -270,6 +271,15 @@ normalize_basic_auth(#httpdb{} = HttpDb) ->
         end,
     set_basic_auth_creds(User, Pass, HttpDb1).
 
+seq_encode(Seq) when is_binary(Seq) ->
+    % Don't encode a string, we already got it encoded from the changes feed
+    Seq;
+seq_encode(Seq) ->
+    % This could be either an integer sequence from CouchDB 1.x, a
+    % [Seq, Opaque] json array from BigCouch 0.4, or any other json
+    % object. We are being maximally compatible here.
+    ?JSON_ENCODE(Seq).
+
 -ifdef(TEST).
 
 -include_lib("eunit/include/eunit.hrl").
diff --git a/src/couch_replicator/test/eunit/couch_replicator_compact_tests.erl b/src/couch_replicator/test/eunit/couch_replicator_compact_tests.erl
index 1f241c753..373bd02ba 100644
--- a/src/couch_replicator/test/eunit/couch_replicator_compact_tests.erl
+++ b/src/couch_replicator/test/eunit/couch_replicator_compact_tests.erl
@@ -93,8 +93,8 @@ check_active_tasks(RepPid, {BaseId, Ext} = _RepId, Src, Tgt) ->
     ?assert(is_integer(couch_util:get_value(doc_write_failures, RepTask))),
     ?assert(is_integer(couch_util:get_value(revisions_checked, RepTask))),
     ?assert(is_integer(couch_util:get_value(missing_revisions_found, RepTask))),
-    ?assert(is_integer(couch_util:get_value(checkpointed_source_seq, RepTask))),
-    ?assert(is_integer(couch_util:get_value(source_seq, RepTask))),
+    ?assert(is_binary(couch_util:get_value(checkpointed_source_seq, RepTask))),
+    ?assert(is_binary(couch_util:get_value(source_seq, RepTask))),
     Pending = couch_util:get_value(changes_pending, RepTask),
     ?assert(is_integer(Pending)).