You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by va...@apache.org on 2017/07/31 23:42:58 UTC

[couchdb] branch master updated: Make replication ID generation more robust.

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/couchdb.git


The following commit(s) were added to refs/heads/master by this push:
     new f8cb6f9  Make replication ID generation more robust.
f8cb6f9 is described below

commit f8cb6f97c3a2d90449065c75dbc5405b34a61d18
Author: Nick Vatamaniuc <va...@apache.org>
AuthorDate: Tue Jul 18 02:16:33 2017 -0400

    Make replication ID generation more robust.
    
    Replications checkpoint to _local documents identified by replication ids. If
    replication ids change replication tasks will not be able to find their
    previous checkpoints and will rewind their change feeds back to 0. For a large
    database that could mean reprocessing millions of documents.
    
    Current version of replication id generation algorithm hashes the full url of
    the source, target, their headers, including authorization ones as well, and a
    few other things. This means when user changes their password and updates their
    replication document, replication ids will change and all the checkpoint will
    be invalidated.
    
    Also, it is fairly common to upgrade services from http:// to https://.
    Replication endpoint URIs then typically just change their schema part
    accordingly. However, schema is part of the replication ID calculation, so
    replication ids would then change as well.
    
    Introduce a more robust replication id generation algorithm which can handle
    some of those issues. The new algorithm:
    
     1. Excludes source and target URI schema from the replication id calculation.
     As long as the host and other parts stay the same changing the schema will
     have no effect on the replication id.
    
     2. Ignores inline (specified in the URL) basic authentication passwords.
    
     3. Ignores basic authentication password even if provided in the
     basic authorization headers.
    
     4. Is insensitive to switching between providing basic authentication
     credentials inline or in a headers section. However it includes the username
     used in the basic auth in the calculation. It is plausible scenario that
     http://user1:pass1@a.host.com is really a different database than
     http://user2:pass@@a.host.com
    
    Issue #688
---
 src/couch_replicator/src/couch_replicator.hrl     |   2 +-
 src/couch_replicator/src/couch_replicator_ids.erl | 175 ++++++++++++++++++++++
 2 files changed, 176 insertions(+), 1 deletion(-)

diff --git a/src/couch_replicator/src/couch_replicator.hrl b/src/couch_replicator/src/couch_replicator.hrl
index ba9a606..d46c347 100644
--- a/src/couch_replicator/src/couch_replicator.hrl
+++ b/src/couch_replicator/src/couch_replicator.hrl
@@ -10,7 +10,7 @@
 % License for the specific language governing permissions and limitations under
 % the License.
 
--define(REP_ID_VERSION, 3).
+-define(REP_ID_VERSION, 4).
 
 -record(rep, {
     id :: rep_id() | '_' | 'undefined',
diff --git a/src/couch_replicator/src/couch_replicator_ids.erl b/src/couch_replicator/src/couch_replicator_ids.erl
index cbfe82a..62cfdf2 100644
--- a/src/couch_replicator/src/couch_replicator_ids.erl
+++ b/src/couch_replicator/src/couch_replicator_ids.erl
@@ -18,6 +18,8 @@
     convert/1
 ]).
 
+-include_lib("ibrowse/include/ibrowse.hrl").
+
 -include_lib("couch/include/couch_db.hrl").
 -include("couch_replicator_api_wrap.hrl").
 -include("couch_replicator.hrl").
@@ -37,6 +39,12 @@ replication_id(#rep{options = Options} = Rep) ->
 % If a change is made to how replications are identified,
 % please add a new clause and increase ?REP_ID_VERSION.
 
+replication_id(#rep{user_ctx = UserCtx} = Rep, 4) ->
+    UUID = couch_server:get_uuid(),
+    SrcInfo = get_v4_endpoint(UserCtx, Rep#rep.source),
+    TgtInfo = get_v4_endpoint(UserCtx, Rep#rep.target),
+    maybe_append_filters([UUID, SrcInfo, TgtInfo], Rep);
+
 replication_id(#rep{user_ctx = UserCtx} = Rep, 3) ->
     UUID = couch_server:get_uuid(),
     Src = get_rep_endpoint(UserCtx, Rep#rep.source),
@@ -125,3 +133,170 @@ get_rep_endpoint(_UserCtx, #httpdb{url=Url, headers=Headers, oauth=OAuth}) ->
     end;
 get_rep_endpoint(UserCtx, <<DbName/binary>>) ->
     {local, DbName, UserCtx}.
+
+
+get_v4_endpoint(UserCtx, #httpdb{} = HttpDb) ->
+    {Url, Headers, OAuth} = case get_rep_endpoint(UserCtx, HttpDb) of
+        {remote, U, Hds} ->
+            {U, Hds, undefined};
+        {remote, U, Hds, OA} ->
+            {U, Hds, OA}
+    end,
+    {UserFromHeaders, HeadersWithoutBasicAuth} = remove_basic_auth(Headers),
+    {UserFromUrl, Host, NonDefaultPort, Path} = get_v4_url_info(Url),
+    User = pick_defined_value([UserFromUrl, UserFromHeaders]),
+    {remote, User, Host, NonDefaultPort, Path, HeadersWithoutBasicAuth, OAuth};
+get_v4_endpoint(UserCtx, <<DbName/binary>>) ->
+    {local, DbName, UserCtx}.
+
+
+remove_basic_auth(Headers) ->
+    case lists:partition(fun is_basic_auth/1, Headers) of
+        {[], HeadersWithoutBasicAuth} ->
+            {undefined, HeadersWithoutBasicAuth};
+        {[{_, "Basic " ++ Base64} | _], HeadersWithoutBasicAuth} ->
+            User = get_basic_auth_user(Base64),
+            {User, HeadersWithoutBasicAuth}
+    end.
+
+
+is_basic_auth({"Authorization", "Basic " ++ _Base64}) ->
+    true;
+is_basic_auth(_) ->
+    false.
+
+
+get_basic_auth_user(Base64) ->
+    try re:split(base64:decode(Base64), ":", [{return, list}, {parts, 2}]) of
+        [User, _Pass] ->
+            User;
+        _ ->
+            undefined
+    catch
+        % Tolerate invalid B64 values here to avoid crashing replicator
+        error:function_clause ->
+            undefined
+    end.
+
+
+pick_defined_value(Values) ->
+    case [V || V <- Values, V /= undefined] of
+        [] ->
+            undefined;
+        DefinedValues ->
+            hd(DefinedValues)
+    end.
+
+
+get_v4_url_info(Url) when is_binary(Url) ->
+    get_v4_url_info(binary_to_list(Url));
+get_v4_url_info(Url) ->
+    case ibrowse_lib:parse_url(Url) of
+        {error, invalid_uri} ->
+            % Tolerate errors here to avoid a bad user document
+            % crashing the replicator
+            {undefined, Url, undefined, undefined};
+        #url{
+            protocol = Schema,
+            username = User,
+            host = Host,
+            port = Port,
+            path = Path
+        } ->
+            NonDefaultPort = get_non_default_port(Schema, Port),
+            {User, Host, NonDefaultPort, Path}
+    end.
+
+
+get_non_default_port(https, 443) ->
+    default;
+get_non_default_port(http, 80) ->
+    default;
+get_non_default_port(http, 5984) ->
+    default;
+get_non_default_port(_Schema, Port) ->
+    Port.
+
+
+-ifdef(TEST).
+
+-include_lib("eunit/include/eunit.hrl").
+
+http_v4_endpoint_test_() ->
+    [?_assertMatch({remote, User, Host, Port, Path, HeadersNoAuth, undefined},
+        get_v4_endpoint(nil, #httpdb{url = Url, headers = Headers})) ||
+            {{User, Host, Port, Path, HeadersNoAuth}, {Url, Headers}} <- [
+                {
+                    {undefined, "host", default, "/", []},
+                    {"http://host", []}
+                },
+                {
+                    {undefined, "host", default, "/", []},
+                    {"https://host", []}
+                },
+                {
+                    {undefined, "host", default, "/", []},
+                    {"http://host:5984", []}
+                },
+                {
+                    {undefined, "host", 1, "/", []},
+                    {"http://host:1", []}
+                },
+                {
+                    {undefined, "host", 2, "/", []},
+                    {"https://host:2", []}
+                },
+                {
+                    {undefined, "host", default, "/", [{"h","v"}]},
+                    {"http://host", [{"h","v"}]}
+                },
+                {
+                    {undefined, "host", default, "/a/b", []},
+                    {"http://host/a/b", []}
+                },
+                {
+                    {"user", "host", default, "/", []},
+                    {"http://user:pass@host", []}
+                },
+                {
+                    {"user", "host", 3, "/", []},
+                    {"http://user:pass@host:3", []}
+                },
+                {
+                    {"user", "host", default, "/", []},
+                    {"http://user:newpass@host", []}
+                },
+                {
+                    {"user", "host", default, "/", []},
+                    {"http://host", [basic_auth("user","pass")]}
+                },
+                {
+                    {"user", "host", default, "/", []},
+                    {"http://host", [basic_auth("user","newpass")]}
+                },
+                {
+                    {"user1", "host", default, "/", []},
+                    {"http://user1:pass1@host", [basic_auth("user2","pass2")]}
+                },
+                {
+                    {"user", "host", default, "/", [{"h", "v"}]},
+                    {"http://host", [{"h", "v"}, basic_auth("user","pass")]}
+                },
+                {
+                    {undefined, "random_junk", undefined, undefined},
+                    {"random_junk", []}
+                },
+                {
+                    {undefined, "host", default, "/", []},
+                    {"http://host", [{"Authorization", "Basic bad"}]}
+                }
+        ]
+    ].
+
+
+basic_auth(User, Pass) ->
+    B64Auth = base64:encode_to_string(User ++ ":" ++ Pass),
+    {"Authorization", "Basic " ++ B64Auth}.
+
+
+-endif.

-- 
To stop receiving notification emails like this one, please contact
['"commits@couchdb.apache.org" <co...@couchdb.apache.org>'].