You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by da...@apache.org on 2018/06/16 21:57:04 UTC

[couchdb] branch master updated (000766c -> 3c98385)

This is an automated email from the ASF dual-hosted git repository.

davisp pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/couchdb.git.


    from 000766c  Fix active size calculations for views
     new aebdbc4  Optimize couch_key_tree:stem/2
     new 3c98385  Fix couch_key_tree_tests.erl

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/couch/src/couch_db.erl              | 10 +----
 src/couch/src/couch_db_updater.erl      | 33 ++++++++-------
 src/couch/src/couch_key_tree.erl        | 74 +++++++++++++++++++++++++++++++++
 src/couch/test/couch_key_tree_tests.erl | 24 ++++++++---
 4 files changed, 114 insertions(+), 27 deletions(-)

-- 
To stop receiving notification emails like this one, please contact
davisp@apache.org.

[couchdb] 01/02: Optimize couch_key_tree:stem/2

Posted by da...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

davisp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit aebdbc452573f70f4e50d88af5814d0fbe936333
Author: Paul J. Davis <pa...@gmail.com>
AuthorDate: Wed Jun 13 13:15:11 2018 -0500

    Optimize couch_key_tree:stem/2
    
    This is two related optimizations for stemming revisions. The first
    optimization re-writes the stemming algorithm to drop it from an O(N^2)
    to O(N) operation by using a depth first search through the tree and
    tracking which revisions are within `revs_limit` revs from a leaf
    dropping any revision that exceeds that limit.
    
    The second optimization is just that we avoid calling stemming more
    often than necessary by switching away from using `merge/3` to `merge/2`
    and then calling `stem/2` only when necessary.
    
    Co-Authored-By: Nick Vatamaniuc <va...@apache.org>
---
 src/couch/src/couch_db.erl         | 10 ++----
 src/couch/src/couch_db_updater.erl | 33 +++++++++--------
 src/couch/src/couch_key_tree.erl   | 74 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/src/couch/src/couch_db.erl b/src/couch/src/couch_db.erl
index 93ea07e..b47cc7e 100644
--- a/src/couch/src/couch_db.erl
+++ b/src/couch/src/couch_db.erl
@@ -870,16 +870,10 @@ prep_and_validate_replicated_updates(Db, [Bucket|RestBuckets], [OldInfo|RestOldI
             {[], AccErrors}, Bucket),
         prep_and_validate_replicated_updates(Db, RestBuckets, RestOldInfo, [ValidatedBucket | AccPrepped], AccErrors3);
     #full_doc_info{rev_tree=OldTree} ->
-        RevsLimit = get_revs_limit(Db),
         OldLeafs = couch_key_tree:get_all_leafs_full(OldTree),
         OldLeafsLU = [{Start, RevId} || {Start, [{RevId, _}|_]} <- OldLeafs],
-        NewRevTree = lists:foldl(
-            fun(NewDoc, AccTree) ->
-                {NewTree, _} = couch_key_tree:merge(AccTree,
-                    couch_doc:to_path(NewDoc), RevsLimit),
-                NewTree
-            end,
-            OldTree, Bucket),
+        NewPaths = lists:map(fun couch_doc:to_path/1, Bucket),
+        NewRevTree = couch_key_tree:multi_merge(OldTree, NewPaths),
         Leafs = couch_key_tree:get_all_leafs_full(NewRevTree),
         LeafRevsFullDict = dict:from_list( [{{Start, RevId}, FullPath} || {Start, [{RevId, _}|_]}=FullPath <- Leafs]),
         {ValidatedBucket, AccErrors3} =
diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl
index a2de3bc..fba99a7 100644
--- a/src/couch/src/couch_db_updater.erl
+++ b/src/couch/src/couch_db_updater.erl
@@ -504,23 +504,24 @@ merge_rev_trees(Limit, MergeConflicts, [NewDocs|RestDocsList],
         [OldDocInfo|RestOldInfo], AccNewInfos, AccRemoveSeqs, AccSeq) ->
     erlang:put(last_id_merged, OldDocInfo#full_doc_info.id), % for debugging
     NewDocInfo0 = lists:foldl(fun({Client, NewDoc}, OldInfoAcc) ->
-        merge_rev_tree(OldInfoAcc, NewDoc, Client, Limit, MergeConflicts)
+        merge_rev_tree(OldInfoAcc, NewDoc, Client, MergeConflicts)
     end, OldDocInfo, NewDocs),
+    NewDocInfo1 = stem_full_doc_info(NewDocInfo0, Limit),
     % When MergeConflicts is false, we updated #full_doc_info.deleted on every
     % iteration of merge_rev_tree. However, merge_rev_tree does not update
     % #full_doc_info.deleted when MergeConflicts is true, since we don't need
     % to know whether the doc is deleted between iterations. Since we still
     % need to know if the doc is deleted after the merge happens, we have to
     % set it here.
-    NewDocInfo1 = case MergeConflicts of
+    NewDocInfo2 = case MergeConflicts of
         true ->
-            NewDocInfo0#full_doc_info{
-                deleted = couch_doc:is_deleted(NewDocInfo0)
+            NewDocInfo1#full_doc_info{
+                deleted = couch_doc:is_deleted(NewDocInfo1)
             };
         false ->
-            NewDocInfo0
+            NewDocInfo1
     end,
-    if NewDocInfo1 == OldDocInfo ->
+    if NewDocInfo2 == OldDocInfo ->
         % nothing changed
         merge_rev_trees(Limit, MergeConflicts, RestDocsList, RestOldInfo,
             AccNewInfos, AccRemoveSeqs, AccSeq);
@@ -529,7 +530,7 @@ merge_rev_trees(Limit, MergeConflicts, [NewDocs|RestDocsList],
         % important to note that the update_seq on OldDocInfo should
         % be identical to the value on NewDocInfo1.
         OldSeq = OldDocInfo#full_doc_info.update_seq,
-        NewDocInfo2 = NewDocInfo1#full_doc_info{
+        NewDocInfo3 = NewDocInfo2#full_doc_info{
             update_seq = AccSeq + 1
         },
         RemoveSeqs = case OldSeq of
@@ -537,10 +538,10 @@ merge_rev_trees(Limit, MergeConflicts, [NewDocs|RestDocsList],
             _ -> [OldSeq | AccRemoveSeqs]
         end,
         merge_rev_trees(Limit, MergeConflicts, RestDocsList, RestOldInfo,
-            [NewDocInfo2|AccNewInfos], RemoveSeqs, AccSeq+1)
+            [NewDocInfo3|AccNewInfos], RemoveSeqs, AccSeq+1)
     end.
 
-merge_rev_tree(OldInfo, NewDoc, Client, Limit, false)
+merge_rev_tree(OldInfo, NewDoc, Client, false)
         when OldInfo#full_doc_info.deleted ->
     % We're recreating a document that was previously
     % deleted. To check that this is a recreation from
@@ -574,7 +575,7 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false)
             % Merge our modified new doc into the tree
             #full_doc_info{rev_tree=OldTree} = OldInfo,
             NewTree0 = couch_doc:to_path(NewDoc2),
-            case couch_key_tree:merge(OldTree, NewTree0, Limit) of
+            case couch_key_tree:merge(OldTree, NewTree0) of
                 {NewTree1, new_leaf} ->
                     % We changed the revision id so inform the caller
                     send_result(Client, NewDoc, {ok, {OldPos+1, NewRevId}}),
@@ -589,7 +590,7 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false)
             send_result(Client, NewDoc, conflict),
             OldInfo
     end;
-merge_rev_tree(OldInfo, NewDoc, Client, Limit, false) ->
+merge_rev_tree(OldInfo, NewDoc, Client, false) ->
     % We're attempting to merge a new revision into an
     % undeleted document. To not be a conflict we require
     % that the merge results in extending a branch.
@@ -597,7 +598,7 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false) ->
     OldTree = OldInfo#full_doc_info.rev_tree,
     NewTree0 = couch_doc:to_path(NewDoc),
     NewDeleted = NewDoc#doc.deleted,
-    case couch_key_tree:merge(OldTree, NewTree0, Limit) of
+    case couch_key_tree:merge(OldTree, NewTree0) of
         {NewTree, new_leaf} when not NewDeleted ->
             OldInfo#full_doc_info{
                 rev_tree = NewTree,
@@ -615,14 +616,18 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false) ->
             send_result(Client, NewDoc, conflict),
             OldInfo
     end;
-merge_rev_tree(OldInfo, NewDoc, _Client, Limit, true) ->
+merge_rev_tree(OldInfo, NewDoc, _Client, true) ->
     % We're merging in revisions without caring about
     % conflicts. Most likely this is a replication update.
     OldTree = OldInfo#full_doc_info.rev_tree,
     NewTree0 = couch_doc:to_path(NewDoc),
-    {NewTree, _} = couch_key_tree:merge(OldTree, NewTree0, Limit),
+    {NewTree, _} = couch_key_tree:merge(OldTree, NewTree0),
     OldInfo#full_doc_info{rev_tree = NewTree}.
 
+stem_full_doc_info(#full_doc_info{rev_tree = Tree} = Info, Limit) ->
+    Stemmed = couch_key_tree:stem(Tree, Limit),
+    Info#full_doc_info{rev_tree = Stemmed}.
+
 update_docs_int(Db, DocsList, LocalDocs, MergeConflicts, FullCommit) ->
     UpdateSeq = couch_db_engine:get_update_seq(Db),
     RevsLimit = couch_db_engine:get_revs_limit(Db),
diff --git a/src/couch/src/couch_key_tree.erl b/src/couch/src/couch_key_tree.erl
index cd661e2..5d53615 100644
--- a/src/couch/src/couch_key_tree.erl
+++ b/src/couch/src/couch_key_tree.erl
@@ -59,6 +59,7 @@ get_key_leafs/2,
 map/2,
 map_leafs/2,
 mapfold/3,
+multi_merge/2,
 merge/3,
 merge/2,
 remove_leafs/2,
@@ -71,6 +72,15 @@ stem/2
 -type revtree() :: [tree()].
 
 
+%% @doc Merge multiple paths into the given tree.
+-spec multi_merge(revtree(), tree()) -> revtree().
+multi_merge(RevTree, Trees) ->
+    lists:foldl(fun(Tree, RevTreeAcc) ->
+        {NewRevTree, _} = merge(RevTreeAcc, Tree),
+        NewRevTree
+    end, RevTree, lists:sort(Trees)).
+
+
 %% @doc Merge a path into the given tree and then stem the result.
 %% Although Tree is of type tree(), it must not contain any branches.
 -spec merge(revtree(), tree() | path(), pos_integer()) ->
@@ -470,6 +480,70 @@ map_leafs_simple(Fun, Pos, [{Key, Value, SubTree} | RestTree]) ->
 
 
 stem(Trees, Limit) ->
+    try
+        {_, Branches} = lists:foldl(fun(Tree, {Seen, TreeAcc}) ->
+            {NewSeen, NewBranches} = stem_tree(Tree, Limit, Seen),
+            {NewSeen, NewBranches ++ TreeAcc}
+        end, {sets:new(), []}, Trees),
+        lists:sort(Branches)
+    catch throw:dupe_keys ->
+        repair_tree(Trees, Limit)
+    end.
+
+
+stem_tree({Depth, Child}, Limit, Seen) ->
+    case stem_tree(Depth, Child, Limit, Seen) of
+        {NewSeen, _, NewChild, NewBranches} ->
+            {NewSeen, [{Depth, NewChild} | NewBranches]};
+        {NewSeen, _, NewBranches} ->
+            {NewSeen, NewBranches}
+    end.
+
+
+stem_tree(_Depth, {Key, _Val, []} = Leaf, Limit, Seen) ->
+    {check_key(Key, Seen), Limit - 1, Leaf, []};
+
+stem_tree(Depth, {Key, Val, Children}, Limit, Seen0) ->
+    Seen1 = check_key(Key, Seen0),
+    FinalAcc = lists:foldl(fun(Child, Acc) ->
+        {SeenAcc, LimitPosAcc, ChildAcc, BranchAcc} = Acc,
+        case stem_tree(Depth + 1, Child, Limit, SeenAcc) of
+            {NewSeenAcc, LimitPos, NewChild, NewBranches} ->
+                NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
+                NewChildAcc = [NewChild | ChildAcc],
+                NewBranchAcc = NewBranches ++ BranchAcc,
+                {NewSeenAcc, NewLimitPosAcc, NewChildAcc, NewBranchAcc};
+            {NewSeenAcc, LimitPos, NewBranches} ->
+                NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
+                NewBranchAcc = NewBranches ++ BranchAcc,
+                {NewSeenAcc, NewLimitPosAcc, ChildAcc, NewBranchAcc}
+        end
+    end, {Seen1, -1, [], []}, Children),
+    {FinalSeen, FinalLimitPos, FinalChildren, FinalBranches} = FinalAcc,
+    case FinalLimitPos of
+        N when N > 0, length(FinalChildren) > 0 ->
+            FinalNode = {Key, Val, lists:reverse(FinalChildren)},
+            {FinalSeen, FinalLimitPos - 1, FinalNode, FinalBranches};
+        0 when length(FinalChildren) > 0 ->
+            NewBranches = lists:map(fun(Child) ->
+                {Depth + 1, Child}
+            end, lists:reverse(FinalChildren)),
+            {FinalSeen, -1, NewBranches ++ FinalBranches};
+        N when N < 0, length(FinalChildren) == 0 ->
+            {FinalSeen, FinalLimitPos - 1, FinalBranches}
+    end.
+
+
+check_key(Key, Seen) ->
+    case sets:is_element(Key, Seen) of
+        true ->
+            throw(dupe_keys);
+        false ->
+            sets:add_element(Key, Seen)
+    end.
+
+
+repair_tree(Trees, Limit) ->
     % flatten each branch in a tree into a tree path, sort by starting rev #
     Paths = lists:sort(lists:map(fun({Pos, Path}) ->
         StemmedPath = lists:sublist(Path, Limit),

-- 
To stop receiving notification emails like this one, please contact
davisp@apache.org.

[couchdb] 02/02: Fix couch_key_tree_tests.erl

Posted by da...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

davisp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 3c9838528972ec55165fdd98bc70cec0bff4db3d
Author: Paul J. Davis <pa...@gmail.com>
AuthorDate: Wed Jun 13 13:35:00 2018 -0500

    Fix couch_key_tree_tests.erl
    
    The `should_merge_tree_to_itself` and `should_merge_tree_of_odd_length`
    tests were both invalid as merging does not support merging of anything
    other than a linear path. This failure was covered up by the fact that
    the stem operation will detect and cover up any errors from a failed
    merge.
    
    Co-Authored-By: Nick Vatamaniuc <va...@apache.org>
---
 src/couch/test/couch_key_tree_tests.erl | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/couch/test/couch_key_tree_tests.erl b/src/couch/test/couch_key_tree_tests.erl
index 88d9203..2b7d5fe 100644
--- a/src/couch/test/couch_key_tree_tests.erl
+++ b/src/couch/test/couch_key_tree_tests.erl
@@ -167,8 +167,23 @@ should_merge_reflexive_for_child_nodes()->
 should_merge_tree_to_itself()->
     TwoChildSibs = {1, {"1","foo", [{"1a", "bar", []},
                                     {"1b", "bar", []}]}},
-    ?_assertEqual({[TwoChildSibs], new_branch},
-                  couch_key_tree:merge([TwoChildSibs], TwoChildSibs, ?DEPTH)).
+    Leafs = couch_key_tree:get_all_leafs([TwoChildSibs]),
+    Paths = lists:map(fun leaf_to_path/1, Leafs),
+    FinalTree = lists:foldl(fun(Path, TreeAcc) ->
+        {NewTree, internal_node} = couch_key_tree:merge(TreeAcc, Path),
+        NewTree
+    end, [TwoChildSibs], Paths),
+    ?_assertEqual([TwoChildSibs], FinalTree).
+
+leaf_to_path({Value, {Start, Keys}}) ->
+    [Branch] = to_branch(Value, lists:reverse(Keys)),
+    {Start - length(Keys) + 1, Branch}.
+
+to_branch(Value, [Key]) ->
+    [{Key, Value, []}];
+to_branch(Value, [Key | RestKeys]) ->
+    [{Key, [], to_branch(Value, RestKeys)}].
+
 
 should_merge_tree_of_odd_length()->
     TwoChild = {1, {"1","foo", [{"1a", "bar", [{"1aa", "bar", []}]}]}},
@@ -176,9 +191,8 @@ should_merge_tree_of_odd_length()->
                                     {"1b", "bar", []}]}},
     TwoChildPlusSibs = {1, {"1","foo", [{"1a", "bar", [{"1aa", "bar", []}]},
                                         {"1b", "bar", []}]}},
-
-    ?_assertEqual({[TwoChildPlusSibs], new_branch},
-                  couch_key_tree:merge([TwoChild], TwoChildSibs, ?DEPTH)).
+    ?_assertEqual({[TwoChildPlusSibs], new_leaf},
+                  couch_key_tree:merge([TwoChildSibs], TwoChild, ?DEPTH)).
 
 should_merge_tree_with_stem()->
     Stemmed = {2, {"1a", "bar", []}},

-- 
To stop receiving notification emails like this one, please contact
davisp@apache.org.