You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by da...@apache.org on 2018/06/16 21:57:04 UTC
[couchdb] branch master updated (000766c -> 3c98385)
This is an automated email from the ASF dual-hosted git repository.
davisp pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/couchdb.git.
from 000766c Fix active size calculations for views
new aebdbc4 Optimize couch_key_tree:stem/2
new 3c98385 Fix couch_key_tree_tests.erl
The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
src/couch/src/couch_db.erl | 10 +----
src/couch/src/couch_db_updater.erl | 33 ++++++++-------
src/couch/src/couch_key_tree.erl | 74 +++++++++++++++++++++++++++++++++
src/couch/test/couch_key_tree_tests.erl | 24 ++++++++---
4 files changed, 114 insertions(+), 27 deletions(-)
--
To stop receiving notification emails like this one, please contact
davisp@apache.org.
[couchdb] 01/02: Optimize couch_key_tree:stem/2
Posted by da...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
davisp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit aebdbc452573f70f4e50d88af5814d0fbe936333
Author: Paul J. Davis <pa...@gmail.com>
AuthorDate: Wed Jun 13 13:15:11 2018 -0500
Optimize couch_key_tree:stem/2
This is two related optimizations for stemming revisions. The first
optimization re-writes the stemming algorithm to drop it from an O(N^2)
to O(N) operation by using a depth first search through the tree and
tracking which revisions are within `revs_limit` revs from a leaf
dropping any revision that exceeds that limit.
The second optimization is just that we avoid calling stemming more
often than necessary by switching away from using `merge/3` to `merge/2`
and then calling `stem/2` only when necessary.
Co-Authored-By: Nick Vatamaniuc <va...@apache.org>
---
src/couch/src/couch_db.erl | 10 ++----
src/couch/src/couch_db_updater.erl | 33 +++++++++--------
src/couch/src/couch_key_tree.erl | 74 ++++++++++++++++++++++++++++++++++++++
3 files changed, 95 insertions(+), 22 deletions(-)
diff --git a/src/couch/src/couch_db.erl b/src/couch/src/couch_db.erl
index 93ea07e..b47cc7e 100644
--- a/src/couch/src/couch_db.erl
+++ b/src/couch/src/couch_db.erl
@@ -870,16 +870,10 @@ prep_and_validate_replicated_updates(Db, [Bucket|RestBuckets], [OldInfo|RestOldI
{[], AccErrors}, Bucket),
prep_and_validate_replicated_updates(Db, RestBuckets, RestOldInfo, [ValidatedBucket | AccPrepped], AccErrors3);
#full_doc_info{rev_tree=OldTree} ->
- RevsLimit = get_revs_limit(Db),
OldLeafs = couch_key_tree:get_all_leafs_full(OldTree),
OldLeafsLU = [{Start, RevId} || {Start, [{RevId, _}|_]} <- OldLeafs],
- NewRevTree = lists:foldl(
- fun(NewDoc, AccTree) ->
- {NewTree, _} = couch_key_tree:merge(AccTree,
- couch_doc:to_path(NewDoc), RevsLimit),
- NewTree
- end,
- OldTree, Bucket),
+ NewPaths = lists:map(fun couch_doc:to_path/1, Bucket),
+ NewRevTree = couch_key_tree:multi_merge(OldTree, NewPaths),
Leafs = couch_key_tree:get_all_leafs_full(NewRevTree),
LeafRevsFullDict = dict:from_list( [{{Start, RevId}, FullPath} || {Start, [{RevId, _}|_]}=FullPath <- Leafs]),
{ValidatedBucket, AccErrors3} =
diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl
index a2de3bc..fba99a7 100644
--- a/src/couch/src/couch_db_updater.erl
+++ b/src/couch/src/couch_db_updater.erl
@@ -504,23 +504,24 @@ merge_rev_trees(Limit, MergeConflicts, [NewDocs|RestDocsList],
[OldDocInfo|RestOldInfo], AccNewInfos, AccRemoveSeqs, AccSeq) ->
erlang:put(last_id_merged, OldDocInfo#full_doc_info.id), % for debugging
NewDocInfo0 = lists:foldl(fun({Client, NewDoc}, OldInfoAcc) ->
- merge_rev_tree(OldInfoAcc, NewDoc, Client, Limit, MergeConflicts)
+ merge_rev_tree(OldInfoAcc, NewDoc, Client, MergeConflicts)
end, OldDocInfo, NewDocs),
+ NewDocInfo1 = stem_full_doc_info(NewDocInfo0, Limit),
% When MergeConflicts is false, we updated #full_doc_info.deleted on every
% iteration of merge_rev_tree. However, merge_rev_tree does not update
% #full_doc_info.deleted when MergeConflicts is true, since we don't need
% to know whether the doc is deleted between iterations. Since we still
% need to know if the doc is deleted after the merge happens, we have to
% set it here.
- NewDocInfo1 = case MergeConflicts of
+ NewDocInfo2 = case MergeConflicts of
true ->
- NewDocInfo0#full_doc_info{
- deleted = couch_doc:is_deleted(NewDocInfo0)
+ NewDocInfo1#full_doc_info{
+ deleted = couch_doc:is_deleted(NewDocInfo1)
};
false ->
- NewDocInfo0
+ NewDocInfo1
end,
- if NewDocInfo1 == OldDocInfo ->
+ if NewDocInfo2 == OldDocInfo ->
% nothing changed
merge_rev_trees(Limit, MergeConflicts, RestDocsList, RestOldInfo,
AccNewInfos, AccRemoveSeqs, AccSeq);
@@ -529,7 +530,7 @@ merge_rev_trees(Limit, MergeConflicts, [NewDocs|RestDocsList],
% important to note that the update_seq on OldDocInfo should
% be identical to the value on NewDocInfo1.
OldSeq = OldDocInfo#full_doc_info.update_seq,
- NewDocInfo2 = NewDocInfo1#full_doc_info{
+ NewDocInfo3 = NewDocInfo2#full_doc_info{
update_seq = AccSeq + 1
},
RemoveSeqs = case OldSeq of
@@ -537,10 +538,10 @@ merge_rev_trees(Limit, MergeConflicts, [NewDocs|RestDocsList],
_ -> [OldSeq | AccRemoveSeqs]
end,
merge_rev_trees(Limit, MergeConflicts, RestDocsList, RestOldInfo,
- [NewDocInfo2|AccNewInfos], RemoveSeqs, AccSeq+1)
+ [NewDocInfo3|AccNewInfos], RemoveSeqs, AccSeq+1)
end.
-merge_rev_tree(OldInfo, NewDoc, Client, Limit, false)
+merge_rev_tree(OldInfo, NewDoc, Client, false)
when OldInfo#full_doc_info.deleted ->
% We're recreating a document that was previously
% deleted. To check that this is a recreation from
@@ -574,7 +575,7 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false)
% Merge our modified new doc into the tree
#full_doc_info{rev_tree=OldTree} = OldInfo,
NewTree0 = couch_doc:to_path(NewDoc2),
- case couch_key_tree:merge(OldTree, NewTree0, Limit) of
+ case couch_key_tree:merge(OldTree, NewTree0) of
{NewTree1, new_leaf} ->
% We changed the revision id so inform the caller
send_result(Client, NewDoc, {ok, {OldPos+1, NewRevId}}),
@@ -589,7 +590,7 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false)
send_result(Client, NewDoc, conflict),
OldInfo
end;
-merge_rev_tree(OldInfo, NewDoc, Client, Limit, false) ->
+merge_rev_tree(OldInfo, NewDoc, Client, false) ->
% We're attempting to merge a new revision into an
% undeleted document. To not be a conflict we require
% that the merge results in extending a branch.
@@ -597,7 +598,7 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false) ->
OldTree = OldInfo#full_doc_info.rev_tree,
NewTree0 = couch_doc:to_path(NewDoc),
NewDeleted = NewDoc#doc.deleted,
- case couch_key_tree:merge(OldTree, NewTree0, Limit) of
+ case couch_key_tree:merge(OldTree, NewTree0) of
{NewTree, new_leaf} when not NewDeleted ->
OldInfo#full_doc_info{
rev_tree = NewTree,
@@ -615,14 +616,18 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false) ->
send_result(Client, NewDoc, conflict),
OldInfo
end;
-merge_rev_tree(OldInfo, NewDoc, _Client, Limit, true) ->
+merge_rev_tree(OldInfo, NewDoc, _Client, true) ->
% We're merging in revisions without caring about
% conflicts. Most likely this is a replication update.
OldTree = OldInfo#full_doc_info.rev_tree,
NewTree0 = couch_doc:to_path(NewDoc),
- {NewTree, _} = couch_key_tree:merge(OldTree, NewTree0, Limit),
+ {NewTree, _} = couch_key_tree:merge(OldTree, NewTree0),
OldInfo#full_doc_info{rev_tree = NewTree}.
+stem_full_doc_info(#full_doc_info{rev_tree = Tree} = Info, Limit) ->
+ Stemmed = couch_key_tree:stem(Tree, Limit),
+ Info#full_doc_info{rev_tree = Stemmed}.
+
update_docs_int(Db, DocsList, LocalDocs, MergeConflicts, FullCommit) ->
UpdateSeq = couch_db_engine:get_update_seq(Db),
RevsLimit = couch_db_engine:get_revs_limit(Db),
diff --git a/src/couch/src/couch_key_tree.erl b/src/couch/src/couch_key_tree.erl
index cd661e2..5d53615 100644
--- a/src/couch/src/couch_key_tree.erl
+++ b/src/couch/src/couch_key_tree.erl
@@ -59,6 +59,7 @@ get_key_leafs/2,
map/2,
map_leafs/2,
mapfold/3,
+multi_merge/2,
merge/3,
merge/2,
remove_leafs/2,
@@ -71,6 +72,15 @@ stem/2
-type revtree() :: [tree()].
+%% @doc Merge multiple paths into the given tree.
+-spec multi_merge(revtree(), tree()) -> revtree().
+multi_merge(RevTree, Trees) ->
+ lists:foldl(fun(Tree, RevTreeAcc) ->
+ {NewRevTree, _} = merge(RevTreeAcc, Tree),
+ NewRevTree
+ end, RevTree, lists:sort(Trees)).
+
+
%% @doc Merge a path into the given tree and then stem the result.
%% Although Tree is of type tree(), it must not contain any branches.
-spec merge(revtree(), tree() | path(), pos_integer()) ->
@@ -470,6 +480,70 @@ map_leafs_simple(Fun, Pos, [{Key, Value, SubTree} | RestTree]) ->
stem(Trees, Limit) ->
+ try
+ {_, Branches} = lists:foldl(fun(Tree, {Seen, TreeAcc}) ->
+ {NewSeen, NewBranches} = stem_tree(Tree, Limit, Seen),
+ {NewSeen, NewBranches ++ TreeAcc}
+ end, {sets:new(), []}, Trees),
+ lists:sort(Branches)
+ catch throw:dupe_keys ->
+ repair_tree(Trees, Limit)
+ end.
+
+
+stem_tree({Depth, Child}, Limit, Seen) ->
+ case stem_tree(Depth, Child, Limit, Seen) of
+ {NewSeen, _, NewChild, NewBranches} ->
+ {NewSeen, [{Depth, NewChild} | NewBranches]};
+ {NewSeen, _, NewBranches} ->
+ {NewSeen, NewBranches}
+ end.
+
+
+stem_tree(_Depth, {Key, _Val, []} = Leaf, Limit, Seen) ->
+ {check_key(Key, Seen), Limit - 1, Leaf, []};
+
+stem_tree(Depth, {Key, Val, Children}, Limit, Seen0) ->
+ Seen1 = check_key(Key, Seen0),
+ FinalAcc = lists:foldl(fun(Child, Acc) ->
+ {SeenAcc, LimitPosAcc, ChildAcc, BranchAcc} = Acc,
+ case stem_tree(Depth + 1, Child, Limit, SeenAcc) of
+ {NewSeenAcc, LimitPos, NewChild, NewBranches} ->
+ NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
+ NewChildAcc = [NewChild | ChildAcc],
+ NewBranchAcc = NewBranches ++ BranchAcc,
+ {NewSeenAcc, NewLimitPosAcc, NewChildAcc, NewBranchAcc};
+ {NewSeenAcc, LimitPos, NewBranches} ->
+ NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
+ NewBranchAcc = NewBranches ++ BranchAcc,
+ {NewSeenAcc, NewLimitPosAcc, ChildAcc, NewBranchAcc}
+ end
+ end, {Seen1, -1, [], []}, Children),
+ {FinalSeen, FinalLimitPos, FinalChildren, FinalBranches} = FinalAcc,
+ case FinalLimitPos of
+ N when N > 0, length(FinalChildren) > 0 ->
+ FinalNode = {Key, Val, lists:reverse(FinalChildren)},
+ {FinalSeen, FinalLimitPos - 1, FinalNode, FinalBranches};
+ 0 when length(FinalChildren) > 0 ->
+ NewBranches = lists:map(fun(Child) ->
+ {Depth + 1, Child}
+ end, lists:reverse(FinalChildren)),
+ {FinalSeen, -1, NewBranches ++ FinalBranches};
+ N when N < 0, length(FinalChildren) == 0 ->
+ {FinalSeen, FinalLimitPos - 1, FinalBranches}
+ end.
+
+
+check_key(Key, Seen) ->
+ case sets:is_element(Key, Seen) of
+ true ->
+ throw(dupe_keys);
+ false ->
+ sets:add_element(Key, Seen)
+ end.
+
+
+repair_tree(Trees, Limit) ->
% flatten each branch in a tree into a tree path, sort by starting rev #
Paths = lists:sort(lists:map(fun({Pos, Path}) ->
StemmedPath = lists:sublist(Path, Limit),
--
To stop receiving notification emails like this one, please contact
davisp@apache.org.
[couchdb] 02/02: Fix couch_key_tree_tests.erl
Posted by da...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
davisp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 3c9838528972ec55165fdd98bc70cec0bff4db3d
Author: Paul J. Davis <pa...@gmail.com>
AuthorDate: Wed Jun 13 13:35:00 2018 -0500
Fix couch_key_tree_tests.erl
The `should_merge_tree_to_itself` and `should_merge_tree_of_odd_length`
tests were both invalid as merging does not support merging of anything
other than a linear path. This failure was covered up by the fact that
the stem operation will detect and cover up any errors from a failed
merge.
Co-Authored-By: Nick Vatamaniuc <va...@apache.org>
---
src/couch/test/couch_key_tree_tests.erl | 24 +++++++++++++++++++-----
1 file changed, 19 insertions(+), 5 deletions(-)
diff --git a/src/couch/test/couch_key_tree_tests.erl b/src/couch/test/couch_key_tree_tests.erl
index 88d9203..2b7d5fe 100644
--- a/src/couch/test/couch_key_tree_tests.erl
+++ b/src/couch/test/couch_key_tree_tests.erl
@@ -167,8 +167,23 @@ should_merge_reflexive_for_child_nodes()->
should_merge_tree_to_itself()->
TwoChildSibs = {1, {"1","foo", [{"1a", "bar", []},
{"1b", "bar", []}]}},
- ?_assertEqual({[TwoChildSibs], new_branch},
- couch_key_tree:merge([TwoChildSibs], TwoChildSibs, ?DEPTH)).
+ Leafs = couch_key_tree:get_all_leafs([TwoChildSibs]),
+ Paths = lists:map(fun leaf_to_path/1, Leafs),
+ FinalTree = lists:foldl(fun(Path, TreeAcc) ->
+ {NewTree, internal_node} = couch_key_tree:merge(TreeAcc, Path),
+ NewTree
+ end, [TwoChildSibs], Paths),
+ ?_assertEqual([TwoChildSibs], FinalTree).
+
+leaf_to_path({Value, {Start, Keys}}) ->
+ [Branch] = to_branch(Value, lists:reverse(Keys)),
+ {Start - length(Keys) + 1, Branch}.
+
+to_branch(Value, [Key]) ->
+ [{Key, Value, []}];
+to_branch(Value, [Key | RestKeys]) ->
+ [{Key, [], to_branch(Value, RestKeys)}].
+
should_merge_tree_of_odd_length()->
TwoChild = {1, {"1","foo", [{"1a", "bar", [{"1aa", "bar", []}]}]}},
@@ -176,9 +191,8 @@ should_merge_tree_of_odd_length()->
{"1b", "bar", []}]}},
TwoChildPlusSibs = {1, {"1","foo", [{"1a", "bar", [{"1aa", "bar", []}]},
{"1b", "bar", []}]}},
-
- ?_assertEqual({[TwoChildPlusSibs], new_branch},
- couch_key_tree:merge([TwoChild], TwoChildSibs, ?DEPTH)).
+ ?_assertEqual({[TwoChildPlusSibs], new_leaf},
+ couch_key_tree:merge([TwoChildSibs], TwoChild, ?DEPTH)).
should_merge_tree_with_stem()->
Stemmed = {2, {"1a", "bar", []}},
--
To stop receiving notification emails like this one, please contact
davisp@apache.org.