You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by ko...@apache.org on 2017/03/01 16:39:37 UTC

[50/50] couch commit: updated refs/heads/2971-count-distinct to ee32cd5

Add a cardinality estimator builtin reduce

This introduces a _distinct builtin reduce function, which uses a
HyperLogLog algorithm to estimate the number of distinct keys in the
view index. The precision is currently fixed to 2^11 observables and
therefore uses approximately 1.5 KB of memory.

COUCHDB-2971


Project: http://git-wip-us.apache.org/repos/asf/couchdb-couch/repo
Commit: http://git-wip-us.apache.org/repos/asf/couchdb-couch/commit/ee32cd58
Tree: http://git-wip-us.apache.org/repos/asf/couchdb-couch/tree/ee32cd58
Diff: http://git-wip-us.apache.org/repos/asf/couchdb-couch/diff/ee32cd58

Branch: refs/heads/2971-count-distinct
Commit: ee32cd5825aaf63448651c9521f0927083d2281e
Parents: 38d5180
Author: Adam Kocoloski <ko...@apache.org>
Authored: Wed Mar 1 09:28:45 2017 -0500
Committer: Adam Kocoloski <ko...@apache.org>
Committed: Wed Mar 1 11:28:55 2017 -0500

----------------------------------------------------------------------
 src/couch_query_servers.erl | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/couchdb-couch/blob/ee32cd58/src/couch_query_servers.erl
----------------------------------------------------------------------
diff --git a/src/couch_query_servers.erl b/src/couch_query_servers.erl
index 92d9e24..ce76c81 100644
--- a/src/couch_query_servers.erl
+++ b/src/couch_query_servers.erl
@@ -17,6 +17,7 @@
 -export([reduce/3, rereduce/3,validate_doc_update/5]).
 -export([filter_docs/5]).
 -export([filter_view/3]).
+-export([finalize/1]).
 -export([rewrite/3]).
 
 -export([with_ddoc_proc/2, proc_prompt/2, ddoc_prompt/3, ddoc_proc_prompt/3, json_doc/1]).
@@ -86,6 +87,16 @@ group_reductions_results(List) ->
      [Heads | group_reductions_results(Tails)]
     end.
 
+finalize(Reductions) ->
+    {ok, lists:map(fun(Reduction) ->
+        case hyper:is_hyper(Reduction) of
+            true ->
+                hyper:card(Reduction);
+            false ->
+                Reduction
+        end
+    end, Reductions)}.
+
 rereduce(_Lang, [], _ReducedValues) ->
     {ok, []};
 rereduce(Lang, RedSrcs, ReducedValues) ->
@@ -152,7 +163,10 @@ builtin_reduce(rereduce, [<<"_count",_/binary>>|BuiltinReds], KVs, Acc) ->
     builtin_reduce(rereduce, BuiltinReds, KVs, [Count|Acc]);
 builtin_reduce(Re, [<<"_stats",_/binary>>|BuiltinReds], KVs, Acc) ->
     Stats = builtin_stats(Re, KVs),
-    builtin_reduce(Re, BuiltinReds, KVs, [Stats|Acc]).
+    builtin_reduce(Re, BuiltinReds, KVs, [Stats|Acc]);
+builtin_reduce(Re, [<<"_distinct",_/binary>>|BuiltinReds], KVs, Acc) ->
+    Distinct = count_distinct_keys(Re, KVs),
+    builtin_reduce(Re, BuiltinReds, KVs, [Distinct|Acc]).
 
 builtin_sum_rows(KVs) ->
     lists:foldl(fun([_Key, Value], Acc) -> sum_values(Value, Acc) end, 0, KVs).
@@ -262,6 +276,13 @@ get_number(Key, Props) ->
         throw({invalid_value, iolist_to_binary(Msg)})
     end.
 
+% TODO allow customization of precision in the ddoc.
+count_distinct_keys(reduce, KVs) ->
+    lists:foldl(fun([[Key, _Id], _Value], Filter) ->
+        hyper:insert(term_to_binary(Key), Filter)
+    end, hyper:new(11), KVs);
+count_distinct_keys(rereduce, Reds) ->
+    hyper:union([Filter || [_, Filter] <- Reds]).
 
 % use the function stored in ddoc.validate_doc_update to test an update.
 -spec validate_doc_update(DDoc, EditDoc, DiskDoc, Ctx, SecObj) -> ok when