You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cp...@apache.org on 2017/04/25 13:59:40 UTC

[13/17] lucene-solr:jira/solr-8668: SOLR-10548: SOLR-10552: numBuckets should use hll and ignore mincount>1 filtering

SOLR-10548: SOLR-10552: numBuckets should use hll and ignore mincount>1 filtering


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/71ce0d31
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/71ce0d31
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/71ce0d31

Branch: refs/heads/jira/solr-8668
Commit: 71ce0d31a6a907bf1566fc51324d5f26e4205c21
Parents: 114a65b
Author: yonik <yo...@apache.org>
Authored: Mon Apr 24 18:17:17 2017 -0400
Committer: yonik <yo...@apache.org>
Committed: Mon Apr 24 18:17:17 2017 -0400

----------------------------------------------------------------------
 solr/CHANGES.txt                                | 12 +++++++
 .../solr/search/facet/FacetFieldMerger.java     | 14 ++------
 .../solr/search/facet/FacetFieldProcessor.java  | 36 +++++++++++---------
 .../solr/search/facet/TestJsonFacets.java       |  8 ++---
 4 files changed, 38 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71ce0d31/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 8fbebdb..19fc3f8 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -139,6 +139,10 @@ Upgrade Notes
 
 * Solr contribs map-reduce, morphlines-core and morphlines-cell have been removed.
 
+* JSON Facet API now uses hyper-log-log for numBuckets cardinality calculation and
+  calculates cardinality before filtering buckets by any mincount greater than 1.
+
+
 Detailed Change List
 ----------------------
 
@@ -192,6 +196,9 @@ Optimizations
 * SOLR-9217: Reduced heap consumption for filter({!join ... score=...}) 
   (Andrey Kudryavtsev, Gopikannan Venugopalsamy via Mikhail Khludnev)
 
+* SOLR-10548: JSON Facet API now uses hyper-log-log++ for determining the number of buckets
+  when merging requests from a multi-shard distributed request. (yonik)
+
 Bug Fixes
 ----------------------
 * SOLR-10281: ADMIN_PATHS is duplicated in two places and inconsistent. This can cause automatic
@@ -230,6 +237,11 @@ Bug Fixes
 
 * SOLR-10493: Investigate SolrCloudExampleTest failures. (Erick Erickson)
 
+* SOLR-10552: JSON Facet API numBuckets was not consistent between distributed and non-distributed requests
+  when there was a mincount > 1.  This has been corrected by changing numBuckets cardinality processing to
+  ignore mincount > 1 for non-distributed requests. (yonik)
+
+
 Other Changes
 ----------------------
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71ce0d31/solr/core/src/java/org/apache/solr/search/facet/FacetFieldMerger.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldMerger.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldMerger.java
index 9ec5d79..4f57bcd 100644
--- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldMerger.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldMerger.java
@@ -82,7 +82,7 @@ public class FacetFieldMerger extends FacetRequestSortedMerger<FacetField> {
       Object nb = facetResult.get("numBuckets");
       if (nb != null) {
         if (numBuckets == null) {
-          numBuckets = new FacetNumBucketsMerger();
+          numBuckets = new HLLAgg("hll_merger").createFacetMerger(nb);
         }
         numBuckets.merge(nb , mcontext);
       }
@@ -98,17 +98,7 @@ public class FacetFieldMerger extends FacetRequestSortedMerger<FacetField> {
     SimpleOrderedMap result = new SimpleOrderedMap();
 
     if (numBuckets != null) {
-      int removed = 0;
-      if (freq.mincount > 1) {
-        for (FacetBucket bucket : buckets.values()) {
-          if (bucket.count < freq.mincount) removed++;
-        }
-      }
-      result.add("numBuckets", ((Number)numBuckets.getMergedResult()).longValue() - removed);
-
-      // TODO: we can further increase this estimate.
-      // If not sorting by count, use a simple ratio to scale
-      // If sorting by count desc, then add up the highest_possible_missing_count from each shard
+      result.add("numBuckets", ((Number)numBuckets.getMergedResult()).longValue());
     }
 
     sortBuckets();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71ce0d31/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
index d4daf08..65b88d8 100644
--- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
@@ -210,10 +210,6 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
                                         IntFunction<Comparable> bucketValFromSlotNumFunc,
                                         Function<Comparable, String> fieldQueryValFunc) throws IOException {
     int numBuckets = 0;
-    List<Object> bucketVals = null;
-    if (freq.numBuckets && fcontext.isShard()) {
-      bucketVals = new ArrayList<>(100);
-    }
 
     final int off = fcontext.isShard() ? 0 : (int) freq.offset;
 
@@ -257,16 +253,18 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
     Slot bottom = null;
     Slot scratchSlot = new Slot();
     for (int slotNum = 0; slotNum < numSlots; slotNum++) {
-      // screen out buckets not matching mincount immediately (i.e. don't even increment numBuckets)
-      if (effectiveMincount > 0 && countAcc.getCount(slotNum) < effectiveMincount) {
-        continue;
+
+      // screen out buckets not matching mincount
+      if (effectiveMincount > 0) {
+        int count = countAcc.getCount(slotNum);
+        if (count  < effectiveMincount) {
+          if (count > 0)
+            numBuckets++;  // Still increment numBuckets as long as we have some count.  This is for consistency between distrib and non-distrib mode.
+          continue;
+        }
       }
 
       numBuckets++;
-      if (bucketVals != null && bucketVals.size()<100) {
-        Object val = bucketValFromSlotNumFunc.apply(slotNum);
-        bucketVals.add(val);
-      }
 
       if (bottom != null) {
         scratchSlot.slot = slotNum; // scratchSlot is only used to hold this slotNum for the following line
@@ -292,10 +290,17 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
       if (!fcontext.isShard()) {
         res.add("numBuckets", numBuckets);
       } else {
-        SimpleOrderedMap<Object> map = new SimpleOrderedMap<>(2);
-        map.add("numBuckets", numBuckets);
-        map.add("vals", bucketVals);
-        res.add("numBuckets", map);
+        DocSet domain = fcontext.base;
+        if (freq.prefix != null) {
+          Query prefixFilter = sf.getType().getPrefixQuery(null, sf, freq.prefix);
+          domain = fcontext.searcher.getDocSet(prefixFilter, domain);
+        }
+
+        HLLAgg agg = new HLLAgg(freq.field);
+        SlotAcc acc = agg.createSlotAcc(fcontext, domain.size(), 1);
+        acc.collect(domain, 0);
+        acc.key = "numBuckets";
+        acc.setValues(res, 0);
       }
     }
 
@@ -522,7 +527,6 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
      "all",
      {"cat3":{"_l":["A"]}}]]},
    "cat1":{"_l":["A"]}}}
-
    */
 
   static <T> List<T> asList(Object list) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71ce0d31/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
index bad3de5..a8a1eaa 100644
--- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
+++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
@@ -770,12 +770,12 @@ public class TestJsonFacets extends SolrTestCaseHS {
             "'f1':{ numBuckets:1, buckets:[{val:B, count:3}]} } "
     );
 
-    // mincount should lower numBuckets
+    // mincount should not lower numBuckets (since SOLR-10552)
     client.testJQ(params(p, "q", "*:*", "rows", "0", "facet", "true"
             , "json.facet", "{f1:{terms:{${terms} field:${cat_s}, numBuckets:true, mincount:3}}}"
         )
         , "facets=={ 'count':6, " +
-            "'f1':{ numBuckets:1, buckets:[{val:B, count:3}]} } "
+            "'f1':{ numBuckets:2, buckets:[{val:B, count:3}]} } "
     );
 
     // basic range facet
@@ -1136,7 +1136,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
                 ",f3:{${terms}  type:field, field:${num_i}, sort:'index asc' }" +
                 ",f4:{${terms}  type:field, field:${num_i}, sort:'index desc' }" +
                 ",f5:{${terms}  type:field, field:${num_i}, sort:'index desc', limit:1, missing:true, allBuckets:true, numBuckets:true }" +
-                ",f6:{${terms}  type:field, field:${num_i}, sort:'index desc', mincount:2, numBuckets:true }" +   // mincount should lower numbuckets
+                ",f6:{${terms}  type:field, field:${num_i}, sort:'index desc', mincount:2, numBuckets:true }" +   // mincount should not lower numbuckets (since SOLR-10552)
                 ",f7:{${terms}  type:field, field:${num_i}, sort:'index desc', offset:2, numBuckets:true }" +     // test offset
                 ",f8:{${terms}  type:field, field:${num_i}, sort:'index desc', offset:100, numBuckets:true }" +   // test high offset
                 ",f9:{${terms}  type:field, field:${num_i}, sort:'x desc', facet:{x:'avg(${num_d})'}, missing:true, allBuckets:true, numBuckets:true }" +            // test stats
@@ -1150,7 +1150,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
             ",f3:{ buckets:[{val:-5,count:2},{val:2,count:1},{val:3,count:1},{val:7,count:1} ] } " +
             ",f4:{ buckets:[{val:7,count:1},{val:3,count:1},{val:2,count:1},{val:-5,count:2} ] } " +
             ",f5:{ buckets:[{val:7,count:1}]   , numBuckets:4, allBuckets:{count:5}, missing:{count:1}  } " +
-            ",f6:{ buckets:[{val:-5,count:2}]  , numBuckets:1  } " +
+            ",f6:{ buckets:[{val:-5,count:2}]  , numBuckets:4  } " +
             ",f7:{ buckets:[{val:2,count:1},{val:-5,count:2}] , numBuckets:4 } " +
             ",f8:{ buckets:[] , numBuckets:4 } " +
             ",f9:{ buckets:[{val:7,count:1,x:11.0},{val:2,count:1,x:4.0},{val:3,count:1,x:2.0},{val:-5,count:2,x:-7.0} ],  numBuckets:4, allBuckets:{count:5,x:0.6},missing:{count:1,x:0.0} } " +  // TODO: should missing exclude "x" because no values were collected?