You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2018/11/05 18:56:39 UTC

[21/26] lucene-solr:jira/solr-12730: SOLR-12954: fix facet.pivot refinement bugs when using facet.sort=index and facet.mincount>1

SOLR-12954: fix facet.pivot refinement bugs when using facet.sort=index and facet.mincount>1


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c5ff4a44
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c5ff4a44
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c5ff4a44

Branch: refs/heads/jira/solr-12730
Commit: c5ff4a4444fe95001bc1baf29f64fb2406fd2ca3
Parents: 31d7dfe
Author: Chris Hostetter <ho...@apache.org>
Authored: Fri Nov 2 20:32:32 2018 -0700
Committer: Chris Hostetter <ho...@apache.org>
Committed: Fri Nov 2 20:32:32 2018 -0700

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  2 +
 .../solr/handler/component/FacetComponent.java  |  3 +-
 .../PivotFacetFieldValueCollection.java         | 34 ++++++--
 .../DistributedFacetPivotLargeTest.java         | 88 ++++++++++++++++++++
 4 files changed, 119 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c5ff4a44/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 6f8c449..90cce5b 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -233,6 +233,8 @@ Bug Fixes
 * SOLR-12875: fix ArrayIndexOutOfBoundsException when unique(field) or uniqueBlock(_root_) is 
 used with DVHASH method in json.facet. (Tim Underwood via Mikhail Khludnev)
 
+* SOLR-12954: fix facet.pivot refinement bugs when using facet.sort=index and facet.mincount>1 (hossman)
+
 Improvements
 ----------------------
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c5ff4a44/solr/core/src/java/org/apache/solr/handler/component/FacetComponent.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/component/FacetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/FacetComponent.java
index 01f8875..e01c958 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/FacetComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/FacetComponent.java
@@ -650,7 +650,7 @@ public class FacetComponent extends SearchComponent {
       (fieldToOverRequest, FacetParams.FACET_SORT, defaultSort);
 
     int shardLimit = requestedLimit + offset;
-    int shardMinCount = requestedMinCount;
+    int shardMinCount = Math.min(requestedMinCount, 1);
 
     // per-shard mincount & overrequest
     if ( FacetParams.FACET_SORT_INDEX.equals(sort) && 
@@ -670,7 +670,6 @@ public class FacetComponent extends SearchComponent {
       if ( 0 < requestedLimit ) {
         shardLimit = doOverRequestMath(shardLimit, overRequestRatio, overRequestCount);
       }
-      shardMinCount = Math.min(requestedMinCount, 1);
     } 
     sreq.params.set(paramStart + FacetParams.FACET_LIMIT, shardLimit);
     sreq.params.set(paramStart + FacetParams.FACET_PIVOT_MINCOUNT, shardMinCount);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c5ff4a44/solr/core/src/java/org/apache/solr/handler/component/PivotFacetFieldValueCollection.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/component/PivotFacetFieldValueCollection.java b/solr/core/src/java/org/apache/solr/handler/component/PivotFacetFieldValueCollection.java
index 5c2b07f..d9aeaea 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/PivotFacetFieldValueCollection.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/PivotFacetFieldValueCollection.java
@@ -118,14 +118,36 @@ public class PivotFacetFieldValueCollection implements Iterable<PivotFacetValue>
    */
   public List<PivotFacetValue> getNextLevelValuesToRefine() {
     final int numRefinableValues = getExplicitValuesListSize();
-    if (facetFieldOffset < numRefinableValues) {
-      final int offsetPlusCount = (facetFieldLimit >= 0) 
-        ? Math.min(facetFieldLimit + facetFieldOffset, numRefinableValues) 
-        : numRefinableValues;
-      return getExplicitValuesList().subList(facetFieldOffset,  offsetPlusCount);
-    } else {
+    if (numRefinableValues < facetFieldOffset) {
       return Collections.<PivotFacetValue>emptyList();
     }
+    
+    final int offsetPlusCount = (facetFieldLimit >= 0) 
+      ? Math.min(facetFieldLimit + facetFieldOffset, numRefinableValues) 
+      : numRefinableValues;
+    
+    if (1 < facetFieldMinimumCount && facetFieldSort.equals(FacetParams.FACET_SORT_INDEX)) {
+      // we have to skip any values that (still) don't meet the mincount
+      //
+      // TODO: in theory we could avoid this extra check by trimming sooner (SOLR-6331)
+      // but since that's a destructive op that blows away the `valuesMap` which we (might?) still need
+      // (and pre-emptively skips the offsets) we're avoiding re-working that optimization
+      // for now until/unless someone gives it more careful thought...
+      final List<PivotFacetValue> results = new ArrayList<>(numRefinableValues);
+      for (PivotFacetValue pivotValue : explicitValues) {
+        if (pivotValue.getCount() >= facetFieldMinimumCount) {
+          results.add(pivotValue);
+          if (numRefinableValues <= results.size()) {
+            break;
+          }
+        }
+      }
+      return results;
+    }
+    
+    // in the non "sort==count OR mincount==1" situation, we can just return the first N values
+    // because any viable candidate is already in the top N
+    return getExplicitValuesList().subList(facetFieldOffset,  offsetPlusCount);
   }
   
   /**

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c5ff4a44/solr/core/src/test/org/apache/solr/handler/component/DistributedFacetPivotLargeTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedFacetPivotLargeTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedFacetPivotLargeTest.java
index eb6f54d..cc31135 100644
--- a/solr/core/src/test/org/apache/solr/handler/component/DistributedFacetPivotLargeTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedFacetPivotLargeTest.java
@@ -17,6 +17,7 @@
 package org.apache.solr.handler.component;
 
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.Date;
 import java.util.List;
 
@@ -264,6 +265,93 @@ public class DistributedFacetPivotLargeTest extends BaseDistributedSearchTestCas
       }
     }
 
+    // check of a single level pivot using sort=index w/mincount big enough
+    // to triggers per-shard mincount > num docs on one shard
+    // (beefed up test of same with nested pivot below)
+    for (int limit : Arrays.asList(4, 444444, -1)) {
+      SolrParams p = params("q", "*:*",
+                            "rows", "0",
+                            // skip place_s:Nplaceholder buckets
+                            "fq","-hiredate_dt:\"2012-10-01T12:30:00Z\"", 
+                            // skip company_t:compHolderN buckets from twoShard
+                            "fq","-(+company_t:compHolder* +real_b:true)",
+                            "facet","true",
+                            "facet.pivot","place_s",
+                            FacetParams.FACET_PIVOT_MINCOUNT, "50",
+                            FacetParams.FACET_LIMIT, ""+limit,
+                            "facet.sort", "index");
+      rsp = null;
+      try {
+        rsp = query( p );
+        assertPivot("place_s", "cardiff", 107, rsp.getFacetPivot().get("place_s").get(0));
+        // - zeroShard  = 50 ... above per-shard min of 50/(numShards=4)
+        // - oneShard   =  5 ... below per-shard min of 50/(numShards=4) .. should be refined
+        // - twoShard   = 52 ... above per-shard min of 50/(numShards=4)
+        // = threeShard =  0 ... should be refined and still match nothing
+      } catch (AssertionError ae) {
+        throw new AssertionError(ae.getMessage() + ": " + p.toString() + " ==> " + rsp, ae);
+      }
+    }
+    
+    // test permutations of mincount & limit with sort=index
+    // (there is a per-shard optimization on mincount when sort=index is used)
+    for (int limit : Arrays.asList(4, 444444, -1)) {
+      SolrParams p = params("q", "*:*",
+                            "rows", "0",
+                            // skip place_s:Nplaceholder buckets
+                            "fq","-hiredate_dt:\"2012-10-01T12:30:00Z\"", 
+                            // skip company_t:compHolderN buckets from twoShard
+                            "fq","-(+company_t:compHolder* +real_b:true)",
+                            "facet","true",
+                            "facet.pivot","place_s,company_t",
+                            FacetParams.FACET_PIVOT_MINCOUNT, "50",
+                            FacetParams.FACET_LIMIT, ""+limit,
+                            "facet.sort", "index");
+      rsp = null;
+      try {
+        rsp = query( p );
+        pivots = rsp.getFacetPivot().get("place_s,company_t");
+        firstPlace = pivots.get(0);
+        assertPivot("place_s", "cardiff", 107, firstPlace);
+        //
+        assertPivot("company_t", "bbc",      101, firstPlace.getPivot().get(0)); 
+        assertPivot("company_t", "honda",     50, firstPlace.getPivot().get(1)); 
+        assertPivot("company_t", "microsoft", 56, firstPlace.getPivot().get(2)); 
+        assertPivot("company_t", "polecat",   52, firstPlace.getPivot().get(3)); 
+      } catch (AssertionError ae) {
+        throw new AssertionError(ae.getMessage() + ": " + p.toString() + " ==> " + rsp, ae);
+      }
+    }
+
+    { // similar to the test above, but now force a restriction on the over request and allow
+      // terms that are early in index sort -- but don't meet the mincount overall -- to be considered
+      // in the first phase. (SOLR-12954)
+      SolrParams p = params("q", "*:*",
+                            "rows", "0",
+                            // skip company_t:compHolderN buckets from twoShard
+                            "fq","-(+company_t:compHolder* +real_b:true)",
+                            "facet","true",
+                            "facet.pivot","place_s,company_t",
+                            // the (50) Nplaceholder place_s values exist in 6 each on oneShard
+                            FacetParams.FACET_PIVOT_MINCOUNT, ""+(6 * shardsArr.length),
+                            FacetParams.FACET_LIMIT, "4",
+                            "facet.sort", "index");
+      rsp = null;
+      try {
+        rsp = query( p ); 
+        pivots = rsp.getFacetPivot().get("place_s,company_t");
+        firstPlace = pivots.get(0);
+        assertPivot("place_s", "cardiff", 107, firstPlace);
+        //
+        assertPivot("company_t", "bbc",      101, firstPlace.getPivot().get(0)); 
+        assertPivot("company_t", "honda",     50, firstPlace.getPivot().get(1)); 
+        assertPivot("company_t", "microsoft", 56, firstPlace.getPivot().get(2)); 
+        assertPivot("company_t", "polecat",   52, firstPlace.getPivot().get(3)); 
+      } catch (AssertionError ae) {
+        throw new AssertionError(ae.getMessage() + ": " + p.toString() + " ==> " + rsp, ae);
+      }
+    }
+    
     // Pivot Faceting (combined wtih Field Faceting)
     for (SolrParams facetParams : 
            // with and w/o an excluded fq