You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by yo...@apache.org on 2011/04/18 00:24:45 UTC
svn commit: r1094214 - in /lucene/dev/trunk/solr: CHANGES.txt src/java/org/apache/solr/handler/component/FacetComponent.java src/test-framework/org/apache/solr/BaseDistributedSearchTestCase.java src/test/org/apache/solr/TestDistributedSearch.java

Author: yonik
Date: Sun Apr 17 22:24:45 2011
New Revision: 1094214

URL: http://svn.apache.org/viewvc?rev=1094214&view=rev
Log:
SOLR-2403: fix facet.mincount>0 && facet.sort=index errors, add some algorithmic improvements, introduce more random testing for distrib faceted testing

Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/FacetComponent.java
    lucene/dev/trunk/solr/src/test-framework/org/apache/solr/BaseDistributedSearchTestCase.java
    lucene/dev/trunk/solr/src/test/org/apache/solr/TestDistributedSearch.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1094214&r1=1094213&r2=1094214&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Sun Apr 17 22:24:45 2011
@@ -270,7 +270,12 @@ Bug Fixes
 * SOLR-2409: edismax parser - treat the text of a fielded query as a literal if the
   fieldname does not exist.  For example Mission: Impossible should not search on
   the "Mission" field unless it's a valid field in the schema.  (Ryan McKinley, yonik)
-  
+
+* SOLR-2403: facet.sort=index reported incorrect results for distributed search
+  in a number of scenarios when facet.mincount>0.  This patch also adds some
+  performance/algorithmic improvements when (facet.sort=count && facet.mincount=1
+  && facet.limit=-1) and when (facet.sort=index && facet.mincount>0)  (yonik)
+
 
 
 Other Changes

Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/FacetComponent.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/FacetComponent.java?rev=1094214&r1=1094213&r2=1094214&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/FacetComponent.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/FacetComponent.java Sun Apr 17 22:24:45 2011
@@ -222,11 +222,37 @@ public class FacetComponent extends Sear
           sreq.params.remove(paramStart + FacetParams.FACET_MINCOUNT);
           sreq.params.remove(paramStart + FacetParams.FACET_OFFSET);
 
-          dff.initialLimit = dff.offset + dff.limit;
+          dff.initialLimit = dff.limit <= 0 ? dff.limit : dff.offset + dff.limit;
 
-          if(dff.sort.equals(FacetParams.FACET_SORT_COUNT) && dff.limit > 0) {
-            // set the initial limit higher to increase accuracy
-            dff.initialLimit = (int)(dff.initialLimit * 1.5) + 10;
+          if (dff.sort.equals(FacetParams.FACET_SORT_COUNT)) {
+            if (dff.limit > 0) {
+              // set the initial limit higher to increase accuracy
+              dff.initialLimit = (int)(dff.initialLimit * 1.5) + 10;
+              dff.initialMincount = 0;      // TODO: we could change this to 1, but would then need more refinement for small facet result sets?
+            } else {
+              // if limit==-1, then no need to artificially lower mincount to 0 if it's 1
+              dff.initialMincount = Math.min(dff.minCount, 1);
+            }
+          } else {
+            // we're sorting by index order.
+            // if minCount==0, we should always be able to get accurate results w/o over-requesting or refining
+            // if minCount==1, we should be able to get accurate results w/o over-requesting, but we'll need to refine
+            // if minCount==n (>1), we can set the initialMincount to minCount/nShards, rounded up.
+            // For example, we know that if minCount=10 and we have 3 shards, then at least one shard must have a count of 4 for the term
+            // For the minCount>1 case, we can generate too short of a list (miss terms at the end of the list) unless limit==-1
+            // For example: each shard could produce a list of top 10, but some of those could fail to make it into the combined list (i.e.
+            //   we needed to go beyond the top 10 to generate the top 10 combined).  Overrequesting can help a little here, but not as
+            //   much as when sorting by count.
+            if (dff.minCount <= 1) {
+              dff.initialMincount = dff.minCount;
+            } else {
+              dff.initialMincount = (int)Math.ceil((double)dff.minCount / rb.slices.length);
+              // dff.initialMincount = 1;
+            }
+          }
+
+          if (dff.initialMincount != 0) {
+            sreq.params.set(paramStart + FacetParams.FACET_MINCOUNT, dff.initialMincount);
           }
 
           // Currently this is for testing only and allows overriding of the
@@ -296,15 +322,18 @@ public class FacetComponent extends Sear
     //
 
     for (DistribFieldFacet dff : fi.facets.values()) {
-      if (dff.limit <= 0) continue; // no need to check these facets for refinement
-      if (dff.minCount <= 1 && dff.sort.equals(FacetParams.FACET_SORT_INDEX)) continue;
+       // no need to check these facets for refinement
+      if (dff.initialLimit <= 0 && dff.initialMincount == 0) continue;
+
+      // only other case where index-sort doesn't need refinement is if minCount==0
+      if (dff.minCount == 0 && dff.sort.equals(FacetParams.FACET_SORT_INDEX)) continue;
 
-      @SuppressWarnings("unchecked") // generic array's are anoying
+      @SuppressWarnings("unchecked") // generic array's are annoying
       List<String>[] tmp = (List<String>[]) new List[rb.shards.length];
       dff._toRefine = tmp;
 
       ShardFacetCount[] counts = dff.getCountSorted();
-      int ntop = Math.min(counts.length, dff.offset + dff.limit);
+      int ntop = Math.min(counts.length, dff.limit >= 0 ? dff.offset + dff.limit : Integer.MAX_VALUE);
       long smallestCount = counts.length == 0 ? 0 : counts[ntop-1].count;
 
       for (int i=0; i<counts.length; i++) {
@@ -313,8 +342,11 @@ public class FacetComponent extends Sear
 
         if (i<ntop) {
           // automatically flag the top values for refinement
+          // this should always be true for facet.sort=index
           needRefinement = true;
         } else {
+          // this logic should only be invoked for facet.sort=index (for now)
+
           // calculate the maximum value that this term may have
           // and if it is >= smallestCount, then flag for refinement
           long maxCount = sfc.count;
@@ -422,13 +454,32 @@ public class FacetComponent extends Sear
           counts = dff.getLexSorted();
       }
 
-      int end = dff.limit < 0 ? counts.length : Math.min(dff.offset + dff.limit, counts.length);
-      for (int i=dff.offset; i<end; i++) {
-        if (counts[i].count < dff.minCount) {
-          if (countSorted) break;  // if sorted by count, we can break out of loop early
-          else continue;
+      if (countSorted) {
+        int end = dff.limit < 0 ? counts.length : Math.min(dff.offset + dff.limit, counts.length);
+        for (int i=dff.offset; i<end; i++) {
+          if (counts[i].count < dff.minCount) {
+            break;
+          }
+          fieldCounts.add(counts[i].name, num(counts[i].count));
+        }
+      } else {
+        int off = dff.offset;
+        int lim = dff.limit >= 0 ? dff.limit : Integer.MAX_VALUE;
+
+        // index order...
+        for (int i=0; i<counts.length; i++) {
+          long count = counts[i].count;
+          if (count < dff.minCount) continue;
+          if (off > 0) {
+            off--;
+            continue;
+          }
+          if (lim <= 0) {
+            break;
+          }
+          lim--;
+          fieldCounts.add(counts[i].name, num(count));
         }
-        fieldCounts.add(counts[i].name, num(counts[i].count));
       }
 
       if (dff.missing) {
@@ -631,7 +682,8 @@ public class FacetComponent extends Sear
     public HashMap<String,ShardFacetCount> counts = new HashMap<String,ShardFacetCount>(128);
     public int termNum;
 
-    public int initialLimit;  // how many terms requested in first phase
+    public int initialLimit;     // how many terms requested in first phase
+    public int initialMincount;  // mincount param sent to each shard
     public boolean needRefinements;
     public ShardFacetCount[] countSorted;
 
@@ -671,11 +723,10 @@ public class FacetComponent extends Sear
         }
       }
 
-      // the largest possible missing term is 0 if we received less
-      // than the number requested (provided mincount==0 like it should be for
-      // a shard request)
+      // the largest possible missing term is initialMincount if we received less
+      // than the number requested.
       if (numRequested<0 || numRequested != 0 && numReceived < numRequested) {
-        last = 0;
+        last = initialMincount;
       }
 
       missingMaxPossible += last;

Modified: lucene/dev/trunk/solr/src/test-framework/org/apache/solr/BaseDistributedSearchTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test-framework/org/apache/solr/BaseDistributedSearchTestCase.java?rev=1094214&r1=1094213&r2=1094214&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test-framework/org/apache/solr/BaseDistributedSearchTestCase.java (original)
+++ lucene/dev/trunk/solr/src/test-framework/org/apache/solr/BaseDistributedSearchTestCase.java Sun Apr 17 22:24:45 2011
@@ -323,12 +323,15 @@ public abstract class BaseDistributedSea
 
   protected void query(Object... q) throws Exception {
     final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add("reqid",Integer.toString(random.nextInt())); // just to help correlate top-level requests w/ sub requests
 
     for (int i = 0; i < q.length; i += 2) {
       params.add(q[i].toString(), q[i + 1].toString());
     }
 
+    params.add("controlClient","true"); // just to enable easier sorting through log files
     final QueryResponse controlRsp = controlClient.query(params);
+    params.remove("controlClient");
 
     setDistributedParams(params);
 
@@ -418,7 +421,7 @@ public abstract class BaseDistributedSea
           break;
         }
         if (ordered) {
-          return "." + namea + "!=" + nameb + " (unordered or missing)";
+          return err("." + namea + "!=" + nameb + " (unordered or missing)");
         }
         // if unordered, continue until we find the right field.
       }
@@ -432,7 +435,7 @@ public abstract class BaseDistributedSea
 
 
     if (a.size() - aSkipped != b.size() - bSkipped) {
-      return ".size()==" + a.size() + "," + b.size() + "skipped=" + aSkipped + "," + bSkipped;
+      return err(".size()==" + a.size() + "," + b.size() + "skipped=" + aSkipped + "," + bSkipped);
     }
 
     return null;
@@ -446,7 +449,7 @@ public abstract class BaseDistributedSea
       int flagsa = flags(handle, keya);
       if ((flagsa & SKIP) != 0) continue;
       if (!b.containsKey(keya)) {
-        return "[" + keya + "]==null";
+        return err("[" + keya + "]==null");
       }
       if ((flagsa & SKIPVAL) != 0) continue;
       Object valb = b.get(keya);
@@ -478,7 +481,7 @@ public abstract class BaseDistributedSea
     } else {
       if (b.getMaxScore() != null) {
         if (a.getMaxScore() == null) {
-          return ".maxScore missing";
+          return err(".maxScore missing");
         }
       }
     }
@@ -524,7 +527,7 @@ public abstract class BaseDistributedSea
 
   public static String compare(Object[] a, Object[] b, int flags, Map<String, Integer> handle) {
     if (a.length != b.length) {
-      return ".length:" + a.length + "!=" + b.length;
+      return err(".length:" + a.length + "!=" + b.length);
     }
     for (int i = 0; i < a.length; i++) {
       String cmp = compare(a[i], b[i], flags, handle);
@@ -535,7 +538,7 @@ public abstract class BaseDistributedSea
 
   public static String compare(Object a, Object b, int flags, Map<String, Integer> handle) {
     if (a == b) return null;
-    if (a == null || b == null) return ":" + a + "!=" + b;
+    if (a == null || b == null) return err(":" + a + "!=" + b);
 
     if (a instanceof NamedList && b instanceof NamedList) {
       return compare((NamedList) a, (NamedList) b, flags, handle);
@@ -559,7 +562,7 @@ public abstract class BaseDistributedSea
 
     if (a instanceof byte[] && b instanceof byte[]) {
       if (!Arrays.equals((byte[]) a, (byte[]) b)) {
-        return ":" + a + "!=" + b;
+        return err(":" + a + "!=" + b);
       }
       return null;
     }
@@ -570,12 +573,19 @@ public abstract class BaseDistributedSea
     }
 
     if (!(a.equals(b))) {
-      return ":" + a + "!=" + b;
+      return err(":" + a + "!=" + b);
     }
 
     return null;
   }
 
+  /** This method is called for root level comparison errors and can be helpful to set a
+   * breakpoint in to debug comparison failures.
+   */
+  public static String err(String msg) {
+    return msg;
+  }
+
   protected void compareResponses(QueryResponse a, QueryResponse b) {
     String cmp;
     cmp = compare(a.getResponse(), b.getResponse(), flags, handle);

Modified: lucene/dev/trunk/solr/src/test/org/apache/solr/TestDistributedSearch.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/TestDistributedSearch.java?rev=1094214&r1=1094213&r2=1094214&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/TestDistributedSearch.java (original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/TestDistributedSearch.java Sun Apr 17 22:24:45 2011
@@ -48,6 +48,9 @@ public class TestDistributedSearch exten
 
   @Override
   public void doTest() throws Exception {
+    int backupStress = stress; // make a copy so we can restore
+
+
     del("*:*");
     indexr(id,1, i1, 100, tlong, 100,t1,"now is the time for all good men"
             ,"foo_f", 1.414f, "foo_b", "true", "foo_d", 1.414d);
@@ -132,23 +135,36 @@ public class TestDistributedSearch exten
     // then the primary sort should always be a tie and then the secondary should always decide
     query("q","{!func}ms(NOW)", "sort","score desc,"+i1+" desc","fl","id");    
 
-    query("q","*:*", "rows",100, "facet","true", "facet.field",t1);
-    query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.limit",-1, "facet.sort","count");
-    query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.limit",-1, "facet.sort","count", "facet.mincount",2);
-    query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.limit",-1, "facet.sort","index");
-    query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.limit",-1, "facet.sort","index", "facet.mincount",2);
-    query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.offset",10, "facet.limit",1, "facet.sort","index");
-    query("q","*:*", "rows",100, "facet","true", "facet.field",t1,"facet.limit",1);
-    query("q","*:*", "rows",100, "facet","true", "facet.query","quick", "facet.query","all", "facet.query","*:*");
-    query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.offset",1);
-    query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.mincount",2);
+    query("q","*:*", "rows",0, "facet","true", "facet.field",t1);
+    query("q","*:*", "rows",0, "facet","true", "facet.field",t1,"facet.limit",1);
+    query("q","*:*", "rows",0, "facet","true", "facet.query","quick", "facet.query","all", "facet.query","*:*");
+    query("q","*:*", "rows",0, "facet","true", "facet.field",t1, "facet.mincount",2);
+
+    stress=0;  // turn off stress... we want to tex max combos in min time
+    for (int i=0; i<25*RANDOM_MULTIPLIER; i++) {
+      String f = fieldNames[random.nextInt(fieldNames.length)];
+      if (random.nextBoolean()) f = t1;  // the text field is a really interesting one to facet on (and it's multi-valued too)
+
+      // we want a random query and not just *:* so we'll get zero counts in facets also
+      // TODO: do a better random query
+      String q = random.nextBoolean() ? "*:*" : "id:(1 3 5 7 9 11 13) OR id:[100 TO " + random.nextInt(50) + "]";
+
+      int nolimit = random.nextBoolean() ? -1 : 10000;  // these should be equivalent
+
+      // if limit==-1, we should always get exact matches
+      query("q",q, "rows",0, "facet","true", "facet.field",f, "facet.limit",nolimit, "facet.sort","count", "facet.mincount",random.nextInt(5), "facet.offset",random.nextInt(10));
+      query("q",q, "rows",0, "facet","true", "facet.field",f, "facet.limit",nolimit, "facet.sort","index", "facet.mincount",random.nextInt(5), "facet.offset",random.nextInt(10));
+      // for index sort, we should get exact results for mincount <= 1
+      query("q",q, "rows",0, "facet","true", "facet.field",f, "facet.sort","index", "facet.mincount",random.nextInt(2), "facet.offset",random.nextInt(10), "facet.limit",random.nextInt(11)-1);
+    }
+    stress = backupStress;  // restore stress
 
     // test faceting multiple things at once
-    query("q","*:*", "rows",100, "facet","true", "facet.query","quick", "facet.query","all", "facet.query","*:*"
+    query("q","*:*", "rows",0, "facet","true", "facet.query","quick", "facet.query","all", "facet.query","*:*"
     ,"facet.field",t1);
 
     // test filter tagging, facet exclusion, and naming (multi-select facet support)
-    query("q","*:*", "rows",100, "facet","true", "facet.query","{!key=myquick}quick", "facet.query","{!key=myall ex=a}all", "facet.query","*:*"
+    query("q","*:*", "rows",0, "facet","true", "facet.query","{!key=myquick}quick", "facet.query","{!key=myall ex=a}all", "facet.query","*:*"
     ,"facet.field","{!key=mykey ex=a}"+t1
     ,"facet.field","{!key=other ex=b}"+t1
     ,"facet.field","{!key=again ex=a,b}"+t1