You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2013/08/15 19:50:30 UTC
svn commit: r1514402 - in /lucene/dev/trunk/solr: ./ core/src/java/org/apache/solr/search/ core/src/java/org/apache/solr/spelling/ core/src/test/org/apache/solr/spelling/

Author: hossman
Date: Thu Aug 15 17:50:30 2013
New Revision: 1514402

URL: http://svn.apache.org/r1514402
Log:
SOLR-5122: Fixed bug in spellcheck.collateMaxCollectDocs.  Eliminates risk of divide by zero, and makes estimated hit counts meaningful in non-optimized indexes.

Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/EarlyTerminatingCollector.java
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/EarlyTerminatingCollectorException.java
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SpellCheckCollator.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1514402&r1=1514401&r2=1514402&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Thu Aug 15 17:50:30 2013
@@ -126,6 +126,10 @@ Bug Fixes
 * SOLR-4764: When using NRT, just init the first reader from IndexWriter.
   (Robert Muir, Mark Miller)
   
+* SOLR-5122: Fixed bug in spellcheck.collateMaxCollectDocs.  Eliminates risk 
+  of divide by zero, and makes estimated hit counts meaningful in non-optimized
+  indexes.  (hossman)
+
 Optimizations
 ----------------------
 

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/EarlyTerminatingCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/EarlyTerminatingCollector.java?rev=1514402&r1=1514401&r2=1514402&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/EarlyTerminatingCollector.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/EarlyTerminatingCollector.java Thu Aug 15 17:50:30 2013
@@ -29,11 +29,14 @@ import org.apache.lucene.search.Scorer;
  * </p>
  */
 public class EarlyTerminatingCollector extends Collector {
-  private int numCollected;
-  private int lastDocId = -1;
-  private int maxDocsToCollect;
-  private Collector delegate;
-  
+
+  private final int maxDocsToCollect;
+  private final Collector delegate;
+
+  private int numCollected = 0;
+  private int prevReaderCumulativeSize = 0;
+  private int currentReaderSize = 0;  
+
   /**
    * <p>
    *  Wraps a {@link Collector}, throwing {@link EarlyTerminatingCollectorException} 
@@ -44,42 +47,40 @@ public class EarlyTerminatingCollector e
    * 
    */
   public EarlyTerminatingCollector(Collector delegate, int maxDocsToCollect) {
+    assert 0 < maxDocsToCollect;
+    assert null != delegate;
+
     this.delegate = delegate;
     this.maxDocsToCollect = maxDocsToCollect;
   }
 
+  /**
+   * This collector requires that docs be collected in order, otherwise
+   * the computed number of scanned docs in the resulting 
+   * {@link EarlyTerminatingCollectorException} will be meaningless.
+   */
   @Override
   public boolean acceptsDocsOutOfOrder() {
-    return delegate.acceptsDocsOutOfOrder();
+    return false;
   }
 
   @Override
   public void collect(int doc) throws IOException {
     delegate.collect(doc);
-    lastDocId = doc;    
     numCollected++;  
-    if(numCollected==maxDocsToCollect) {
-      throw new EarlyTerminatingCollectorException(numCollected, lastDocId);
+    if(maxDocsToCollect <= numCollected) {
+      throw new EarlyTerminatingCollectorException
+        (numCollected, prevReaderCumulativeSize + (doc + 1));
     }
   }
   @Override
   public void setNextReader(AtomicReaderContext context) throws IOException {
-    delegate.setNextReader(context);    
+    prevReaderCumulativeSize += currentReaderSize; // not current any more
+    currentReaderSize = context.reader().maxDoc() - 1;
+    delegate.setNextReader(context);
   }
   @Override
   public void setScorer(Scorer scorer) throws IOException {
     delegate.setScorer(scorer);    
   }
-  public int getNumCollected() {
-    return numCollected;
-  }
-  public void setNumCollected(int numCollected) {
-    this.numCollected = numCollected;
-  }
-  public int getLastDocId() {
-    return lastDocId;
-  }
-  public void setLastDocId(int lastDocId) {
-    this.lastDocId = lastDocId;
-  }
 }

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/EarlyTerminatingCollectorException.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/EarlyTerminatingCollectorException.java?rev=1514402&r1=1514401&r2=1514402&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/EarlyTerminatingCollectorException.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/EarlyTerminatingCollectorException.java Thu Aug 15 17:50:30 2013
@@ -23,23 +23,40 @@ package org.apache.solr.search;
  */
 public class EarlyTerminatingCollectorException extends RuntimeException {
   private static final long serialVersionUID = 5939241340763428118L;  
-  private int lastDocId = -1;
+  private int numberScanned;
   private int numberCollected;
   
-  public EarlyTerminatingCollectorException(int numberCollected, int lastDocId) {
+  public EarlyTerminatingCollectorException(int numberCollected, int numberScanned) {
+    assert numberCollected <= numberScanned : numberCollected+"<="+numberScanned;
+    assert 0 < numberCollected;
+    assert 0 < numberScanned;
+
     this.numberCollected = numberCollected;
-    this.lastDocId = lastDocId;
+    this.numberScanned = numberScanned;
   }
-  public int getLastDocId() {
-    return lastDocId;
-  }
-  public void setLastDocId(int lastDocId) {
-    this.lastDocId = lastDocId;
+  /**
+   * The total number of documents in the index that were "scanned" by 
+   * the index when collecting the {@see #getNumberCollected()} documents 
+   * that triggered this exception.
+   * <p>
+   * This number represents the sum of:
+   * </p>
+   * <ul>
+   *  <li>The total number of documents in all AtomicReaders
+   *      that were fully exhausted during collection
+   *  </li>
+   *  <li>The id of the last doc collected in the last AtomicReader
+   *      consulted during collection.
+   *  </li>
+   * </ul>
+   **/
+  public int getNumberScanned() {
+    return numberScanned;
   }
+  /**
+   * The number of documents collected that resulted in early termination
+   */
   public int getNumberCollected() {
     return numberCollected;
   }
-  public void setNumberCollected(int numberCollected) {
-    this.numberCollected = numberCollected;
-  }
 }

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SpellCheckCollator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SpellCheckCollator.java?rev=1514402&r1=1514401&r2=1514402&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SpellCheckCollator.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SpellCheckCollator.java Thu Aug 15 17:50:30 2013
@@ -147,10 +147,14 @@ public class SpellCheckCollator {
           hits = (Integer) checkResponse.rsp.getToLog().get("hits");
         } catch (EarlyTerminatingCollectorException etce) {
           assert (docCollectionLimit > 0);
-          if (etce.getLastDocId() + 1 == maxDocId) {
-            hits = docCollectionLimit;
+          assert 0 < etce.getNumberScanned();
+          assert 0 < etce.getNumberCollected();
+
+          if (etce.getNumberScanned() == maxDocId) {
+            hits = etce.getNumberCollected();
           } else {
-            hits = maxDocId / ((etce.getLastDocId() + 1) / docCollectionLimit);
+            hits = (int) ( ((float)( maxDocId * etce.getNumberCollected() )) 
+                           / (float)etce.getNumberScanned() );
           }
         } catch (Exception e) {
           LOG.warn("Exception trying to re-query to check if a spell check possibility would return any hits.", e);

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java?rev=1514402&r1=1514401&r2=1514402&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java Thu Aug 15 17:50:30 2013
@@ -21,7 +21,9 @@ import java.util.List;
 import java.util.Set;
 
 import org.apache.lucene.util.LuceneTestCase.Slow;
+import org.apache.lucene.util._TestUtil;
 import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.GroupParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
@@ -44,24 +46,52 @@ public class SpellCheckCollatorTest exte
   @BeforeClass
   public static void beforeClass() throws Exception {
     initCore("solrconfig-spellcheckcomponent.xml", "schema.xml");
-    assertNull(h.validateUpdate(adoc("id", "0", "lowerfilt", "faith hope and love", "teststop", "metanoia")));
-    assertNull(h.validateUpdate(adoc("id", "1", "lowerfilt", "faith hope and loaves")));
-    assertNull(h.validateUpdate(adoc("id", "2", "lowerfilt", "fat hops and loaves")));
-    assertNull(h.validateUpdate(adoc("id", "3", "lowerfilt", "faith of homer", "teststop", "metanoia")));
-    assertNull(h.validateUpdate(adoc("id", "4", "lowerfilt", "fat of homer")));
-    assertNull(h.validateUpdate(adoc("id", "5", "lowerfilt1", "peace")));
-    assertNull(h.validateUpdate(adoc("id", "6", "lowerfilt", "hyphenated word")));
-    assertNull(h.validateUpdate(adoc("id", "7", "teststop", "Jane filled out a form at Charles De Gaulle")));
-    assertNull(h.validateUpdate(adoc("id", "8", "teststop", "Dick flew from Heathrow")));
-    assertNull(h.validateUpdate(adoc("id", "9", "teststop", "Jane is stuck in customs because Spot chewed up the form")));
-    assertNull(h.validateUpdate(adoc("id", "10", "teststop", "Once in Paris Dick built a fire on the hearth")));
-    assertNull(h.validateUpdate(adoc("id", "11", "teststop", "Dick waited for Jane as he watched the sparks flow upward")));
-    assertNull(h.validateUpdate(adoc("id", "12", "teststop", "This June parisian rendez-vous is ruined because of a customs snafu")));
-    assertNull(h.validateUpdate(adoc("id", "13", "teststop", "partisan political machine", "teststop", "metanoia")));
-    assertNull(h.validateUpdate(adoc("id", "14", "teststop", "metanoia")));
-    assertNull(h.validateUpdate(adoc("id", "15", "teststop", "metanoia")));
-    assertNull(h.validateUpdate(adoc("id", "16", "teststop", "metanoia")));
-    assertNull(h.validateUpdate(commit()));
+    assertU(adoc("id", "0", 
+                 "lowerfilt", "faith hope and love", 
+                 "teststop", "metanoia"));
+    assertU(adoc("id", "1", 
+                 "lowerfilt", "faith hope and loaves",
+                 "teststop", "everyother"));
+    assertU(adoc("id", "2", 
+                 "lowerfilt", "fat hops and loaves"));
+    assertU(adoc("id", "3", 
+                 "lowerfilt", "faith of homer", 
+                 "teststop", "metanoia",
+                 "teststop", "everyother"));
+    assertU(adoc("id", "4", 
+                 "lowerfilt", "fat of homer"));
+    assertU(adoc("id", "5", 
+                 "lowerfilt1", "peace",
+                 "teststop", "everyother"));
+    assertU(adoc("id", "6", 
+                 "lowerfilt", "hyphenated word"));
+    assertU(adoc("id", "7", 
+                 "teststop", "Jane filled out a form at Charles De Gaulle",
+                 "teststop", "everyother"));
+    assertU(adoc("id", "8", 
+                 "teststop", "Dick flew from Heathrow"));
+    assertU(adoc("id", "9", 
+                 "teststop", "Jane is stuck in customs because Spot chewed up the form",
+                 "teststop", "everyother"));
+    assertU(adoc("id", "10", 
+                 "teststop", "Once in Paris Dick built a fire on the hearth"));
+    assertU(adoc("id", "11", 
+                 "teststop", "Dick waited for Jane as he watched the sparks flow upward",
+                 "teststop", "everyother"));
+    assertU(adoc("id", "12", 
+                 "teststop", "This June parisian rendez-vous is ruined because of a customs snafu"));
+    assertU(adoc("id", "13", 
+                 "teststop", "partisan political machine", 
+                 "teststop", "metanoia",
+                 "teststop", "everyother"));
+    assertU(adoc("id", "14", 
+                 "teststop", "metanoia"));
+    assertU(adoc("id", "15", 
+                 "teststop", "metanoia",
+                 "teststop", "everyother"));
+    assertU(adoc("id", "16", 
+                 "teststop", "metanoia"));
+    assertU(commit());
   }
 
   @Test
@@ -436,71 +466,66 @@ public class SpellCheckCollatorTest exte
   }
 
   @Test
-  @Ignore("SOLR-5122: estimates don't seem to make any sense")
   public void testEstimatedHitCounts() throws Exception {
-   assertQ(
-        req(
-          SpellCheckComponent.COMPONENT_NAME, "true",
-          SpellCheckComponent.SPELLCHECK_DICT, "direct",
-          SpellingParams.SPELLCHECK_COUNT, "1",   
-          SpellingParams.SPELLCHECK_COLLATE, "true",
-          SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, "1",
-          SpellingParams.SPELLCHECK_MAX_COLLATIONS, "1",
-          SpellingParams.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true",          
-          "qt", "spellCheckCompRH",          
-          CommonParams.Q, "teststop:metnoia"
-        ),
-        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/str[@name='collationQuery']='teststop:metanoia'",
-        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/int[@name='hits']=6"        
-      );
-    assertQ(
-        req(
-          SpellCheckComponent.COMPONENT_NAME, "true",
-          SpellCheckComponent.SPELLCHECK_DICT, "direct",
-          SpellingParams.SPELLCHECK_COUNT, "1",   
-          SpellingParams.SPELLCHECK_COLLATE, "true",
-          SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, "1",
-          SpellingParams.SPELLCHECK_MAX_COLLATIONS, "1",
-          SpellingParams.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true",
-          SpellingParams.SPELLCHECK_COLLATE_MAX_COLLECT_DOCS, "1",
-          "qt", "spellCheckCompRH",          
-          CommonParams.Q, "teststop:metnoia"
-        ),
-        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/str[@name='collationQuery']='teststop:metanoia'",
-        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/int[@name='hits']=17"        
-      );
-    assertQ(
-        req(
-          SpellCheckComponent.COMPONENT_NAME, "true",
-          SpellCheckComponent.SPELLCHECK_DICT, "direct",
-          SpellingParams.SPELLCHECK_COUNT, "1",   
-          SpellingParams.SPELLCHECK_COLLATE, "true",
-          SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, "1",
-          SpellingParams.SPELLCHECK_MAX_COLLATIONS, "1",
-          SpellingParams.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true",
-          SpellingParams.SPELLCHECK_COLLATE_MAX_COLLECT_DOCS, "3",
-          "qt", "spellCheckCompRH",          
-          CommonParams.Q, "teststop:metnoia"
-        ),
-        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/str[@name='collationQuery']='teststop:metanoia'",
-        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/int[@name='hits']=4"        
-      );
-    assertQ(
-        req(
-          SpellCheckComponent.COMPONENT_NAME, "true",
-          SpellCheckComponent.SPELLCHECK_DICT, "direct",
-          SpellingParams.SPELLCHECK_COUNT, "1",   
-          SpellingParams.SPELLCHECK_COLLATE, "true",
-          SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, "1",
-          SpellingParams.SPELLCHECK_MAX_COLLATIONS, "1",
-          SpellingParams.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true",
-          SpellingParams.SPELLCHECK_COLLATE_MAX_COLLECT_DOCS, "100",
-          "qt", "spellCheckCompRH",          
-          CommonParams.Q, "teststop:metnoia"
-        ),
-        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/str[@name='collationQuery']='teststop:metanoia'",
-        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/int[@name='hits']=6"        
-      );
+    final String xpathPrefix = 
+      "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/";
+    final SolrParams reusedParams = params
+      (SpellCheckComponent.COMPONENT_NAME, "true",
+       SpellCheckComponent.SPELLCHECK_DICT, "direct",
+       SpellingParams.SPELLCHECK_COUNT, "1",   
+       SpellingParams.SPELLCHECK_COLLATE, "true",
+       SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, "1",
+       SpellingParams.SPELLCHECK_MAX_COLLATIONS, "1",
+       SpellingParams.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true",          
+       "qt", "spellCheckCompRH");       
+
+    // default case, no SPELLCHECK_COLLATE_MAX_COLLECT_DOCS should be exact num hits
+    assertQ(req(reusedParams, 
+                CommonParams.Q, "teststop:metnoia")
+            , xpathPrefix + "str[@name='collationQuery']='teststop:metanoia'"
+            , xpathPrefix + "int[@name='hits']=6"        
+            );
+
+    // specifying 0 means "exact" same as default, but specifing a value greater 
+    // then the total number of docs in the index should also result in it
+    // "estimating" and getting exact number as well.
+    for (String val : new String[] { "0", "30", "100", "10000" }) {
+      assertQ(req(reusedParams,
+                  CommonParams.Q, "teststop:metnoia",
+                  SpellingParams.SPELLCHECK_COLLATE_MAX_COLLECT_DOCS, val)
+              , xpathPrefix + "str[@name='collationQuery']='teststop:metanoia'"
+              , xpathPrefix + "int[@name='hits']=6"        
+              );
+    }
+
+    // values between 0 and the num docs in the index should not error, and should 
+    // produce an estimate no more then the total number of docs
+    final int iters = atLeast(10);
+    for (int iter = 0; iter < iters; iter++) {
+      final int val = _TestUtil.nextInt(random(), 1, 17);
+      assertQ(req(reusedParams,
+                  CommonParams.Q, "teststop:metnoia",
+                  SpellingParams.SPELLCHECK_COLLATE_MAX_COLLECT_DOCS, ""+val)
+              , xpathPrefix + "str[@name='collationQuery']='teststop:metanoia'"
+              , xpathPrefix + "int[@name='hits' and . <= 17 and 0 < .]"        
+              );
+    }
+
+    // "everYother" appears in every other doc in the index, so "everother" 
+    // should produce a "decent" aproximation of the number of hits (8)
+    // for any 5 <= SPELLCHECK_COLLATE_MAX_COLLECT_DOCS
+    //
+    // (we have to be kind of flexible with our definition of "decent"
+    // since we're dealing with a fairly small index here)
+    for (int val = 5; val <= 20; val++) {
+      assertQ(req(reusedParams,
+                  CommonParams.Q, "teststop:everother",
+                  SpellingParams.SPELLCHECK_COLLATE_MAX_COLLECT_DOCS, ""+val)
+              , xpathPrefix + "str[@name='collationQuery']='teststop:everyother'"
+              , xpathPrefix + "int[@name='hits' and 6 <= . and . <= 10]"        
+              );
+    }
+
   }  
   
 }