You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ji...@apache.org on 2019/01/29 07:58:37 UTC

[lucene-solr] branch master updated: LUCENE-8660: TopDocsCollectors now return an accurate count (instead of a lower bound) if the total hit count is equal to the provided threshold.

This is an automated email from the ASF dual-hosted git repository.

jimczi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new a269a4d  LUCENE-8660: TopDocsCollectors now return an accurate count (instead of a lower bound) if the total hit count is equal to the provided threshold.
a269a4d is described below

commit a269a4d1cb7889e5a69aa042316dffdf2a1050a5
Author: jimczi <ji...@apache.org>
AuthorDate: Mon Jan 28 22:43:20 2019 +0100

    LUCENE-8660: TopDocsCollectors now return an accurate count (instead of a lower bound) if the total hit count is equal to the provided threshold.
---
 lucene/CHANGES.txt                                 |  3 +++
 .../apache/lucene/search/TopFieldCollector.java    | 22 ++++++++---------
 .../apache/lucene/search/TopScoreDocCollector.java | 28 ++++++++++------------
 .../lucene/search/TestMatchAllDocsQuery.java       | 12 ++++++++--
 .../apache/lucene/search/TestTopDocsCollector.java |  6 ++---
 .../lucene/search/TestTopFieldCollector.java       | 12 +++++-----
 6 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4838cc4..57414c9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -216,6 +216,9 @@ Improvements
 
 * LUCENE-8279: CheckIndex now cross-checks terms with norms. (Adrien Grand)
 
+* LUCENE-8660: TopDocsCollectors now return an accurate count (instead of a lower bound)
+  if the total hit count is equal to the provided threshold. (Adrien Grand, Jim Ferenczi)
+
 Optimizations
 
 * LUCENE-8040: Optimize IndexSearcher.collectionStatistics, avoiding MultiFields/MultiTerms
diff --git a/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java
index f3a2a3b..5981659 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java
@@ -134,7 +134,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
               // this document is largest than anything else in the queue, and
               // therefore not competitive.
               if (canEarlyTerminate) {
-                if (totalHits >= totalHitsThreshold) {
+                if (totalHits > totalHitsThreshold) {
                   totalHitsRelation = Relation.GREATER_THAN_OR_EQUAL_TO;
                   throw new CollectionTerminatedException();
                 } else {
@@ -230,7 +230,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
               // this document is largest than anything else in the queue, and
               // therefore not competitive.
               if (canEarlyTerminate) {
-                if (totalHits >= totalHitsThreshold) {
+                if (totalHits > totalHitsThreshold) {
                   totalHitsRelation = Relation.GREATER_THAN_OR_EQUAL_TO;
                   throw new CollectionTerminatedException();
                 } else {
@@ -324,7 +324,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
   }
 
   protected void updateMinCompetitiveScore(Scorable scorer) throws IOException {
-    if (canSetMinScore && totalHits >= totalHitsThreshold && queueFull) {
+    if (canSetMinScore && totalHits > totalHitsThreshold && queueFull) {
       assert bottom != null && firstComparator != null;
       float minScore = firstComparator.value(bottom.slot);
       scorer.setMinCompetitiveScore(minScore);
@@ -345,9 +345,9 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
    * @param numHits
    *          the number of results to collect.
    * @param totalHitsThreshold
-   *          the number of docs to count accurately. If the query matches
-   *          {@code totalHitsThreshold} hits or more then its hit count will be a
-   *          lower bound. On the other hand if the query matches less than
+   *          the number of docs to count accurately. If the query matches more than
+   *          {@code totalHitsThreshold} hits then its hit count will be a
+   *          lower bound. On the other hand if the query matches less than or exactly
    *          {@code totalHitsThreshold} hits then the hit count of the result will
    *          be accurate. {@link Integer#MAX_VALUE} may be used to make the hit
    *          count accurate, but this will also make query processing slower.
@@ -373,9 +373,9 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
    * @param after
    *          only hits after this FieldDoc will be collected
    * @param totalHitsThreshold
-   *          the number of docs to count accurately. If the query matches
-   *          {@code totalHitsThreshold} hits or more then its hit count will be a
-   *          lower bound. On the other hand if the query matches less than
+   *          the number of docs to count accurately. If the query matches more than
+   *          {@code totalHitsThreshold} hits then its hit count will be a
+   *          lower bound. On the other hand if the query matches less than or exactly
    *          {@code totalHitsThreshold} hits then the hit count of the result will
    *          be accurate. {@link Integer#MAX_VALUE} may be used to make the hit
    *          count accurate, but this will also make query processing slower.
@@ -393,8 +393,8 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
       throw new IllegalArgumentException("numHits must be > 0; please use TotalHitCountCollector if you just need the total hit count");
     }
 
-    if (totalHitsThreshold <= 0) {
-      throw new IllegalArgumentException("totalHitsThreshold must be > 0, got " + totalHitsThreshold);
+    if (totalHitsThreshold < 0) {
+      throw new IllegalArgumentException("totalHitsThreshold must be >= 0, got " + totalHitsThreshold);
     }
 
     FieldValueHitQueue<Entry> queue = FieldValueHitQueue.create(sort.fields, numHits);
diff --git a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java
index 071ed55..c857f8f 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java
@@ -72,7 +72,7 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
 
           totalHits++;
           if (score <= pqTop.score) {
-            if (totalHitsRelation == TotalHits.Relation.EQUAL_TO && totalHits >= totalHitsThreshold) {
+            if (totalHitsRelation == TotalHits.Relation.EQUAL_TO && totalHits > totalHitsThreshold) {
               // we just reached totalHitsThreshold, we can start setting the min
               // competitive score now
               updateMinCompetitiveScore(scorer);
@@ -133,7 +133,7 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
 
           if (score > after.score || (score == after.score && doc <= afterDoc)) {
             // hit was collected on a previous page
-            if (totalHitsRelation == TotalHits.Relation.EQUAL_TO && totalHits >= totalHitsThreshold) {
+            if (totalHitsRelation == TotalHits.Relation.EQUAL_TO && totalHits > totalHitsThreshold) {
               // we just reached totalHitsThreshold, we can start setting the min
               // competitive score now
               updateMinCompetitiveScore(scorer);
@@ -161,12 +161,11 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
    * Creates a new {@link TopScoreDocCollector} given the number of hits to
    * collect and the number of hits to count accurately.
    *
-   * <p><b>NOTE</b>: If the total hit count of the top docs is less than
+   * <p><b>NOTE</b>: If the total hit count of the top docs is less than or exactly
    * {@code totalHitsThreshold} then this value is accurate. On the other hand,
-   * if the {@link TopDocs#totalHits} value is greater than or equal to
-   * {@code totalHitsThreshold} then its value is a lower bound of the hit
-   * count. A value of {@link Integer#MAX_VALUE} will make the hit count
-   * accurate but will also likely make query processing slower.
+   * if the {@link TopDocs#totalHits} value is greater than {@code totalHitsThreshold}
+   * then its value is a lower bound of the hit count. A value of {@link Integer#MAX_VALUE}
+   * will make the hit count accurate but will also likely make query processing slower.
    * <p><b>NOTE</b>: The instances returned by this method
    * pre-allocate a full array of length
    * <code>numHits</code>, and fill the array with sentinel
@@ -181,12 +180,11 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
    * collect, the bottom of the previous page, and the number of hits to count
    * accurately.
    *
-   * <p><b>NOTE</b>: If the total hit count of the top docs is less than
+   * <p><b>NOTE</b>: If the total hit count of the top docs is less than or exactly
    * {@code totalHitsThreshold} then this value is accurate. On the other hand,
-   * if the {@link TopDocs#totalHits} value is greater than or equal to
-   * {@code totalHitsThreshold} then its value is a lower bound of the hit
-   * count. A value of {@link Integer#MAX_VALUE} will make the hit count
-   * accurate but will also likely make query processing slower.
+   * if the {@link TopDocs#totalHits} value is greater than {@code totalHitsThreshold}
+   * then its value is a lower bound of the hit count. A value of {@link Integer#MAX_VALUE}
+   * will make the hit count accurate but will also likely make query processing slower.
    * <p><b>NOTE</b>: The instances returned by this method
    * pre-allocate a full array of length
    * <code>numHits</code>, and fill the array with sentinel
@@ -198,8 +196,8 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
       throw new IllegalArgumentException("numHits must be > 0; please use TotalHitCountCollector if you just need the total hit count");
     }
 
-    if (totalHitsThreshold <= 0) {
-      throw new IllegalArgumentException("totalHitsThreshold must be > 0, got " + totalHitsThreshold);
+    if (totalHitsThreshold < 0) {
+      throw new IllegalArgumentException("totalHitsThreshold must be >= 0, got " + totalHitsThreshold);
     }
 
     if (after == null) {
@@ -236,7 +234,7 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
   }
 
   protected void updateMinCompetitiveScore(Scorable scorer) throws IOException {
-    if (totalHits >= totalHitsThreshold
+    if (totalHits > totalHitsThreshold
           && pqTop != null
           && pqTop.score != Float.NEGATIVE_INFINITY) { // -Infinity is the score of sentinels
       // since we tie-break on doc id and collect in doc id order, we can require
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java
index a2e794e..2696d7c 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java
@@ -103,7 +103,8 @@ public class TestMatchAllDocsQuery extends LuceneTestCase {
 
     Directory dir = newDirectory();
     IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(analyzer).setMaxBufferedDocs(2).setMergePolicy(newLogMergePolicy()));
-    for (int i = 0; i < 500; i++) {
+    final int numDocs = 500;
+    for (int i = 0; i < numDocs; i++) {
       addDoc("doc" + i, iw);
     }
     IndexReader ir = DirectoryReader.open(iw);
@@ -114,7 +115,14 @@ public class TestMatchAllDocsQuery extends LuceneTestCase {
     TopScoreDocCollector c = TopScoreDocCollector.create(10, null, totalHitsThreshold);
 
     is.search(new MatchAllDocsQuery(), c);
-    assertEquals(totalHitsThreshold, c.totalHits);
+    assertEquals(totalHitsThreshold+1, c.totalHits);
+    assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, c.totalHitsRelation);
+
+    TopScoreDocCollector c1 = TopScoreDocCollector.create(10, null, numDocs);
+
+    is.search(new MatchAllDocsQuery(), c1);
+    assertEquals(numDocs, c1.totalHits);
+    assertEquals(TotalHits.Relation.EQUAL_TO, c1.totalHitsRelation);
 
     iw.close();
     ir.close();
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java
index 0506310..809d835 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java
@@ -286,7 +286,7 @@ public class TestTopDocsCollector extends LuceneTestCase {
     assertEquals(2, reader.leaves().size());
     w.close();
 
-    for (int totalHitsThreshold = 1; totalHitsThreshold < 20; ++ totalHitsThreshold) {
+    for (int totalHitsThreshold = 0; totalHitsThreshold < 20; ++ totalHitsThreshold) {
       TopScoreDocCollector collector = TopScoreDocCollector.create(2, null, totalHitsThreshold);
       ScoreAndDoc scorer = new ScoreAndDoc();
 
@@ -314,8 +314,8 @@ public class TestTopDocsCollector extends LuceneTestCase {
 
       TopDocs topDocs = collector.topDocs();
       assertEquals(4, topDocs.totalHits.value);
-      assertEquals(totalHitsThreshold <= 4, scorer.minCompetitiveScore != null);
-      assertEquals(totalHitsThreshold <= 4 ? TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO : TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation);
+      assertEquals(totalHitsThreshold < 4, scorer.minCompetitiveScore != null);
+      assertEquals(totalHitsThreshold < 4 ? TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO : TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation);
     }
 
     reader.close();
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java
index 6daf347..046990b 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java
@@ -148,7 +148,7 @@ public class TestTopFieldCollector extends LuceneTestCase {
     assertEquals(2, reader.leaves().size());
     w.close();
 
-    for (int totalHitsThreshold = 1; totalHitsThreshold < 20; ++ totalHitsThreshold) {
+    for (int totalHitsThreshold = 0; totalHitsThreshold < 20; ++ totalHitsThreshold) {
       for (FieldDoc after : new FieldDoc[] { null, new FieldDoc(4, Float.NaN, new Object[] { 2L })}) {
         TopFieldCollector collector = TopFieldCollector.create(sort, 2, after, totalHitsThreshold);
         ScoreAndDoc scorer = new ScoreAndDoc();
@@ -169,7 +169,7 @@ public class TestTopFieldCollector extends LuceneTestCase {
 
         scorer.doc = 1;
         scorer.score = 3;
-        if (totalHitsThreshold < 4) {
+        if (totalHitsThreshold < 3) {
           expectThrows(CollectionTerminatedException.class, () -> leafCollector2.collect(1));
           TopDocs topDocs = collector.topDocs();
           assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation);
@@ -181,7 +181,7 @@ public class TestTopFieldCollector extends LuceneTestCase {
 
         scorer.doc = 5;
         scorer.score = 4;
-        if (totalHitsThreshold == 4) {
+        if (totalHitsThreshold == 3) {
           expectThrows(CollectionTerminatedException.class, () -> leafCollector2.collect(1));
           TopDocs topDocs = collector.topDocs();
           assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation);
@@ -296,7 +296,7 @@ public class TestTopFieldCollector extends LuceneTestCase {
     assertEquals(2, reader.leaves().size());
     w.close();
 
-    for (int totalHitsThreshold = 1; totalHitsThreshold < 20; ++ totalHitsThreshold) {
+    for (int totalHitsThreshold = 0; totalHitsThreshold < 20; ++ totalHitsThreshold) {
       Sort sort = new Sort(FIELD_SCORE, new SortField("foo", SortField.Type.LONG));
       TopFieldCollector collector = TopFieldCollector.create(sort, 2, null, totalHitsThreshold);
       ScoreAndDoc scorer = new ScoreAndDoc();
@@ -325,8 +325,8 @@ public class TestTopFieldCollector extends LuceneTestCase {
 
       TopDocs topDocs = collector.topDocs();
       assertEquals(4, topDocs.totalHits.value);
-      assertEquals(totalHitsThreshold <= 4, scorer.minCompetitiveScore != null);
-      assertEquals(totalHitsThreshold <= 4 ? TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO : TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation);
+      assertEquals(totalHitsThreshold < 4, scorer.minCompetitiveScore != null);
+      assertEquals(totalHitsThreshold < 4 ? TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO : TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation);
     }
 
     reader.close();