You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ji...@apache.org on 2019/01/29 07:59:32 UTC
[lucene-solr] branch branch_8x updated: LUCENE-8660:
TopDocsCollectors now return an accurate count (instead of a lower bound)
if the total hit count is equal to the provided threshold.
This is an automated email from the ASF dual-hosted git repository.
jimczi pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new 93d1e67 LUCENE-8660: TopDocsCollectors now return an accurate count (instead of a lower bound) if the total hit count is equal to the provided threshold.
93d1e67 is described below
commit 93d1e67886e75623f3f72526284bdbc3b1fef7e6
Author: jimczi <ji...@apache.org>
AuthorDate: Mon Jan 28 22:43:20 2019 +0100
LUCENE-8660: TopDocsCollectors now return an accurate count (instead of a lower bound) if the total hit count is equal to the provided threshold.
---
lucene/CHANGES.txt | 3 +++
.../apache/lucene/search/TopFieldCollector.java | 22 ++++++++---------
.../apache/lucene/search/TopScoreDocCollector.java | 28 ++++++++++------------
.../lucene/search/TestMatchAllDocsQuery.java | 12 ++++++++--
.../apache/lucene/search/TestTopDocsCollector.java | 6 ++---
.../lucene/search/TestTopFieldCollector.java | 12 +++++-----
6 files changed, 46 insertions(+), 37 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 00c0395..2ac4076 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -208,6 +208,9 @@ Improvements
* LUCENE-8279: CheckIndex now cross-checks terms with norms. (Adrien Grand)
+* LUCENE-8660: TopDocsCollectors now return an accurate count (instead of a lower bound)
+ if the total hit count is equal to the provided threshold. (Adrien Grand, Jim Ferenczi)
+
Optimizations
* LUCENE-8040: Optimize IndexSearcher.collectionStatistics, avoiding MultiFields/MultiTerms
diff --git a/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java
index f3a2a3b..5981659 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java
@@ -134,7 +134,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
// this document is largest than anything else in the queue, and
// therefore not competitive.
if (canEarlyTerminate) {
- if (totalHits >= totalHitsThreshold) {
+ if (totalHits > totalHitsThreshold) {
totalHitsRelation = Relation.GREATER_THAN_OR_EQUAL_TO;
throw new CollectionTerminatedException();
} else {
@@ -230,7 +230,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
// this document is largest than anything else in the queue, and
// therefore not competitive.
if (canEarlyTerminate) {
- if (totalHits >= totalHitsThreshold) {
+ if (totalHits > totalHitsThreshold) {
totalHitsRelation = Relation.GREATER_THAN_OR_EQUAL_TO;
throw new CollectionTerminatedException();
} else {
@@ -324,7 +324,7 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
}
protected void updateMinCompetitiveScore(Scorable scorer) throws IOException {
- if (canSetMinScore && totalHits >= totalHitsThreshold && queueFull) {
+ if (canSetMinScore && totalHits > totalHitsThreshold && queueFull) {
assert bottom != null && firstComparator != null;
float minScore = firstComparator.value(bottom.slot);
scorer.setMinCompetitiveScore(minScore);
@@ -345,9 +345,9 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
* @param numHits
* the number of results to collect.
* @param totalHitsThreshold
- * the number of docs to count accurately. If the query matches
- * {@code totalHitsThreshold} hits or more then its hit count will be a
- * lower bound. On the other hand if the query matches less than
+ * the number of docs to count accurately. If the query matches more than
+ * {@code totalHitsThreshold} hits then its hit count will be a
+ * lower bound. On the other hand if the query matches less than or exactly
* {@code totalHitsThreshold} hits then the hit count of the result will
* be accurate. {@link Integer#MAX_VALUE} may be used to make the hit
* count accurate, but this will also make query processing slower.
@@ -373,9 +373,9 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
* @param after
* only hits after this FieldDoc will be collected
* @param totalHitsThreshold
- * the number of docs to count accurately. If the query matches
- * {@code totalHitsThreshold} hits or more then its hit count will be a
- * lower bound. On the other hand if the query matches less than
+ * the number of docs to count accurately. If the query matches more than
+ * {@code totalHitsThreshold} hits then its hit count will be a
+ * lower bound. On the other hand if the query matches less than or exactly
* {@code totalHitsThreshold} hits then the hit count of the result will
* be accurate. {@link Integer#MAX_VALUE} may be used to make the hit
* count accurate, but this will also make query processing slower.
@@ -393,8 +393,8 @@ public abstract class TopFieldCollector extends TopDocsCollector<Entry> {
throw new IllegalArgumentException("numHits must be > 0; please use TotalHitCountCollector if you just need the total hit count");
}
- if (totalHitsThreshold <= 0) {
- throw new IllegalArgumentException("totalHitsThreshold must be > 0, got " + totalHitsThreshold);
+ if (totalHitsThreshold < 0) {
+ throw new IllegalArgumentException("totalHitsThreshold must be >= 0, got " + totalHitsThreshold);
}
FieldValueHitQueue<Entry> queue = FieldValueHitQueue.create(sort.fields, numHits);
diff --git a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java
index 071ed55..c857f8f 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java
@@ -72,7 +72,7 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
totalHits++;
if (score <= pqTop.score) {
- if (totalHitsRelation == TotalHits.Relation.EQUAL_TO && totalHits >= totalHitsThreshold) {
+ if (totalHitsRelation == TotalHits.Relation.EQUAL_TO && totalHits > totalHitsThreshold) {
// we just reached totalHitsThreshold, we can start setting the min
// competitive score now
updateMinCompetitiveScore(scorer);
@@ -133,7 +133,7 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
if (score > after.score || (score == after.score && doc <= afterDoc)) {
// hit was collected on a previous page
- if (totalHitsRelation == TotalHits.Relation.EQUAL_TO && totalHits >= totalHitsThreshold) {
+ if (totalHitsRelation == TotalHits.Relation.EQUAL_TO && totalHits > totalHitsThreshold) {
// we just reached totalHitsThreshold, we can start setting the min
// competitive score now
updateMinCompetitiveScore(scorer);
@@ -161,12 +161,11 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
* Creates a new {@link TopScoreDocCollector} given the number of hits to
* collect and the number of hits to count accurately.
*
- * <p><b>NOTE</b>: If the total hit count of the top docs is less than
+ * <p><b>NOTE</b>: If the total hit count of the top docs is less than or exactly
* {@code totalHitsThreshold} then this value is accurate. On the other hand,
- * if the {@link TopDocs#totalHits} value is greater than or equal to
- * {@code totalHitsThreshold} then its value is a lower bound of the hit
- * count. A value of {@link Integer#MAX_VALUE} will make the hit count
- * accurate but will also likely make query processing slower.
+ * if the {@link TopDocs#totalHits} value is greater than {@code totalHitsThreshold}
+ * then its value is a lower bound of the hit count. A value of {@link Integer#MAX_VALUE}
+ * will make the hit count accurate but will also likely make query processing slower.
* <p><b>NOTE</b>: The instances returned by this method
* pre-allocate a full array of length
* <code>numHits</code>, and fill the array with sentinel
@@ -181,12 +180,11 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
* collect, the bottom of the previous page, and the number of hits to count
* accurately.
*
- * <p><b>NOTE</b>: If the total hit count of the top docs is less than
+ * <p><b>NOTE</b>: If the total hit count of the top docs is less than or exactly
* {@code totalHitsThreshold} then this value is accurate. On the other hand,
- * if the {@link TopDocs#totalHits} value is greater than or equal to
- * {@code totalHitsThreshold} then its value is a lower bound of the hit
- * count. A value of {@link Integer#MAX_VALUE} will make the hit count
- * accurate but will also likely make query processing slower.
+ * if the {@link TopDocs#totalHits} value is greater than {@code totalHitsThreshold}
+ * then its value is a lower bound of the hit count. A value of {@link Integer#MAX_VALUE}
+ * will make the hit count accurate but will also likely make query processing slower.
* <p><b>NOTE</b>: The instances returned by this method
* pre-allocate a full array of length
* <code>numHits</code>, and fill the array with sentinel
@@ -198,8 +196,8 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
throw new IllegalArgumentException("numHits must be > 0; please use TotalHitCountCollector if you just need the total hit count");
}
- if (totalHitsThreshold <= 0) {
- throw new IllegalArgumentException("totalHitsThreshold must be > 0, got " + totalHitsThreshold);
+ if (totalHitsThreshold < 0) {
+ throw new IllegalArgumentException("totalHitsThreshold must be >= 0, got " + totalHitsThreshold);
}
if (after == null) {
@@ -236,7 +234,7 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
}
protected void updateMinCompetitiveScore(Scorable scorer) throws IOException {
- if (totalHits >= totalHitsThreshold
+ if (totalHits > totalHitsThreshold
&& pqTop != null
&& pqTop.score != Float.NEGATIVE_INFINITY) { // -Infinity is the score of sentinels
// since we tie-break on doc id and collect in doc id order, we can require
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java
index a2e794e..2696d7c 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java
@@ -103,7 +103,8 @@ public class TestMatchAllDocsQuery extends LuceneTestCase {
Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(analyzer).setMaxBufferedDocs(2).setMergePolicy(newLogMergePolicy()));
- for (int i = 0; i < 500; i++) {
+ final int numDocs = 500;
+ for (int i = 0; i < numDocs; i++) {
addDoc("doc" + i, iw);
}
IndexReader ir = DirectoryReader.open(iw);
@@ -114,7 +115,14 @@ public class TestMatchAllDocsQuery extends LuceneTestCase {
TopScoreDocCollector c = TopScoreDocCollector.create(10, null, totalHitsThreshold);
is.search(new MatchAllDocsQuery(), c);
- assertEquals(totalHitsThreshold, c.totalHits);
+ assertEquals(totalHitsThreshold+1, c.totalHits);
+ assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, c.totalHitsRelation);
+
+ TopScoreDocCollector c1 = TopScoreDocCollector.create(10, null, numDocs);
+
+ is.search(new MatchAllDocsQuery(), c1);
+ assertEquals(numDocs, c1.totalHits);
+ assertEquals(TotalHits.Relation.EQUAL_TO, c1.totalHitsRelation);
iw.close();
ir.close();
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java
index 0506310..809d835 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java
@@ -286,7 +286,7 @@ public class TestTopDocsCollector extends LuceneTestCase {
assertEquals(2, reader.leaves().size());
w.close();
- for (int totalHitsThreshold = 1; totalHitsThreshold < 20; ++ totalHitsThreshold) {
+ for (int totalHitsThreshold = 0; totalHitsThreshold < 20; ++ totalHitsThreshold) {
TopScoreDocCollector collector = TopScoreDocCollector.create(2, null, totalHitsThreshold);
ScoreAndDoc scorer = new ScoreAndDoc();
@@ -314,8 +314,8 @@ public class TestTopDocsCollector extends LuceneTestCase {
TopDocs topDocs = collector.topDocs();
assertEquals(4, topDocs.totalHits.value);
- assertEquals(totalHitsThreshold <= 4, scorer.minCompetitiveScore != null);
- assertEquals(totalHitsThreshold <= 4 ? TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO : TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation);
+ assertEquals(totalHitsThreshold < 4, scorer.minCompetitiveScore != null);
+ assertEquals(totalHitsThreshold < 4 ? TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO : TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation);
}
reader.close();
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java
index 6daf347..046990b 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java
@@ -148,7 +148,7 @@ public class TestTopFieldCollector extends LuceneTestCase {
assertEquals(2, reader.leaves().size());
w.close();
- for (int totalHitsThreshold = 1; totalHitsThreshold < 20; ++ totalHitsThreshold) {
+ for (int totalHitsThreshold = 0; totalHitsThreshold < 20; ++ totalHitsThreshold) {
for (FieldDoc after : new FieldDoc[] { null, new FieldDoc(4, Float.NaN, new Object[] { 2L })}) {
TopFieldCollector collector = TopFieldCollector.create(sort, 2, after, totalHitsThreshold);
ScoreAndDoc scorer = new ScoreAndDoc();
@@ -169,7 +169,7 @@ public class TestTopFieldCollector extends LuceneTestCase {
scorer.doc = 1;
scorer.score = 3;
- if (totalHitsThreshold < 4) {
+ if (totalHitsThreshold < 3) {
expectThrows(CollectionTerminatedException.class, () -> leafCollector2.collect(1));
TopDocs topDocs = collector.topDocs();
assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation);
@@ -181,7 +181,7 @@ public class TestTopFieldCollector extends LuceneTestCase {
scorer.doc = 5;
scorer.score = 4;
- if (totalHitsThreshold == 4) {
+ if (totalHitsThreshold == 3) {
expectThrows(CollectionTerminatedException.class, () -> leafCollector2.collect(1));
TopDocs topDocs = collector.topDocs();
assertEquals(TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO, topDocs.totalHits.relation);
@@ -296,7 +296,7 @@ public class TestTopFieldCollector extends LuceneTestCase {
assertEquals(2, reader.leaves().size());
w.close();
- for (int totalHitsThreshold = 1; totalHitsThreshold < 20; ++ totalHitsThreshold) {
+ for (int totalHitsThreshold = 0; totalHitsThreshold < 20; ++ totalHitsThreshold) {
Sort sort = new Sort(FIELD_SCORE, new SortField("foo", SortField.Type.LONG));
TopFieldCollector collector = TopFieldCollector.create(sort, 2, null, totalHitsThreshold);
ScoreAndDoc scorer = new ScoreAndDoc();
@@ -325,8 +325,8 @@ public class TestTopFieldCollector extends LuceneTestCase {
TopDocs topDocs = collector.topDocs();
assertEquals(4, topDocs.totalHits.value);
- assertEquals(totalHitsThreshold <= 4, scorer.minCompetitiveScore != null);
- assertEquals(totalHitsThreshold <= 4 ? TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO : TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation);
+ assertEquals(totalHitsThreshold < 4, scorer.minCompetitiveScore != null);
+ assertEquals(totalHitsThreshold < 4 ? TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO : TotalHits.Relation.EQUAL_TO, topDocs.totalHits.relation);
}
reader.close();