You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/08/06 04:15:49 UTC

[16/48] lucene-solr:jira/http2: LUCENE-8060: IndexSearcher's search and searchAfter methods now only compute total hit counts accurately up to 1, 000.

LUCENE-8060: IndexSearcher's search and searchAfter methods now only compute total hit counts accurately up to 1,000.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/99dbe936
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/99dbe936
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/99dbe936

Branch: refs/heads/jira/http2
Commit: 99dbe936818add5723f2014a90bd0ea8a17c8f19
Parents: 0dc124a
Author: Adrien Grand <jp...@gmail.com>
Authored: Wed Aug 1 09:00:40 2018 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Aug 1 09:01:21 2018 +0200

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  4 ++++
 lucene/MIGRATE.txt                              |  8 ++++---
 .../org/apache/lucene/search/IndexSearcher.java | 23 +++++++++++++++++---
 .../org/apache/lucene/search/TestBoolean2.java  |  5 ++---
 .../apache/lucene/search/TestBooleanScorer.java |  2 +-
 .../apache/lucene/search/TestLRUQueryCache.java |  8 ++++++-
 .../apache/lucene/search/TestNeedsScores.java   |  2 +-
 .../lucene/search/TestShardSearching.java       |  2 +-
 .../apache/lucene/search/TestTopDocsMerge.java  |  2 +-
 .../search/join/ToParentBlockJoinQuery.java     |  8 +------
 .../java/org/apache/lucene/util/TestUtil.java   | 18 ++++++++++++---
 11 files changed, 58 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/99dbe936/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 0f13dd3..76815f5 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -96,6 +96,10 @@ Changes in Runtime Behavior
 * LUCENE-7444: StandardAnalyzer no longer defaults to removing English stopwords
   (Alan Woodward)
 
+* LUCENE-8060: IndexSearcher's search and searchAfter methods now only compute
+  total hit counts accurately up to 1,000 in order to enable top-hits
+  optimizations such as block-max WAND (LUCENE-8135). (Adrien Grand)
+
 Improvements
 
 * LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/99dbe936/lucene/MIGRATE.txt
----------------------------------------------------------------------
diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt
index fc0930c..7dfe5c7 100644
--- a/lucene/MIGRATE.txt
+++ b/lucene/MIGRATE.txt
@@ -90,10 +90,12 @@ request in order to only compute scores for documents that made it to the top
 hits. As a consequence, the trackDocScores option has been removed and can be
 replaced with the new TopFieldCollector#populateScores helper method.
 
-## TopDocs.totalHits is no longer a long ##
+## IndexSearcher.search(After) may return lower bounds of the hit count and TopDocs.totalHits is no longer a long ##
 
 Lucene 8 received optimizations for collection of top-k matches by not visiting
 all matches. However these optimizations won't help if all matches still need
 to be visited in order to compute the total number of hits. As a consequence,
-TopDocs.totalHits is now an TotalHits object that is either an exact hit count
-or a lower bound of the hit count.
+IndexSearcher's search and searchAfter methods were changed to only count hits
+accurately up to 1,000, and Topdocs.totalHits was changed from a long to an
+object that says whether the hit count is accurate or a lower bound of the
+actual hit count.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/99dbe936/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
index d2e5d01..689409f 100644
--- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
@@ -63,7 +63,19 @@ import org.apache.lucene.util.ThreadInterruptedException;
  * reader ({@link DirectoryReader#open(IndexWriter)}).
  * Once you have a new {@link IndexReader}, it's relatively
  * cheap to create a new IndexSearcher from it.
- * 
+ *
+ * <p><b>NOTE</b>: The {@link #search} and {@link #searchAfter} methods are
+ * configured to only count top hits accurately up to {@code 1,000} and may
+ * return a {@link TotalHits.Relation lower bound} of the hit count if the
+ * hit count is greater than or equal to {@code 1,000}. On queries that match
+ * lots of documents, counting the number of hits may take much longer than
+ * computing the top hits so this trade-off allows to get some minimal
+ * information about the hit count without slowing down search too much. The
+ * {@link TopDocs#scoreDocs} array is always accurate however. If this behavior
+ * doesn't suit your needs, you should create collectors manually with either
+ * {@link TopScoreDocCollector#create} or {@link TopFieldCollector#create} and
+ * call {@link #search(Query, Collector)}.
+ *
  * <a name="thread-safety"></a><p><b>NOTE</b>: <code>{@link
  * IndexSearcher}</code> instances are completely
  * thread safe, meaning multiple threads can call any of its
@@ -82,6 +94,11 @@ public class IndexSearcher {
     final long maxRamBytesUsed = Math.min(1L << 25, Runtime.getRuntime().maxMemory() / 20);
     DEFAULT_QUERY_CACHE = new LRUQueryCache(maxCachedQueries, maxRamBytesUsed);
   }
+  /**
+   * By default we count hits accurately up to 1000. This makes sure that we
+   * don't spend most time on computing hit counts
+   */
+  private static final int TOTAL_HITS_THRESHOLD = 1000;
 
   final IndexReader reader; // package private for testing!
   
@@ -384,7 +401,7 @@ public class IndexSearcher {
 
       @Override
       public TopScoreDocCollector newCollector() throws IOException {
-        return TopScoreDocCollector.create(cappedNumHits, after, Integer.MAX_VALUE);
+        return TopScoreDocCollector.create(cappedNumHits, after, TOTAL_HITS_THRESHOLD);
       }
 
       @Override
@@ -513,7 +530,7 @@ public class IndexSearcher {
       @Override
       public TopFieldCollector newCollector() throws IOException {
         // TODO: don't pay the price for accurate hit counts by default
-        return TopFieldCollector.create(rewrittenSort, cappedNumHits, after, Integer.MAX_VALUE);
+        return TopFieldCollector.create(rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD);
       }
 
       @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/99dbe936/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java
index e15ba97..9478841 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java
@@ -255,7 +255,7 @@ public class TestBoolean2 extends LuceneTestCase {
     
     // sanity check expected num matches in bigSearcher
     assertEquals(mulFactor * collector.totalHits,
-                 bigSearcher.search(query, 1).totalHits.value);
+                 bigSearcher.count(query));
 
     // now check 2 diff scorers from the bigSearcher as well
     collector = TopScoreDocCollector.create(topDocsToCheck, Integer.MAX_VALUE);
@@ -398,8 +398,7 @@ public class TestBoolean2 extends LuceneTestCase {
         BooleanQuery.Builder q3 = new BooleanQuery.Builder();
         q3.add(q1, BooleanClause.Occur.SHOULD);
         q3.add(new PrefixQuery(new Term("field2", "b")), BooleanClause.Occur.SHOULD);
-        TopDocs hits4 = bigSearcher.search(q3.build(), 1);
-        assertEquals(mulFactor*collector.totalHits + NUM_EXTRA_DOCS/2, hits4.totalHits.value);
+        assertEquals(mulFactor*collector.totalHits + NUM_EXTRA_DOCS/2, bigSearcher.count(q3.build()));
 
         // test diff (randomized) scorers produce the same results on bigSearcher as well
         collector = TopFieldCollector.create(sort, 1000 * mulFactor, 1);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/99dbe936/lucene/core/src/test/org/apache/lucene/search/TestBooleanScorer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBooleanScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestBooleanScorer.java
index 4d3d8d8..86733a4 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestBooleanScorer.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestBooleanScorer.java
@@ -149,7 +149,7 @@ public class TestBooleanScorer extends LuceneTestCase {
     q2.add(q1.build(), BooleanClause.Occur.SHOULD);
     q2.add(new CrazyMustUseBulkScorerQuery(), BooleanClause.Occur.SHOULD);
 
-    assertEquals(1, s.search(q2.build(), 10).totalHits.value);
+    assertEquals(1, s.count(q2.build()));
     r.close();
     dir.close();
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/99dbe936/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java b/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java
index a30e026..5633607 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java
@@ -148,7 +148,13 @@ public class TestLRUQueryCache extends LuceneTestCase {
                 TotalHitCountCollector collector = new TotalHitCountCollector();
                 searcher.search(q, collector); // will use the cache
                 final int totalHits1 = collector.getTotalHits();
-                final long totalHits2 = searcher.search(q, 1).totalHits.value; // will not use the cache because of scores
+                TotalHitCountCollector collector2 = new TotalHitCountCollector();
+                searcher.search(q, new FilterCollector(collector2) {
+                  public ScoreMode scoreMode() {
+                    return ScoreMode.COMPLETE; // will not use the cache because of scores
+                  }
+                });
+                final long totalHits2 = collector2.getTotalHits();
                 assertEquals(totalHits2, totalHits1);
               } finally {
                 mgr.release(searcher);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/99dbe936/lucene/core/src/test/org/apache/lucene/search/TestNeedsScores.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestNeedsScores.java b/lucene/core/src/test/org/apache/lucene/search/TestNeedsScores.java
index 88860c0..75b6da1 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestNeedsScores.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestNeedsScores.java
@@ -62,7 +62,7 @@ public class TestNeedsScores extends LuceneTestCase {
     Query required = new TermQuery(new Term("field", "this"));
     Query prohibited = new TermQuery(new Term("field", "3"));
     BooleanQuery.Builder bq = new BooleanQuery.Builder();
-    bq.add(new AssertNeedsScores(required, ScoreMode.COMPLETE), BooleanClause.Occur.MUST);
+    bq.add(new AssertNeedsScores(required, ScoreMode.TOP_SCORES), BooleanClause.Occur.MUST);
     bq.add(new AssertNeedsScores(prohibited, ScoreMode.COMPLETE_NO_SCORES), BooleanClause.Occur.MUST_NOT);
     assertEquals(4, searcher.search(bq.build(), 5).totalHits.value); // we exclude 3
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/99dbe936/lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java b/lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java
index 043c943..5b9a62b 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java
@@ -384,7 +384,7 @@ public class TestShardSearching extends ShardSearchingTestBase {
       sd.doc += base[sd.shardIndex];
     }
 
-    TestUtil.assertEquals(hits, shardHits);
+    TestUtil.assertConsistent(hits, shardHits);
 
     if (moreHits) {
       // Return a continuation:

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/99dbe936/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java
index bf92642..43db2f2 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java
@@ -372,7 +372,7 @@ public class TestTopDocsMerge extends LuceneTestCase {
         }
       }
 
-      TestUtil.assertEquals(topHits, mergedHits);
+      TestUtil.assertConsistent(topHits, mergedHits);
     }
     reader.close();
     dir.close();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/99dbe936/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java
----------------------------------------------------------------------
diff --git a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java
index 3b99ccf..04e8959 100644
--- a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java
+++ b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java
@@ -311,13 +311,7 @@ public class ToParentBlockJoinQuery extends Query {
 
     @Override
     public float getMaxScore(int upTo) throws IOException {
-      switch(scoreMode) {
-        case Max:
-        case Min:
-          return childScorer.getMaxScore(DocIdSetIterator.NO_MORE_DOCS);
-        default:
-          return Float.POSITIVE_INFINITY;
-      }
+      return Float.POSITIVE_INFINITY;
     }
 
     private void setScoreAndFreq() throws IOException {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/99dbe936/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
index bc31b44..b12d7b8 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
@@ -92,6 +92,7 @@ import org.apache.lucene.mockfile.WindowsFS;
 import org.apache.lucene.search.FieldDoc;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.TotalHits;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.FilterDirectory;
@@ -1040,9 +1041,20 @@ public final class TestUtil {
     Assert.assertEquals("Reflection does not produce same map", reflectedValues, map);
   }
 
-  public static void assertEquals(TopDocs expected, TopDocs actual) {
-    Assert.assertEquals("wrong total hits", expected.totalHits.value, actual.totalHits.value);
-    Assert.assertEquals("wrong total hits", expected.totalHits.relation, actual.totalHits.relation);
+  /**
+   * Assert that the given {@link TopDocs} have the same top docs and consistent hit counts.
+   */
+  public static void assertConsistent(TopDocs expected, TopDocs actual) {
+    Assert.assertEquals("wrong total hits", expected.totalHits.value == 0, actual.totalHits.value == 0);
+    if (expected.totalHits.relation == TotalHits.Relation.EQUAL_TO) {
+      if (actual.totalHits.relation == TotalHits.Relation.EQUAL_TO) {
+        Assert.assertEquals("wrong total hits", expected.totalHits.value, actual.totalHits.value);
+      } else {
+        Assert.assertTrue("wrong total hits", expected.totalHits.value >= actual.totalHits.value);
+      }
+    } else if (actual.totalHits.relation == TotalHits.Relation.EQUAL_TO) {
+      Assert.assertTrue("wrong total hits", expected.totalHits.value <= actual.totalHits.value);
+    }
     Assert.assertEquals("wrong hit count", expected.scoreDocs.length, actual.scoreDocs.length);
     for(int hitIDX=0;hitIDX<expected.scoreDocs.length;hitIDX++) {
       final ScoreDoc expectedSD = expected.scoreDocs[hitIDX];