You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dn...@apache.org on 2021/10/03 17:10:25 UTC
[lucene-solr] 01/02: LUCENE-10126: Fix competitive iterator wrongly skip docs

This is an automated email from the ASF dual-hosted git repository.

dnhatn pushed a commit to branch branch_8_10
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit dffca84172a2645e0f144b6565ea64565f861267
Author: Nhat Nguyen <nh...@elastic.co>
AuthorDate: Sun Oct 3 11:57:54 2021 -0400

    LUCENE-10126: Fix competitive iterator wrongly skip docs
    
    The competitive iterator can wrongly skip a document that is advanced
    but not collected in the previous scoreRange.
---
 .../src/java/org/apache/lucene/search/Weight.java  | 30 ++++++-----
 .../lucene/search/comparators/DocComparator.java   |  2 +-
 .../search/comparators/NumericComparator.java      |  2 +-
 .../apache/lucene/search/TestSortOptimization.java | 60 ++++++++++++----------
 .../apache/lucene/search/AssertingBulkScorer.java  | 26 ++++++++--
 .../apache/lucene/search/AssertingScorable.java    | 40 ++++++++++++---
 6 files changed, 105 insertions(+), 55 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/search/Weight.java b/lucene/core/src/java/org/apache/lucene/search/Weight.java
index a26d1a3..7ea490f 100644
--- a/lucene/core/src/java/org/apache/lucene/search/Weight.java
+++ b/lucene/core/src/java/org/apache/lucene/search/Weight.java
@@ -216,17 +216,23 @@ public abstract class Weight implements SegmentCacheable {
     public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
       collector.setScorer(scorer);
       DocIdSetIterator scorerIterator = twoPhase == null ? iterator : twoPhase.approximation();
-      DocIdSetIterator collectorIterator = collector.competitiveIterator();
+      DocIdSetIterator competitiveIterator = collector.competitiveIterator();
       DocIdSetIterator filteredIterator;
-      if (collectorIterator == null) {
+      if (competitiveIterator == null) {
         filteredIterator = scorerIterator;
       } else {
+        // Wrap CompetitiveIterator and ScorerIterator start with (i.e., calling nextDoc()) the last
+        // visited docID because ConjunctionDISI might have advanced to it in the previous
+        // scoreRange, but we didn't process due to the range limit of scoreRange.
         if (scorerIterator.docID() != -1) {
-          // Wrap ScorerIterator to start from -1 for conjunction 
           scorerIterator = new StartDISIWrapper(scorerIterator);
         }
+        if (competitiveIterator.docID() != -1) {
+          competitiveIterator = new StartDISIWrapper(competitiveIterator);
+        }
         // filter scorerIterator to keep only competitive docs as defined by collector
-        filteredIterator = ConjunctionDISI.intersectIterators(Arrays.asList(scorerIterator, collectorIterator));
+        filteredIterator =
+            ConjunctionDISI.intersectIterators(Arrays.asList(scorerIterator, competitiveIterator));
       }
       if (filteredIterator.docID() == -1 && min == 0 && max == DocIdSetIterator.NO_MORE_DOCS) {
         scoreAll(collector, filteredIterator, twoPhase, acceptDocs);
@@ -287,17 +293,15 @@ public abstract class Weight implements SegmentCacheable {
     }
   }
 
-  /**
-   * Wraps an internal docIdSetIterator for it to start with docID = -1
-   */
-  protected static class StartDISIWrapper extends DocIdSetIterator {
+  /** Wraps an internal docIdSetIterator for it to start with the last visited docID */
+  private static class StartDISIWrapper extends DocIdSetIterator {
     private final DocIdSetIterator in;
-    private final int min;
+    private final int startDocID;
     private int docID = -1;
 
-    public StartDISIWrapper(DocIdSetIterator in) {
+    StartDISIWrapper(DocIdSetIterator in) {
       this.in = in;
-      this.min = in.docID();
+      this.startDocID = in.docID();
     }
 
     @Override
@@ -312,8 +316,8 @@ public abstract class Weight implements SegmentCacheable {
 
     @Override
     public int advance(int target) throws IOException {
-      if (target <= min) {
-        return docID = min;
+      if (target <= startDocID) {
+        return docID = startDocID;
       }
       return docID = in.advance(target);
     }
diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/DocComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/DocComparator.java
index 0bda127..fec4085 100644
--- a/lucene/core/src/java/org/apache/lucene/search/comparators/DocComparator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/comparators/DocComparator.java
@@ -139,7 +139,7 @@ public class DocComparator extends FieldComparator<Integer> {
         return null;
       } else {
         return new DocIdSetIterator() {
-          private int docID = -1;
+          private int docID = competitiveIterator.docID();
 
           @Override
           public int nextDoc() throws IOException {
diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java
index 8c04748..fed13a0 100644
--- a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java
@@ -250,7 +250,7 @@ public abstract class NumericComparator<T extends Number> extends FieldComparato
     public DocIdSetIterator competitiveIterator() {
       if (enableSkipping == false) return null;
       return new DocIdSetIterator() {
-        private int docID = -1;
+        private int docID = competitiveIterator.docID();
 
         @Override
         public int nextDoc() throws IOException {
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
index 90e978d..390e974 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
@@ -59,7 +59,7 @@ public class TestSortOptimization extends LuceneTestCase {
     }
     final IndexReader reader = DirectoryReader.open(writer);
     writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader);
     final SortField sortField = new SortField("my_field", SortField.Type.LONG);
     sortField.setCanUsePoints();
     final Sort sort = new Sort(sortField);
@@ -76,7 +76,7 @@ public class TestSortOptimization extends LuceneTestCase {
         assertEquals(i, ((Long) fieldDoc.fields[0]).intValue());
       }
       assertTrue(collector.isEarlyTerminated());
-      assertTrue(topDocs.totalHits.value < numDocs);
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
     }
 
     { // paging sort with after
@@ -91,7 +91,7 @@ public class TestSortOptimization extends LuceneTestCase {
         assertEquals(afterValue + 1 + i, fieldDoc.fields[0]);
       }
       assertTrue(collector.isEarlyTerminated());
-      assertTrue(topDocs.totalHits.value < numDocs);
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
     }
 
     { // test that if there is the secondary sort on _score, scores are filled correctly
@@ -106,7 +106,7 @@ public class TestSortOptimization extends LuceneTestCase {
         assertEquals(1.0, score, 0.001);
       }
       assertTrue(collector.isEarlyTerminated());
-      assertTrue(topDocs.totalHits.value < numDocs);
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
     }
 
     { // test that if numeric field is a secondary sort, no optimization is run
@@ -148,7 +148,7 @@ public class TestSortOptimization extends LuceneTestCase {
     }
     final IndexReader reader = DirectoryReader.open(writer);
     writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader, random().nextBoolean(), random().nextBoolean());
     final SortField sortField = new SortField("my_field", SortField.Type.LONG);
     sortField.setCanUsePoints();
     final Sort sort = new Sort(sortField);
@@ -185,7 +185,7 @@ public class TestSortOptimization extends LuceneTestCase {
     }
     final IndexReader reader = DirectoryReader.open(writer);
     writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader, random().nextBoolean(), random().nextBoolean());
     final int numHits = 3;
     final int totalHitsThreshold = 3;
 
@@ -209,7 +209,7 @@ public class TestSortOptimization extends LuceneTestCase {
       searcher.search(new MatchAllDocsQuery(), collector);
       TopDocs topDocs = collector.topDocs();
       assertEquals(topDocs.scoreDocs.length, numHits);
-      assertTrue(topDocs.totalHits.value < numDocs); // assert that some docs were skipped => optimization was run
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
     }
 
     reader.close();
@@ -219,7 +219,7 @@ public class TestSortOptimization extends LuceneTestCase {
   public void testSortOptimizationEqualValues() throws IOException {
     final Directory dir = newDirectory();
     final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
-    final int numDocs = atLeast(10000);
+    final int numDocs = atLeast(TEST_NIGHTLY ? 50_000 : 10_000);
     for (int i = 1; i <= numDocs; ++i) {
       final Document doc = new Document();
       doc.add(new NumericDocValuesField("my_field1", 100)); // all docs have the same value of my_field1
@@ -230,7 +230,7 @@ public class TestSortOptimization extends LuceneTestCase {
     }
     final IndexReader reader = DirectoryReader.open(writer);
     writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader, random().nextBoolean(), random().nextBoolean());
     final int numHits = 3;
     final int totalHitsThreshold = 3;
 
@@ -246,7 +246,7 @@ public class TestSortOptimization extends LuceneTestCase {
         FieldDoc fieldDoc = (FieldDoc) topDocs.scoreDocs[i];
         assertEquals(100, fieldDoc.fields[0]);
       }
-      assertTrue(topDocs.totalHits.value < numDocs); // assert that some docs were skipped => optimization was run
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
     }
 
     { // test that sorting on a single field with equal values and after parameter
@@ -305,7 +305,7 @@ public class TestSortOptimization extends LuceneTestCase {
     }
     final IndexReader reader = DirectoryReader.open(writer);
     writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader, random().nextBoolean(), random().nextBoolean());
     final SortField sortField = new SortField("my_field", SortField.Type.FLOAT);
     sortField.setCanUsePoints();
     final Sort sort = new Sort(sortField);
@@ -322,7 +322,7 @@ public class TestSortOptimization extends LuceneTestCase {
         assertEquals(1f * i, fieldDoc.fields[0]);
       }
       assertTrue(collector.isEarlyTerminated());
-      assertTrue(topDocs.totalHits.value < numDocs);
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
     }
 
     reader.close();
@@ -362,7 +362,7 @@ public class TestSortOptimization extends LuceneTestCase {
     int numHits = 0;
     do {
       for (int i = 0; i < numIndices; i++) {
-        IndexSearcher searcher = newSearcher(readers[i]);
+        IndexSearcher searcher = newSearcher(readers[i], random().nextBoolean(), random().nextBoolean());
         final TopFieldCollector collector =
                 TopFieldCollector.create(sort, size, after, totalHitsThreshold);
         searcher.search(new MatchAllDocsQuery(), collector);
@@ -388,8 +388,7 @@ public class TestSortOptimization extends LuceneTestCase {
 
     final int expectedNumHits = numDocsInIndex * numIndices;
     assertEquals(expectedNumHits, numHits);
-    // check that the optimization was run, as very few docs were collected
-    assertTrue(collectedDocs < totalDocs);
+    assertNonCompetitiveHitsAreSkipped(collectedDocs, totalDocs);
   }
 
   public void testDocSortOptimizationWithAfter() throws IOException {
@@ -405,7 +404,7 @@ public class TestSortOptimization extends LuceneTestCase {
     }
     final IndexReader reader = DirectoryReader.open(writer);
     writer.close();
-    IndexSearcher searcher = newSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader, random().nextBoolean(), random().nextBoolean());
     final int numHits = 10;
     final int totalHitsThreshold = 10;
     final int[] searchAfters = {3, 10, numDocs - 10};
@@ -424,8 +423,7 @@ public class TestSortOptimization extends LuceneTestCase {
           assertEquals(expectedDocID, topDocs.scoreDocs[i].doc);
         }
         assertTrue(collector.isEarlyTerminated());
-        // check that very few docs were collected
-        assertTrue(topDocs.totalHits.value < numDocs);
+        assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
       }
 
       // sort by _doc + _score with search after should trigger optimization
@@ -442,8 +440,7 @@ public class TestSortOptimization extends LuceneTestCase {
           assertEquals(expectedDocID, topDocs.scoreDocs[i].doc);
         }
         assertTrue(collector.isEarlyTerminated());
-        // assert that very few docs were collected
-        assertTrue(topDocs.totalHits.value < numDocs);
+        assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
       }
 
       // sort by _doc desc should not trigger optimization
@@ -487,7 +484,7 @@ public class TestSortOptimization extends LuceneTestCase {
     }
     final IndexReader reader = DirectoryReader.open(writer);
     writer.close();
-    IndexSearcher searcher = newSearcher(reader);;
+    IndexSearcher searcher = newSearcher(reader, random().nextBoolean(), random().nextBoolean());
     final int numHits = 3;
     final int totalHitsThreshold = 3;
     final Sort sort = new Sort(FIELD_DOC);
@@ -502,7 +499,7 @@ public class TestSortOptimization extends LuceneTestCase {
         assertEquals(i, topDocs.scoreDocs[i].doc);
       }
       assertTrue(collector.isEarlyTerminated());
-      assertTrue(topDocs.totalHits.value < 10); // assert that very few docs were collected
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, 10);
     }
 
     // sort by _doc with a bool query should skip all non-competitive documents
@@ -522,7 +519,7 @@ public class TestSortOptimization extends LuceneTestCase {
         assertEquals("seg1", d.get("tf"));
       }
       assertTrue(collector.isEarlyTerminated());
-      assertTrue(topDocs.totalHits.value < 10); // assert that very few docs were collected
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, 10);
     }
 
     reader.close();
@@ -551,7 +548,7 @@ public class TestSortOptimization extends LuceneTestCase {
     final IndexReader reader = DirectoryReader.open(writer);
     writer.close();
 
-    IndexSearcher searcher = newSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader, random().nextBoolean(), random().nextBoolean());
     searcher.setQueryCache(null);
     final int numHits = 10;
     final int totalHitsThreshold = 10;
@@ -589,7 +586,7 @@ public class TestSortOptimization extends LuceneTestCase {
     IndexReader reader = writer.getReader();
     writer.close();
 
-    IndexSearcher searcher = newSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader, random().nextBoolean(), random().nextBoolean());
     SortField sort1 = new SortField("intField", SortField.Type.LONG);
     sort1.setCanUsePoints();
     assertThrows(
@@ -646,7 +643,7 @@ public class TestSortOptimization extends LuceneTestCase {
     }
     IndexReader reader = DirectoryReader.open(writer);
     writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader, random().nextBoolean(), random().nextBoolean());
     SortField sortField = new SortField("my_field", SortField.Type.LONG);
     sortField.setCanUsePoints();
     TopFieldDocs topDocs =
@@ -692,7 +689,7 @@ public class TestSortOptimization extends LuceneTestCase {
     seqNos.sort(Long::compare);
     IndexReader reader = DirectoryReader.open(writer);
     writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader, random().nextBoolean(), random().nextBoolean());
     SortField sortField = new SortField("seq_no", SortField.Type.LONG);
     sortField.setCanUsePoints();
     int visitedHits = 0;
@@ -718,4 +715,13 @@ public class TestSortOptimization extends LuceneTestCase {
     dir.close();
   }
 
+  private void assertNonCompetitiveHitsAreSkipped(long collectedHits, long numDocs) {
+    if (collectedHits >= numDocs) {
+      fail(
+          "Expected some non-competitive hits are skipped; got collected_hits="
+              + collectedHits
+              + " num_docs="
+              + numDocs);
+    }
+  }
 }
diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingBulkScorer.java b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingBulkScorer.java
index 4445f3d..309d31a 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingBulkScorer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingBulkScorer.java
@@ -73,13 +73,30 @@ final class AssertingBulkScorer extends BulkScorer {
   }
 
   @Override
-  public int score(LeafCollector collector, Bits acceptDocs, int min, final int max) throws IOException {
-    assert min >= this.max: "Scoring backward: min=" + min + " while previous max was max=" + this.max;
+  public int score(LeafCollector collector, Bits acceptDocs, int min, final int max)
+      throws IOException {
+    assert min >= this.max
+        : "Scoring backward: min=" + min + " while previous max was max=" + this.max;
     assert min <= max : "max must be greater than min, got min=" + min + ", and max=" + max;
     this.max = max;
     collector = new AssertingLeafCollector(collector, min, max);
-    final int next = in.score(collector, acceptDocs, min, max);
-    assert next >= max;
+    int next = min;
+    do {
+      final int upTo;
+      if (random.nextBoolean()) {
+        upTo = max;
+      } else {
+        final long interval;
+        if (random.nextInt(100) <= 5) {
+          interval = 1 + random.nextInt(10);
+        } else {
+          interval = 1 + random.nextInt(random.nextBoolean() ? 100 : 5000);
+        }
+        upTo = Math.toIntExact(Math.min(next + interval, max));
+      }
+      next = in.score(new AssertingLeafCollector(collector, next, upTo), acceptDocs, next, upTo);
+    } while (next < max);
+
     if (max >= maxDoc || next >= maxDoc) {
       assert next == DocIdSetIterator.NO_MORE_DOCS;
       return DocIdSetIterator.NO_MORE_DOCS;
@@ -92,5 +109,4 @@ final class AssertingBulkScorer extends BulkScorer {
   public String toString() {
     return "AssertingBulkScorer(" + in + ")";
   }
-
 }
diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingScorable.java b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingScorable.java
index 208eb4b..662d486 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingScorable.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingScorable.java
@@ -47,18 +47,42 @@ public class AssertingScorable extends FilterScorable {
     if (in instanceof AssertingScorable) {
       return in;
     }
-    return new AssertingScorable(in);
+    // If `in` is Scorer, we need to wrap it as a Scorer instead of Scorable because
+    // NumericComparator uses the iterator cost of a Scorer in sort optimization.
+    if (in instanceof Scorer) {
+      return new WrappedScorer((Scorer) in);
+    } else {
+      return new AssertingScorable(in);
+    }
+  }
+
+  private static class WrappedScorer extends FilterScorer {
+    WrappedScorer(Scorer in) {
+      super(in);
+    }
+
+    @Override
+    public float score() throws IOException {
+      return new AssertingScorable(in).score();
+    }
+
+    @Override
+    public void setMinCompetitiveScore(float minScore) throws IOException {
+      in.setMinCompetitiveScore(minScore);
+    }
+
+    @Override
+    public float getMaxScore(int upTo) throws IOException {
+      return in.getMaxScore(upTo);
+    }
   }
 
   public static Scorable unwrap(Scorable in) {
     while (true) {
-      if (in instanceof AssertingScorable)
-        in = ((AssertingScorable)in).in;
-      else if (in instanceof AssertingScorer)
-        in = ((AssertingScorer)in).in;
-      else
-        return in;
+      if (in instanceof AssertingScorable) in = ((AssertingScorable) in).in;
+      else if (in instanceof AssertingScorer) in = ((AssertingScorer) in).in;
+      else if (in instanceof WrappedScorer) in = ((WrappedScorer) in).in;
+      else return in;
     }
   }
-
 }