You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2021/07/07 00:30:05 UTC
[lucene-solr] branch branch_8x updated: LUCENE-10020 DocComparator
don't skip docs of same docID (#2530)
This is an automated email from the ASF dual-hosted git repository.
mayya pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new bdef1be LUCENE-10020 DocComparator don't skip docs of same docID (#2530)
bdef1be is described below
commit bdef1be78fb46ab7bf1924386daab2a7db1a4316
Author: Mayya Sharipova <ma...@elastic.co>
AuthorDate: Tue Jul 6 20:29:50 2021 -0400
LUCENE-10020 DocComparator don't skip docs of same docID (#2530)
DocComparator should not skip docs with the same docID on multiple
sorts with search after.
Because of the optimization introduced in LUCENE-9449, currently when
searching with sort on [_doc, other fields] with search after,
DocComparator can efficiently skip all docs before and including
the provided [search after docID]. This is a desirable behaviour
in a single index search. But in a distributed search, where multiple
indices have docs with the same docID, and when searching on
[_doc, other fields], the sort optimization should NOT skip
documents with the same docIDs.
This PR fixes this.
Backport for https://github.com/apache/lucene/pull/204
Relates to LUCENE-9449
---
lucene/CHANGES.txt | 3 +
.../lucene/search/comparators/DocComparator.java | 6 +-
...tionSkipping.java => TestSortOptimization.java} | 65 +++++++++++++++++++++-
3 files changed, 72 insertions(+), 2 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3ed4cd4..624b555 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -54,6 +54,9 @@ Bug Fixes
* LUCENE-9988: Fix DrillSideways correctness bug introduced in LUCENE-9944 (Greg Miller)
+* LUCENE-10020: DocComparator should not skip docs with the same docID on
+ multiple sorts with search after (Mayya Sharipova, Julie Tibshirani)
+
Other
---------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/DocComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/DocComparator.java
index ec98fbf..0bda127 100644
--- a/lucene/core/src/java/org/apache/lucene/search/comparators/DocComparator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/comparators/DocComparator.java
@@ -89,7 +89,11 @@ public class DocComparator extends FieldComparator<Integer> {
public DocLeafComparator(LeafReaderContext context) {
this.docBase = context.docBase;
if (enableSkipping) {
- this.minDoc = topValue + 1;
+ // Skip docs before topValue, but include docs starting with topValue.
+ // Including topValue is necessary when doing sort on [_doc, other fields]
+ // in a distributed search where there are docs from different indices
+ // with the same docID.
+ this.minDoc = topValue;
this.maxDoc = context.reader().maxDoc();
this.competitiveIterator = DocIdSetIterator.all(maxDoc);
} else {
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFieldSortOptimizationSkipping.java b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
similarity index 90%
rename from lucene/core/src/test/org/apache/lucene/search/TestFieldSortOptimizationSkipping.java
rename to lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
index 96e6ad9..db61af9 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestFieldSortOptimizationSkipping.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
@@ -38,7 +38,7 @@ import java.io.IOException;
import static org.apache.lucene.search.SortField.FIELD_DOC;
import static org.apache.lucene.search.SortField.FIELD_SCORE;
-public class TestFieldSortOptimizationSkipping extends LuceneTestCase {
+public class TestSortOptimization extends LuceneTestCase {
public void testLongSortOptimization() throws IOException {
@@ -321,6 +321,69 @@ public class TestFieldSortOptimizationSkipping extends LuceneTestCase {
dir.close();
}
+ /**
+ * Test that a search with sort on [_doc, other fields] across multiple indices doesn't miss any
+ * documents.
+ */
+ public void testDocSortOptimizationMultipleIndices() throws IOException {
+ final int numIndices = 3;
+ final int numDocsInIndex = atLeast(50);
+ Directory[] dirs = new Directory[numIndices];
+ IndexReader[] readers = new IndexReader[numIndices];
+ for (int i = 0; i < numIndices; i++) {
+ dirs[i] = newDirectory();
+ try (IndexWriter writer = new IndexWriter(dirs[i], new IndexWriterConfig())) {
+ for (int docID = 0; docID < numDocsInIndex; docID++) {
+ final Document doc = new Document();
+ doc.add(new NumericDocValuesField("my_field", docID * numIndices + i));
+ writer.addDocument(doc);
+ }
+ writer.flush();
+ }
+ readers[i] = DirectoryReader.open(dirs[i]);
+ }
+
+ final int size = 7;
+ final int totalHitsThreshold = 7;
+ final Sort sort = new Sort(FIELD_DOC, new SortField("my_field", SortField.Type.LONG));
+ TopFieldDocs[] topDocs = new TopFieldDocs[numIndices];
+ int curNumHits;
+ FieldDoc after = null;
+ long collectedDocs = 0;
+ long totalDocs = 0;
+ int numHits = 0;
+ do {
+ for (int i = 0; i < numIndices; i++) {
+ IndexSearcher searcher = newSearcher(readers[i]);
+ final TopFieldCollector collector =
+ TopFieldCollector.create(sort, size, after, totalHitsThreshold);
+ searcher.search(new MatchAllDocsQuery(), collector);
+ topDocs[i] = collector.topDocs();
+ for (int docID = 0; docID < topDocs[i].scoreDocs.length; docID++) {
+ topDocs[i].scoreDocs[docID].shardIndex = i;
+ }
+ collectedDocs += topDocs[i].totalHits.value;
+ totalDocs += numDocsInIndex;
+ }
+ TopFieldDocs mergedTopDcs = TopDocs.merge(sort, size, topDocs);
+ curNumHits = mergedTopDcs.scoreDocs.length;
+ numHits += curNumHits;
+ if (curNumHits > 0) {
+ after = (FieldDoc) mergedTopDcs.scoreDocs[curNumHits - 1];
+ }
+ } while (curNumHits > 0);
+
+ for (int i = 0; i < numIndices; i++) {
+ readers[i].close();
+ dirs[i].close();
+ }
+
+ final int expectedNumHits = numDocsInIndex * numIndices;
+ assertEquals(expectedNumHits, numHits);
+ // check that the optimization was run, as very few docs were collected
+ assertTrue(collectedDocs < totalDocs);
+ }
+
public void testDocSortOptimizationWithAfter() throws IOException {
final Directory dir = newDirectory();
final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());