You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2021/10/25 13:38:43 UTC

[lucene] branch main updated: LUCENE-10154 NumericLeafComparator to define getPointValues (#364)

This is an automated email from the ASF dual-hosted git repository.

mayya pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 2ed6e4a  LUCENE-10154 NumericLeafComparator to define getPointValues (#364)
2ed6e4a is described below

commit 2ed6e4aa78eb6d1fbb90c21c9723313ab5077e83
Author: Mayya Sharipova <ma...@elastic.co>
AuthorDate: Mon Oct 25 09:38:37 2021 -0400

    LUCENE-10154 NumericLeafComparator to define getPointValues (#364)
    
    This patch adds getPointValues to NumericLeafComparatorsimilar how it
    has getNumericDocValues.
    
    Numeric Sort optimization with points relies on the assumption that
    points and doc values record the same information, as we substitute
    iterator over doc_values with one over points.
    
    If we override getNumericDocValues it almost certainly means that whatever
    PointValues NumericComparator is going to look at shouldn't be used to
    skip non-competitive documents. Returning null for pointValues in this
    case will force comparator NOT to use sort optimization with points,
    and continue with a traditional way of iterating over doc values.
---
 lucene/CHANGES.txt                                 |  2 +
 .../apache/lucene/search/DoubleValuesSource.java   |  6 ++
 .../org/apache/lucene/search/LongValuesSource.java |  6 ++
 .../lucene/search/SortedNumericSortField.java      | 49 +++++++++++++
 .../search/comparators/NumericComparator.java      | 41 ++++++++++-
 .../apache/lucene/search/TestSortOptimization.java | 82 ++++++++++++++++++++++
 .../search/join/ToParentBlockJoinSortField.java    | 21 ++++++
 7 files changed, 205 insertions(+), 2 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3cf94ac..96d42cb 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -478,6 +478,8 @@ Bug Fixes
 * LUCENE-10134: ConcurrentSortedSetDocValuesFacetCounts shouldn't share liveDocs Bits across threads.
   (Ankur Goel)
 
+* LUCENE-10154: NumericLeafComparator to define getPointValues. (Mayya Sharipova, Adrien Grand)
+
 Build
 ---------------------
 
diff --git a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java
index a665ecc..d59bc9b 100644
--- a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java
+++ b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java
@@ -25,6 +25,7 @@ import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.search.comparators.DoubleComparator;
 
 /**
@@ -493,6 +494,11 @@ public abstract class DoubleValuesSource implements SegmentCacheable {
             }
 
             @Override
+            protected PointValues getPointValues(LeafReaderContext context, String field) {
+              return null;
+            }
+
+            @Override
             public void setScorer(Scorable scorer) throws IOException {
               holder.values = producer.getValues(ctx, fromScorer(scorer));
               super.setScorer(scorer);
diff --git a/lucene/core/src/java/org/apache/lucene/search/LongValuesSource.java b/lucene/core/src/java/org/apache/lucene/search/LongValuesSource.java
index 4386446..4443b05 100644
--- a/lucene/core/src/java/org/apache/lucene/search/LongValuesSource.java
+++ b/lucene/core/src/java/org/apache/lucene/search/LongValuesSource.java
@@ -22,6 +22,7 @@ import java.util.Objects;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.search.comparators.LongComparator;
 
 /**
@@ -344,6 +345,11 @@ public abstract class LongValuesSource implements SegmentCacheable {
             }
 
             @Override
+            protected PointValues getPointValues(LeafReaderContext context, String field) {
+              return null;
+            }
+
+            @Override
             public void setScorer(Scorable scorer) throws IOException {
               holder.values = producer.getValues(ctx, DoubleValuesSource.fromScorer(scorer));
               super.setScorer(scorer);
diff --git a/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java b/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java
index 60ddaa9..e01461e 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java
@@ -22,6 +22,7 @@ import org.apache.lucene.index.IndexSorter;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.SortFieldProvider;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.search.comparators.DoubleComparator;
@@ -257,6 +258,18 @@ public class SortedNumericSortField extends SortField {
                     return SortedNumericSelector.wrap(
                         DocValues.getSortedNumeric(context.reader(), field), selector, type);
                   }
+                  // we can use sort optimization with points if selector is MIN or MAX,
+                  // because we can still build successful iterator over points in this case.
+                  @Override
+                  protected PointValues getPointValues(LeafReaderContext context, String field)
+                      throws IOException {
+                    if (selector == SortedNumericSelector.Type.MAX
+                        || selector == SortedNumericSelector.Type.MIN) {
+                      return super.getPointValues(context, field);
+                    } else {
+                      return null;
+                    }
+                  }
                 };
               }
             };
@@ -274,6 +287,18 @@ public class SortedNumericSortField extends SortField {
                     return SortedNumericSelector.wrap(
                         DocValues.getSortedNumeric(context.reader(), field), selector, type);
                   }
+                  // we can use sort optimization with points if selector is MIN or MAX,
+                  // because we can still build successful iterator over points in this case.
+                  @Override
+                  protected PointValues getPointValues(LeafReaderContext context, String field)
+                      throws IOException {
+                    if (selector == SortedNumericSelector.Type.MAX
+                        || selector == SortedNumericSelector.Type.MIN) {
+                      return super.getPointValues(context, field);
+                    } else {
+                      return null;
+                    }
+                  }
                 };
               }
             };
@@ -291,6 +316,18 @@ public class SortedNumericSortField extends SortField {
                     return SortedNumericSelector.wrap(
                         DocValues.getSortedNumeric(context.reader(), field), selector, type);
                   }
+                  // we can use sort optimization with points if selector is MIN or MAX,
+                  // because we can still build successful iterator over points in this case.
+                  @Override
+                  protected PointValues getPointValues(LeafReaderContext context, String field)
+                      throws IOException {
+                    if (selector == SortedNumericSelector.Type.MAX
+                        || selector == SortedNumericSelector.Type.MIN) {
+                      return super.getPointValues(context, field);
+                    } else {
+                      return null;
+                    }
+                  }
                 };
               }
             };
@@ -308,6 +345,18 @@ public class SortedNumericSortField extends SortField {
                     return SortedNumericSelector.wrap(
                         DocValues.getSortedNumeric(context.reader(), field), selector, type);
                   }
+                  // we can use sort optimization with points if selector is MIN or MAX,
+                  // because we can still build successful iterator over points in this case.
+                  @Override
+                  protected PointValues getPointValues(LeafReaderContext context, String field)
+                      throws IOException {
+                    if (selector == SortedNumericSelector.Type.MAX
+                        || selector == SortedNumericSelector.Type.MIN) {
+                      return super.getPointValues(context, field);
+                    } else {
+                      return null;
+                    }
+                  }
                 };
               }
             };
diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java
index d706f35..d8a597d 100644
--- a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java
@@ -35,6 +35,11 @@ import org.apache.lucene.util.DocIdSetBuilder;
 /**
  * Abstract numeric comparator for comparing numeric values. This comparator provides a skipping
  * functionality – an iterator that can skip over non-competitive documents.
+ *
+ * <p>Parameter {@code field} provided in the constructor is used as a field name in the default
+ * implementations of the methods {@code getNumericDocValues} and {@code getPointValues} to retrieve
+ * doc values and points. You can pass a dummy value for a field name (e.g. when sorting by script),
+ * but in this case you must override both of these methods.
  */
 public abstract class NumericComparator<T extends Number> extends FieldComparator<T> {
   protected final T missingValue;
@@ -92,7 +97,7 @@ public abstract class NumericComparator<T extends Number> extends FieldComparato
 
     public NumericLeafComparator(LeafReaderContext context) throws IOException {
       this.docValues = getNumericDocValues(context, field);
-      this.pointValues = canSkipDocuments ? context.reader().getPointValues(field) : null;
+      this.pointValues = canSkipDocuments ? getPointValues(context, field) : null;
       if (pointValues != null) {
         FieldInfo info = context.reader().getFieldInfos().fieldInfo(field);
         if (info == null || info.getPointDimensionCount() == 0) {
@@ -130,12 +135,44 @@ public abstract class NumericComparator<T extends Number> extends FieldComparato
       }
     }
 
-    /** Retrieves the NumericDocValues for the field in this segment */
+    /**
+     * Retrieves the NumericDocValues for the field in this segment
+     *
+     * <p>If you override this method, you must also override {@link
+     * #getPointValues(LeafReaderContext, String)} This class uses sort optimization that leverages
+     * points to filter out non-competitive matches, which relies on the assumption that points and
+     * doc values record the same information.
+     *
+     * @param context – reader context
+     * @param field - field name
+     * @return numeric doc values for the field in this segment.
+     * @throws IOException If there is a low-level I/O error
+     */
     protected NumericDocValues getNumericDocValues(LeafReaderContext context, String field)
         throws IOException {
       return DocValues.getNumeric(context.reader(), field);
     }
 
+    /**
+     * Retrieves point values for the field in this segment
+     *
+     * <p>If you override this method, you must also override {@link
+     * #getNumericDocValues(LeafReaderContext, String)} This class uses sort optimization that
+     * leverages points to filter out non-competitive matches, which relies on the assumption that
+     * points and doc values record the same information. Return {@code null} even if no points
+     * implementation is available, in this case sort optimization with points will be disabled.
+     *
+     * @param context – reader context
+     * @param field - field name
+     * @return point values for the field in this segment if they are available or {@code null} if
+     *     sort optimization with points should be disabled.
+     * @throws IOException If there is a low-level I/O error
+     */
+    protected PointValues getPointValues(LeafReaderContext context, String field)
+        throws IOException {
+      return context.reader().getPointValues(field);
+    }
+
     @Override
     public void setBottom(int slot) throws IOException {
       queueFull = true; // if we are setting bottom, it means that we have collected enough hits
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
index 52ef0c8..3e91135 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
@@ -19,6 +19,7 @@ package org.apache.lucene.search;
 import static org.apache.lucene.search.SortField.FIELD_DOC;
 import static org.apache.lucene.search.SortField.FIELD_SCORE;
 
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
@@ -31,6 +32,7 @@ import org.apache.lucene.document.IntPoint;
 import org.apache.lucene.document.IntRange;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.SortedNumericDocValuesField;
 import org.apache.lucene.document.StoredField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
@@ -762,6 +764,86 @@ public class TestSortOptimization extends LuceneTestCase {
     dir.close();
   }
 
+  // Test that sort on sorted numeric field without sort optimization and
+  // with sort optimization produce the same results
+  public void testSortOptimizationOnSortedNumericField() throws IOException {
+    final Directory dir = newDirectory();
+    final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
+    final int numDocs = atLeast(5000);
+    for (int i = 0; i < numDocs; ++i) {
+      int value = random().nextInt();
+      int value2 = random().nextInt();
+      final Document doc = new Document();
+      doc.add(new SortedNumericDocValuesField("my_field", value));
+      doc.add(new SortedNumericDocValuesField("my_field", value2));
+      doc.add(new LongPoint("my_field", value));
+      doc.add(new LongPoint("my_field", value2));
+      writer.addDocument(doc);
+    }
+    final IndexReader reader = DirectoryReader.open(writer);
+    writer.close();
+    IndexSearcher searcher = newSearcher(reader);
+
+    SortedNumericSelector.Type type =
+        RandomPicks.randomFrom(random(), SortedNumericSelector.Type.values());
+    boolean reverse = random().nextBoolean();
+    final SortField sortField =
+        new SortedNumericSortField("my_field", SortField.Type.LONG, reverse, type);
+    sortField.setOptimizeSortWithPoints(false);
+    final Sort sort = new Sort(sortField); // sort without sort optimization
+    final SortField sortField2 =
+        new SortedNumericSortField("my_field", SortField.Type.LONG, reverse, type);
+    final Sort sort2 = new Sort(sortField2); // sort with sort optimization
+    Query query = new MatchAllDocsQuery();
+    final int totalHitsThreshold = 3;
+
+    long expectedCollectedHits = 0;
+    long collectedHits = 0;
+    long collectedHits2 = 0;
+    int visitedHits = 0;
+    ScoreDoc after = null;
+    while (visitedHits < numDocs) {
+      int batch = 1 + random().nextInt(100);
+      int expectedHits = Math.min(numDocs - visitedHits, batch);
+
+      final TopFieldCollector collector =
+          TopFieldCollector.create(sort, batch, (FieldDoc) after, totalHitsThreshold);
+      searcher.search(query, collector);
+      TopDocs topDocs = collector.topDocs();
+      ScoreDoc[] scoreDocs = topDocs.scoreDocs;
+
+      final TopFieldCollector collector2 =
+          TopFieldCollector.create(sort2, batch, (FieldDoc) after, totalHitsThreshold);
+      searcher.search(query, collector2);
+      TopDocs topDocs2 = collector2.topDocs();
+      ScoreDoc[] scoreDocs2 = topDocs2.scoreDocs;
+
+      // assert that the resulting hits are the same
+      assertEquals(expectedHits, topDocs.scoreDocs.length);
+      assertEquals(topDocs.scoreDocs.length, topDocs2.scoreDocs.length);
+      for (int i = 0; i < scoreDocs.length; i++) {
+        FieldDoc fieldDoc = (FieldDoc) scoreDocs[i];
+        FieldDoc fieldDoc2 = (FieldDoc) scoreDocs2[i];
+        assertEquals(fieldDoc.fields[0], fieldDoc2.fields[0]);
+        assertEquals(fieldDoc.doc, fieldDoc2.doc);
+        visitedHits++;
+      }
+
+      expectedCollectedHits += numDocs;
+      collectedHits += topDocs.totalHits.value;
+      collectedHits2 += topDocs2.totalHits.value;
+      after = scoreDocs[expectedHits - 1];
+    }
+    assertEquals(visitedHits, numDocs);
+    assertEquals(expectedCollectedHits, collectedHits);
+    // assert that the second sort with optimization collected less or equal hits
+    assertTrue(collectedHits >= collectedHits2);
+    // System.out.println(expectedCollectedHits + "\t" + collectedHits + "\t" + collectedHits2);
+
+    reader.close();
+    dir.close();
+  }
+
   private void assertNonCompetitiveHitsAreSkipped(long collectedHits, long numDocs) {
     if (collectedHits >= numDocs) {
       fail(
diff --git a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java
index 4d0aa29..b97a3ca 100644
--- a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java
+++ b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java
@@ -23,6 +23,7 @@ import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.FilterNumericDocValues;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
@@ -172,6 +173,11 @@ public class ToParentBlockJoinSortField extends SortField {
             }
             return BlockJoinSelector.wrap(sortedNumeric, type, parents, toIter(children));
           }
+          // no sort optimization with points
+          @Override
+          protected PointValues getPointValues(LeafReaderContext context, String field) {
+            return null;
+          }
         };
       }
     };
@@ -196,6 +202,11 @@ public class ToParentBlockJoinSortField extends SortField {
             }
             return BlockJoinSelector.wrap(sortedNumeric, type, parents, toIter(children));
           }
+          // no sort optimization with points
+          @Override
+          protected PointValues getPointValues(LeafReaderContext context, String field) {
+            return null;
+          }
         };
       }
     };
@@ -227,6 +238,11 @@ public class ToParentBlockJoinSortField extends SortField {
               }
             };
           }
+          // no sort optimization with points
+          @Override
+          protected PointValues getPointValues(LeafReaderContext context, String field) {
+            return null;
+          }
         };
       }
       ;
@@ -260,6 +276,11 @@ public class ToParentBlockJoinSortField extends SortField {
               }
             };
           }
+          // no sort optimization with points
+          @Override
+          protected PointValues getPointValues(LeafReaderContext context, String field) {
+            return null;
+          }
         };
       }
     };