You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2021/10/25 14:21:29 UTC
[lucene-solr] branch branch_8x updated: LUCENE-10154 NumericLeafComparator to define getPointValues (#2593)

This is an automated email from the ASF dual-hosted git repository.

mayya pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 3630c9f  LUCENE-10154 NumericLeafComparator to define getPointValues (#2593)
3630c9f is described below

commit 3630c9f5c30ddfb57835a763d2eda5fc61efad62
Author: Mayya Sharipova <ma...@elastic.co>
AuthorDate: Mon Oct 25 10:21:10 2021 -0400

    LUCENE-10154 NumericLeafComparator to define getPointValues (#2593)
    
    This patch adds getPointValues to NumericLeafComparatorsimilar how it
    has getNumericDocValues.
    
    Numeric Sort optimization with points relies on the assumption that
    points and doc values record the same information, as we substitute
    iterator over doc_values with one over points.
    
    If we override getNumericDocValues it almost certainly means that whatever
    PointValues NumericComparator is going to look at shouldn't be used to
    skip non-competitive documents. Returning null for pointValues in this
    case will force comparator NOT to use sort optimization with points,
    and continue with a traditional way of iterating over doc values.
    
    Backport for https://github.com/apache/lucene/pull/364
---
 lucene/CHANGES.txt                                 |  2 +
 .../apache/lucene/search/DoubleValuesSource.java   |  6 ++
 .../org/apache/lucene/search/LongValuesSource.java |  6 ++
 .../lucene/search/SortedNumericSortField.java      | 49 +++++++++++++
 .../search/comparators/NumericComparator.java      | 37 +++++++++-
 .../apache/lucene/search/TestSortOptimization.java | 82 ++++++++++++++++++++++
 .../search/join/ToParentBlockJoinSortField.java    | 21 ++++++
 7 files changed, 202 insertions(+), 1 deletion(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index be50560..a1f0410 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -41,6 +41,8 @@ Bug Fixes
 
 * LUCENE-10008: Respect ignoreCase in CommonGramsFilterFactory (Vigya Sharma)
 
+* LUCENE-10154: NumericLeafComparator to define getPointValues. (Mayya Sharipova, Adrien Grand)
+
 Build
 ---------------------
 
diff --git a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java
index ca82a2c..eee0108 100644
--- a/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java
+++ b/lucene/core/src/java/org/apache/lucene/search/DoubleValuesSource.java
@@ -26,6 +26,7 @@ import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.search.comparators.DoubleComparator;
 
 /**
@@ -501,6 +502,11 @@ public abstract class DoubleValuesSource implements SegmentCacheable {
             }
 
             @Override
+            protected PointValues getPointValues(LeafReaderContext context, String field) {
+              return null;
+            }
+
+            @Override
             public void setScorer(Scorable scorer) throws IOException {
               holder.values = producer.getValues(ctx, fromScorer(scorer));
               super.setScorer(scorer);
diff --git a/lucene/core/src/java/org/apache/lucene/search/LongValuesSource.java b/lucene/core/src/java/org/apache/lucene/search/LongValuesSource.java
index 73575fd..fa2a2f6 100644
--- a/lucene/core/src/java/org/apache/lucene/search/LongValuesSource.java
+++ b/lucene/core/src/java/org/apache/lucene/search/LongValuesSource.java
@@ -23,6 +23,7 @@ import java.util.Objects;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.search.comparators.LongComparator;
 
 /**
@@ -349,6 +350,11 @@ public abstract class LongValuesSource implements SegmentCacheable {
             }
 
             @Override
+            protected PointValues getPointValues(LeafReaderContext context, String field) {
+              return null;
+            }
+
+            @Override
             public void setScorer(Scorable scorer) throws IOException {
               holder.values = producer.getValues(ctx, DoubleValuesSource.fromScorer(scorer));
               super.setScorer(scorer);
diff --git a/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java b/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java
index 5e5d0a5..ec21cba 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java
@@ -24,6 +24,7 @@ import org.apache.lucene.index.IndexSorter;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.SortFieldProvider;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.search.comparators.DoubleComparator;
@@ -242,6 +243,18 @@ public class SortedNumericSortField extends SortField {
                     return SortedNumericSelector.wrap(
                         DocValues.getSortedNumeric(context.reader(), field), selector, type);
                   }
+                  // we can use sort optimization with points if selector is MIN or MAX,
+                  // because we can still build successful iterator over points in this case.
+                  @Override
+                  protected PointValues getPointValues(LeafReaderContext context, String field)
+                      throws IOException {
+                    if (selector == SortedNumericSelector.Type.MAX
+                        || selector == SortedNumericSelector.Type.MIN) {
+                      return super.getPointValues(context, field);
+                    } else {
+                      return null;
+                    }
+                  }
                 };
               }
             };
@@ -259,6 +272,18 @@ public class SortedNumericSortField extends SortField {
                     return SortedNumericSelector.wrap(
                         DocValues.getSortedNumeric(context.reader(), field), selector, type);
                   }
+                  // we can use sort optimization with points if selector is MIN or MAX,
+                  // because we can still build successful iterator over points in this case.
+                  @Override
+                  protected PointValues getPointValues(LeafReaderContext context, String field)
+                      throws IOException {
+                    if (selector == SortedNumericSelector.Type.MAX
+                        || selector == SortedNumericSelector.Type.MIN) {
+                      return super.getPointValues(context, field);
+                    } else {
+                      return null;
+                    }
+                  }
                 };
               }
             };
@@ -276,6 +301,18 @@ public class SortedNumericSortField extends SortField {
                     return SortedNumericSelector.wrap(
                         DocValues.getSortedNumeric(context.reader(), field), selector, type);
                   }
+                  // we can use sort optimization with points if selector is MIN or MAX,
+                  // because we can still build successful iterator over points in this case.
+                  @Override
+                  protected PointValues getPointValues(LeafReaderContext context, String field)
+                      throws IOException {
+                    if (selector == SortedNumericSelector.Type.MAX
+                        || selector == SortedNumericSelector.Type.MIN) {
+                      return super.getPointValues(context, field);
+                    } else {
+                      return null;
+                    }
+                  }
                 };
               }
             };
@@ -293,6 +330,18 @@ public class SortedNumericSortField extends SortField {
                     return SortedNumericSelector.wrap(
                         DocValues.getSortedNumeric(context.reader(), field), selector, type);
                   }
+                  // we can use sort optimization with points if selector is MIN or MAX,
+                  // because we can still build successful iterator over points in this case.
+                  @Override
+                  protected PointValues getPointValues(LeafReaderContext context, String field)
+                      throws IOException {
+                    if (selector == SortedNumericSelector.Type.MAX
+                        || selector == SortedNumericSelector.Type.MIN) {
+                      return super.getPointValues(context, field);
+                    } else {
+                      return null;
+                    }
+                  }
                 };
               }
             };
diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java
index fed13a0..86aff36 100644
--- a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java
@@ -35,6 +35,11 @@ import java.io.IOException;
 /**
  * Abstract numeric comparator for comparing numeric values.
  * This comparator provides a skipping functionality – an iterator that can skip over non-competitive documents.
+ *
+ * <p>Parameter {@code field} provided in the constructor is used as a field name in the default
+ * implementations of the methods {@code getNumericDocValues} and {@code getPointValues} to retrieve
+ * doc values and points. You can pass a dummy value for a field name (e.g. when sorting by script),
+ * but in this case you must override both of these methods.
  */
 public abstract class NumericComparator<T extends Number> extends FieldComparator<T> {
   protected final T missingValue;
@@ -89,7 +94,7 @@ public abstract class NumericComparator<T extends Number> extends FieldComparato
 
     public NumericLeafComparator(LeafReaderContext context) throws IOException {
       this.docValues = getNumericDocValues(context, field);
-      this.pointValues = canSkipDocuments ? context.reader().getPointValues(field) : null;
+      this.pointValues = canSkipDocuments ? getPointValues(context, field) : null;
       if (pointValues != null) {
         FieldInfo info = context.reader().getFieldInfos().fieldInfo(field);
         if (info == null || info.getPointDimensionCount() == 0) {
@@ -127,11 +132,41 @@ public abstract class NumericComparator<T extends Number> extends FieldComparato
 
     /**
      * Retrieves the NumericDocValues for the field in this segment
+     *
+     * <p>If you override this method, you must also override {@link
+     * #getPointValues(LeafReaderContext, String)} This class uses sort optimization that leverages
+     * points to filter out non-competitive matches, which relies on the assumption that points and
+     * doc values record the same information.
+     *
+     * @param context – reader context
+     * @param field - field name
+     * @return numeric doc values for the field in this segment.
+     * @throws IOException If there is a low-level I/O error
      */
     protected NumericDocValues getNumericDocValues(LeafReaderContext context, String field) throws IOException {
       return DocValues.getNumeric(context.reader(), field);
     }
 
+    /**
+     * Retrieves point values for the field in this segment
+     *
+     * <p>If you override this method, you must also override {@link
+     * #getNumericDocValues(LeafReaderContext, String)} This class uses sort optimization that
+     * leverages points to filter out non-competitive matches, which relies on the assumption that
+     * points and doc values record the same information. Return {@code null} even if no points
+     * implementation is available, in this case sort optimization with points will be disabled.
+     *
+     * @param context – reader context
+     * @param field - field name
+     * @return point values for the field in this segment if they are available or {@code null} if
+     *     sort optimization with points should be disabled.
+     * @throws IOException If there is a low-level I/O error
+     */
+    protected PointValues getPointValues(LeafReaderContext context, String field)
+        throws IOException {
+      return context.reader().getPointValues(field);
+    }
+
     @Override
     public void setBottom(int slot) throws IOException {
       queueFull = true; // if we are setting bottom, it means that we have collected enough hits
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
index 9270cc2..c3170d8 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
@@ -16,6 +16,7 @@
  */
 package org.apache.lucene.search;
 
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FloatDocValuesField;
@@ -24,6 +25,7 @@ import org.apache.lucene.document.IntPoint;
 import org.apache.lucene.document.IntRange;
 import org.apache.lucene.document.FloatPoint;
 import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.SortedNumericDocValuesField;
 import org.apache.lucene.document.StoredField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
@@ -751,6 +753,86 @@ public class TestSortOptimization extends LuceneTestCase {
     dir.close();
   }
 
+  // Test that sort on sorted numeric field without sort optimization and
+  // with sort optimization produce the same results
+  public void testSortOptimizationOnSortedNumericField() throws IOException {
+    final Directory dir = newDirectory();
+    final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
+    final int numDocs = atLeast(5000);
+    for (int i = 0; i < numDocs; ++i) {
+      int value = random().nextInt();
+      int value2 = random().nextInt();
+      final Document doc = new Document();
+      doc.add(new SortedNumericDocValuesField("my_field", value));
+      doc.add(new SortedNumericDocValuesField("my_field", value2));
+      doc.add(new LongPoint("my_field", value));
+      doc.add(new LongPoint("my_field", value2));
+      writer.addDocument(doc);
+    }
+    final IndexReader reader = DirectoryReader.open(writer);
+    writer.close();
+    IndexSearcher searcher = newSearcher(reader);
+
+    SortedNumericSelector.Type type =
+        RandomPicks.randomFrom(random(), SortedNumericSelector.Type.values());
+    boolean reverse = random().nextBoolean();
+    final SortField sortField =
+        new SortedNumericSortField("my_field", SortField.Type.LONG, reverse, type);
+    final Sort sort = new Sort(sortField); // sort without sort optimization
+    final SortField sortField2 =
+        new SortedNumericSortField("my_field", SortField.Type.LONG, reverse, type);
+    sortField2.setCanUsePoints();
+    final Sort sort2 = new Sort(sortField2);  // sort with sort optimization
+    Query query = new MatchAllDocsQuery();
+    final int totalHitsThreshold = 3;
+
+    long expectedCollectedHits = 0;
+    long collectedHits = 0;
+    long collectedHits2 = 0;
+    int visitedHits = 0;
+    ScoreDoc after = null;
+    while (visitedHits < numDocs) {
+      int batch = 1 + random().nextInt(100);
+      int expectedHits = Math.min(numDocs - visitedHits, batch);
+
+      final TopFieldCollector collector =
+          TopFieldCollector.create(sort, batch, (FieldDoc) after, totalHitsThreshold);
+      searcher.search(query, collector);
+      TopDocs topDocs = collector.topDocs();
+      ScoreDoc[] scoreDocs = topDocs.scoreDocs;
+
+      final TopFieldCollector collector2 =
+          TopFieldCollector.create(sort2, batch, (FieldDoc) after, totalHitsThreshold);
+      searcher.search(query, collector2);
+      TopDocs topDocs2 = collector2.topDocs();
+      ScoreDoc[] scoreDocs2 = topDocs2.scoreDocs;
+
+      // assert that the resulting hits are the same
+      assertEquals(expectedHits, topDocs.scoreDocs.length);
+      assertEquals(topDocs.scoreDocs.length, topDocs2.scoreDocs.length);
+      for (int i = 0; i < scoreDocs.length; i++) {
+        FieldDoc fieldDoc = (FieldDoc) scoreDocs[i];
+        FieldDoc fieldDoc2 = (FieldDoc) scoreDocs2[i];
+        assertEquals(fieldDoc.fields[0], fieldDoc2.fields[0]);
+        assertEquals(fieldDoc.doc, fieldDoc2.doc);
+        visitedHits++;
+      }
+
+      expectedCollectedHits += numDocs;
+      collectedHits += topDocs.totalHits.value;
+      collectedHits2 += topDocs2.totalHits.value;
+      after = scoreDocs[expectedHits - 1];
+    }
+    assertEquals(visitedHits, numDocs);
+    assertEquals(expectedCollectedHits, collectedHits);
+    // assert that the second sort with optimization collected less or equal hits
+    assertTrue(collectedHits >= collectedHits2);
+    // System.out.println(expectedCollectedHits + "\t" + collectedHits + "\t" + collectedHits2);
+
+    reader.close();
+    dir.close();
+  }
+
   private void assertNonCompetitiveHitsAreSkipped(long collectedHits, long numDocs) {
     if (collectedHits >= numDocs) {
       fail(
diff --git a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java
index 2227405..ae72314 100644
--- a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java
+++ b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java
@@ -22,6 +22,7 @@ import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.FilterNumericDocValues;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
@@ -147,6 +148,11 @@ public class ToParentBlockJoinSortField extends SortField {
             }
             return BlockJoinSelector.wrap(sortedNumeric, type, parents, children);
           }
+          // no sort optimization with points
+          @Override
+          protected PointValues getPointValues(LeafReaderContext context, String field) {
+            return null;
+          }
         };
       }
     };
@@ -170,6 +176,11 @@ public class ToParentBlockJoinSortField extends SortField {
             }
             return BlockJoinSelector.wrap(sortedNumeric, type, parents, children);
           }
+          // no sort optimization with points
+          @Override
+          protected PointValues getPointValues(LeafReaderContext context, String field) {
+            return null;
+          }
         };
       }
     };
@@ -199,6 +210,11 @@ public class ToParentBlockJoinSortField extends SortField {
               }
             };
           }
+          // no sort optimization with points
+          @Override
+          protected PointValues getPointValues(LeafReaderContext context, String field) {
+            return null;
+          }
         };
       };
     };
@@ -228,6 +244,11 @@ public class ToParentBlockJoinSortField extends SortField {
               }
             };
           }
+          // no sort optimization with points
+          @Override
+          protected PointValues getPointValues(LeafReaderContext context, String field) {
+            return null;
+          }
         };
       }
     };