You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2022/09/26 12:57:28 UTC

[lucene] branch branch_9x updated (8ccb59eb035 -> b69ed36b6d2)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a change to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


    from 8ccb59eb035 Upgrade several build dependencies and regenerate sources (#11812) (#11816)
     new f31292c1441 LUCENE-10425:speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocSetIdIterator construction  using bkd binary search (#687)
     new b69ed36b6d2 Fix codec name in index header for Lucene94FieldInfosFormat. (#11818)

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 lucene/CHANGES.txt                                 |   3 +
 .../codecs/lucene94/Lucene94FieldInfosFormat.java  |   2 +-
 .../IndexSortSortedNumericDocValuesRangeQuery.java | 167 +++++++++++++++++++++
 ...tIndexSortSortedNumericDocValuesRangeQuery.java | 112 ++++++++++++++
 4 files changed, 283 insertions(+), 1 deletion(-)


[lucene] 01/02: LUCENE-10425:speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocSetIdIterator construction using bkd binary search (#687)

Posted by jp...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git

commit f31292c14414759acd5fbec52d925a2b95be1e44
Author: jianping weng <xi...@alibaba-inc.com>
AuthorDate: Thu Sep 22 14:51:13 2022 +0800

    LUCENE-10425:speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocSetIdIterator construction  using bkd binary search (#687)
---
 lucene/CHANGES.txt                                 |   3 +
 .../IndexSortSortedNumericDocValuesRangeQuery.java | 167 +++++++++++++++++++++
 ...tIndexSortSortedNumericDocValuesRangeQuery.java | 112 ++++++++++++++
 3 files changed, 282 insertions(+)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 15e97e0afa8..3ddf6dc305a 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -17,6 +17,9 @@ Improvements
 ---------------------
 * GITHUB#11785: Improve Tessellator performance by delaying calls to the method
   #isIntersectingPolygon (Ignacio Vera) 
+
+* GITHUB#687: speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocIdSetIterator
+  construction using bkd binary search. (Jianping Weng)
   
 Bug Fixes
 ---------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java
index 6e517a5594c..a3baa9d48bd 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java
@@ -18,12 +18,17 @@ package org.apache.lucene.sandbox.search;
 
 import java.io.IOException;
 import java.util.Objects;
+import java.util.function.Predicate;
+import org.apache.lucene.document.IntPoint;
+import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.PointValues;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.search.ConstantScoreScorer;
 import org.apache.lucene.search.ConstantScoreWeight;
@@ -43,6 +48,8 @@ import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.SortField.Type;
 import org.apache.lucene.search.SortedNumericSortField;
 import org.apache.lucene.search.Weight;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.ArrayUtil.ByteArrayComparator;
 
 /**
  * A range query that can take advantage of the fact that the index is sorted to speed up execution.
@@ -214,12 +221,172 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
     };
   }
 
+  /**
+   * Returns the first document whose packed value is greater than or equal (if allowEqual is true)
+   * to the provided packed value or -1 if all packed values are smaller than the provided one,
+   */
+  public final int nextDoc(PointValues values, byte[] packedValue, boolean allowEqual)
+      throws IOException {
+    assert values.getNumDimensions() == 1;
+    final int bytesPerDim = values.getBytesPerDimension();
+    final ByteArrayComparator comparator = ArrayUtil.getUnsignedComparator(bytesPerDim);
+    final Predicate<byte[]> biggerThan =
+        testPackedValue -> {
+          int cmp = comparator.compare(testPackedValue, 0, packedValue, 0);
+          return cmp > 0 || (cmp == 0 && allowEqual);
+        };
+    return nextDoc(values.getPointTree(), biggerThan);
+  }
+
+  private int nextDoc(PointValues.PointTree pointTree, Predicate<byte[]> biggerThan)
+      throws IOException {
+    if (biggerThan.test(pointTree.getMaxPackedValue()) == false) {
+      // doc is before us
+      return -1;
+    } else if (pointTree.moveToChild()) {
+      // navigate down
+      do {
+        final int doc = nextDoc(pointTree, biggerThan);
+        if (doc != -1) {
+          return doc;
+        }
+      } while (pointTree.moveToSibling());
+      pointTree.moveToParent();
+      return -1;
+    } else {
+      // doc is in this leaf
+      final int[] doc = {-1};
+      pointTree.visitDocValues(
+          new IntersectVisitor() {
+            @Override
+            public void visit(int docID) {
+              throw new AssertionError("Invalid call to visit(docID)");
+            }
+
+            @Override
+            public void visit(int docID, byte[] packedValue) {
+              if (doc[0] == -1 && biggerThan.test(packedValue)) {
+                doc[0] = docID;
+              }
+            }
+
+            @Override
+            public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+              return Relation.CELL_CROSSES_QUERY;
+            }
+          });
+      return doc[0];
+    }
+  }
+
+  private boolean matchNone(PointValues points, byte[] queryLowerPoint, byte[] queryUpperPoint)
+      throws IOException {
+    final ByteArrayComparator comparator =
+        ArrayUtil.getUnsignedComparator(points.getBytesPerDimension());
+    for (int dim = 0; dim < points.getNumDimensions(); dim++) {
+      int offset = dim * points.getBytesPerDimension();
+      if (comparator.compare(points.getMinPackedValue(), offset, queryUpperPoint, offset) > 0
+          || comparator.compare(points.getMaxPackedValue(), offset, queryLowerPoint, offset) < 0) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private boolean matchAll(PointValues points, byte[] queryLowerPoint, byte[] queryUpperPoint)
+      throws IOException {
+    final ByteArrayComparator comparator =
+        ArrayUtil.getUnsignedComparator(points.getBytesPerDimension());
+    for (int dim = 0; dim < points.getNumDimensions(); dim++) {
+      int offset = dim * points.getBytesPerDimension();
+      if (comparator.compare(points.getMinPackedValue(), offset, queryUpperPoint, offset) > 0) {
+        return false;
+      }
+      if (comparator.compare(points.getMaxPackedValue(), offset, queryLowerPoint, offset) < 0) {
+        return false;
+      }
+      if (comparator.compare(points.getMinPackedValue(), offset, queryLowerPoint, offset) < 0
+          || comparator.compare(points.getMaxPackedValue(), offset, queryUpperPoint, offset) > 0) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  private BoundedDocIdSetIterator getDocIdSetIteratorOrNullFromBkd(
+      LeafReaderContext context, DocIdSetIterator delegate) throws IOException {
+    Sort indexSort = context.reader().getMetaData().getSort();
+    if (indexSort != null
+        && indexSort.getSort().length > 0
+        && indexSort.getSort()[0].getField().equals(field)
+        && indexSort.getSort()[0].getReverse() == false) {
+      PointValues points = context.reader().getPointValues(field);
+      if (points == null) {
+        return null;
+      }
+
+      if (points.getNumDimensions() != 1) {
+        return null;
+      }
+
+      if (points.getBytesPerDimension() != Long.BYTES
+          && points.getBytesPerDimension() != Integer.BYTES) {
+        return null;
+      }
+
+      // Each doc that has points has exactly one point.
+      if (points.size() == points.getDocCount()) {
+
+        byte[] queryLowerPoint;
+        byte[] queryUpperPoint;
+        if (points.getBytesPerDimension() == Integer.BYTES) {
+          queryLowerPoint = IntPoint.pack((int) lowerValue).bytes;
+          queryUpperPoint = IntPoint.pack((int) upperValue).bytes;
+        } else {
+          queryLowerPoint = LongPoint.pack(lowerValue).bytes;
+          queryUpperPoint = LongPoint.pack(upperValue).bytes;
+        }
+        if (lowerValue > upperValue || matchNone(points, queryLowerPoint, queryUpperPoint)) {
+          return new BoundedDocIdSetIterator(0, 0, null);
+        }
+        int minDocId, maxDocId;
+        if (matchAll(points, queryLowerPoint, queryUpperPoint)) {
+          minDocId = 0;
+          maxDocId = context.reader().maxDoc();
+        } else {
+          // >=queryLowerPoint
+          minDocId = nextDoc(points, queryLowerPoint, true);
+
+          if (minDocId == -1) {
+            return new BoundedDocIdSetIterator(0, 0, null);
+          }
+          // >queryUpperPoint,
+          maxDocId = nextDoc(points, queryUpperPoint, false);
+          if (maxDocId == -1) {
+            maxDocId = context.reader().maxDoc();
+          }
+        }
+
+        if ((points.getDocCount() == context.reader().maxDoc())) {
+          return new BoundedDocIdSetIterator(minDocId, maxDocId, null);
+        } else {
+          return new BoundedDocIdSetIterator(minDocId, maxDocId, delegate);
+        }
+      }
+    }
+    return null;
+  }
+
   private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
       throws IOException {
     SortedNumericDocValues sortedNumericValues =
         DocValues.getSortedNumeric(context.reader(), field);
     NumericDocValues numericValues = DocValues.unwrapSingleton(sortedNumericValues);
     if (numericValues != null) {
+      BoundedDocIdSetIterator iterator = getDocIdSetIteratorOrNullFromBkd(context, numericValues);
+      if (iterator != null) {
+        return iterator;
+      }
       Sort indexSort = context.reader().getMetaData().getSort();
       if (indexSort != null
           && indexSort.getSort().length > 0
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
index 69477837292..74d671aa774 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
@@ -19,6 +19,7 @@ package org.apache.lucene.sandbox.search;
 import static org.hamcrest.CoreMatchers.instanceOf;
 
 import java.io.IOException;
+import java.util.Random;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.LongPoint;
@@ -641,4 +642,115 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
     return new IndexSortSortedNumericDocValuesRangeQuery(
         field, lowerValue, upperValue, fallbackQuery);
   }
+
+  public void testCountWithBkd() throws IOException {
+    String filedName = "field";
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+    Sort indexSort = new Sort(new SortedNumericSortField(filedName, SortField.Type.LONG, false));
+    iwc.setIndexSort(indexSort);
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
+    addDocWithBkd(writer, filedName, 6, 500);
+    addDocWithBkd(writer, filedName, 5, 500);
+    addDocWithBkd(writer, filedName, 8, 500);
+    addDocWithBkd(writer, filedName, 9, 500);
+    addDocWithBkd(writer, filedName, 7, 500);
+    writer.flush();
+    writer.forceMerge(1);
+    IndexReader reader = writer.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+
+    Query fallbackQuery = LongPoint.newRangeQuery(filedName, 6, 8);
+    Query query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 6, 8, fallbackQuery);
+    Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(1500, weight.count(context));
+    }
+
+    fallbackQuery = LongPoint.newRangeQuery(filedName, 6, 10);
+    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 6, 10, fallbackQuery);
+    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(2000, weight.count(context));
+    }
+
+    fallbackQuery = LongPoint.newRangeQuery(filedName, 4, 6);
+    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 4, 6, fallbackQuery);
+    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(1000, weight.count(context));
+    }
+
+    fallbackQuery = LongPoint.newRangeQuery(filedName, 2, 10);
+    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 2, 10, fallbackQuery);
+    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(2500, weight.count(context));
+    }
+
+    fallbackQuery = LongPoint.newRangeQuery(filedName, 2, 3);
+    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 2, 3, fallbackQuery);
+    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(0, weight.count(context));
+    }
+
+    fallbackQuery = LongPoint.newRangeQuery(filedName, 10, 11);
+    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 10, 11, fallbackQuery);
+    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(0, weight.count(context));
+    }
+
+    writer.close();
+    reader.close();
+    dir.close();
+  }
+
+  public void testRandomCountWithBkd() throws IOException {
+    String filedName = "field";
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+    Sort indexSort = new Sort(new SortedNumericSortField(filedName, SortField.Type.LONG, false));
+    iwc.setIndexSort(indexSort);
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
+    Random random = random();
+    for (int i = 0; i < 100; i++) {
+      addDocWithBkd(writer, filedName, random.nextInt(1000), random.nextInt(1000));
+    }
+    writer.flush();
+    writer.forceMerge(1);
+    IndexReader reader = writer.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+
+    for (int i = 0; i < 100; i++) {
+      int random1 = random.nextInt(1100);
+      int random2 = random.nextInt(1100);
+      int low = Math.min(random1, random2);
+      int upper = Math.max(random1, random2);
+      Query rangeQuery = LongPoint.newRangeQuery(filedName, low, upper);
+      Query indexSortRangeQuery =
+          new IndexSortSortedNumericDocValuesRangeQuery(filedName, low, upper, rangeQuery);
+      Weight indexSortRangeQueryWeight =
+          indexSortRangeQuery.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+      Weight rangeQueryWeight = rangeQuery.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+      for (LeafReaderContext context : searcher.getLeafContexts()) {
+        assertEquals(rangeQueryWeight.count(context), indexSortRangeQueryWeight.count(context));
+      }
+    }
+
+    writer.close();
+    reader.close();
+    dir.close();
+  }
+
+  private void addDocWithBkd(RandomIndexWriter indexWriter, String field, long value, int repeat)
+      throws IOException {
+    for (int i = 0; i < repeat; i++) {
+      Document doc = new Document();
+      doc.add(new SortedNumericDocValuesField(field, value));
+      doc.add(new LongPoint(field, value));
+      indexWriter.addDocument(doc);
+    }
+  }
 }


[lucene] 02/02: Fix codec name in index header for Lucene94FieldInfosFormat. (#11818)

Posted by jp...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git

commit b69ed36b6d2775e646ab14bc8c4d7405bb805c39
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Mon Sep 26 14:56:30 2022 +0200

    Fix codec name in index header for Lucene94FieldInfosFormat. (#11818)
---
 .../org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java
index d8b556c2720..5a3957aa3c1 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java
@@ -373,7 +373,7 @@ public final class Lucene94FieldInfosFormat extends FieldInfosFormat {
   static final String EXTENSION = "fnm";
 
   // Codec header
-  static final String CODEC_NAME = "Lucene90FieldInfos";
+  static final String CODEC_NAME = "Lucene94FieldInfos";
   static final int FORMAT_START = 0;
   static final int FORMAT_CURRENT = FORMAT_START;