You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by iv...@apache.org on 2018/09/08 13:59:33 UTC

lucene-solr:branch_7x: LUCENE-7862: Store the real bounds of the leaf cells in the BKD index when the number of dimensions is bigger than 1

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7x 68beae49c -> 46a3f1e6f


LUCENE-7862: Store the real bounds of the leaf cells in the BKD index when the number of dimensions is bigger than 1


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/46a3f1e6
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/46a3f1e6
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/46a3f1e6

Branch: refs/heads/branch_7x
Commit: 46a3f1e6f551fd3c4b506f44e8632a310656f828
Parents: 68beae4
Author: iverase <iv...@apache.org>
Authored: Sat Sep 8 15:58:56 2018 +0200
Committer: iverase <iv...@apache.org>
Committed: Sat Sep 8 15:58:56 2018 +0200

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  4 ++
 .../org/apache/lucene/util/bkd/BKDReader.java   | 57 ++++++++++++++++----
 .../org/apache/lucene/util/bkd/BKDWriter.java   | 55 ++++++++++++++++---
 .../document/FloatPointNearestNeighbor.java     |  2 +-
 .../apache/lucene/search/NearestNeighbor.java   |  2 +-
 5 files changed, 101 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46a3f1e6/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 7823d2b..9387b18 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -299,6 +299,10 @@ New Features
   suggesters, and as a component of a named entity recognition system.  This was excised
   out of CompletionTokenStream in the NRT doc suggester.  (David Smiley, Jim Ferenczi)
 
+* LUCENE-7862: Store the real bounds of the leaf cells in the BKD index when the
+   number of dimensions is bigger than 1. It improves performance when there is
+   correlation between the dimensions, for example ranges. (Ignacio Vera, Adrien Grand)
+
 Bug Fixes
 
 * LUCENE-8221: MoreLikeThis.setMaxDocFreqPct can easily int-overflow on larger

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46a3f1e6/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index 8a27e65..194ac82 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -497,7 +497,7 @@ public final class BKDReader extends PointValues implements Accountable {
   public static final class IntersectState {
     final IndexInput in;
     final int[] scratchDocIDs;
-    final byte[] scratchPackedValue;
+    final byte[] scratchPackedValue1, scratchPackedValue2;
     final int[] commonPrefixLengths;
 
     final IntersectVisitor visitor;
@@ -512,7 +512,8 @@ public final class BKDReader extends PointValues implements Accountable {
       this.visitor = visitor;
       this.commonPrefixLengths = new int[numDims];
       this.scratchDocIDs = new int[maxPointsInLeafNode];
-      this.scratchPackedValue = new byte[packedBytesLength];
+      this.scratchPackedValue1 = new byte[packedBytesLength];
+      this.scratchPackedValue2 = new byte[packedBytesLength];
       this.index = indexVisitor;
     }
   }
@@ -579,7 +580,7 @@ public final class BKDReader extends PointValues implements Accountable {
     int count = readDocIDs(state.in, index.getLeafBlockFP(), state.scratchDocIDs);
 
     // Again, this time reading values and checking with the visitor
-    visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
+    visitDocValues(state.commonPrefixLengths, state.scratchPackedValue1, state.scratchPackedValue2, state.in, state.scratchDocIDs, count, state.visitor);
   }
 
   private void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
@@ -612,22 +613,60 @@ public final class BKDReader extends PointValues implements Accountable {
     return count;
   }
 
-  void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
-    visitor.grow(count);
+  void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue1, byte[] scratchPackedValue2, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+
+
+    readCommonPrefixes(commonPrefixLengths, scratchPackedValue1, in);
+
+    if (numDims != 1 && version >= BKDWriter.VERSION_LEAF_STORES_BOUNDS) {
+      byte[] minPackedValue = scratchPackedValue1;
+      byte[] maxPackedValue = scratchPackedValue2;
+      //Copy common prefixes before reading adjusted box
+      System.arraycopy(minPackedValue, 0, maxPackedValue, 0, packedBytesLength);
+      readMinMax(commonPrefixLengths, minPackedValue, maxPackedValue, in);
+
+      // The index gives us range of values for each dimension, but the actual range of values
+      // might be much more narrow than what the index told us, so we double check the relation
+      // here, which is cheap yet might help figure out that the block either entirely matches
+      // or does not match at all. This is especially more likely in the case that there are
+      // multiple dimensions that have correlation, ie. splitting on one dimension also
+      // significantly changes the range of values in another dimension.
+      Relation r = visitor.compare(minPackedValue, maxPackedValue);
+      if (r == Relation.CELL_OUTSIDE_QUERY) {
+        return;
+      }
+      visitor.grow(count);
+
+      if (r == Relation.CELL_INSIDE_QUERY) {
+        for (int i = 0; i < count; ++i) {
+          visitor.visit(docIDs[i]);
+        }
+        return;
+      }
+    } else {
+      visitor.grow(count);
+    }
 
-    readCommonPrefixes(commonPrefixLengths, scratchPackedValue, in);
 
     int compressedDim = version < BKDWriter.VERSION_COMPRESSED_VALUES
         ? -1
         : readCompressedDim(in);
 
     if (compressedDim == -1) {
-      visitRawDocValues(commonPrefixLengths, scratchPackedValue, in, docIDs, count, visitor);
+      visitRawDocValues(commonPrefixLengths, scratchPackedValue1, in, docIDs, count, visitor);
     } else {
-      visitCompressedDocValues(commonPrefixLengths, scratchPackedValue, in, docIDs, count, visitor, compressedDim);
+      visitCompressedDocValues(commonPrefixLengths, scratchPackedValue1, in, docIDs, count, visitor, compressedDim);
     }
   }
 
+    private void readMinMax(int[] commonPrefixLengths, byte[] minPackedValue, byte[] maxPackedValue, IndexInput in) throws IOException {
+      for (int dim = 0; dim < numDims; dim++) {
+        int prefix = commonPrefixLengths[dim];
+        in.readBytes(minPackedValue, dim * bytesPerDim + prefix, bytesPerDim - prefix);
+        in.readBytes(maxPackedValue, dim * bytesPerDim + prefix, bytesPerDim - prefix);
+      }
+    }
+
   // Just read suffixes for every dimension
   private void visitRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
     for (int i = 0; i < count; ++i) {
@@ -708,7 +747,7 @@ public final class BKDReader extends PointValues implements Accountable {
         int count = readDocIDs(state.in, state.index.getLeafBlockFP(), state.scratchDocIDs);
 
         // Again, this time reading values and checking with the visitor
-        visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
+        visitDocValues(state.commonPrefixLengths, state.scratchPackedValue1, state.scratchPackedValue2, state.in, state.scratchDocIDs, count, state.visitor);
       }
 
     } else {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46a3f1e6/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index edb35c2..e47cbca 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -39,6 +39,7 @@ import org.apache.lucene.store.RAMOutputStream;
 import org.apache.lucene.store.TrackingDirectoryWrapper;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.BytesRefComparator;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.FutureArrays;
@@ -87,7 +88,8 @@ public class BKDWriter implements Closeable {
   public static final int VERSION_COMPRESSED_VALUES = 2;
   public static final int VERSION_IMPLICIT_SPLIT_DIM_1D = 3;
   public static final int VERSION_PACKED_INDEX = 4;
-  public static final int VERSION_CURRENT = VERSION_PACKED_INDEX;
+  public static final int VERSION_LEAF_STORES_BOUNDS = 5;
+  public static final int VERSION_CURRENT = VERSION_LEAF_STORES_BOUNDS;
 
   /** How many bytes each docs takes in the fixed-width offline format */
   private final int bytesPerDoc;
@@ -348,16 +350,16 @@ public class BKDWriter implements Closeable {
           docsInBlock = bkd.readDocIDs(state.in, state.in.getFilePointer(), state.scratchDocIDs);
           assert docsInBlock > 0;
           docBlockUpto = 0;
-          bkd.visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, docsInBlock, new IntersectVisitor() {
+          bkd.visitDocValues(state.commonPrefixLengths, state.scratchPackedValue1, state.scratchPackedValue2, state.in, state.scratchDocIDs, docsInBlock, new IntersectVisitor() {
             int i = 0;
 
             @Override
-            public void visit(int docID) throws IOException {
+            public void visit(int docID) {
               throw new UnsupportedOperationException();
             }
 
             @Override
-            public void visit(int docID, byte[] packedValue) throws IOException {
+            public void visit(int docID, byte[] packedValue) {
               assert docID == state.scratchDocIDs[i];
               System.arraycopy(packedValue, 0, packedValues, i * bkd.packedBytesLength, bkd.packedBytesLength);
               i++;
@@ -365,7 +367,7 @@ public class BKDWriter implements Closeable {
 
             @Override
             public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
-              throw new UnsupportedOperationException();
+              return Relation.CELL_CROSSES_QUERY;
             }
 
           });
@@ -386,7 +388,7 @@ public class BKDWriter implements Closeable {
         if (mappedDocID != -1) {
           // Not deleted!
           docID = mappedDocID;
-          System.arraycopy(packedValues, index * bkd.packedBytesLength, state.scratchPackedValue, 0, bkd.packedBytesLength);
+          System.arraycopy(packedValues, index * bkd.packedBytesLength, state.scratchPackedValue1, 0, bkd.packedBytesLength);
           return true;
         }
       }
@@ -405,7 +407,7 @@ public class BKDWriter implements Closeable {
     public boolean lessThan(MergeReader a, MergeReader b) {
       assert a != b;
 
-      int cmp = FutureArrays.compareUnsigned(a.state.scratchPackedValue, 0, bytesPerDim, b.state.scratchPackedValue, 0, bytesPerDim);
+      int cmp = FutureArrays.compareUnsigned(a.state.scratchPackedValue1, 0, bytesPerDim, b.state.scratchPackedValue1, 0, bytesPerDim);
       if (cmp < 0) {
         return true;
       } else if (cmp > 0) {
@@ -547,7 +549,7 @@ public class BKDWriter implements Closeable {
       MergeReader reader = queue.top();
       // System.out.println("iter reader=" + reader);
 
-      oneDimWriter.add(reader.state.scratchPackedValue, reader.docID);
+      oneDimWriter.add(reader.state.scratchPackedValue1, reader.docID);
 
       if (reader.next()) {
         queue.updateTop();
@@ -1276,6 +1278,9 @@ public class BKDWriter implements Closeable {
       // all values in this block are equal
       out.writeByte((byte) -1);
     } else {
+      if (numDims != 1) {
+        writeActualBounds(out, commonPrefixLengths, count, packedValues);
+      }
       assert commonPrefixLengths[sortedDim] < bytesPerDim;
       out.writeByte((byte) sortedDim);
       int compressedByteOffset = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim];
@@ -1295,6 +1300,40 @@ public class BKDWriter implements Closeable {
     }
   }
 
+  private void writeActualBounds(DataOutput out, int[] commonPrefixLengths, int count, IntFunction<BytesRef> packedValues) throws IOException {
+    for (int dim = 0; dim < numDims; ++dim) {
+      int commonPrefixLength = commonPrefixLengths[dim];
+      int suffixLength = bytesPerDim - commonPrefixLength;
+      if (suffixLength > 0) {
+        BytesRef[] minMax = computeMinMax(count, packedValues, dim * bytesPerDim + commonPrefixLength, suffixLength);
+        BytesRef min = minMax[0];
+        BytesRef max = minMax[1];
+        out.writeBytes(min.bytes, min.offset, min.length);
+        out.writeBytes(max.bytes, max.offset, max.length);
+      }
+    }
+  }
+
+  /** Return an array that contains the min and max values for the [offset, offset+length] interval
+   *  of the given {@link BytesRef}s. */
+  private static BytesRef[] computeMinMax(int count, IntFunction<BytesRef> packedValues, int offset, int length) {
+    assert length > 0;
+    BytesRefBuilder min = new BytesRefBuilder();
+    BytesRefBuilder max = new BytesRefBuilder();
+    BytesRef first = packedValues.apply(0);
+    min.copyBytes(first.bytes, first.offset + offset, length);
+    max.copyBytes(first.bytes, first.offset + offset, length);
+    for (int i = 1; i < count; ++i) {
+      BytesRef candidate = packedValues.apply(i);
+      if (FutureArrays.compareUnsigned(min.bytes(), 0, length, candidate.bytes, candidate.offset + offset, candidate.offset + offset + length) > 0) {
+        min.copyBytes(candidate.bytes, candidate.offset + offset, length);
+      } else if (FutureArrays.compareUnsigned(max.bytes(), 0, length, candidate.bytes, candidate.offset + offset, candidate.offset + offset + length) < 0) {
+        max.copyBytes(candidate.bytes, candidate.offset + offset, length);
+      }
+    }
+    return new BytesRef[]{min.get(), max.get()};
+  }
+
   private void writeLeafBlockPackedValuesRange(DataOutput out, int[] commonPrefixLengths, int start, int end, IntFunction<BytesRef> packedValues) throws IOException {
     for (int i = start; i < end; ++i) {
       BytesRef ref = packedValues.apply(i);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46a3f1e6/lucene/sandbox/src/java/org/apache/lucene/document/FloatPointNearestNeighbor.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/document/FloatPointNearestNeighbor.java b/lucene/sandbox/src/java/org/apache/lucene/document/FloatPointNearestNeighbor.java
index 90a80b6..febb8e3 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/document/FloatPointNearestNeighbor.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/document/FloatPointNearestNeighbor.java
@@ -189,7 +189,7 @@ public class FloatPointNearestNeighbor {
 
     @Override
     public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
-      throw new AssertionError();
+      return PointValues.Relation.CELL_CROSSES_QUERY;
     }
   }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/46a3f1e6/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java b/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java
index 8502c45..449d37d 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java
@@ -178,7 +178,7 @@ class NearestNeighbor {
 
     @Override
     public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
-      throw new AssertionError();
+      return Relation.CELL_CROSSES_QUERY;
     }
   }