You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2018/09/17 10:08:52 UTC
[07/44] lucene-solr:jira/solr-12709: LUCENE-7862: Store the real
bounds of the leaf cells in the BKD index when the number of dimensions is
bigger than 1
LUCENE-7862: Store the real bounds of the leaf cells in the BKD index when the number of dimensions is bigger than 1
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/f406ff91
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/f406ff91
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/f406ff91
Branch: refs/heads/jira/solr-12709
Commit: f406ff91a8912f13a7652a2802084db1c0da5830
Parents: 3b62f23
Author: iverase <iv...@apache.org>
Authored: Sat Sep 8 15:42:30 2018 +0200
Committer: iverase <iv...@apache.org>
Committed: Sat Sep 8 15:42:30 2018 +0200
----------------------------------------------------------------------
lucene/CHANGES.txt | 4 ++
.../org/apache/lucene/util/bkd/BKDReader.java | 58 +++++++++++++++++---
.../org/apache/lucene/util/bkd/BKDWriter.java | 56 ++++++++++++++++---
.../document/FloatPointNearestNeighbor.java | 2 +-
.../apache/lucene/search/NearestNeighbor.java | 2 +-
5 files changed, 103 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f406ff91/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 07163b3..846554e 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -298,6 +298,10 @@ Improvements
* LUCENE-8422: IntervalQuery now returns useful Matches (Alan Woodward)
+* LUCENE-7862: Store the real bounds of the leaf cells in the BKD index when the
+ number of dimensions is bigger than 1. It improves performance when there is
+ correlation between the dimensions, for example ranges. (Ignacio Vera, Adrien Grand)
+
Other:
* LUCENE-8485: Update randomizedtesting to version 2.6.4. (Dawid Weiss)
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f406ff91/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index f6a9bf4..00a1d7d 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -325,7 +325,7 @@ public final class BKDReader extends PointValues implements Accountable {
public static final class IntersectState {
final IndexInput in;
final int[] scratchDocIDs;
- final byte[] scratchPackedValue;
+ final byte[] scratchPackedValue1, scratchPackedValue2;
final int[] commonPrefixLengths;
final IntersectVisitor visitor;
@@ -340,7 +340,8 @@ public final class BKDReader extends PointValues implements Accountable {
this.visitor = visitor;
this.commonPrefixLengths = new int[numDims];
this.scratchDocIDs = new int[maxPointsInLeafNode];
- this.scratchPackedValue = new byte[packedBytesLength];
+ this.scratchPackedValue1 = new byte[packedBytesLength];
+ this.scratchPackedValue2 = new byte[packedBytesLength];
this.index = indexVisitor;
}
}
@@ -402,7 +403,7 @@ public final class BKDReader extends PointValues implements Accountable {
int count = readDocIDs(state.in, index.getLeafBlockFP(), state.scratchDocIDs);
// Again, this time reading values and checking with the visitor
- visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
+ visitDocValues(state.commonPrefixLengths, state.scratchPackedValue1, state.scratchPackedValue2, state.in, state.scratchDocIDs, count, state.visitor);
}
private void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
@@ -427,20 +428,59 @@ public final class BKDReader extends PointValues implements Accountable {
return count;
}
- void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
- visitor.grow(count);
+ void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue1, byte[] scratchPackedValue2, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+
+
+ readCommonPrefixes(commonPrefixLengths, scratchPackedValue1, in);
+
+ if (numDims != 1 && version >= BKDWriter.VERSION_LEAF_STORES_BOUNDS) {
+ byte[] minPackedValue = scratchPackedValue1;
+ byte[] maxPackedValue = scratchPackedValue2;
+ //Copy common prefixes before reading adjusted
+ // box
+ System.arraycopy(minPackedValue, 0, maxPackedValue, 0, packedBytesLength);
+ readMinMax(commonPrefixLengths, minPackedValue, maxPackedValue, in);
+
+ // The index gives us range of values for each dimension, but the actual range of values
+ // might be much more narrow than what the index told us, so we double check the relation
+ // here, which is cheap yet might help figure out that the block either entirely matches
+ // or does not match at all. This is especially more likely in the case that there are
+ // multiple dimensions that have correlation, ie. splitting on one dimension also
+ // significantly changes the range of values in another dimension.
+ Relation r = visitor.compare(minPackedValue, maxPackedValue);
+ if (r == Relation.CELL_OUTSIDE_QUERY) {
+ return;
+ }
+ visitor.grow(count);
+
+ if (r == Relation.CELL_INSIDE_QUERY) {
+ for (int i = 0; i < count; ++i) {
+ visitor.visit(docIDs[i]);
+ }
+ return;
+ }
+ } else {
+ visitor.grow(count);
+ }
- readCommonPrefixes(commonPrefixLengths, scratchPackedValue, in);
int compressedDim = readCompressedDim(in);
if (compressedDim == -1) {
- visitRawDocValues(commonPrefixLengths, scratchPackedValue, in, docIDs, count, visitor);
+ visitRawDocValues(commonPrefixLengths, scratchPackedValue1, in, docIDs, count, visitor);
} else {
- visitCompressedDocValues(commonPrefixLengths, scratchPackedValue, in, docIDs, count, visitor, compressedDim);
+ visitCompressedDocValues(commonPrefixLengths, scratchPackedValue1, in, docIDs, count, visitor, compressedDim);
}
}
+ private void readMinMax(int[] commonPrefixLengths, byte[] minPackedValue, byte[] maxPackedValue, IndexInput in) throws IOException {
+ for (int dim = 0; dim < numDims; dim++) {
+ int prefix = commonPrefixLengths[dim];
+ in.readBytes(minPackedValue, dim * bytesPerDim + prefix, bytesPerDim - prefix);
+ in.readBytes(maxPackedValue, dim * bytesPerDim + prefix, bytesPerDim - prefix);
+ }
+ }
+
// Just read suffixes for every dimension
private void visitRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
for (int i = 0; i < count; ++i) {
@@ -521,7 +561,7 @@ public final class BKDReader extends PointValues implements Accountable {
int count = readDocIDs(state.in, state.index.getLeafBlockFP(), state.scratchDocIDs);
// Again, this time reading values and checking with the visitor
- visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
+ visitDocValues(state.commonPrefixLengths, state.scratchPackedValue1, state.scratchPackedValue2, state.in, state.scratchDocIDs, count, state.visitor);
}
} else {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f406ff91/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index a8b9813..014d470 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -39,6 +39,7 @@ import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefComparator;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.FutureArrays;
@@ -83,7 +84,9 @@ public class BKDWriter implements Closeable {
public static final String CODEC_NAME = "BKD";
public static final int VERSION_START = 4; // version used by Lucene 7.0
- public static final int VERSION_CURRENT = VERSION_START;
+ //public static final int VERSION_CURRENT = VERSION_START;
+ public static final int VERSION_LEAF_STORES_BOUNDS = 5;
+ public static final int VERSION_CURRENT = VERSION_LEAF_STORES_BOUNDS;
/** How many bytes each docs takes in the fixed-width offline format */
private final int bytesPerDoc;
@@ -344,16 +347,16 @@ public class BKDWriter implements Closeable {
docsInBlock = bkd.readDocIDs(state.in, state.in.getFilePointer(), state.scratchDocIDs);
assert docsInBlock > 0;
docBlockUpto = 0;
- bkd.visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, docsInBlock, new IntersectVisitor() {
+ bkd.visitDocValues(state.commonPrefixLengths, state.scratchPackedValue1, state.scratchPackedValue2, state.in, state.scratchDocIDs, docsInBlock, new IntersectVisitor() {
int i = 0;
@Override
- public void visit(int docID) throws IOException {
+ public void visit(int docID) {
throw new UnsupportedOperationException();
}
@Override
- public void visit(int docID, byte[] packedValue) throws IOException {
+ public void visit(int docID, byte[] packedValue) {
assert docID == state.scratchDocIDs[i];
System.arraycopy(packedValue, 0, packedValues, i * bkd.packedBytesLength, bkd.packedBytesLength);
i++;
@@ -361,7 +364,7 @@ public class BKDWriter implements Closeable {
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
- throw new UnsupportedOperationException();
+ return Relation.CELL_CROSSES_QUERY;
}
});
@@ -382,7 +385,7 @@ public class BKDWriter implements Closeable {
if (mappedDocID != -1) {
// Not deleted!
docID = mappedDocID;
- System.arraycopy(packedValues, index * bkd.packedBytesLength, state.scratchPackedValue, 0, bkd.packedBytesLength);
+ System.arraycopy(packedValues, index * bkd.packedBytesLength, state.scratchPackedValue1, 0, bkd.packedBytesLength);
return true;
}
}
@@ -401,7 +404,7 @@ public class BKDWriter implements Closeable {
public boolean lessThan(MergeReader a, MergeReader b) {
assert a != b;
- int cmp = FutureArrays.compareUnsigned(a.state.scratchPackedValue, 0, bytesPerDim, b.state.scratchPackedValue, 0, bytesPerDim);
+ int cmp = FutureArrays.compareUnsigned(a.state.scratchPackedValue1, 0, bytesPerDim, b.state.scratchPackedValue1, 0, bytesPerDim);
if (cmp < 0) {
return true;
} else if (cmp > 0) {
@@ -543,7 +546,7 @@ public class BKDWriter implements Closeable {
MergeReader reader = queue.top();
// System.out.println("iter reader=" + reader);
- oneDimWriter.add(reader.state.scratchPackedValue, reader.docID);
+ oneDimWriter.add(reader.state.scratchPackedValue1, reader.docID);
if (reader.next()) {
queue.updateTop();
@@ -1272,6 +1275,9 @@ public class BKDWriter implements Closeable {
// all values in this block are equal
out.writeByte((byte) -1);
} else {
+ if (numDims != 1) {
+ writeActualBounds(out, commonPrefixLengths, count, packedValues);
+ }
assert commonPrefixLengths[sortedDim] < bytesPerDim;
out.writeByte((byte) sortedDim);
int compressedByteOffset = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim];
@@ -1291,6 +1297,40 @@ public class BKDWriter implements Closeable {
}
}
+ private void writeActualBounds(DataOutput out, int[] commonPrefixLengths, int count, IntFunction<BytesRef> packedValues) throws IOException {
+ for (int dim = 0; dim < numDims; ++dim) {
+ int commonPrefixLength = commonPrefixLengths[dim];
+ int suffixLength = bytesPerDim - commonPrefixLength;
+ if (suffixLength > 0) {
+ BytesRef[] minMax = computeMinMax(count, packedValues, dim * bytesPerDim + commonPrefixLength, suffixLength);
+ BytesRef min = minMax[0];
+ BytesRef max = minMax[1];
+ out.writeBytes(min.bytes, min.offset, min.length);
+ out.writeBytes(max.bytes, max.offset, max.length);
+ }
+ }
+ }
+
+ /** Return an array that contains the min and max values for the [offset, offset+length] interval
+ * of the given {@link BytesRef}s. */
+ private static BytesRef[] computeMinMax(int count, IntFunction<BytesRef> packedValues, int offset, int length) {
+ assert length > 0;
+ BytesRefBuilder min = new BytesRefBuilder();
+ BytesRefBuilder max = new BytesRefBuilder();
+ BytesRef first = packedValues.apply(0);
+ min.copyBytes(first.bytes, first.offset + offset, length);
+ max.copyBytes(first.bytes, first.offset + offset, length);
+ for (int i = 1; i < count; ++i) {
+ BytesRef candidate = packedValues.apply(i);
+ if (FutureArrays.compareUnsigned(min.bytes(), 0, length, candidate.bytes, candidate.offset + offset, candidate.offset + offset + length) > 0) {
+ min.copyBytes(candidate.bytes, candidate.offset + offset, length);
+ } else if (FutureArrays.compareUnsigned(max.bytes(), 0, length, candidate.bytes, candidate.offset + offset, candidate.offset + offset + length) < 0) {
+ max.copyBytes(candidate.bytes, candidate.offset + offset, length);
+ }
+ }
+ return new BytesRef[]{min.get(), max.get()};
+ }
+
private void writeLeafBlockPackedValuesRange(DataOutput out, int[] commonPrefixLengths, int start, int end, IntFunction<BytesRef> packedValues) throws IOException {
for (int i = start; i < end; ++i) {
BytesRef ref = packedValues.apply(i);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f406ff91/lucene/sandbox/src/java/org/apache/lucene/document/FloatPointNearestNeighbor.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/document/FloatPointNearestNeighbor.java b/lucene/sandbox/src/java/org/apache/lucene/document/FloatPointNearestNeighbor.java
index 8458746..38a4103 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/document/FloatPointNearestNeighbor.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/document/FloatPointNearestNeighbor.java
@@ -190,7 +190,7 @@ public class FloatPointNearestNeighbor {
@Override
public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
- throw new AssertionError();
+ return PointValues.Relation.CELL_CROSSES_QUERY;
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f406ff91/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java b/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java
index 8502c45..449d37d 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java
@@ -178,7 +178,7 @@ class NearestNeighbor {
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
- throw new AssertionError();
+ return Relation.CELL_CROSSES_QUERY;
}
}