You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/12/07 13:47:39 UTC
[1/5] lucene-solr:branch_6x: LUCENE-7563: use a compressed format for
the in-heap BKD index
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x 8c943b608 -> 0c8e8e396
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index da73286..2567eef 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -33,6 +33,7 @@ import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@@ -83,7 +84,8 @@ public class BKDWriter implements Closeable {
public static final int VERSION_COMPRESSED_DOC_IDS = 1;
public static final int VERSION_COMPRESSED_VALUES = 2;
public static final int VERSION_IMPLICIT_SPLIT_DIM_1D = 3;
- public static final int VERSION_CURRENT = VERSION_IMPLICIT_SPLIT_DIM_1D;
+ public static final int VERSION_PACKED_INDEX = 4;
+ public static final int VERSION_CURRENT = VERSION_PACKED_INDEX;
/** How many bytes each docs takes in the fixed-width offline format */
private final int bytesPerDoc;
@@ -325,15 +327,10 @@ public class BKDWriter implements Closeable {
bkd.numDims,
bkd.packedBytesLength,
bkd.maxPointsInLeafNode,
+ null,
null);
this.docMap = docMap;
- long minFP = Long.MAX_VALUE;
- //System.out.println("MR.init " + this + " bkdreader=" + bkd + " leafBlockFPs.length=" + bkd.leafBlockFPs.length);
- for(long fp : bkd.leafBlockFPs) {
- minFP = Math.min(minFP, fp);
- //System.out.println(" leaf fp=" + fp);
- }
- state.in.seek(minFP);
+ state.in.seek(bkd.getMinLeafBlockFP());
this.packedValues = new byte[bkd.maxPointsInLeafNode * bkd.packedBytesLength];
}
@@ -341,7 +338,7 @@ public class BKDWriter implements Closeable {
//System.out.println("MR.next this=" + this);
while (true) {
if (docBlockUpto == docsInBlock) {
- if (blockID == bkd.leafBlockFPs.length) {
+ if (blockID == bkd.leafNodeOffset) {
//System.out.println(" done!");
return false;
}
@@ -489,7 +486,6 @@ public class BKDWriter implements Closeable {
return indexFP;
}
-
/* In the 1D case, we can simply sort points in ascending order and use the
* same writing logic as we use at merge time. */
private long writeField1Dim(IndexOutput out, String fieldName, MutablePointsReader reader) throws IOException {
@@ -648,6 +644,7 @@ public class BKDWriter implements Closeable {
}
private void writeLeafBlock() throws IOException {
+ //System.out.println("writeLeafBlock pos=" + out.getFilePointer());
assert leafCount != 0;
if (valueCount == 0) {
System.arraycopy(leafValues, 0, minPackedValue, 0, packedBytesLength);
@@ -811,6 +808,24 @@ public class BKDWriter implements Closeable {
}.sort(0, pointCount);
}
+ // useful for debugging:
+ /*
+ private void printPathSlice(String desc, PathSlice slice, int dim) throws IOException {
+ System.out.println(" " + desc + " dim=" + dim + " count=" + slice.count + ":");
+ try(PointReader r = slice.writer.getReader(slice.start, slice.count)) {
+ int count = 0;
+ while (r.next()) {
+ byte[] v = r.packedValue();
+ System.out.println(" " + count + ": " + new BytesRef(v, dim*bytesPerDim, bytesPerDim));
+ count++;
+ if (count == slice.count) {
+ break;
+ }
+ }
+ }
+ }
+ */
+
private PointWriter sort(int dim) throws IOException {
assert dim >= 0 && dim < numDims;
@@ -1019,46 +1034,238 @@ public class BKDWriter implements Closeable {
return indexFP;
}
- /** Subclass can change how it writes the index. */
- protected void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
+ /** Packs the two arrays, representing a balanced binary tree, into a compact byte[] structure. */
+ private byte[] packIndex(long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
+
+ int numLeaves = leafBlockFPs.length;
+
+ // Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
+ // if it was created by OneDimensionBKDWriter). In this case the leaf nodes may straddle the two bottom
+ // levels of the binary tree:
+ if (numDims == 1 && numLeaves > 1) {
+ int levelCount = 2;
+ while (true) {
+ if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
+ int lastLevel = 2*(numLeaves - levelCount);
+ assert lastLevel >= 0;
+ if (lastLevel != 0) {
+ // Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading
+ // at read-time, so that we can still delta code them on disk at write:
+ long[] newLeafBlockFPs = new long[numLeaves];
+ System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
+ System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
+ leafBlockFPs = newLeafBlockFPs;
+ }
+ break;
+ }
+
+ levelCount *= 2;
+ }
+ }
+
+ /** Reused while packing the index */
+ RAMOutputStream writeBuffer = new RAMOutputStream();
+
+ // This is the "file" we append the byte[] to:
+ List<byte[]> blocks = new ArrayList<>();
+ byte[] lastSplitValues = new byte[bytesPerDim * numDims];
+ //System.out.println("\npack index");
+ int totalSize = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, 0l, blocks, 1, lastSplitValues, new boolean[numDims], false);
+
+ // Compact the byte[] blocks into single byte index:
+ byte[] index = new byte[totalSize];
+ int upto = 0;
+ for(byte[] block : blocks) {
+ System.arraycopy(block, 0, index, upto, block.length);
+ upto += block.length;
+ }
+ assert upto == totalSize;
+
+ return index;
+ }
+
+ /** Appends the current contents of writeBuffer as another block on the growing in-memory file */
+ private int appendBlock(RAMOutputStream writeBuffer, List<byte[]> blocks) throws IOException {
+ int pos = Math.toIntExact(writeBuffer.getFilePointer());
+ byte[] bytes = new byte[pos];
+ writeBuffer.writeTo(bytes, 0);
+ writeBuffer.reset();
+ blocks.add(bytes);
+ return pos;
+ }
+
+ /**
+ * lastSplitValues is per-dimension split value previously seen; we use this to prefix-code the split byte[] on each inner node
+ */
+ private int recursePackIndex(RAMOutputStream writeBuffer, long[] leafBlockFPs, byte[] splitPackedValues, long minBlockFP, List<byte[]> blocks,
+ int nodeID, byte[] lastSplitValues, boolean[] negativeDeltas, boolean isLeft) throws IOException {
+ if (nodeID >= leafBlockFPs.length) {
+ int leafID = nodeID - leafBlockFPs.length;
+ //System.out.println("recursePack leaf nodeID=" + nodeID);
+
+ // In the unbalanced case it's possible the left most node only has one child:
+ if (leafID < leafBlockFPs.length) {
+ long delta = leafBlockFPs[leafID] - minBlockFP;
+ if (isLeft) {
+ assert delta == 0;
+ return 0;
+ } else {
+ assert nodeID == 1 || delta > 0: "nodeID=" + nodeID;
+ writeBuffer.writeVLong(delta);
+ return appendBlock(writeBuffer, blocks);
+ }
+ } else {
+ return 0;
+ }
+ } else {
+ long leftBlockFP;
+ if (isLeft == false) {
+ leftBlockFP = getLeftMostLeafBlockFP(leafBlockFPs, nodeID);
+ long delta = leftBlockFP - minBlockFP;
+ assert nodeID == 1 || delta > 0;
+ writeBuffer.writeVLong(delta);
+ } else {
+ // The left tree's left most leaf block FP is always the minimal FP:
+ leftBlockFP = minBlockFP;
+ }
+
+ int address = nodeID * (1+bytesPerDim);
+ int splitDim = splitPackedValues[address++] & 0xff;
+
+ //System.out.println("recursePack inner nodeID=" + nodeID + " splitDim=" + splitDim + " splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDim));
+
+ // find common prefix with last split value in this dim:
+ int prefix = 0;
+ for(;prefix<bytesPerDim;prefix++) {
+ if (splitPackedValues[address+prefix] != lastSplitValues[splitDim * bytesPerDim + prefix]) {
+ break;
+ }
+ }
+
+ //System.out.println("writeNodeData nodeID=" + nodeID + " splitDim=" + splitDim + " numDims=" + numDims + " bytesPerDim=" + bytesPerDim + " prefix=" + prefix);
+
+ int firstDiffByteDelta;
+ if (prefix < bytesPerDim) {
+ //System.out.println(" delta byte cur=" + Integer.toHexString(splitPackedValues[address+prefix]&0xFF) + " prev=" + Integer.toHexString(lastSplitValues[splitDim * bytesPerDim + prefix]&0xFF) + " negated?=" + negativeDeltas[splitDim]);
+ firstDiffByteDelta = (splitPackedValues[address+prefix]&0xFF) - (lastSplitValues[splitDim * bytesPerDim + prefix]&0xFF);
+ if (negativeDeltas[splitDim]) {
+ firstDiffByteDelta = -firstDiffByteDelta;
+ }
+ //System.out.println(" delta=" + firstDiffByteDelta);
+ assert firstDiffByteDelta > 0;
+ } else {
+ firstDiffByteDelta = 0;
+ }
+
+ // pack the prefix, splitDim and delta first diff byte into a single vInt:
+ int code = (firstDiffByteDelta * (1+bytesPerDim) + prefix) * numDims + splitDim;
+
+ //System.out.println(" code=" + code);
+ //System.out.println(" splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDim));
+
+ writeBuffer.writeVInt(code);
+
+ // write the split value, prefix coded vs. our parent's split value:
+ int suffix = bytesPerDim - prefix;
+ byte[] savSplitValue = new byte[suffix];
+ if (suffix > 1) {
+ writeBuffer.writeBytes(splitPackedValues, address+prefix+1, suffix-1);
+ }
+
+ byte[] cmp = lastSplitValues.clone();
+
+ System.arraycopy(lastSplitValues, splitDim * bytesPerDim + prefix, savSplitValue, 0, suffix);
+
+ // copy our split value into lastSplitValues for our children to prefix-code against
+ System.arraycopy(splitPackedValues, address+prefix, lastSplitValues, splitDim * bytesPerDim + prefix, suffix);
+
+ int numBytes = appendBlock(writeBuffer, blocks);
+
+ // placeholder for left-tree numBytes; we need this so that at search time if we only need to recurse into the right sub-tree we can
+ // quickly seek to its starting point
+ int idxSav = blocks.size();
+ blocks.add(null);
+
+ boolean savNegativeDelta = negativeDeltas[splitDim];
+ negativeDeltas[splitDim] = true;
+
+ int leftNumBytes = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, leftBlockFP, blocks, 2*nodeID, lastSplitValues, negativeDeltas, true);
+
+ if (nodeID * 2 < leafBlockFPs.length) {
+ writeBuffer.writeVInt(leftNumBytes);
+ } else {
+ assert leftNumBytes == 0: "leftNumBytes=" + leftNumBytes;
+ }
+ int numBytes2 = Math.toIntExact(writeBuffer.getFilePointer());
+ byte[] bytes2 = new byte[numBytes2];
+ writeBuffer.writeTo(bytes2, 0);
+ writeBuffer.reset();
+ // replace our placeholder:
+ blocks.set(idxSav, bytes2);
+
+ negativeDeltas[splitDim] = false;
+ int rightNumBytes = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, leftBlockFP, blocks, 2*nodeID+1, lastSplitValues, negativeDeltas, false);
+
+ negativeDeltas[splitDim] = savNegativeDelta;
+
+ // restore lastSplitValues to what caller originally passed us:
+ System.arraycopy(savSplitValue, 0, lastSplitValues, splitDim * bytesPerDim + prefix, suffix);
+
+ assert Arrays.equals(lastSplitValues, cmp);
+
+ return numBytes + numBytes2 + leftNumBytes + rightNumBytes;
+ }
+ }
+
+ private long getLeftMostLeafBlockFP(long[] leafBlockFPs, int nodeID) {
+ int nodeIDIn = nodeID;
+ // TODO: can we do this cheaper, e.g. a closed form solution instead of while loop? Or
+ // change the recursion while packing the index to return this left-most leaf block FP
+ // from each recursion instead?
+ //
+ // Still, the overall cost here is minor: this method's cost is O(log(N)), and while writing
+ // we call it O(N) times (N = number of leaf blocks)
+ while (nodeID < leafBlockFPs.length) {
+ nodeID *= 2;
+ }
+ int leafID = nodeID - leafBlockFPs.length;
+ long result = leafBlockFPs[leafID];
+ if (result < 0) {
+ throw new AssertionError(result + " for leaf " + leafID);
+ }
+ return result;
+ }
+
+ private void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
+ byte[] packedIndex = packIndex(leafBlockFPs, splitPackedValues);
+ writeIndex(out, leafBlockFPs.length, packedIndex);
+ }
+
+ private void writeIndex(IndexOutput out, int numLeaves, byte[] packedIndex) throws IOException {
+
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
out.writeVInt(numDims);
out.writeVInt(maxPointsInLeafNode);
out.writeVInt(bytesPerDim);
- assert leafBlockFPs.length > 0;
- out.writeVInt(leafBlockFPs.length);
+ assert numLeaves > 0;
+ out.writeVInt(numLeaves);
out.writeBytes(minPackedValue, 0, packedBytesLength);
out.writeBytes(maxPackedValue, 0, packedBytesLength);
out.writeVLong(pointCount);
out.writeVInt(docsSeen.cardinality());
-
- // NOTE: splitPackedValues[0] is unused, because nodeID is 1-based:
- if (numDims == 1) {
- // write the index, skipping the byte used to store the split dim since it is always 0
- for (int i = 1; i < splitPackedValues.length; i += 1 + bytesPerDim) {
- out.writeBytes(splitPackedValues, i, bytesPerDim);
- }
- } else {
- out.writeBytes(splitPackedValues, 0, splitPackedValues.length);
- }
-
- long lastFP = 0;
- for (int i=0;i<leafBlockFPs.length;i++) {
- long delta = leafBlockFPs[i]-lastFP;
- out.writeVLong(delta);
- lastFP = leafBlockFPs[i];
- }
+ out.writeVInt(packedIndex.length);
+ out.writeBytes(packedIndex, 0, packedIndex.length);
}
- protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
+ private void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
assert count > 0: "maxPointsInLeafNode=" + maxPointsInLeafNode;
out.writeVInt(count);
DocIdsWriter.writeDocIds(docIDs, start, count, out);
}
- protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
+ private void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
int prefixLenSum = Arrays.stream(commonPrefixLengths).sum();
if (prefixLenSum == packedBytesLength) {
// all values in this block are equal
@@ -1109,7 +1316,7 @@ public class BKDWriter implements Closeable {
return end - start;
}
- protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
+ private void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
for(int dim=0;dim<numDims;dim++) {
out.writeVInt(commonPrefixes[dim]);
//System.out.println(commonPrefixes[dim] + " of " + bytesPerDim);
@@ -1177,7 +1384,7 @@ public class BKDWriter implements Closeable {
// TODO: find a way to also checksum this reader? If we changed to markLeftTree, and scanned the final chunk, it could work?
try (PointReader reader = source.writer.getReader(source.start + source.count - rightCount, rightCount)) {
boolean result = reader.next();
- assert result;
+ assert result: "rightCount=" + rightCount + " source.count=" + source.count + " source.writer=" + source.writer;
System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim);
if (numDims > 1) {
assert ordBitSet.get(reader.ord()) == false;
@@ -1244,12 +1451,12 @@ public class BKDWriter implements Closeable {
/* Recursively reorders the provided reader and writes the bkd-tree on the fly. */
private void build(int nodeID, int leafNodeOffset,
- MutablePointsReader reader, int from, int to,
- IndexOutput out,
- byte[] minPackedValue, byte[] maxPackedValue,
- byte[] splitPackedValues,
- long[] leafBlockFPs,
- int[] spareDocIds) throws IOException {
+ MutablePointValues reader, int from, int to,
+ IndexOutput out,
+ byte[] minPackedValue, byte[] maxPackedValue,
+ byte[] splitPackedValues,
+ long[] leafBlockFPs,
+ int[] spareDocIds) throws IOException {
if (nodeID >= leafNodeOffset) {
// leaf node
@@ -1311,6 +1518,7 @@ public class BKDWriter implements Closeable {
for (int i = from; i < to; ++i) {
docIDs[i - from] = reader.getDocID(i);
}
+ //System.out.println("writeLeafBlock pos=" + out.getFilePointer());
writeLeafBlockDocs(out, docIDs, 0, count);
// Write the common prefixes:
@@ -1344,6 +1552,7 @@ public class BKDWriter implements Closeable {
break;
}
}
+
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen,
reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
@@ -1381,7 +1590,7 @@ public class BKDWriter implements Closeable {
for(PathSlice slice : slices) {
assert slice.count == slices[0].count;
}
-
+
if (numDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) {
// Special case for 1D, to cutover to heap once we recurse deeply enough:
slices[0] = switchToHeap(slices[0], toCloseHeroically);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java
index 0cd4bd2..99182cb 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointReader.java
@@ -18,7 +18,10 @@ package org.apache.lucene.util.bkd;
import java.util.List;
-final class HeapPointReader extends PointReader {
+/** Utility class to read buffered points from in-heap arrays.
+ *
+ * @lucene.internal */
+public final class HeapPointReader extends PointReader {
private int curRead;
final List<byte[]> blocks;
final int valuesPerBlock;
@@ -30,7 +33,7 @@ final class HeapPointReader extends PointReader {
final byte[] scratch;
final boolean singleValuePerDoc;
- HeapPointReader(List<byte[]> blocks, int valuesPerBlock, int packedBytesLength, int[] ords, long[] ordsLong, int[] docIDs, int start, int end, boolean singleValuePerDoc) {
+ public HeapPointReader(List<byte[]> blocks, int valuesPerBlock, int packedBytesLength, int[] ords, long[] ordsLong, int[] docIDs, int start, int end, boolean singleValuePerDoc) {
this.blocks = blocks;
this.valuesPerBlock = valuesPerBlock;
this.singleValuePerDoc = singleValuePerDoc;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java
index 24d248b..e102651 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/HeapPointWriter.java
@@ -24,18 +24,21 @@ import java.util.List;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
-final class HeapPointWriter implements PointWriter {
- int[] docIDs;
- long[] ordsLong;
- int[] ords;
+/** Utility class to write new points into in-heap arrays.
+ *
+ * @lucene.internal */
+public final class HeapPointWriter implements PointWriter {
+ public int[] docIDs;
+ public long[] ordsLong;
+ public int[] ords;
private int nextWrite;
private boolean closed;
final int maxSize;
- final int valuesPerBlock;
+ public final int valuesPerBlock;
final int packedBytesLength;
final boolean singleValuePerDoc;
// NOTE: can't use ByteBlockPool because we need random-write access when sorting in heap
- final List<byte[]> blocks = new ArrayList<>();
+ public final List<byte[]> blocks = new ArrayList<>();
public HeapPointWriter(int initSize, int maxSize, int packedBytesLength, boolean longOrds, boolean singleValuePerDoc) {
docIDs = new int[initSize];
@@ -77,7 +80,7 @@ final class HeapPointWriter implements PointWriter {
nextWrite = other.nextWrite;
}
- void readPackedValue(int index, byte[] bytes) {
+ public void readPackedValue(int index, byte[] bytes) {
assert bytes.length == packedBytesLength;
int block = index / valuesPerBlock;
int blockIndex = index % valuesPerBlock;
@@ -85,7 +88,7 @@ final class HeapPointWriter implements PointWriter {
}
/** Returns a reference, in <code>result</code>, to the byte[] slice holding this value */
- void getPackedValueSlice(int index, BytesRef result) {
+ public void getPackedValueSlice(int index, BytesRef result) {
int block = index / valuesPerBlock;
int blockIndex = index % valuesPerBlock;
result.bytes = blocks.get(block);
@@ -138,7 +141,8 @@ final class HeapPointWriter implements PointWriter {
@Override
public PointReader getReader(long start, long length) {
assert start + length <= docIDs.length: "start=" + start + " length=" + length + " docIDs.length=" + docIDs.length;
- return new HeapPointReader(blocks, valuesPerBlock, packedBytesLength, ords, ordsLong, docIDs, (int) start, nextWrite, singleValuePerDoc);
+ assert start + length <= nextWrite: "start=" + start + " length=" + length + " nextWrite=" + nextWrite;
+ return new HeapPointReader(blocks, valuesPerBlock, packedBytesLength, ords, ordsLong, docIDs, (int) start, Math.toIntExact(start+length), singleValuePerDoc);
}
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
index b439208..9e84c8d 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
@@ -26,13 +26,16 @@ import org.apache.lucene.util.Selector;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.PackedInts;
-final class MutablePointsReaderUtils {
+/** Utility APIs for sorting and partitioning buffered points.
+ *
+ * @lucene.internal */
+public final class MutablePointsReaderUtils {
MutablePointsReaderUtils() {}
- /** Sort the given {@link MutablePointsReader} based on its packed value then doc ID. */
- static void sort(int maxDoc, int packedBytesLength,
- MutablePointsReader reader, int from, int to) {
+ /** Sort the given {@link MutablePointValues} based on its packed value then doc ID. */
+ public static void sort(int maxDoc, int packedBytesLength,
+ MutablePointValues reader, int from, int to) {
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
new MSBRadixSorter(packedBytesLength + (bitsPerDocId + 7) / 8) {
@@ -88,9 +91,9 @@ final class MutablePointsReaderUtils {
}
/** Sort points on the given dimension. */
- static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
- MutablePointsReader reader, int from, int to,
- BytesRef scratch1, BytesRef scratch2) {
+ public static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
+ MutablePointValues reader, int from, int to,
+ BytesRef scratch1, BytesRef scratch2) {
// No need for a fancy radix sort here, this is called on the leaves only so
// there are not many values to sort
@@ -127,9 +130,9 @@ final class MutablePointsReaderUtils {
/** Partition points around {@code mid}. All values on the left must be less
* than or equal to it and all values on the right must be greater than or
* equal to it. */
- static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
- MutablePointsReader reader, int from, int to, int mid,
- BytesRef scratch1, BytesRef scratch2) {
+ public static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
+ MutablePointValues reader, int from, int to, int mid,
+ BytesRef scratch1, BytesRef scratch2) {
final int offset = splitDim * bytesPerDim + commonPrefixLen;
final int cmpBytes = bytesPerDim - commonPrefixLen;
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java
index 17758c0..2861d59 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointReader.java
@@ -27,8 +27,10 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.LongBitSet;
-/** Reads points from disk in a fixed-with format, previously written with {@link OfflinePointWriter}. */
-final class OfflinePointReader extends PointReader {
+/** Reads points from disk in a fixed-with format, previously written with {@link OfflinePointWriter}.
+ *
+ * @lucene.internal */
+public final class OfflinePointReader extends PointReader {
long countLeft;
final IndexInput in;
private final byte[] packedValue;
@@ -43,7 +45,7 @@ final class OfflinePointReader extends PointReader {
// File name we are reading
final String name;
- OfflinePointReader(Directory tempDir, String tempFileName, int packedBytesLength, long start, long length,
+ public OfflinePointReader(Directory tempDir, String tempFileName, int packedBytesLength, long start, long length,
boolean longOrds, boolean singleValuePerDoc) throws IOException {
this.singleValuePerDoc = singleValuePerDoc;
int bytesPerDoc = packedBytesLength + Integer.BYTES;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java
index 87637ae..7e615a6 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/OfflinePointWriter.java
@@ -26,12 +26,14 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
-/** Writes points to disk in a fixed-with format. */
-final class OfflinePointWriter implements PointWriter {
+/** Writes points to disk in a fixed-with format.
+ *
+ * @lucene.internal */
+public final class OfflinePointWriter implements PointWriter {
final Directory tempDir;
- final IndexOutput out;
- final String name;
+ public final IndexOutput out;
+ public final String name;
final int packedBytesLength;
final boolean singleValuePerDoc;
long count;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/java/org/apache/lucene/util/bkd/PointReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/PointReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/PointReader.java
index 90de0d1..0c31275 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/PointReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/PointReader.java
@@ -24,20 +24,22 @@ import org.apache.lucene.util.LongBitSet;
/** One pass iterator through all points previously written with a
* {@link PointWriter}, abstracting away whether points a read
- * from (offline) disk or simple arrays in heap. */
-abstract class PointReader implements Closeable {
+ * from (offline) disk or simple arrays in heap.
+ *
+ * @lucene.internal */
+public abstract class PointReader implements Closeable {
/** Returns false once iteration is done, else true. */
- abstract boolean next() throws IOException;
+ public abstract boolean next() throws IOException;
/** Returns the packed byte[] value */
- abstract byte[] packedValue();
+ public abstract byte[] packedValue();
/** Point ordinal */
- abstract long ord();
+ public abstract long ord();
/** DocID for this point */
- abstract int docID();
+ public abstract int docID();
/** Iterates through the next {@code count} ords, marking them in the provided {@code ordBitSet}. */
public void markOrds(long count, LongBitSet ordBitSet) throws IOException {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/java/org/apache/lucene/util/bkd/PointWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/PointWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/PointWriter.java
index d19f6e5..0222d0e 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/PointWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/PointWriter.java
@@ -23,8 +23,10 @@ import java.util.List;
/** Appends many points, and then at the end provides a {@link PointReader} to iterate
* those points. This abstracts away whether we write to disk, or use simple arrays
- * in heap. */
-interface PointWriter extends Closeable {
+ * in heap.
+ *
+ * @lucene.internal */
+public interface PointWriter extends Closeable {
/** Add a new point */
void append(byte[] packedValue, long ord, int docID) throws IOException;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
index cf8372d..c923bf0 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
@@ -619,6 +619,9 @@ public class TestPointQueries extends LuceneTestCase {
int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS);
int sameValuePct = random().nextInt(100);
+ if (VERBOSE) {
+ System.out.println("TEST: sameValuePct=" + sameValuePct);
+ }
byte[][][] docValues = new byte[numValues][][];
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java b/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
index af2e463..e30168c 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
@@ -16,6 +16,7 @@
*/
package org.apache.lucene.util.bkd;
+import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IOContext;
@@ -64,7 +65,10 @@ public class Test2BBKDPoints extends LuceneTestCase {
IndexInput in = dir.openInput("1d.bkd", IOContext.DEFAULT);
in.seek(indexFP);
BKDReader r = new BKDReader(in);
- r.verify(numDocs);
+ CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("1d", numDocs, r);
+ r.intersect(visitor);
+ assertEquals(r.size(), visitor.getPointCountSeen());
+ assertEquals(r.getDocCount(), visitor.getDocCountSeen());
in.close();
dir.close();
}
@@ -101,7 +105,10 @@ public class Test2BBKDPoints extends LuceneTestCase {
IndexInput in = dir.openInput("2d.bkd", IOContext.DEFAULT);
in.seek(indexFP);
BKDReader r = new BKDReader(in);
- r.verify(numDocs);
+ CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("2d", numDocs, r);
+ r.intersect(visitor);
+ assertEquals(r.size(), visitor.getPointCountSeen());
+ assertEquals(r.getDocCount(), visitor.getDocCountSeen());
in.close();
dir.close();
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
index 9eb1fd3..8b9b7a5 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
@@ -28,6 +28,7 @@ import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.index.PointValues;
import org.apache.lucene.store.CorruptingIndexOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FilterDirectory;
@@ -1010,4 +1011,57 @@ public class TestBKD extends LuceneTestCase {
}
}
+ // Claims 16 bytes per dim, but only use the bottom N 1-3 bytes; this would happen e.g. if a user indexes what are actually just short
+ // values as a LongPoint:
+ public void testWastedLeadingBytes() throws Exception {
+ int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS);
+ int bytesPerDim = PointValues.MAX_NUM_BYTES;
+ int bytesUsed = TestUtil.nextInt(random(), 1, 3);
+
+ Directory dir = newFSDirectory(createTempDir());
+ int numDocs = 100000;
+ BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", numDims, bytesPerDim, 32, 1f, numDocs, true);
+ byte[] tmp = new byte[bytesUsed];
+ byte[] buffer = new byte[numDims * bytesPerDim];
+ for(int i=0;i<numDocs;i++) {
+ for(int dim=0;dim<numDims;dim++) {
+ random().nextBytes(tmp);
+ System.arraycopy(tmp, 0, buffer, dim*bytesPerDim+(bytesPerDim-bytesUsed), tmp.length);
+ }
+ w.add(buffer, i);
+ }
+
+ IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT);
+ long fp = w.finish(out);
+ out.close();
+
+ IndexInput in = dir.openInput("bkd", IOContext.DEFAULT);
+ in.seek(fp);
+ BKDReader r = new BKDReader(in);
+ int[] count = new int[1];
+ r.intersect(new IntersectVisitor() {
+
+ @Override
+ public void visit(int docID) {
+ count[0]++;
+ }
+
+ @Override
+ public void visit(int docID, byte[] packedValue) {
+ visit(docID);
+ }
+
+ @Override
+ public Relation compare(byte[] minPacked, byte[] maxPacked) {
+ if (random().nextInt(7) == 1) {
+ return Relation.CELL_CROSSES_QUERY;
+ } else {
+ return Relation.CELL_INSIDE_QUERY;
+ }
+ }
+ });
+ assertEquals(numDocs, count[0]);
+ in.close();
+ dir.close();
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
index 39b3282..b0e5427 100644
--- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
@@ -229,7 +229,7 @@ public class TestFSTs extends LuceneTestCase {
final long value = lastOutput + TestUtil.nextInt(random(), 1, 1000);
lastOutput = value;
pairs.add(new FSTTester.InputOutput<>(terms[idx],
- outputs.newPair((long) idx, value)));
+ outputs.newPair((long) idx, value)));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/sandbox/src/java/org/apache/lucene/document/NearestNeighbor.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/document/NearestNeighbor.java b/lucene/sandbox/src/java/org/apache/lucene/document/NearestNeighbor.java
index 3b9f302..587c63f 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/document/NearestNeighbor.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/document/NearestNeighbor.java
@@ -26,7 +26,10 @@ import org.apache.lucene.geo.Rectangle;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SloppyMath;
+import org.apache.lucene.util.bkd.BKDReader.IndexTree;
+import org.apache.lucene.util.bkd.BKDReader.IntersectState;
import org.apache.lucene.util.bkd.BKDReader;
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
@@ -41,16 +44,16 @@ class NearestNeighbor {
static class Cell implements Comparable<Cell> {
final int readerIndex;
- final int nodeID;
final byte[] minPacked;
final byte[] maxPacked;
+ final IndexTree index;
/** The closest possible distance of all points in this cell */
final double distanceMeters;
- public Cell(int readerIndex, int nodeID, byte[] minPacked, byte[] maxPacked, double distanceMeters) {
+ public Cell(IndexTree index, int readerIndex, byte[] minPacked, byte[] maxPacked, double distanceMeters) {
+ this.index = index;
this.readerIndex = readerIndex;
- this.nodeID = nodeID;
this.minPacked = minPacked.clone();
this.maxPacked = maxPacked.clone();
this.distanceMeters = distanceMeters;
@@ -66,7 +69,7 @@ class NearestNeighbor {
double minLon = decodeLongitude(minPacked, Integer.BYTES);
double maxLat = decodeLatitude(maxPacked, 0);
double maxLon = decodeLongitude(maxPacked, Integer.BYTES);
- return "Cell(readerIndex=" + readerIndex + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceMeters=" + distanceMeters + ")";
+ return "Cell(readerIndex=" + readerIndex + " nodeID=" + index.getNodeID() + " isLeaf=" + index.isLeafNode() + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceMeters=" + distanceMeters + ")";
}
}
@@ -219,13 +222,21 @@ class NearestNeighbor {
List<BKDReader.IntersectState> states = new ArrayList<>();
// Add root cell for each reader into the queue:
+ int bytesPerDim = -1;
+
for(int i=0;i<readers.size();i++) {
BKDReader reader = readers.get(i);
+ if (bytesPerDim == -1) {
+ bytesPerDim = reader.getBytesPerDimension();
+ } else if (bytesPerDim != reader.getBytesPerDimension()) {
+ throw new IllegalStateException("bytesPerDim changed from " + bytesPerDim + " to " + reader.getBytesPerDimension() + " across readers");
+ }
byte[] minPackedValue = reader.getMinPackedValue();
byte[] maxPackedValue = reader.getMaxPackedValue();
- states.add(reader.getIntersectState(visitor));
+ IntersectState state = reader.getIntersectState(visitor);
+ states.add(state);
- cellQueue.offer(new Cell(i, 1, reader.getMinPackedValue(), reader.getMaxPackedValue(),
+ cellQueue.offer(new Cell(state.index, i, reader.getMinPackedValue(), reader.getMaxPackedValue(),
approxBestDistance(minPackedValue, maxPackedValue, pointLat, pointLon)));
}
@@ -236,12 +247,12 @@ class NearestNeighbor {
// TODO: if we replace approxBestDistance with actualBestDistance, we can put an opto here to break once this "best" cell is fully outside of the hitQueue bottom's radius:
BKDReader reader = readers.get(cell.readerIndex);
- if (reader.isLeafNode(cell.nodeID)) {
+ if (cell.index.isLeafNode()) {
//System.out.println(" leaf");
// Leaf block: visit all points and possibly collect them:
visitor.curDocBase = docBases.get(cell.readerIndex);
visitor.curLiveDocs = liveDocs.get(cell.readerIndex);
- reader.visitLeafBlockValues(cell.nodeID, states.get(cell.readerIndex));
+ reader.visitLeafBlockValues(cell.index, states.get(cell.readerIndex));
//System.out.println(" now " + hitQueue.size() + " hits");
} else {
//System.out.println(" non-leaf");
@@ -257,14 +268,23 @@ class NearestNeighbor {
continue;
}
+ BytesRef splitValue = BytesRef.deepCopyOf(cell.index.getSplitDimValue());
+ int splitDim = cell.index.getSplitDim();
+
+ // we must clone the index so that we we can recurse left and right "concurrently":
+ IndexTree newIndex = cell.index.clone();
byte[] splitPackedValue = cell.maxPacked.clone();
- reader.copySplitValue(cell.nodeID, splitPackedValue);
- cellQueue.offer(new Cell(cell.readerIndex, 2*cell.nodeID, cell.minPacked, splitPackedValue,
+ System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+
+ cell.index.pushLeft();
+ cellQueue.offer(new Cell(cell.index, cell.readerIndex, cell.minPacked, splitPackedValue,
approxBestDistance(cell.minPacked, splitPackedValue, pointLat, pointLon)));
splitPackedValue = cell.minPacked.clone();
- reader.copySplitValue(cell.nodeID, splitPackedValue);
- cellQueue.offer(new Cell(cell.readerIndex, 2*cell.nodeID+1, splitPackedValue, cell.maxPacked,
+ System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+
+ newIndex.pushRight();
+ cellQueue.offer(new Cell(newIndex, cell.readerIndex, splitPackedValue, cell.maxPacked,
approxBestDistance(splitPackedValue, cell.maxPacked, pointLat, pointLon)));
}
}
[4/5] lucene-solr:branch_6x: LUCENE-7563: remove redundant array copy
in PackedIndexTree.clone
Posted by mi...@apache.org.
LUCENE-7563: remove redundant array copy in PackedIndexTree.clone
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/fd1f608b
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/fd1f608b
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/fd1f608b
Branch: refs/heads/branch_6x
Commit: fd1f608b49a7a8b5f7e6cc805378da2217ec657a
Parents: f51766c
Author: Mike McCandless <mi...@apache.org>
Authored: Mon Dec 5 06:45:16 2016 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Wed Dec 7 06:32:35 2016 -0500
----------------------------------------------------------------------
lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java | 1 -
1 file changed, 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fd1f608b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index 8970124..ef9bccc 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -347,7 +347,6 @@ public final class BKDReader extends PointValues implements Accountable {
index.nodeID = nodeID;
index.level = level;
index.splitDim = splitDim;
- System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims);
index.leafBlockFPStack[level] = leafBlockFPStack[level];
index.leftNodePositions[level] = leftNodePositions[level];
index.rightNodePositions[level] = rightNodePositions[level];
[3/5] lucene-solr:branch_6x: LUCENE-7563: use a compressed format for
the in-heap BKD index
Posted by mi...@apache.org.
LUCENE-7563: use a compressed format for the in-heap BKD index
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/f51766c0
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/f51766c0
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/f51766c0
Branch: refs/heads/branch_6x
Commit: f51766c00fc374a6fc6f407b723bd8458556de7d
Parents: 8c943b6
Author: Mike McCandless <mi...@apache.org>
Authored: Sun Dec 4 05:18:04 2016 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Wed Dec 7 06:32:16 2016 -0500
----------------------------------------------------------------------
lucene/CHANGES.txt | 4 +
.../codecs/simpletext/SimpleTextBKDReader.java | 282 ++-
.../codecs/simpletext/SimpleTextBKDWriter.java | 1661 ++++++++++++++++++
.../simpletext/SimpleTextPointsReader.java | 5 +-
.../simpletext/SimpleTextPointsWriter.java | 188 +-
.../codecs/lucene60/Lucene60PointsFormat.java | 10 +-
.../lucene/codecs/lucene60/package-info.java | 4 +-
.../org/apache/lucene/index/CheckIndex.java | 319 ++--
.../org/apache/lucene/util/bkd/BKDReader.java | 658 ++++---
.../org/apache/lucene/util/bkd/BKDWriter.java | 293 ++-
.../apache/lucene/util/bkd/HeapPointReader.java | 7 +-
.../apache/lucene/util/bkd/HeapPointWriter.java | 22 +-
.../util/bkd/MutablePointsReaderUtils.java | 23 +-
.../lucene/util/bkd/OfflinePointReader.java | 8 +-
.../lucene/util/bkd/OfflinePointWriter.java | 10 +-
.../org/apache/lucene/util/bkd/PointReader.java | 14 +-
.../org/apache/lucene/util/bkd/PointWriter.java | 6 +-
.../apache/lucene/search/TestPointQueries.java | 3 +
.../apache/lucene/util/bkd/Test2BBKDPoints.java | 11 +-
.../org/apache/lucene/util/bkd/TestBKD.java | 54 +
.../org/apache/lucene/util/fst/TestFSTs.java | 2 +-
.../apache/lucene/document/NearestNeighbor.java | 44 +-
22 files changed, 2989 insertions(+), 639 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 21a80e0..1bf7735 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -83,6 +83,10 @@ Optimizations
* LUCENE-7568: Optimize merging when index sorting is used but the
index is already sorted (Jim Ferenczi via Mike McCandless)
+* LUCENE-7563: The BKD in-memory index for dimensional points now uses
+ a compressed format, using substantially less RAM in some cases
+ (Adrien Grand, Mike McCandless)
+
Other
* LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
index 35e9448..488547b 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
@@ -16,14 +16,17 @@
*/
package org.apache.lucene.codecs.simpletext;
-
import java.io.IOException;
import java.nio.charset.StandardCharsets;
-import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.codecs.simpletext.SimpleTextUtil;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.PointValues;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.bkd.BKDReader;
@@ -31,15 +34,105 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_C
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_DOC_ID;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_VALUE;
-class SimpleTextBKDReader extends BKDReader {
+/** Forked from {@link BKDReader} and simplified/specialized for SimpleText's usage */
+
+final class SimpleTextBKDReader extends PointValues implements Accountable {
+ // Packed array of byte[] holding all split values in the full binary tree:
+ final private byte[] splitPackedValues;
+ final long[] leafBlockFPs;
+ final private int leafNodeOffset;
+ final int numDims;
+ final int bytesPerDim;
+ final int bytesPerIndexEntry;
+ final IndexInput in;
+ final int maxPointsInLeafNode;
+ final byte[] minPackedValue;
+ final byte[] maxPackedValue;
+ final long pointCount;
+ final int docCount;
+ final int version;
+ protected final int packedBytesLength;
- public SimpleTextBKDReader(IndexInput datIn, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
+ public SimpleTextBKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
byte[] minPackedValue, byte[] maxPackedValue, long pointCount, int docCount) throws IOException {
- super(datIn, numDims, maxPointsInLeafNode, bytesPerDim, leafBlockFPs, splitPackedValues, minPackedValue, maxPackedValue, pointCount, docCount);
+ this.in = in;
+ this.numDims = numDims;
+ this.maxPointsInLeafNode = maxPointsInLeafNode;
+ this.bytesPerDim = bytesPerDim;
+ // no version check here because callers of this API (SimpleText) have no back compat:
+ bytesPerIndexEntry = numDims == 1 ? bytesPerDim : bytesPerDim + 1;
+ packedBytesLength = numDims * bytesPerDim;
+ this.leafNodeOffset = leafBlockFPs.length;
+ this.leafBlockFPs = leafBlockFPs;
+ this.splitPackedValues = splitPackedValues;
+ this.minPackedValue = minPackedValue;
+ this.maxPackedValue = maxPackedValue;
+ this.pointCount = pointCount;
+ this.docCount = docCount;
+ this.version = SimpleTextBKDWriter.VERSION_CURRENT;
+ assert minPackedValue.length == packedBytesLength;
+ assert maxPackedValue.length == packedBytesLength;
}
- @Override
- protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
+ /** Used to track all state for a single call to {@link #intersect}. */
+ public static final class IntersectState {
+ final IndexInput in;
+ final int[] scratchDocIDs;
+ final byte[] scratchPackedValue;
+ final int[] commonPrefixLengths;
+
+ final IntersectVisitor visitor;
+
+ public IntersectState(IndexInput in, int numDims,
+ int packedBytesLength,
+ int maxPointsInLeafNode,
+ IntersectVisitor visitor) {
+ this.in = in;
+ this.visitor = visitor;
+ this.commonPrefixLengths = new int[numDims];
+ this.scratchDocIDs = new int[maxPointsInLeafNode];
+ this.scratchPackedValue = new byte[packedBytesLength];
+ }
+ }
+
+ public void intersect(IntersectVisitor visitor) throws IOException {
+ intersect(getIntersectState(visitor), 1, minPackedValue, maxPackedValue);
+ }
+
+ /** Fast path: this is called when the query box fully encompasses all cells under this node. */
+ private void addAll(IntersectState state, int nodeID) throws IOException {
+ //System.out.println("R: addAll nodeID=" + nodeID);
+
+ if (nodeID >= leafNodeOffset) {
+ //System.out.println("ADDALL");
+ visitDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.visitor);
+ // TODO: we can assert that the first value here in fact matches what the index claimed?
+ } else {
+ addAll(state, 2*nodeID);
+ addAll(state, 2*nodeID+1);
+ }
+ }
+
+ /** Create a new {@link IntersectState} */
+ public IntersectState getIntersectState(IntersectVisitor visitor) {
+ return new IntersectState(in.clone(), numDims,
+ packedBytesLength,
+ maxPointsInLeafNode,
+ visitor);
+ }
+
+ /** Visits all docIDs and packed values in a single leaf block */
+ public void visitLeafBlockValues(int nodeID, IntersectState state) throws IOException {
+ int leafID = nodeID - leafNodeOffset;
+
+ // Leaf node; scan and filter all points in this block:
+ int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
+
+ // Again, this time reading values and checking with the visitor
+ visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
+ }
+
+ void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
BytesRefBuilder scratch = new BytesRefBuilder();
in.seek(blockFP);
readLine(in, scratch);
@@ -51,8 +144,7 @@ class SimpleTextBKDReader extends BKDReader {
}
}
- @Override
- protected int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
+ int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
BytesRefBuilder scratch = new BytesRefBuilder();
in.seek(blockFP);
readLine(in, scratch);
@@ -64,8 +156,7 @@ class SimpleTextBKDReader extends BKDReader {
return count;
}
- @Override
- protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+ void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
visitor.grow(count);
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
assert scratchPackedValue.length == packedBytesLength;
@@ -80,6 +171,175 @@ class SimpleTextBKDReader extends BKDReader {
}
}
+ private void visitCompressedDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor, int compressedDim) throws IOException {
+ // the byte at `compressedByteOffset` is compressed using run-length compression,
+ // other suffix bytes are stored verbatim
+ final int compressedByteOffset = compressedDim * bytesPerDim + commonPrefixLengths[compressedDim];
+ commonPrefixLengths[compressedDim]++;
+ int i;
+ for (i = 0; i < count; ) {
+ scratchPackedValue[compressedByteOffset] = in.readByte();
+ final int runLen = Byte.toUnsignedInt(in.readByte());
+ for (int j = 0; j < runLen; ++j) {
+ for(int dim=0;dim<numDims;dim++) {
+ int prefix = commonPrefixLengths[dim];
+ in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
+ }
+ visitor.visit(docIDs[i+j], scratchPackedValue);
+ }
+ i += runLen;
+ }
+ if (i != count) {
+ throw new CorruptIndexException("Sub blocks do not add up to the expected count: " + count + " != " + i, in);
+ }
+ }
+
+ private int readCompressedDim(IndexInput in) throws IOException {
+ int compressedDim = in.readByte();
+ if (compressedDim < -1 || compressedDim >= numDims) {
+ throw new CorruptIndexException("Got compressedDim="+compressedDim, in);
+ }
+ return compressedDim;
+ }
+
+ private void readCommonPrefixes(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in) throws IOException {
+ for(int dim=0;dim<numDims;dim++) {
+ int prefix = in.readVInt();
+ commonPrefixLengths[dim] = prefix;
+ if (prefix > 0) {
+ in.readBytes(scratchPackedValue, dim*bytesPerDim, prefix);
+ }
+ //System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix);
+ }
+ }
+
+ private void intersect(IntersectState state,
+ int nodeID,
+ byte[] cellMinPacked, byte[] cellMaxPacked)
+ throws IOException {
+
+ /*
+ System.out.println("\nR: intersect nodeID=" + nodeID);
+ for(int dim=0;dim<numDims;dim++) {
+ System.out.println(" dim=" + dim + "\n cellMin=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim) + "\n cellMax=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
+ }
+ */
+
+ Relation r = state.visitor.compare(cellMinPacked, cellMaxPacked);
+
+ if (r == Relation.CELL_OUTSIDE_QUERY) {
+ // This cell is fully outside of the query shape: stop recursing
+ return;
+ } else if (r == Relation.CELL_INSIDE_QUERY) {
+ // This cell is fully inside of the query shape: recursively add all points in this cell without filtering
+ addAll(state, nodeID);
+ return;
+ } else {
+ // The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering
+ }
+
+ if (nodeID >= leafNodeOffset) {
+ // TODO: we can assert that the first value here in fact matches what the index claimed?
+
+ int leafID = nodeID - leafNodeOffset;
+
+ // In the unbalanced case it's possible the left most node only has one child:
+ if (leafID < leafBlockFPs.length) {
+ // Leaf node; scan and filter all points in this block:
+ int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
+
+ // Again, this time reading values and checking with the visitor
+ visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
+ }
+
+ } else {
+
+ // Non-leaf node: recurse on the split left and right nodes
+
+ int address = nodeID * bytesPerIndexEntry;
+ int splitDim;
+ if (numDims == 1) {
+ splitDim = 0;
+ } else {
+ splitDim = splitPackedValues[address++] & 0xff;
+ }
+
+ assert splitDim < numDims;
+
+ // TODO: can we alloc & reuse this up front?
+
+ byte[] splitPackedValue = new byte[packedBytesLength];
+
+ // Recurse on left sub-tree:
+ System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
+ System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+ intersect(state,
+ 2*nodeID,
+ cellMinPacked, splitPackedValue);
+
+ // Recurse on right sub-tree:
+ System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
+ System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+ intersect(state,
+ 2*nodeID+1,
+ splitPackedValue, cellMaxPacked);
+ }
+ }
+
+ /** Copies the split value for this node into the provided byte array */
+ public void copySplitValue(int nodeID, byte[] splitPackedValue) {
+ int address = nodeID * bytesPerIndexEntry;
+ int splitDim;
+ if (numDims == 1) {
+ splitDim = 0;
+ } else {
+ splitDim = splitPackedValues[address++] & 0xff;
+ }
+
+ assert splitDim < numDims;
+ System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return RamUsageEstimator.sizeOf(splitPackedValues) +
+ RamUsageEstimator.sizeOf(leafBlockFPs);
+ }
+
+ @Override
+ public byte[] getMinPackedValue() {
+ return minPackedValue.clone();
+ }
+
+ @Override
+ public byte[] getMaxPackedValue() {
+ return maxPackedValue.clone();
+ }
+
+ @Override
+ public int getNumDimensions() {
+ return numDims;
+ }
+
+ @Override
+ public int getBytesPerDimension() {
+ return bytesPerDim;
+ }
+
+ @Override
+ public long size() {
+ return pointCount;
+ }
+
+ @Override
+ public int getDocCount() {
+ return docCount;
+ }
+
+ public boolean isLeafNode(int nodeID) {
+ return nodeID >= leafNodeOffset;
+ }
+
private int parseInt(BytesRefBuilder scratch, BytesRef prefix) {
assert startsWith(scratch, prefix);
return Integer.parseInt(stripPrefix(scratch, prefix));
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java
new file mode 100644
index 0000000..d7674ed
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java
@@ -0,0 +1,1661 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.simpletext;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+import java.util.function.IntFunction;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.MutablePointValues;
+import org.apache.lucene.index.MergeState;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.TrackingDirectoryWrapper;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.BytesRefComparator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LongBitSet;
+import org.apache.lucene.util.MSBRadixSorter;
+import org.apache.lucene.util.NumericUtils;
+import org.apache.lucene.util.OfflineSorter;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.bkd.BKDWriter;
+import org.apache.lucene.util.bkd.HeapPointWriter;
+import org.apache.lucene.util.bkd.MutablePointsReaderUtils;
+import org.apache.lucene.util.bkd.OfflinePointReader;
+import org.apache.lucene.util.bkd.OfflinePointWriter;
+import org.apache.lucene.util.bkd.PointReader;
+import org.apache.lucene.util.bkd.PointWriter;
+
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_COUNT;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_DOC_ID;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_FP;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_VALUE;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BYTES_PER_DIM;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.DOC_COUNT;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.INDEX_COUNT;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.MAX_LEAF_POINTS;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.MAX_VALUE;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.MIN_VALUE;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.NUM_DIMS;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.POINT_COUNT;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.SPLIT_COUNT;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.SPLIT_DIM;
+import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.SPLIT_VALUE;
+
+
+// TODO
+// - allow variable length byte[] (across docs and dims), but this is quite a bit more hairy
+// - we could also index "auto-prefix terms" here, and use better compression, and maybe only use for the "fully contained" case so we'd
+// only index docIDs
+// - the index could be efficiently encoded as an FST, so we don't have wasteful
+// (monotonic) long[] leafBlockFPs; or we could use MonotonicLongValues ... but then
+// the index is already plenty small: 60M OSM points --> 1.1 MB with 128 points
+// per leaf, and you can reduce that by putting more points per leaf
+// - we could use threads while building; the higher nodes are very parallelizable
+
+/** Forked from {@link BKDWriter} and simplified/specialized for SimpleText's usage */
+
+final class SimpleTextBKDWriter implements Closeable {
+
+ public static final String CODEC_NAME = "BKD";
+ public static final int VERSION_START = 0;
+ public static final int VERSION_COMPRESSED_DOC_IDS = 1;
+ public static final int VERSION_COMPRESSED_VALUES = 2;
+ public static final int VERSION_IMPLICIT_SPLIT_DIM_1D = 3;
+ public static final int VERSION_CURRENT = VERSION_IMPLICIT_SPLIT_DIM_1D;
+
+ /** How many bytes each docs takes in the fixed-width offline format */
+ private final int bytesPerDoc;
+
+ /** Default maximum number of point in each leaf block */
+ public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE = 1024;
+
+ /** Default maximum heap to use, before spilling to (slower) disk */
+ public static final float DEFAULT_MAX_MB_SORT_IN_HEAP = 16.0f;
+
+ /** Maximum number of dimensions */
+ public static final int MAX_DIMS = 8;
+
+ /** How many dimensions we are indexing */
+ protected final int numDims;
+
+ /** How many bytes each value in each dimension takes. */
+ protected final int bytesPerDim;
+
+ /** numDims * bytesPerDim */
+ protected final int packedBytesLength;
+
+ final BytesRefBuilder scratch = new BytesRefBuilder();
+
+ final TrackingDirectoryWrapper tempDir;
+ final String tempFileNamePrefix;
+ final double maxMBSortInHeap;
+
+ final byte[] scratchDiff;
+ final byte[] scratch1;
+ final byte[] scratch2;
+ final BytesRef scratchBytesRef1 = new BytesRef();
+ final BytesRef scratchBytesRef2 = new BytesRef();
+ final int[] commonPrefixLengths;
+
+ protected final FixedBitSet docsSeen;
+
+ private OfflinePointWriter offlinePointWriter;
+ private HeapPointWriter heapPointWriter;
+
+ private IndexOutput tempInput;
+ protected final int maxPointsInLeafNode;
+ private final int maxPointsSortInHeap;
+
+ /** Minimum per-dim values, packed */
+ protected final byte[] minPackedValue;
+
+ /** Maximum per-dim values, packed */
+ protected final byte[] maxPackedValue;
+
+ protected long pointCount;
+
+ /** true if we have so many values that we must write ords using long (8 bytes) instead of int (4 bytes) */
+ protected final boolean longOrds;
+
+ /** An upper bound on how many points the caller will add (includes deletions) */
+ private final long totalPointCount;
+
+ /** True if every document has at most one value. We specialize this case by not bothering to store the ord since it's redundant with docID. */
+ protected final boolean singleValuePerDoc;
+
+ /** How much heap OfflineSorter is allowed to use */
+ protected final OfflineSorter.BufferSize offlineSorterBufferMB;
+
+ /** How much heap OfflineSorter is allowed to use */
+ protected final int offlineSorterMaxTempFiles;
+
+ private final int maxDoc;
+
+ public SimpleTextBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim,
+ int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws IOException {
+ this(maxDoc, tempDir, tempFileNamePrefix, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc,
+ totalPointCount > Integer.MAX_VALUE, Math.max(1, (long) maxMBSortInHeap), OfflineSorter.MAX_TEMPFILES);
+ }
+
+ private SimpleTextBKDWriter(int maxDoc, Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim,
+ int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount,
+ boolean singleValuePerDoc, boolean longOrds, long offlineSorterBufferMB, int offlineSorterMaxTempFiles) throws IOException {
+ verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount);
+ // We use tracking dir to deal with removing files on exception, so each place that
+ // creates temp files doesn't need crazy try/finally/sucess logic:
+ this.tempDir = new TrackingDirectoryWrapper(tempDir);
+ this.tempFileNamePrefix = tempFileNamePrefix;
+ this.maxPointsInLeafNode = maxPointsInLeafNode;
+ this.numDims = numDims;
+ this.bytesPerDim = bytesPerDim;
+ this.totalPointCount = totalPointCount;
+ this.maxDoc = maxDoc;
+ this.offlineSorterBufferMB = OfflineSorter.BufferSize.megabytes(offlineSorterBufferMB);
+ this.offlineSorterMaxTempFiles = offlineSorterMaxTempFiles;
+ docsSeen = new FixedBitSet(maxDoc);
+ packedBytesLength = numDims * bytesPerDim;
+
+ scratchDiff = new byte[bytesPerDim];
+ scratch1 = new byte[packedBytesLength];
+ scratch2 = new byte[packedBytesLength];
+ commonPrefixLengths = new int[numDims];
+
+ minPackedValue = new byte[packedBytesLength];
+ maxPackedValue = new byte[packedBytesLength];
+
+ // If we may have more than 1+Integer.MAX_VALUE values, then we must encode ords with long (8 bytes), else we can use int (4 bytes).
+ this.longOrds = longOrds;
+
+ this.singleValuePerDoc = singleValuePerDoc;
+
+ // dimensional values (numDims * bytesPerDim) + ord (int or long) + docID (int)
+ if (singleValuePerDoc) {
+ // Lucene only supports up to 2.1 docs, so we better not need longOrds in this case:
+ assert longOrds == false;
+ bytesPerDoc = packedBytesLength + Integer.BYTES;
+ } else if (longOrds) {
+ bytesPerDoc = packedBytesLength + Long.BYTES + Integer.BYTES;
+ } else {
+ bytesPerDoc = packedBytesLength + Integer.BYTES + Integer.BYTES;
+ }
+
+ // As we recurse, we compute temporary partitions of the data, halving the
+ // number of points at each recursion. Once there are few enough points,
+ // we can switch to sorting in heap instead of offline (on disk). At any
+ // time in the recursion, we hold the number of points at that level, plus
+ // all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X
+ // what that level would consume, so we multiply by 0.5 to convert from
+ // bytes to points here. Each dimension has its own sorted partition, so
+ // we must divide by numDims as wel.
+
+ maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims));
+
+ // Finally, we must be able to hold at least the leaf node in heap during build:
+ if (maxPointsSortInHeap < maxPointsInLeafNode) {
+ throw new IllegalArgumentException("maxMBSortInHeap=" + maxMBSortInHeap + " only allows for maxPointsSortInHeap=" + maxPointsSortInHeap + ", but this is less than maxPointsInLeafNode=" + maxPointsInLeafNode + "; either increase maxMBSortInHeap or decrease maxPointsInLeafNode");
+ }
+
+ // We write first maxPointsSortInHeap in heap, then cutover to offline for additional points:
+ heapPointWriter = new HeapPointWriter(16, maxPointsSortInHeap, packedBytesLength, longOrds, singleValuePerDoc);
+
+ this.maxMBSortInHeap = maxMBSortInHeap;
+ }
+
+ public static void verifyParams(int numDims, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount) {
+ // We encode dim in a single byte in the splitPackedValues, but we only expose 4 bits for it now, in case we want to use
+ // remaining 4 bits for another purpose later
+ if (numDims < 1 || numDims > MAX_DIMS) {
+ throw new IllegalArgumentException("numDims must be 1 .. " + MAX_DIMS + " (got: " + numDims + ")");
+ }
+ if (maxPointsInLeafNode <= 0) {
+ throw new IllegalArgumentException("maxPointsInLeafNode must be > 0; got " + maxPointsInLeafNode);
+ }
+ if (maxPointsInLeafNode > ArrayUtil.MAX_ARRAY_LENGTH) {
+ throw new IllegalArgumentException("maxPointsInLeafNode must be <= ArrayUtil.MAX_ARRAY_LENGTH (= " + ArrayUtil.MAX_ARRAY_LENGTH + "); got " + maxPointsInLeafNode);
+ }
+ if (maxMBSortInHeap < 0.0) {
+ throw new IllegalArgumentException("maxMBSortInHeap must be >= 0.0 (got: " + maxMBSortInHeap + ")");
+ }
+ if (totalPointCount < 0) {
+ throw new IllegalArgumentException("totalPointCount must be >=0 (got: " + totalPointCount + ")");
+ }
+ }
+
+ /** If the current segment has too many points then we spill over to temp files / offline sort. */
+ private void spillToOffline() throws IOException {
+
+ // For each .add we just append to this input file, then in .finish we sort this input and resursively build the tree:
+ offlinePointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, longOrds, "spill", 0, singleValuePerDoc);
+ tempInput = offlinePointWriter.out;
+ PointReader reader = heapPointWriter.getReader(0, pointCount);
+ for(int i=0;i<pointCount;i++) {
+ boolean hasNext = reader.next();
+ assert hasNext;
+ offlinePointWriter.append(reader.packedValue(), i, heapPointWriter.docIDs[i]);
+ }
+
+ heapPointWriter = null;
+ }
+
+ public void add(byte[] packedValue, int docID) throws IOException {
+ if (packedValue.length != packedBytesLength) {
+ throw new IllegalArgumentException("packedValue should be length=" + packedBytesLength + " (got: " + packedValue.length + ")");
+ }
+
+ if (pointCount >= maxPointsSortInHeap) {
+ if (offlinePointWriter == null) {
+ spillToOffline();
+ }
+ offlinePointWriter.append(packedValue, pointCount, docID);
+ } else {
+ // Not too many points added yet, continue using heap:
+ heapPointWriter.append(packedValue, pointCount, docID);
+ }
+
+ // TODO: we could specialize for the 1D case:
+ if (pointCount == 0) {
+ System.arraycopy(packedValue, 0, minPackedValue, 0, packedBytesLength);
+ System.arraycopy(packedValue, 0, maxPackedValue, 0, packedBytesLength);
+ } else {
+ for(int dim=0;dim<numDims;dim++) {
+ int offset = dim*bytesPerDim;
+ if (StringHelper.compare(bytesPerDim, packedValue, offset, minPackedValue, offset) < 0) {
+ System.arraycopy(packedValue, offset, minPackedValue, offset, bytesPerDim);
+ }
+ if (StringHelper.compare(bytesPerDim, packedValue, offset, maxPackedValue, offset) > 0) {
+ System.arraycopy(packedValue, offset, maxPackedValue, offset, bytesPerDim);
+ }
+ }
+ }
+
+ pointCount++;
+ if (pointCount > totalPointCount) {
+ throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values");
+ }
+ docsSeen.set(docID);
+ }
+
+ /** How many points have been added so far */
+ public long getPointCount() {
+ return pointCount;
+ }
+
+ private static class MergeReader {
+ final SimpleTextBKDReader bkd;
+ final SimpleTextBKDReader.IntersectState state;
+ final MergeState.DocMap docMap;
+
+ /** Current doc ID */
+ public int docID;
+
+ /** Which doc in this block we are up to */
+ private int docBlockUpto;
+
+ /** How many docs in the current block */
+ private int docsInBlock;
+
+ /** Which leaf block we are up to */
+ private int blockID;
+
+ private final byte[] packedValues;
+
+ public MergeReader(SimpleTextBKDReader bkd, MergeState.DocMap docMap) throws IOException {
+ this.bkd = bkd;
+ state = new SimpleTextBKDReader.IntersectState(bkd.in.clone(),
+ bkd.numDims,
+ bkd.packedBytesLength,
+ bkd.maxPointsInLeafNode,
+ null);
+ this.docMap = docMap;
+ long minFP = Long.MAX_VALUE;
+ //System.out.println("MR.init " + this + " bkdreader=" + bkd + " leafBlockFPs.length=" + bkd.leafBlockFPs.length);
+ for(long fp : bkd.leafBlockFPs) {
+ minFP = Math.min(minFP, fp);
+ //System.out.println(" leaf fp=" + fp);
+ }
+ state.in.seek(minFP);
+ this.packedValues = new byte[bkd.maxPointsInLeafNode * bkd.packedBytesLength];
+ }
+
+ public boolean next() throws IOException {
+ //System.out.println("MR.next this=" + this);
+ while (true) {
+ if (docBlockUpto == docsInBlock) {
+ if (blockID == bkd.leafBlockFPs.length) {
+ //System.out.println(" done!");
+ return false;
+ }
+ //System.out.println(" new block @ fp=" + state.in.getFilePointer());
+ docsInBlock = bkd.readDocIDs(state.in, state.in.getFilePointer(), state.scratchDocIDs);
+ assert docsInBlock > 0;
+ docBlockUpto = 0;
+ bkd.visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, docsInBlock, new IntersectVisitor() {
+ int i = 0;
+
+ @Override
+ public void visit(int docID) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {
+ assert docID == state.scratchDocIDs[i];
+ System.arraycopy(packedValue, 0, packedValues, i * bkd.packedBytesLength, bkd.packedBytesLength);
+ i++;
+ }
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ throw new UnsupportedOperationException();
+ }
+
+ });
+
+ blockID++;
+ }
+
+ final int index = docBlockUpto++;
+ int oldDocID = state.scratchDocIDs[index];
+
+ int mappedDocID;
+ if (docMap == null) {
+ mappedDocID = oldDocID;
+ } else {
+ mappedDocID = docMap.get(oldDocID);
+ }
+
+ if (mappedDocID != -1) {
+ // Not deleted!
+ docID = mappedDocID;
+ System.arraycopy(packedValues, index * bkd.packedBytesLength, state.scratchPackedValue, 0, bkd.packedBytesLength);
+ return true;
+ }
+ }
+ }
+ }
+
+ private static class BKDMergeQueue extends PriorityQueue<MergeReader> {
+ private final int bytesPerDim;
+
+ public BKDMergeQueue(int bytesPerDim, int maxSize) {
+ super(maxSize);
+ this.bytesPerDim = bytesPerDim;
+ }
+
+ @Override
+ public boolean lessThan(MergeReader a, MergeReader b) {
+ assert a != b;
+
+ int cmp = StringHelper.compare(bytesPerDim, a.state.scratchPackedValue, 0, b.state.scratchPackedValue, 0);
+ if (cmp < 0) {
+ return true;
+ } else if (cmp > 0) {
+ return false;
+ }
+
+ // Tie break by sorting smaller docIDs earlier:
+ return a.docID < b.docID;
+ }
+ }
+
+ /** Write a field from a {@link MutablePointValues}. This way of writing
+ * points is faster than regular writes with {@link BKDWriter#add} since
+ * there is opportunity for reordering points before writing them to
+ * disk. This method does not use transient disk in order to reorder points.
+ */
+ public long writeField(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
+ if (numDims == 1) {
+ return writeField1Dim(out, fieldName, reader);
+ } else {
+ return writeFieldNDims(out, fieldName, reader);
+ }
+ }
+
+
+ /* In the 2+D case, we recursively pick the split dimension, compute the
+ * median value and partition other values around it. */
+ private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointValues values) throws IOException {
+ if (pointCount != 0) {
+ throw new IllegalStateException("cannot mix add and writeField");
+ }
+
+ // Catch user silliness:
+ if (heapPointWriter == null && tempInput == null) {
+ throw new IllegalStateException("already finished");
+ }
+
+ // Mark that we already finished:
+ heapPointWriter = null;
+
+ long countPerLeaf = pointCount = values.size();
+ long innerNodeCount = 1;
+
+ while (countPerLeaf > maxPointsInLeafNode) {
+ countPerLeaf = (countPerLeaf+1)/2;
+ innerNodeCount *= 2;
+ }
+
+ int numLeaves = Math.toIntExact(innerNodeCount);
+
+ checkMaxLeafNodeCount(numLeaves);
+
+ final byte[] splitPackedValues = new byte[numLeaves * (bytesPerDim + 1)];
+ final long[] leafBlockFPs = new long[numLeaves];
+
+ // compute the min/max for this slice
+ Arrays.fill(minPackedValue, (byte) 0xff);
+ Arrays.fill(maxPackedValue, (byte) 0);
+ for (int i = 0; i < Math.toIntExact(pointCount); ++i) {
+ values.getValue(i, scratchBytesRef1);
+ for(int dim=0;dim<numDims;dim++) {
+ int offset = dim*bytesPerDim;
+ if (StringHelper.compare(bytesPerDim, scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, minPackedValue, offset) < 0) {
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, minPackedValue, offset, bytesPerDim);
+ }
+ if (StringHelper.compare(bytesPerDim, scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset) > 0) {
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset, bytesPerDim);
+ }
+ }
+
+ docsSeen.set(values.getDocID(i));
+ }
+
+ build(1, numLeaves, values, 0, Math.toIntExact(pointCount), out,
+ minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs,
+ new int[maxPointsInLeafNode]);
+
+ long indexFP = out.getFilePointer();
+ writeIndex(out, leafBlockFPs, splitPackedValues);
+ return indexFP;
+ }
+
+
+ /* In the 1D case, we can simply sort points in ascending order and use the
+ * same writing logic as we use at merge time. */
+ private long writeField1Dim(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
+ MutablePointsReaderUtils.sort(maxDoc, packedBytesLength, reader, 0, Math.toIntExact(reader.size()));
+
+ final OneDimensionBKDWriter oneDimWriter = new OneDimensionBKDWriter(out);
+
+ reader.intersect(new IntersectVisitor() {
+
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {
+ oneDimWriter.add(packedValue, docID);
+ }
+
+ @Override
+ public void visit(int docID) throws IOException {
+ throw new IllegalStateException();
+ }
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return Relation.CELL_CROSSES_QUERY;
+ }
+ });
+
+ return oneDimWriter.finish();
+ }
+
+ // TODO: remove this opto: SimpleText is supposed to be simple!
+
+ /** More efficient bulk-add for incoming {@link SimpleTextBKDReader}s. This does a merge sort of the already
+ * sorted values and currently only works when numDims==1. This returns -1 if all documents containing
+ * dimensional values were deleted. */
+ public long merge(IndexOutput out, List<MergeState.DocMap> docMaps, List<SimpleTextBKDReader> readers) throws IOException {
+ assert docMaps == null || readers.size() == docMaps.size();
+
+ BKDMergeQueue queue = new BKDMergeQueue(bytesPerDim, readers.size());
+
+ for(int i=0;i<readers.size();i++) {
+ SimpleTextBKDReader bkd = readers.get(i);
+ MergeState.DocMap docMap;
+ if (docMaps == null) {
+ docMap = null;
+ } else {
+ docMap = docMaps.get(i);
+ }
+ MergeReader reader = new MergeReader(bkd, docMap);
+ if (reader.next()) {
+ queue.add(reader);
+ }
+ }
+
+ OneDimensionBKDWriter oneDimWriter = new OneDimensionBKDWriter(out);
+
+ while (queue.size() != 0) {
+ MergeReader reader = queue.top();
+ // System.out.println("iter reader=" + reader);
+
+ // NOTE: doesn't work with subclasses (e.g. SimpleText!)
+ oneDimWriter.add(reader.state.scratchPackedValue, reader.docID);
+
+ if (reader.next()) {
+ queue.updateTop();
+ } else {
+ // This segment was exhausted
+ queue.pop();
+ }
+ }
+
+ return oneDimWriter.finish();
+ }
+
+ private class OneDimensionBKDWriter {
+
+ final IndexOutput out;
+ final List<Long> leafBlockFPs = new ArrayList<>();
+ final List<byte[]> leafBlockStartValues = new ArrayList<>();
+ final byte[] leafValues = new byte[maxPointsInLeafNode * packedBytesLength];
+ final int[] leafDocs = new int[maxPointsInLeafNode];
+ long valueCount;
+ int leafCount;
+
+ OneDimensionBKDWriter(IndexOutput out) {
+ if (numDims != 1) {
+ throw new UnsupportedOperationException("numDims must be 1 but got " + numDims);
+ }
+ if (pointCount != 0) {
+ throw new IllegalStateException("cannot mix add and merge");
+ }
+
+ // Catch user silliness:
+ if (heapPointWriter == null && tempInput == null) {
+ throw new IllegalStateException("already finished");
+ }
+
+ // Mark that we already finished:
+ heapPointWriter = null;
+
+ this.out = out;
+
+ lastPackedValue = new byte[packedBytesLength];
+ }
+
+ // for asserts
+ final byte[] lastPackedValue;
+ int lastDocID;
+
+ void add(byte[] packedValue, int docID) throws IOException {
+ assert valueInOrder(valueCount + leafCount,
+ 0, lastPackedValue, packedValue, 0, docID, lastDocID);
+
+ System.arraycopy(packedValue, 0, leafValues, leafCount * packedBytesLength, packedBytesLength);
+ leafDocs[leafCount] = docID;
+ docsSeen.set(docID);
+ leafCount++;
+
+ if (valueCount > totalPointCount) {
+ throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values");
+ }
+
+ if (leafCount == maxPointsInLeafNode) {
+ // We write a block once we hit exactly the max count ... this is different from
+ // when we flush a new segment, where we write between max/2 and max per leaf block,
+ // so merged segments will behave differently from newly flushed segments:
+ writeLeafBlock();
+ leafCount = 0;
+ }
+
+ assert (lastDocID = docID) >= 0; // only assign when asserts are enabled
+ }
+
+ public long finish() throws IOException {
+ if (leafCount > 0) {
+ writeLeafBlock();
+ leafCount = 0;
+ }
+
+ if (valueCount == 0) {
+ return -1;
+ }
+
+ pointCount = valueCount;
+
+ long indexFP = out.getFilePointer();
+
+ int numInnerNodes = leafBlockStartValues.size();
+
+ //System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts=" + leafBlockStartValues.size());
+
+ byte[] index = new byte[(1+numInnerNodes) * (1+bytesPerDim)];
+ rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues);
+ long[] arr = new long[leafBlockFPs.size()];
+ for(int i=0;i<leafBlockFPs.size();i++) {
+ arr[i] = leafBlockFPs.get(i);
+ }
+ writeIndex(out, arr, index);
+ return indexFP;
+ }
+
+ private void writeLeafBlock() throws IOException {
+ assert leafCount != 0;
+ if (valueCount == 0) {
+ System.arraycopy(leafValues, 0, minPackedValue, 0, packedBytesLength);
+ }
+ System.arraycopy(leafValues, (leafCount - 1) * packedBytesLength, maxPackedValue, 0, packedBytesLength);
+
+ valueCount += leafCount;
+
+ if (leafBlockFPs.size() > 0) {
+ // Save the first (minimum) value in each leaf block except the first, to build the split value index in the end:
+ leafBlockStartValues.add(Arrays.copyOf(leafValues, packedBytesLength));
+ }
+ leafBlockFPs.add(out.getFilePointer());
+ checkMaxLeafNodeCount(leafBlockFPs.size());
+
+ Arrays.fill(commonPrefixLengths, bytesPerDim);
+ // Find per-dim common prefix:
+ for(int dim=0;dim<numDims;dim++) {
+ int offset1 = dim * bytesPerDim;
+ int offset2 = (leafCount - 1) * packedBytesLength + offset1;
+ for(int j=0;j<commonPrefixLengths[dim];j++) {
+ if (leafValues[offset1+j] != leafValues[offset2+j]) {
+ commonPrefixLengths[dim] = j;
+ break;
+ }
+ }
+ }
+
+ writeLeafBlockDocs(out, leafDocs, 0, leafCount);
+
+ final IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
+ final BytesRef scratch = new BytesRef();
+
+ {
+ scratch.length = packedBytesLength;
+ scratch.bytes = leafValues;
+ }
+
+ @Override
+ public BytesRef apply(int i) {
+ scratch.offset = packedBytesLength * i;
+ return scratch;
+ }
+ };
+ assert valuesInOrderAndBounds(leafCount, 0, Arrays.copyOf(leafValues, packedBytesLength),
+ Arrays.copyOfRange(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength),
+ packedValues, leafDocs, 0);
+ writeLeafBlockPackedValues(out, commonPrefixLengths, leafCount, 0, packedValues);
+ }
+
+ }
+
+ // TODO: there must be a simpler way?
+ private void rotateToTree(int nodeID, int offset, int count, byte[] index, List<byte[]> leafBlockStartValues) {
+ //System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + " bpd=" + bytesPerDim + " index.length=" + index.length);
+ if (count == 1) {
+ // Leaf index node
+ //System.out.println(" leaf index node");
+ //System.out.println(" index[" + nodeID + "] = blockStartValues[" + offset + "]");
+ System.arraycopy(leafBlockStartValues.get(offset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim);
+ } else if (count > 1) {
+ // Internal index node: binary partition of count
+ int countAtLevel = 1;
+ int totalCount = 0;
+ while (true) {
+ int countLeft = count - totalCount;
+ //System.out.println(" cycle countLeft=" + countLeft + " coutAtLevel=" + countAtLevel);
+ if (countLeft <= countAtLevel) {
+ // This is the last level, possibly partially filled:
+ int lastLeftCount = Math.min(countAtLevel/2, countLeft);
+ assert lastLeftCount >= 0;
+ int leftHalf = (totalCount-1)/2 + lastLeftCount;
+
+ int rootOffset = offset + leftHalf;
+ /*
+ System.out.println(" last left count " + lastLeftCount);
+ System.out.println(" leftHalf " + leftHalf + " rightHalf=" + (count-leftHalf-1));
+ System.out.println(" rootOffset=" + rootOffset);
+ */
+
+ System.arraycopy(leafBlockStartValues.get(rootOffset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim);
+ //System.out.println(" index[" + nodeID + "] = blockStartValues[" + rootOffset + "]");
+
+ // TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree
+ // under here, to save this while loop on each recursion
+
+ // Recurse left
+ rotateToTree(2*nodeID, offset, leftHalf, index, leafBlockStartValues);
+
+ // Recurse right
+ rotateToTree(2*nodeID+1, rootOffset+1, count-leftHalf-1, index, leafBlockStartValues);
+ return;
+ }
+ totalCount += countAtLevel;
+ countAtLevel *= 2;
+ }
+ } else {
+ assert count == 0;
+ }
+ }
+
+ // TODO: if we fixed each partition step to just record the file offset at the "split point", we could probably handle variable length
+ // encoding and not have our own ByteSequencesReader/Writer
+
+ /** Sort the heap writer by the specified dim */
+ private void sortHeapPointWriter(final HeapPointWriter writer, int dim) {
+ final int pointCount = Math.toIntExact(this.pointCount);
+ // Tie-break by docID:
+
+ // No need to tie break on ord, for the case where the same doc has the same value in a given dimension indexed more than once: it
+ // can't matter at search time since we don't write ords into the index:
+ new MSBRadixSorter(bytesPerDim + Integer.BYTES) {
+
+ @Override
+ protected int byteAt(int i, int k) {
+ assert k >= 0;
+ if (k < bytesPerDim) {
+ // dim bytes
+ int block = i / writer.valuesPerBlock;
+ int index = i % writer.valuesPerBlock;
+ return writer.blocks.get(block)[index * packedBytesLength + dim * bytesPerDim + k] & 0xff;
+ } else {
+ // doc id
+ int s = 3 - (k - bytesPerDim);
+ return (writer.docIDs[i] >>> (s * 8)) & 0xff;
+ }
+ }
+
+ @Override
+ protected void swap(int i, int j) {
+ int docID = writer.docIDs[i];
+ writer.docIDs[i] = writer.docIDs[j];
+ writer.docIDs[j] = docID;
+
+ if (singleValuePerDoc == false) {
+ if (longOrds) {
+ long ord = writer.ordsLong[i];
+ writer.ordsLong[i] = writer.ordsLong[j];
+ writer.ordsLong[j] = ord;
+ } else {
+ int ord = writer.ords[i];
+ writer.ords[i] = writer.ords[j];
+ writer.ords[j] = ord;
+ }
+ }
+
+ byte[] blockI = writer.blocks.get(i / writer.valuesPerBlock);
+ int indexI = (i % writer.valuesPerBlock) * packedBytesLength;
+ byte[] blockJ = writer.blocks.get(j / writer.valuesPerBlock);
+ int indexJ = (j % writer.valuesPerBlock) * packedBytesLength;
+
+ // scratch1 = values[i]
+ System.arraycopy(blockI, indexI, scratch1, 0, packedBytesLength);
+ // values[i] = values[j]
+ System.arraycopy(blockJ, indexJ, blockI, indexI, packedBytesLength);
+ // values[j] = scratch1
+ System.arraycopy(scratch1, 0, blockJ, indexJ, packedBytesLength);
+ }
+
+ }.sort(0, pointCount);
+ }
+
+ private PointWriter sort(int dim) throws IOException {
+ assert dim >= 0 && dim < numDims;
+
+ if (heapPointWriter != null) {
+
+ assert tempInput == null;
+
+ // We never spilled the incoming points to disk, so now we sort in heap:
+ HeapPointWriter sorted;
+
+ if (dim == 0) {
+ // First dim can re-use the current heap writer
+ sorted = heapPointWriter;
+ } else {
+ // Subsequent dims need a private copy
+ sorted = new HeapPointWriter((int) pointCount, (int) pointCount, packedBytesLength, longOrds, singleValuePerDoc);
+ sorted.copyFrom(heapPointWriter);
+ }
+
+ //long t0 = System.nanoTime();
+ sortHeapPointWriter(sorted, dim);
+ //long t1 = System.nanoTime();
+ //System.out.println("BKD: sort took " + ((t1-t0)/1000000.0) + " msec");
+
+ sorted.close();
+ return sorted;
+ } else {
+
+ // Offline sort:
+ assert tempInput != null;
+
+ final int offset = bytesPerDim * dim;
+
+ Comparator<BytesRef> cmp;
+ if (dim == numDims - 1) {
+ // in that case the bytes for the dimension and for the doc id are contiguous,
+ // so we don't need a branch
+ cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {
+ @Override
+ protected int byteAt(BytesRef ref, int i) {
+ return ref.bytes[ref.offset + offset + i] & 0xff;
+ }
+ };
+ } else {
+ cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {
+ @Override
+ protected int byteAt(BytesRef ref, int i) {
+ if (i < bytesPerDim) {
+ return ref.bytes[ref.offset + offset + i] & 0xff;
+ } else {
+ return ref.bytes[ref.offset + packedBytesLength + i - bytesPerDim] & 0xff;
+ }
+ }
+ };
+ }
+
+ OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc) {
+
+ /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
+ @Override
+ protected ByteSequencesWriter getWriter(IndexOutput out) {
+ return new ByteSequencesWriter(out) {
+ @Override
+ public void write(byte[] bytes, int off, int len) throws IOException {
+ assert len == bytesPerDoc: "len=" + len + " bytesPerDoc=" + bytesPerDoc;
+ out.writeBytes(bytes, off, len);
+ }
+ };
+ }
+
+ /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
+ @Override
+ protected ByteSequencesReader getReader(ChecksumIndexInput in, String name) throws IOException {
+ return new ByteSequencesReader(in, name) {
+ final BytesRef scratch = new BytesRef(new byte[bytesPerDoc]);
+ @Override
+ public BytesRef next() throws IOException {
+ if (in.getFilePointer() >= end) {
+ return null;
+ }
+ in.readBytes(scratch.bytes, 0, bytesPerDoc);
+ return scratch;
+ }
+ };
+ }
+ };
+
+ String name = sorter.sort(tempInput.getName());
+
+ return new OfflinePointWriter(tempDir, name, packedBytesLength, pointCount, longOrds, singleValuePerDoc);
+ }
+ }
+
+ private void checkMaxLeafNodeCount(int numLeaves) {
+ if ((1+bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
+ throw new IllegalStateException("too many nodes; increase maxPointsInLeafNode (currently " + maxPointsInLeafNode + ") and reindex");
+ }
+ }
+
+ /** Writes the BKD tree to the provided {@link IndexOutput} and returns the file offset where index was written. */
+ public long finish(IndexOutput out) throws IOException {
+ // System.out.println("\nBKDTreeWriter.finish pointCount=" + pointCount + " out=" + out + " heapWriter=" + heapPointWriter);
+
+ // TODO: specialize the 1D case? it's much faster at indexing time (no partitioning on recurse...)
+
+ // Catch user silliness:
+ if (heapPointWriter == null && tempInput == null) {
+ throw new IllegalStateException("already finished");
+ }
+
+ if (offlinePointWriter != null) {
+ offlinePointWriter.close();
+ }
+
+ if (pointCount == 0) {
+ throw new IllegalStateException("must index at least one point");
+ }
+
+ LongBitSet ordBitSet;
+ if (numDims > 1) {
+ if (singleValuePerDoc) {
+ ordBitSet = new LongBitSet(maxDoc);
+ } else {
+ ordBitSet = new LongBitSet(pointCount);
+ }
+ } else {
+ ordBitSet = null;
+ }
+
+ long countPerLeaf = pointCount;
+ long innerNodeCount = 1;
+
+ while (countPerLeaf > maxPointsInLeafNode) {
+ countPerLeaf = (countPerLeaf+1)/2;
+ innerNodeCount *= 2;
+ }
+
+ int numLeaves = (int) innerNodeCount;
+
+ checkMaxLeafNodeCount(numLeaves);
+
+ // NOTE: we could save the 1+ here, to use a bit less heap at search time, but then we'd need a somewhat costly check at each
+ // step of the recursion to recompute the split dim:
+
+ // Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the lead byte at each recursion says which dim we split on.
+ byte[] splitPackedValues = new byte[Math.toIntExact(numLeaves*(1+bytesPerDim))];
+
+ // +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g. 7)
+ long[] leafBlockFPs = new long[numLeaves];
+
+ // Make sure the math above "worked":
+ assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointCount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode;
+
+ // Sort all docs once by each dimension:
+ PathSlice[] sortedPointWriters = new PathSlice[numDims];
+
+ // This is only used on exception; on normal code paths we close all files we opened:
+ List<Closeable> toCloseHeroically = new ArrayList<>();
+
+ boolean success = false;
+ try {
+ //long t0 = System.nanoTime();
+ for(int dim=0;dim<numDims;dim++) {
+ sortedPointWriters[dim] = new PathSlice(sort(dim), 0, pointCount);
+ }
+ //long t1 = System.nanoTime();
+ //System.out.println("sort time: " + ((t1-t0)/1000000.0) + " msec");
+
+ if (tempInput != null) {
+ tempDir.deleteFile(tempInput.getName());
+ tempInput = null;
+ } else {
+ assert heapPointWriter != null;
+ heapPointWriter = null;
+ }
+
+ build(1, numLeaves, sortedPointWriters,
+ ordBitSet, out,
+ minPackedValue, maxPackedValue,
+ splitPackedValues,
+ leafBlockFPs,
+ toCloseHeroically);
+
+ for(PathSlice slice : sortedPointWriters) {
+ slice.writer.destroy();
+ }
+
+ // If no exception, we should have cleaned everything up:
+ assert tempDir.getCreatedFiles().isEmpty();
+ //long t2 = System.nanoTime();
+ //System.out.println("write time: " + ((t2-t1)/1000000.0) + " msec");
+
+ success = true;
+ } finally {
+ if (success == false) {
+ IOUtils.deleteFilesIgnoringExceptions(tempDir, tempDir.getCreatedFiles());
+ IOUtils.closeWhileHandlingException(toCloseHeroically);
+ }
+ }
+
+ //System.out.println("Total nodes: " + innerNodeCount);
+
+ // Write index:
+ long indexFP = out.getFilePointer();
+ writeIndex(out, leafBlockFPs, splitPackedValues);
+ return indexFP;
+ }
+
+ /** Subclass can change how it writes the index. */
+ private void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
+ write(out, NUM_DIMS);
+ writeInt(out, numDims);
+ newline(out);
+
+ write(out, BYTES_PER_DIM);
+ writeInt(out, bytesPerDim);
+ newline(out);
+
+ write(out, MAX_LEAF_POINTS);
+ writeInt(out, maxPointsInLeafNode);
+ newline(out);
+
+ write(out, INDEX_COUNT);
+ writeInt(out, leafBlockFPs.length);
+ newline(out);
+
+ write(out, MIN_VALUE);
+ BytesRef br = new BytesRef(minPackedValue, 0, minPackedValue.length);
+ write(out, br.toString());
+ newline(out);
+
+ write(out, MAX_VALUE);
+ br = new BytesRef(maxPackedValue, 0, maxPackedValue.length);
+ write(out, br.toString());
+ newline(out);
+
+ write(out, POINT_COUNT);
+ writeLong(out, pointCount);
+ newline(out);
+
+ write(out, DOC_COUNT);
+ writeInt(out, docsSeen.cardinality());
+ newline(out);
+
+ for(int i=0;i<leafBlockFPs.length;i++) {
+ write(out, BLOCK_FP);
+ writeLong(out, leafBlockFPs[i]);
+ newline(out);
+ }
+
+ assert (splitPackedValues.length % (1 + bytesPerDim)) == 0;
+ int count = splitPackedValues.length / (1 + bytesPerDim);
+ assert count == leafBlockFPs.length;
+
+ write(out, SPLIT_COUNT);
+ writeInt(out, count);
+ newline(out);
+
+ for(int i=0;i<count;i++) {
+ write(out, SPLIT_DIM);
+ writeInt(out, splitPackedValues[i * (1 + bytesPerDim)] & 0xff);
+ newline(out);
+ write(out, SPLIT_VALUE);
+ br = new BytesRef(splitPackedValues, 1+(i * (1+bytesPerDim)), bytesPerDim);
+ write(out, br.toString());
+ newline(out);
+ }
+ }
+
+ protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
+ write(out, BLOCK_COUNT);
+ writeInt(out, count);
+ newline(out);
+ for(int i=0;i<count;i++) {
+ write(out, BLOCK_DOC_ID);
+ writeInt(out, docIDs[start+i]);
+ newline(out);
+ }
+ }
+
+ protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
+ for (int i = 0; i < count; ++i) {
+ BytesRef packedValue = packedValues.apply(i);
+ // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
+ write(out, BLOCK_VALUE);
+ write(out, packedValue.toString());
+ newline(out);
+ }
+ }
+
+ private void writeLeafBlockPackedValuesRange(IndexOutput out, int[] commonPrefixLengths, int start, int end, IntFunction<BytesRef> packedValues) throws IOException {
+ for (int i = start; i < end; ++i) {
+ BytesRef ref = packedValues.apply(i);
+ assert ref.length == packedBytesLength;
+
+ for(int dim=0;dim<numDims;dim++) {
+ int prefix = commonPrefixLengths[dim];
+ out.writeBytes(ref.bytes, ref.offset + dim*bytesPerDim + prefix, bytesPerDim-prefix);
+ }
+ }
+ }
+
+ private static int runLen(IntFunction<BytesRef> packedValues, int start, int end, int byteOffset) {
+ BytesRef first = packedValues.apply(start);
+ byte b = first.bytes[first.offset + byteOffset];
+ for (int i = start + 1; i < end; ++i) {
+ BytesRef ref = packedValues.apply(i);
+ byte b2 = ref.bytes[ref.offset + byteOffset];
+ assert Byte.toUnsignedInt(b2) >= Byte.toUnsignedInt(b);
+ if (b != b2) {
+ return i - start;
+ }
+ }
+ return end - start;
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (tempInput != null) {
+ // NOTE: this should only happen on exception, e.g. caller calls close w/o calling finish:
+ try {
+ tempInput.close();
+ } finally {
+ tempDir.deleteFile(tempInput.getName());
+ tempInput = null;
+ }
+ }
+ }
+
+ /** Sliced reference to points in an OfflineSorter.ByteSequencesWriter file. */
+ private static final class PathSlice {
+ final PointWriter writer;
+ final long start;
+ final long count;
+
+ public PathSlice(PointWriter writer, long start, long count) {
+ this.writer = writer;
+ this.start = start;
+ this.count = count;
+ }
+
+ @Override
+ public String toString() {
+ return "PathSlice(start=" + start + " count=" + count + " writer=" + writer + ")";
+ }
+ }
+
+ /** Called on exception, to check whether the checksum is also corrupt in this source, and add that
+ * information (checksum matched or didn't) as a suppressed exception. */
+ private void verifyChecksum(Throwable priorException, PointWriter writer) throws IOException {
+ // TODO: we could improve this, to always validate checksum as we recurse, if we shared left and
+ // right reader after recursing to children, and possibly within recursed children,
+ // since all together they make a single pass through the file. But this is a sizable re-org,
+ // and would mean leaving readers (IndexInputs) open for longer:
+ if (writer instanceof OfflinePointWriter) {
+ // We are reading from a temp file; go verify the checksum:
+ String tempFileName = ((OfflinePointWriter) writer).name;
+ try (ChecksumIndexInput in = tempDir.openChecksumInput(tempFileName, IOContext.READONCE)) {
+ CodecUtil.checkFooter(in, priorException);
+ }
+ } else {
+ // We are reading from heap; nothing to add:
+ IOUtils.reThrow(priorException);
+ }
+ }
+
+ /** Marks bits for the ords (points) that belong in the right sub tree (those docs that have values >= the splitValue). */
+ private byte[] markRightTree(long rightCount, int splitDim, PathSlice source, LongBitSet ordBitSet) throws IOException {
+
+ // Now we mark ords that fall into the right half, so we can partition on all other dims that are not the split dim:
+
+ // Read the split value, then mark all ords in the right tree (larger than the split value):
+
+ // TODO: find a way to also checksum this reader? If we changed to markLeftTree, and scanned the final chunk, it could work?
+ try (PointReader reader = source.writer.getReader(source.start + source.count - rightCount, rightCount)) {
+ boolean result = reader.next();
+ assert result;
+ System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim);
+ if (numDims > 1) {
+ assert ordBitSet.get(reader.ord()) == false;
+ ordBitSet.set(reader.ord());
+ // Subtract 1 from rightCount because we already did the first value above (so we could record the split value):
+ reader.markOrds(rightCount-1, ordBitSet);
+ }
+ } catch (Throwable t) {
+ verifyChecksum(t, source.writer);
+ }
+
+ return scratch1;
+ }
+
+ /** Called only in assert */
+ private boolean valueInBounds(BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) {
+ for(int dim=0;dim<numDims;dim++) {
+ int offset = bytesPerDim*dim;
+ if (StringHelper.compare(bytesPerDim, packedValue.bytes, packedValue.offset + offset, minPackedValue, offset) < 0) {
+ return false;
+ }
+ if (StringHelper.compare(bytesPerDim, packedValue.bytes, packedValue.offset + offset, maxPackedValue, offset) > 0) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
+ // Find which dim has the largest span so we can split on it:
+ int splitDim = -1;
+ for(int dim=0;dim<numDims;dim++) {
+ NumericUtils.subtract(bytesPerDim, dim, maxPackedValue, minPackedValue, scratchDiff);
+ if (splitDim == -1 || StringHelper.compare(bytesPerDim, scratchDiff, 0, scratch1, 0) > 0) {
+ System.arraycopy(scratchDiff, 0, scratch1, 0, bytesPerDim);
+ splitDim = dim;
+ }
+ }
+
+ //System.out.println("SPLIT: " + splitDim);
+ return splitDim;
+ }
+
+ /** Pull a partition back into heap once the point count is low enough while recursing. */
+ private PathSlice switchToHeap(PathSlice source, List<Closeable> toCloseHeroically) throws IOException {
+ int count = Math.toIntExact(source.count);
+ // Not inside the try because we don't want to close it here:
+ PointReader reader = source.writer.getSharedReader(source.start, source.count, toCloseHeroically);
+ try (PointWriter writer = new HeapPointWriter(count, count, packedBytesLength, longOrds, singleValuePerDoc)) {
+ for(int i=0;i<count;i++) {
+ boolean hasNext = reader.next();
+ assert hasNext;
+ writer.append(reader.packedValue(), reader.ord(), reader.docID());
+ }
+ return new PathSlice(writer, 0, count);
+ } catch (Throwable t) {
+ verifyChecksum(t, source.writer);
+
+ // Dead code but javac disagrees:
+ return null;
+ }
+ }
+
+ /* Recursively reorders the provided reader and writes the bkd-tree on the fly. */
+ private void build(int nodeID, int leafNodeOffset,
+ MutablePointValues reader, int from, int to,
+ IndexOutput out,
+ byte[] minPackedValue, byte[] maxPackedValue,
+ byte[] splitPackedValues,
+ long[] leafBlockFPs,
+ int[] spareDocIds) throws IOException {
+
+ if (nodeID >= leafNodeOffset) {
+ // leaf node
+ final int count = to - from;
+ assert count <= maxPointsInLeafNode;
+
+ // Compute common prefixes
+ Arrays.fill(commonPrefixLengths, bytesPerDim);
+ reader.getValue(from, scratchBytesRef1);
+ for (int i = from + 1; i < to; ++i) {
+ reader.getValue(i, scratchBytesRef2);
+ for (int dim=0;dim<numDims;dim++) {
+ final int offset = dim * bytesPerDim;
+ for(int j=0;j<commonPrefixLengths[dim];j++) {
+ if (scratchBytesRef1.bytes[scratchBytesRef1.offset+offset+j] != scratchBytesRef2.bytes[scratchBytesRef2.offset+offset+j]) {
+ commonPrefixLengths[dim] = j;
+ break;
+ }
+ }
+ }
+ }
+
+ // Find the dimension that has the least number of unique bytes at commonPrefixLengths[dim]
+ FixedBitSet[] usedBytes = new FixedBitSet[numDims];
+ for (int dim = 0; dim < numDims; ++dim) {
+ if (commonPrefixLengths[dim] < bytesPerDim) {
+ usedBytes[dim] = new FixedBitSet(256);
+ }
+ }
+ for (int i = from + 1; i < to; ++i) {
+ for (int dim=0;dim<numDims;dim++) {
+ if (usedBytes[dim] != null) {
+ byte b = reader.getByteAt(i, dim * bytesPerDim + commonPrefixLengths[dim]);
+ usedBytes[dim].set(Byte.toUnsignedInt(b));
+ }
+ }
+ }
+ int sortedDim = 0;
+ int sortedDimCardinality = Integer.MAX_VALUE;
+ for (int dim = 0; dim < numDims; ++dim) {
+ if (usedBytes[dim] != null) {
+ final int cardinality = usedBytes[dim].cardinality();
+ if (cardinality < sortedDimCardinality) {
+ sortedDim = dim;
+ sortedDimCardinality = cardinality;
+ }
+ }
+ }
+
+ // sort by sortedDim
+ MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths,
+ reader, from, to, scratchBytesRef1, scratchBytesRef2);
+
+ // Save the block file pointer:
+ leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
+
+ // Write doc IDs
+ int[] docIDs = spareDocIds;
+ for (int i = from; i < to; ++i) {
+ docIDs[i - from] = reader.getDocID(i);
+ }
+ writeLeafBlockDocs(out, docIDs, 0, count);
+
+ // Write the common prefixes:
+ reader.getValue(from, scratchBytesRef1);
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, packedBytesLength);
+
+ // Write the full values:
+ IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
+ @Override
+ public BytesRef apply(int i) {
+ reader.getValue(from + i, scratchBytesRef1);
+ return scratchBytesRef1;
+ }
+ };
+ assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
+ docIDs, 0);
+ writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
+
+ } else {
+ // inner node
+
+ // compute the split dimension and partition around it
+ final int splitDim = split(minPackedValue, maxPackedValue);
+ final int mid = (from + to + 1) >>> 1;
+
+ int commonPrefixLen = bytesPerDim;
+ for (int i = 0; i < bytesPerDim; ++i) {
+ if (minPackedValue[splitDim * bytesPerDim + i] != maxPackedValue[splitDim * bytesPerDim + i]) {
+ commonPrefixLen = i;
+ break;
+ }
+ }
+ MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen,
+ reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
+
+ // set the split value
+ final int address = nodeID * (1+bytesPerDim);
+ splitPackedValues[address] = (byte) splitDim;
+ reader.getValue(mid, scratchBytesRef1);
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, splitPackedValues, address + 1, bytesPerDim);
+
+ byte[] minSplitPackedValue = Arrays.copyOf(minPackedValue, packedBytesLength);
+ byte[] maxSplitPackedValue = Arrays.copyOf(maxPackedValue, packedBytesLength);
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,
+ minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,
+ maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
+
+ // recurse
+ build(nodeID * 2, leafNodeOffset, reader, from, mid, out,
+ minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
+ build(nodeID * 2 + 1, leafNodeOffset, reader, mid, to, out,
+ minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
+ }
+ }
+
+ /** The array (sized numDims) of PathSlice describe the cell we have currently recursed to. */
+ private void build(int nodeID, int leafNodeOffset,
+ PathSlice[] slices,
+ LongBitSet ordBitSet,
+ IndexOutput out,
+ byte[] minPackedValue, byte[] maxPackedValue,
+ byte[] splitPackedValues,
+ long[] leafBlockFPs,
+ List<Closeable> toCloseHeroically) throws IOException {
+
+ for(PathSlice slice : slices) {
+ assert slice.count == slices[0].count;
+ }
+
+ if (numDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) {
+ // Special case for 1D, to cutover to heap once we recurse deeply enough:
+ slices[0] = switchToHeap(slices[0], toCloseHeroically);
+ }
+
+ if (nodeID >= leafNodeOffset) {
+
+ // Leaf node: write block
+ // We can write the block in any order so by default we write it sorted by the dimension that has the
+ // least number of unique bytes at commonPrefixLengths[dim], which makes compression more efficient
+ int sortedDim = 0;
+ int sortedDimCardinality = Integer.MAX_VALUE;
+
+ for (int dim=0;dim<numDims;dim++) {
+ if (slices[dim].writer instanceof HeapPointWriter == false) {
+ // Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started
+ // offline, but then kept splitting only in one dimension, and so never had to rewrite into heap writer
+ slices[dim] = switchToHeap(slices[dim], toCloseHeroically);
+ }
+
+ PathSlice source = slices[dim];
+
+ HeapPointWriter heapSource = (HeapPointWriter) source.writer;
+
+ // Find common prefix by comparing first and last values, already sorted in this dimension:
+ heapSource.readPackedValue(Math.toIntExact(source.start), scratch1);
+ heapSource.readPackedValue(Math.toIntExact(source.start + source.count - 1), scratch2);
+
+ int offset = dim * bytesPerDim;
+ commonPrefixLengths[dim] = bytesPerDim;
+ for(int j=0;j<bytesPerDim;j++) {
+ if (scratch1[offset+j] != scratch2[offset+j]) {
+ commonPrefixLengths[dim] = j;
+ break;
+ }
+ }
+
+ int prefix = commonPrefixLengths[dim];
+ if (prefix < bytesPerDim) {
+ int cardinality = 1;
+ byte previous = scratch1[offset + prefix];
+ for (long i = 1; i < source.count; ++i) {
+ heapSource.readPackedValue(Math.toIntExact(source.start + i), scratch2);
+ byte b = scratch2[offset + prefix];
+ assert Byte.toUnsignedInt(previous) <= Byte.toUnsignedInt(b);
+ if (b != previous) {
+ cardinality++;
+ previous = b;
+ }
+ }
+ assert cardinality <= 256;
+ if (cardinality < sortedDimCardinality) {
+ sortedDim = dim;
+ sortedDimCardinality = cardinality;
+ }
+ }
+ }
+
+ PathSlice source = slices[sortedDim];
+
+ // We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point:
+ HeapPointWriter heapSource = (HeapPointWriter) source.writer;
+
+ // Save the block file pointer:
+ leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
+ //System.out.println(" write leaf block @ fp=" + out.getFilePointer());
+
+ // Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o
+ // loading the values:
+ int count = Math.toIntExact(source.count);
+ assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset;
+ writeLeafBlockDocs(out, heapSource.docIDs, Math.toIntExact(source.start), count);
+
+ // TODO: minor opto: we don't really have to write the actual common prefixes, because BKDReader on recursing can regenerate it for us
+ // from the index, much like how terms dict does so from the FST:
+
+ // Write the full values:
+ IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
+ final BytesRef scratch = new BytesRef();
+
+ {
+ scratch.length = packedBytesLength;
+ }
+
+ @Override
+ public BytesRef apply(int i) {
+ heapSource.getPackedValueSlice(Math.toIntExact(source.start + i), scratch);
+ return scratch;
+ }
+ };
+ assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
+ heapSource.docIDs, Math.toIntExact(source.start));
+ writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
+
+ } else {
+ // Inner node: partition/recurse
+
+ int splitDim;
+ if (numDims > 1) {
+ splitDim = split(minPackedValue, maxPackedValue);
+ } else {
+ splitDim = 0;
+ }
+
+ PathSlice source = slices[splitDim];
+
+ assert nodeID < splitPackedValues.length: "nodeID=" + nodeID + " splitValues.length=" + splitPackedValues.length;
+
+ // How many points will be in the left tree:
+ long rightCount = source.count / 2;
+ long leftCount = source.count - rightCount;
+
+ byte[] splitValue = markRightTree(rightCount, splitDim, source, ordBitSet);
+ int address = nodeID * (1+bytesPerDim);
+ splitPackedValues[address] = (byte) splitDim;
+ System.arraycopy(splitValue, 0, splitPackedValues, address + 1, bytesPerDim);
+
+ // Partition all PathSlice that are not the split dim into sorted left and right sets, so we can recurse:
+
+ PathSlice[] leftSlices = new PathSlice[numDims];
+ PathSlice[] rightSlices = new PathSlice[numDims];
+
+ byte[] minSplitPackedValue = new byte[packedBytesLength];
+ System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, packedBytesLength);
+
+ byte[] maxSplitPackedValue = new byte[packedBytesLength];
+ System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, packedBytesLength);
+
+ // When we are on this dim, below, we clear the ordBitSet:
+ int dimToClear;
+ if (numDims - 1 == splitDim) {
+ dimToClear = numDims - 2;
+ } else {
+ dimToClear = numDims - 1;
+ }
+
+ for(int dim=0;dim<numDims;dim++) {
+
+ if (dim == splitDim) {
+ // No need to partition on this dim since it's a simple slice of the incoming already sorted slice, and we
+ // will re-use its shared reader when visiting it as we recurse:
+ leftSlices[dim] = new PathSlice(source.writer, source.start, leftCount);
+ rightSlices[dim] = new PathSlice(source.writer, source.start + leftCount, rightCount);
+ System.arraycopy(splitValue, 0, minSplitPackedValue, dim*bytesPerDim, bytesPerDim);
+ System.arraycopy(splitValue, 0, maxSplitPackedValue, dim*bytesPerDim, bytesPerDim);
+ continue;
+ }
+
+ // Not inside the try because we don't want to close this one now, so that after recursion is done,
+ // we will have done a singel full sweep of the file:
+ PointReader reader = slices[dim].writer.getSharedReader(slices[dim].start, slices[dim].count, toCloseHeroically);
+
+ try (PointWriter leftPointWriter = getPointWriter(leftCount, "left" + dim);
+ PointWriter rightPointWriter = getPointWriter(source.count - leftCount, "right" + dim)) {
+
+ long nextRightCount = reader.split(source.count, ordBitSet, leftPointWriter, rightPointWriter, dim == dimToClear);
+ if (rightCount != nextRightCount) {
+ throw new IllegalStateException("wrong number of points in split: expected=" + rightCount + " but actual=" + nextRightCount);
+ }
+
+ leftSlices[dim] = new PathSlice(leftPointWriter, 0, leftCount);
+ rightSlices[dim] = new PathSlice(rightPointWriter, 0, rightCount);
+ } catch (Throwable t) {
+ verifyChecksum(t, slices[dim].writer);
+ }
+ }
+
+ // Recurse on left tree:
+ build(2*nodeID, leafNodeOffset, leftSlices,
+ ordBitSet, out,
+ minPackedValue, maxSplitPackedValue,
+ splitPackedValues, leafBlockFPs, toCloseHeroically);
+ for(int dim=0;dim<numDims;dim++) {
+ // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
+ if (dim != splitDim) {
+ leftSlices[dim].writer.destroy();
+ }
+ }
+
+ // TODO: we could "tail recurse" here? have our parent discard its refs as we recurse right?
+ // Recurse on right tree:
+ build(2*nodeID+1, leafNodeOffset, rightSlices,
+ ordBitSet, out,
+ minSplitPackedValue, maxPackedValue,
+ splitPackedValues, leafBlockFPs, toCloseHeroically);
+ for(int dim=0;dim<numDims;dim++) {
+ // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
+ if (dim != splitDim) {
+ rightSlices[dim].writer.destroy();
+ }
+ }
+ }
+ }
+
+ // only called from assert
+ private boolean valuesInOrderAndBounds(int count, int sortedDim, byte[] minPackedValue, byte[] maxPackedValue,
+ IntFunction<BytesRef> values, int[] docs, int docsOffset) throws IOException {
+ byte[] lastPackedValue = new byte[packedBytesLength];
+ int lastDoc = -1;
+ for (int i=0;i<count;i++) {
+ BytesRef packedValue = values.apply(i);
+ assert packedValue.length == packedBytesLength;
+ assert valueInOrder(i, sortedDim, lastPackedValue, packedValue.bytes, packedValue.offset,
+ docs[docsOffset + i], lastDoc);
+ lastDoc = docs[docsOffset + i];
+
+ // Make sure this value does in fact fall within this leaf cell:
+ assert valueInBounds(packedValue, minPackedValue, maxPackedValue);
+ }
+ return true;
+ }
+
+ // only called from assert
+ private boolean valueInOrder(long ord, int sortedDim, byte[] lastPackedValue, byte[] packedValue, int packedValueOffset,
+ int doc, int lastDoc) {
+ int dimOffset = sortedDim * bytesPerDim;
+ if (ord > 0) {
+ int cmp = StringHelper.compare(bytesPerDim, lastPackedValue, dimOffset, packedValue, packedValueOffset + dimOffset);
+ if (cmp > 0) {
+ throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord);
+ }
+ if (cmp == 0 && doc < lastDoc) {
+ throw new AssertionError("docs out of order: last doc=" + lastDoc + " current doc=" + doc + " ord=" + ord);
+ }
+ }
+ System.arraycopy(packedValue, packedValueOffset, lastPackedValue, 0, packedBytesLength);
+ return true;
+ }
+
+ PointWriter getPointWriter(long count, String desc) throws IOException {
+ if (count <= maxPointsSortInHeap) {
+ int size = Math.toIntExact(count);
+ return new HeapPointWriter(size, size, packedBytesLength, longOrds, singleValuePerDoc);
+ } else {
+ return new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength, longOrds, desc, count, singleValuePerDoc);
+ }
+ }
+
+ private void write(IndexOutput out, String s) throws IOException {
+ SimpleTextUtil.write(out, s, scratch);
+ }
+
+ private void writeInt(IndexOutput out, int x) throws IOException {
+ SimpleTextUtil.write(out, Integer.toString(x), scratch);
+ }
+
+ private void writeLong(IndexOutput out, long x) throws IOException {
+ SimpleTextUtil.write(out, Long.toString(x), scratch);
+ }
+
+ private void write(IndexOutput out, BytesRef b) throws IOException {
+ SimpleTextUtil.write(out, b);
+ }
+
+ private void newline(IndexOutput out) throws IOException {
+ SimpleTextUtil.writeNewline(out);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java
----------------------------------------------------------------------
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java
index 2a01b52..8ca1635 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java
@@ -35,7 +35,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.bkd.BKDReader;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_FP;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BYTES_PER_DIM;
@@ -57,7 +56,7 @@ class SimpleTextPointsReader extends PointsReader {
private final IndexInput dataIn;
final SegmentReadState readState;
- final Map<String,BKDReader> readers = new HashMap<>();
+ final Map<String,SimpleTextBKDReader> readers = new HashMap<>();
final BytesRefBuilder scratch = new BytesRefBuilder();
public SimpleTextPointsReader(SegmentReadState readState) throws IOException {
@@ -97,7 +96,7 @@ class SimpleTextPointsReader extends PointsReader {
this.readState = readState;
}
- private BKDReader initReader(long fp) throws IOException {
+ private SimpleTextBKDReader initReader(long fp) throws IOException {
// NOTE: matches what writeIndex does in SimpleTextPointsWriter
dataIn.seek(fp);
readLine(dataIn);
[5/5] lucene-solr:branch_6x: LUCENE-7563: fix 6.x backport
compilation errors
Posted by mi...@apache.org.
LUCENE-7563: fix 6.x backport compilation errors
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/0c8e8e39
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/0c8e8e39
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/0c8e8e39
Branch: refs/heads/branch_6x
Commit: 0c8e8e396a4ccc41e6af78ac7d0342716c36902a
Parents: fd1f608
Author: Mike McCandless <mi...@apache.org>
Authored: Wed Dec 7 07:37:26 2016 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Wed Dec 7 07:37:26 2016 -0500
----------------------------------------------------------------------
.../codecs/simpletext/SimpleTextBKDReader.java | 13 ++--
.../codecs/simpletext/SimpleTextBKDWriter.java | 18 ++---
.../simpletext/SimpleTextPointsReader.java | 16 ++---
.../simpletext/SimpleTextPointsWriter.java | 4 +-
.../lucene/codecs/lucene60/package-info.java | 2 +-
.../org/apache/lucene/index/CheckIndex.java | 34 +++++-----
.../org/apache/lucene/util/bkd/BKDReader.java | 5 +-
.../org/apache/lucene/util/bkd/BKDWriter.java | 2 +-
.../util/bkd/MutablePointsReaderUtils.java | 8 +--
.../apache/lucene/util/bkd/Test2BBKDPoints.java | 70 ++++++++++++++++++--
10 files changed, 113 insertions(+), 59 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c8e8e39/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
index 488547b..bea7b62 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
@@ -21,7 +21,8 @@ import java.nio.charset.StandardCharsets;
import org.apache.lucene.codecs.simpletext.SimpleTextUtil;
import org.apache.lucene.index.CorruptIndexException;
-import org.apache.lucene.index.PointValues;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
@@ -36,7 +37,7 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_V
/** Forked from {@link BKDReader} and simplified/specialized for SimpleText's usage */
-final class SimpleTextBKDReader extends PointValues implements Accountable {
+final class SimpleTextBKDReader implements Accountable {
// Packed array of byte[] holding all split values in the full binary tree:
final private byte[] splitPackedValues;
final long[] leafBlockFPs;
@@ -306,32 +307,26 @@ final class SimpleTextBKDReader extends PointValues implements Accountable {
RamUsageEstimator.sizeOf(leafBlockFPs);
}
- @Override
public byte[] getMinPackedValue() {
return minPackedValue.clone();
}
- @Override
public byte[] getMaxPackedValue() {
return maxPackedValue.clone();
}
- @Override
public int getNumDimensions() {
return numDims;
}
- @Override
public int getBytesPerDimension() {
return bytesPerDim;
}
- @Override
- public long size() {
+ public long getPointCount() {
return pointCount;
}
- @Override
public int getDocCount() {
return docCount;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c8e8e39/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java
index d7674ed..0dbc176 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDWriter.java
@@ -25,7 +25,7 @@ import java.util.List;
import java.util.function.IntFunction;
import org.apache.lucene.codecs.CodecUtil;
-import org.apache.lucene.codecs.MutablePointValues;
+import org.apache.lucene.codecs.MutablePointsReader;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
@@ -427,12 +427,12 @@ final class SimpleTextBKDWriter implements Closeable {
}
}
- /** Write a field from a {@link MutablePointValues}. This way of writing
+ /** Write a field from a {@link MutablePointsReader}. This way of writing
* points is faster than regular writes with {@link BKDWriter#add} since
* there is opportunity for reordering points before writing them to
* disk. This method does not use transient disk in order to reorder points.
*/
- public long writeField(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
+ public long writeField(IndexOutput out, String fieldName, MutablePointsReader reader) throws IOException {
if (numDims == 1) {
return writeField1Dim(out, fieldName, reader);
} else {
@@ -443,7 +443,7 @@ final class SimpleTextBKDWriter implements Closeable {
/* In the 2+D case, we recursively pick the split dimension, compute the
* median value and partition other values around it. */
- private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointValues values) throws IOException {
+ private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointsReader values) throws IOException {
if (pointCount != 0) {
throw new IllegalStateException("cannot mix add and writeField");
}
@@ -456,7 +456,7 @@ final class SimpleTextBKDWriter implements Closeable {
// Mark that we already finished:
heapPointWriter = null;
- long countPerLeaf = pointCount = values.size();
+ long countPerLeaf = pointCount = values.size(fieldName);
long innerNodeCount = 1;
while (countPerLeaf > maxPointsInLeafNode) {
@@ -501,12 +501,12 @@ final class SimpleTextBKDWriter implements Closeable {
/* In the 1D case, we can simply sort points in ascending order and use the
* same writing logic as we use at merge time. */
- private long writeField1Dim(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
- MutablePointsReaderUtils.sort(maxDoc, packedBytesLength, reader, 0, Math.toIntExact(reader.size()));
+ private long writeField1Dim(IndexOutput out, String fieldName, MutablePointsReader reader) throws IOException {
+ MutablePointsReaderUtils.sort(maxDoc, packedBytesLength, reader, 0, Math.toIntExact(reader.size(fieldName)));
final OneDimensionBKDWriter oneDimWriter = new OneDimensionBKDWriter(out);
- reader.intersect(new IntersectVisitor() {
+ reader.intersect(fieldName, new IntersectVisitor() {
@Override
public void visit(int docID, byte[] packedValue) throws IOException {
@@ -1264,7 +1264,7 @@ final class SimpleTextBKDWriter implements Closeable {
/* Recursively reorders the provided reader and writes the bkd-tree on the fly. */
private void build(int nodeID, int leafNodeOffset,
- MutablePointValues reader, int from, int to,
+ MutablePointsReader reader, int from, int to,
IndexOutput out,
byte[] minPackedValue, byte[] maxPackedValue,
byte[] splitPackedValues,
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c8e8e39/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java
----------------------------------------------------------------------
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java
index 8ca1635..e6711e7 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsReader.java
@@ -184,7 +184,7 @@ class SimpleTextPointsReader extends PointsReader {
return new String(scratch.bytes(), prefix.length, scratch.length() - prefix.length, StandardCharsets.UTF_8);
}
- private BKDReader getBKDReader(String fieldName) {
+ private SimpleTextBKDReader getBKDReader(String fieldName) {
FieldInfo fieldInfo = readState.fieldInfos.fieldInfo(fieldName);
if (fieldInfo == null) {
throw new IllegalArgumentException("field=\"" + fieldName + "\" is unrecognized");
@@ -198,7 +198,7 @@ class SimpleTextPointsReader extends PointsReader {
/** Finds all documents and points matching the provided visitor */
@Override
public void intersect(String fieldName, IntersectVisitor visitor) throws IOException {
- BKDReader bkdReader = getBKDReader(fieldName);
+ SimpleTextBKDReader bkdReader = getBKDReader(fieldName);
if (bkdReader == null) {
// Schema ghost corner case! This field did index points in the past, but
// now all docs having this field were deleted in this segment:
@@ -246,7 +246,7 @@ class SimpleTextPointsReader extends PointsReader {
@Override
public byte[] getMinPackedValue(String fieldName) {
- BKDReader bkdReader = getBKDReader(fieldName);
+ SimpleTextBKDReader bkdReader = getBKDReader(fieldName);
if (bkdReader == null) {
// Schema ghost corner case! This field did index points in the past, but
// now all docs having this field were deleted in this segment:
@@ -257,7 +257,7 @@ class SimpleTextPointsReader extends PointsReader {
@Override
public byte[] getMaxPackedValue(String fieldName) {
- BKDReader bkdReader = getBKDReader(fieldName);
+ SimpleTextBKDReader bkdReader = getBKDReader(fieldName);
if (bkdReader == null) {
// Schema ghost corner case! This field did index points in the past, but
// now all docs having this field were deleted in this segment:
@@ -268,7 +268,7 @@ class SimpleTextPointsReader extends PointsReader {
@Override
public int getNumDimensions(String fieldName) {
- BKDReader bkdReader = getBKDReader(fieldName);
+ SimpleTextBKDReader bkdReader = getBKDReader(fieldName);
if (bkdReader == null) {
// Schema ghost corner case! This field did index points in the past, but
// now all docs having this field were deleted in this segment:
@@ -279,7 +279,7 @@ class SimpleTextPointsReader extends PointsReader {
@Override
public int getBytesPerDimension(String fieldName) {
- BKDReader bkdReader = getBKDReader(fieldName);
+ SimpleTextBKDReader bkdReader = getBKDReader(fieldName);
if (bkdReader == null) {
// Schema ghost corner case! This field did index points in the past, but
// now all docs having this field were deleted in this segment:
@@ -290,7 +290,7 @@ class SimpleTextPointsReader extends PointsReader {
@Override
public long size(String fieldName) {
- BKDReader bkdReader = getBKDReader(fieldName);
+ SimpleTextBKDReader bkdReader = getBKDReader(fieldName);
if (bkdReader == null) {
// Schema ghost corner case! This field did index points in the past, but
// now all docs having this field were deleted in this segment:
@@ -301,7 +301,7 @@ class SimpleTextPointsReader extends PointsReader {
@Override
public int getDocCount(String fieldName) {
- BKDReader bkdReader = getBKDReader(fieldName);
+ SimpleTextBKDReader bkdReader = getBKDReader(fieldName);
if (bkdReader == null) {
// Schema ghost corner case! This field did index points in the past, but
// now all docs having this field were deleted in this segment:
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c8e8e39/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
----------------------------------------------------------------------
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
index e4b2c2c..0c62929 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
@@ -70,7 +70,7 @@ class SimpleTextPointsWriter extends PointsWriter {
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
- // We use the normal BKDWriter, but subclass to customize how it writes the index and blocks to disk:
+ // We use our own fork of the BKDWriter to customize how it writes the index and blocks to disk:
try (SimpleTextBKDWriter writer = new SimpleTextBKDWriter(writeState.segmentInfo.maxDoc(),
writeState.directory,
writeState.segmentInfo.name,
@@ -78,7 +78,7 @@ class SimpleTextPointsWriter extends PointsWriter {
fieldInfo.getPointNumBytes(),
SimpleTextBKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
SimpleTextBKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
- values.size(),
+ values.size(fieldInfo.name),
singleValuePerDoc)) {
values.intersect(fieldInfo.name, new IntersectVisitor() {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c8e8e39/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
index a914001..f6f0783 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
@@ -16,7 +16,7 @@
*/
/**
- * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene70}
+ * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene62}
* for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene60;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c8e8e39/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
index bc22d86..21a28e5 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -1799,18 +1799,14 @@ public final class CheckIndex implements Closeable {
}
for (FieldInfo fieldInfo : fieldInfos) {
if (fieldInfo.getPointDimensionCount() > 0) {
- PointValues values = pointsReader.getValues(fieldInfo.name);
- if (values == null) {
- continue;
- }
status.totalValueFields++;
- long size = values.size();
- int docCount = values.getDocCount();
+ long size = values.size(fieldInfo.name);
+ int docCount = values.getDocCount(fieldInfo.name);
VerifyPointsVisitor visitor = new VerifyPointsVisitor(fieldInfo.name, reader.maxDoc(), values);
- values.intersect(visitor);
+ values.intersect(fieldInfo.name, visitor);
if (visitor.getPointCountSeen() != size) {
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points, but in fact has " + visitor.getPointCountSeen());
@@ -1863,34 +1859,34 @@ public final class CheckIndex implements Closeable {
public VerifyPointsVisitor(String fieldName, int maxDoc, PointValues values) throws IOException {
this.maxDoc = maxDoc;
this.fieldName = fieldName;
- numDims = values.getNumDimensions();
- bytesPerDim = values.getBytesPerDimension();
+ numDims = values.getNumDimensions(fieldName);
+ bytesPerDim = values.getBytesPerDimension(fieldName);
packedBytesCount = numDims * bytesPerDim;
- globalMinPackedValue = values.getMinPackedValue();
- globalMaxPackedValue = values.getMaxPackedValue();
+ globalMinPackedValue = values.getMinPackedValue(fieldName);
+ globalMaxPackedValue = values.getMaxPackedValue(fieldName);
docsSeen = new FixedBitSet(maxDoc);
lastMinPackedValue = new byte[packedBytesCount];
lastMaxPackedValue = new byte[packedBytesCount];
lastPackedValue = new byte[packedBytesCount];
- if (values.getDocCount() > values.size()) {
- throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have size=" + values.size() + " points and inconsistent docCount=" + values.getDocCount());
+ if (values.getDocCount(fieldName) > values.size(fieldName)) {
+ throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have size=" + values.size(fieldName) + " points and inconsistent docCount=" + values.getDocCount(fieldName));
}
- if (values.getDocCount() > maxDoc) {
- throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have docCount=" + values.getDocCount() + " but that's greater than maxDoc=" + maxDoc);
+ if (values.getDocCount(fieldName) > maxDoc) {
+ throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have docCount=" + values.getDocCount(fieldName) + " but that's greater than maxDoc=" + maxDoc);
}
if (globalMinPackedValue == null) {
- if (values.size() != 0) {
- throw new RuntimeException("getMinPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size());
+ if (values.size(fieldName) != 0) {
+ throw new RuntimeException("getMinPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size(fieldName));
}
} else if (globalMinPackedValue.length != packedBytesCount) {
throw new RuntimeException("getMinPackedValue for field \"" + fieldName + "\" return length=" + globalMinPackedValue.length + " array, but should be " + packedBytesCount);
}
if (globalMaxPackedValue == null) {
- if (values.size() != 0) {
- throw new RuntimeException("getMaxPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size());
+ if (values.size(fieldName) != 0) {
+ throw new RuntimeException("getMaxPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size(fieldName));
}
} else if (globalMaxPackedValue.length != packedBytesCount) {
throw new RuntimeException("getMaxPackedValue for field \"" + fieldName + "\" return length=" + globalMaxPackedValue.length + " array, but should be " + packedBytesCount);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c8e8e39/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index ef9bccc..4f4228d 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -20,7 +20,8 @@ import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CorruptIndexException;
-import org.apache.lucene.index.PointValues;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
@@ -33,7 +34,7 @@ import org.apache.lucene.util.StringHelper;
*
* @lucene.experimental */
-public final class BKDReader extends PointValues implements Accountable {
+public final class BKDReader implements Accountable {
// Packed array of byte[] holding all split values in the full binary tree:
final int leafNodeOffset;
final int numDims;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c8e8e39/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index 2567eef..b9fd37cb 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -1451,7 +1451,7 @@ public class BKDWriter implements Closeable {
/* Recursively reorders the provided reader and writes the bkd-tree on the fly. */
private void build(int nodeID, int leafNodeOffset,
- MutablePointValues reader, int from, int to,
+ MutablePointsReader reader, int from, int to,
IndexOutput out,
byte[] minPackedValue, byte[] maxPackedValue,
byte[] splitPackedValues,
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c8e8e39/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
index 9e84c8d..c7be5ba 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
@@ -33,9 +33,9 @@ public final class MutablePointsReaderUtils {
MutablePointsReaderUtils() {}
- /** Sort the given {@link MutablePointValues} based on its packed value then doc ID. */
+ /** Sort the given {@link MutablePointsReader} based on its packed value then doc ID. */
public static void sort(int maxDoc, int packedBytesLength,
- MutablePointValues reader, int from, int to) {
+ MutablePointsReader reader, int from, int to) {
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
new MSBRadixSorter(packedBytesLength + (bitsPerDocId + 7) / 8) {
@@ -92,7 +92,7 @@ public final class MutablePointsReaderUtils {
/** Sort points on the given dimension. */
public static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
- MutablePointValues reader, int from, int to,
+ MutablePointsReader reader, int from, int to,
BytesRef scratch1, BytesRef scratch2) {
// No need for a fancy radix sort here, this is called on the leaves only so
@@ -131,7 +131,7 @@ public final class MutablePointsReaderUtils {
* than or equal to it and all values on the right must be greater than or
* equal to it. */
public static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
- MutablePointValues reader, int from, int to, int mid,
+ MutablePointsReader reader, int from, int to, int mid,
BytesRef scratch1, BytesRef scratch2) {
final int offset = splitDim * bytesPerDim + commonPrefixLen;
final int cmpBytes = bytesPerDim - commonPrefixLen;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c8e8e39/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java b/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
index e30168c..a89a184 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
@@ -16,7 +16,10 @@
*/
package org.apache.lucene.util.bkd;
+import java.io.IOException;
+
import org.apache.lucene.index.CheckIndex;
+import org.apache.lucene.index.PointValues;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IOContext;
@@ -65,9 +68,9 @@ public class Test2BBKDPoints extends LuceneTestCase {
IndexInput in = dir.openInput("1d.bkd", IOContext.DEFAULT);
in.seek(indexFP);
BKDReader r = new BKDReader(in);
- CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("1d", numDocs, r);
+ CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("1d", numDocs, new BKDReaderToPointValues("1d", r));
r.intersect(visitor);
- assertEquals(r.size(), visitor.getPointCountSeen());
+ assertEquals(r.getPointCount(), visitor.getPointCountSeen());
assertEquals(r.getDocCount(), visitor.getDocCountSeen());
in.close();
dir.close();
@@ -105,11 +108,70 @@ public class Test2BBKDPoints extends LuceneTestCase {
IndexInput in = dir.openInput("2d.bkd", IOContext.DEFAULT);
in.seek(indexFP);
BKDReader r = new BKDReader(in);
- CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("2d", numDocs, r);
+ CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("2d", numDocs, new BKDReaderToPointValues("2d", r));
r.intersect(visitor);
- assertEquals(r.size(), visitor.getPointCountSeen());
+ assertEquals(r.getPointCount(), visitor.getPointCountSeen());
assertEquals(r.getDocCount(), visitor.getDocCountSeen());
in.close();
dir.close();
}
+
+ private class BKDReaderToPointValues extends PointValues {
+
+ private final BKDReader bkdReader;
+ private final String fieldName;
+
+ public BKDReaderToPointValues(String fieldName, BKDReader bkdReader) {
+ this.fieldName = fieldName;
+ this.bkdReader = bkdReader;
+ }
+
+ @Override
+ public void intersect(String fieldNameIn, IntersectVisitor visitor) throws IOException {
+ verifyFieldName(fieldNameIn);
+ bkdReader.intersect(visitor);
+ }
+
+ @Override
+ public byte[] getMinPackedValue(String fieldNameIn) throws IOException {
+ verifyFieldName(fieldNameIn);
+ return bkdReader.getMinPackedValue();
+ }
+
+ @Override
+ public byte[] getMaxPackedValue(String fieldNameIn) throws IOException {
+ verifyFieldName(fieldNameIn);
+ return bkdReader.getMaxPackedValue();
+ }
+
+ @Override
+ public int getNumDimensions(String fieldNameIn) throws IOException {
+ verifyFieldName(fieldNameIn);
+ return bkdReader.getNumDimensions();
+ }
+
+ @Override
+ public int getBytesPerDimension(String fieldNameIn) throws IOException {
+ verifyFieldName(fieldNameIn);
+ return bkdReader.getBytesPerDimension();
+ }
+
+ @Override
+ public long size(String fieldNameIn) {
+ verifyFieldName(fieldNameIn);
+ return bkdReader.getPointCount();
+ }
+
+ @Override
+ public int getDocCount(String fieldNameIn) {
+ verifyFieldName(fieldNameIn);
+ return bkdReader.getDocCount();
+ }
+
+ private void verifyFieldName(String fieldNameIn) {
+ if (fieldName.equals(fieldNameIn) == false) {
+ throw new IllegalArgumentException("expected fieldName=\"" + fieldName + "\" but got \"" + fieldNameIn + "\"");
+ }
+ }
+ }
}
[2/5] lucene-solr:branch_6x: LUCENE-7563: use a compressed format for
the in-heap BKD index
Posted by mi...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
----------------------------------------------------------------------
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
index 8d5c034..e4b2c2c 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
@@ -20,7 +20,6 @@ package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
-import java.util.function.IntFunction;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.PointsWriter;
@@ -32,29 +31,28 @@ import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.bkd.BKDWriter;
class SimpleTextPointsWriter extends PointsWriter {
- final static BytesRef NUM_DIMS = new BytesRef("num dims ");
- final static BytesRef BYTES_PER_DIM = new BytesRef("bytes per dim ");
- final static BytesRef MAX_LEAF_POINTS = new BytesRef("max leaf points ");
- final static BytesRef INDEX_COUNT = new BytesRef("index count ");
- final static BytesRef BLOCK_COUNT = new BytesRef("block count ");
- final static BytesRef BLOCK_DOC_ID = new BytesRef(" doc ");
- final static BytesRef BLOCK_FP = new BytesRef(" block fp ");
- final static BytesRef BLOCK_VALUE = new BytesRef(" block value ");
- final static BytesRef SPLIT_COUNT = new BytesRef("split count ");
- final static BytesRef SPLIT_DIM = new BytesRef(" split dim ");
- final static BytesRef SPLIT_VALUE = new BytesRef(" split value ");
- final static BytesRef FIELD_COUNT = new BytesRef("field count ");
- final static BytesRef FIELD_FP_NAME = new BytesRef(" field fp name ");
- final static BytesRef FIELD_FP = new BytesRef(" field fp ");
- final static BytesRef MIN_VALUE = new BytesRef("min value ");
- final static BytesRef MAX_VALUE = new BytesRef("max value ");
- final static BytesRef POINT_COUNT = new BytesRef("point count ");
- final static BytesRef DOC_COUNT = new BytesRef("doc count ");
- final static BytesRef END = new BytesRef("END");
+ public final static BytesRef NUM_DIMS = new BytesRef("num dims ");
+ public final static BytesRef BYTES_PER_DIM = new BytesRef("bytes per dim ");
+ public final static BytesRef MAX_LEAF_POINTS = new BytesRef("max leaf points ");
+ public final static BytesRef INDEX_COUNT = new BytesRef("index count ");
+ public final static BytesRef BLOCK_COUNT = new BytesRef("block count ");
+ public final static BytesRef BLOCK_DOC_ID = new BytesRef(" doc ");
+ public final static BytesRef BLOCK_FP = new BytesRef(" block fp ");
+ public final static BytesRef BLOCK_VALUE = new BytesRef(" block value ");
+ public final static BytesRef SPLIT_COUNT = new BytesRef("split count ");
+ public final static BytesRef SPLIT_DIM = new BytesRef(" split dim ");
+ public final static BytesRef SPLIT_VALUE = new BytesRef(" split value ");
+ public final static BytesRef FIELD_COUNT = new BytesRef("field count ");
+ public final static BytesRef FIELD_FP_NAME = new BytesRef(" field fp name ");
+ public final static BytesRef FIELD_FP = new BytesRef(" field fp ");
+ public final static BytesRef MIN_VALUE = new BytesRef("min value ");
+ public final static BytesRef MAX_VALUE = new BytesRef("max value ");
+ public final static BytesRef POINT_COUNT = new BytesRef("point count ");
+ public final static BytesRef DOC_COUNT = new BytesRef("doc count ");
+ public final static BytesRef END = new BytesRef("END");
private IndexOutput dataOut;
final BytesRefBuilder scratch = new BytesRefBuilder();
@@ -73,105 +71,15 @@ class SimpleTextPointsWriter extends PointsWriter {
boolean singleValuePerDoc = values.size(fieldInfo.name) == values.getDocCount(fieldInfo.name);
// We use the normal BKDWriter, but subclass to customize how it writes the index and blocks to disk:
- try (BKDWriter writer = new BKDWriter(writeState.segmentInfo.maxDoc(),
- writeState.directory,
- writeState.segmentInfo.name,
- fieldInfo.getPointDimensionCount(),
- fieldInfo.getPointNumBytes(),
- BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
- BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
- values.size(fieldInfo.name),
- singleValuePerDoc) {
-
- @Override
- protected void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
- write(out, NUM_DIMS);
- writeInt(out, numDims);
- newline(out);
-
- write(out, BYTES_PER_DIM);
- writeInt(out, bytesPerDim);
- newline(out);
-
- write(out, MAX_LEAF_POINTS);
- writeInt(out, maxPointsInLeafNode);
- newline(out);
-
- write(out, INDEX_COUNT);
- writeInt(out, leafBlockFPs.length);
- newline(out);
-
- write(out, MIN_VALUE);
- BytesRef br = new BytesRef(minPackedValue, 0, minPackedValue.length);
- write(out, br.toString());
- newline(out);
-
- write(out, MAX_VALUE);
- br = new BytesRef(maxPackedValue, 0, maxPackedValue.length);
- write(out, br.toString());
- newline(out);
-
- write(out, POINT_COUNT);
- writeLong(out, pointCount);
- newline(out);
-
- write(out, DOC_COUNT);
- writeInt(out, docsSeen.cardinality());
- newline(out);
-
- for(int i=0;i<leafBlockFPs.length;i++) {
- write(out, BLOCK_FP);
- writeLong(out, leafBlockFPs[i]);
- newline(out);
- }
-
- assert (splitPackedValues.length % (1 + fieldInfo.getPointNumBytes())) == 0;
- int count = splitPackedValues.length / (1 + fieldInfo.getPointNumBytes());
- assert count == leafBlockFPs.length;
-
- write(out, SPLIT_COUNT);
- writeInt(out, count);
- newline(out);
-
- for(int i=0;i<count;i++) {
- write(out, SPLIT_DIM);
- writeInt(out, splitPackedValues[i * (1 + fieldInfo.getPointNumBytes())] & 0xff);
- newline(out);
- write(out, SPLIT_VALUE);
- br = new BytesRef(splitPackedValues, 1+(i * (1+fieldInfo.getPointNumBytes())), fieldInfo.getPointNumBytes());
- write(out, br.toString());
- newline(out);
- }
- }
-
- @Override
- protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
- write(out, BLOCK_COUNT);
- writeInt(out, count);
- newline(out);
- for(int i=0;i<count;i++) {
- write(out, BLOCK_DOC_ID);
- writeInt(out, docIDs[start+i]);
- newline(out);
- }
- }
-
- @Override
- protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixLengths, byte[] packedValue) {
- // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
- }
-
- @Override
- protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
- for (int i = 0; i < count; ++i) {
- BytesRef packedValue = packedValues.apply(i);
- // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
- write(out, BLOCK_VALUE);
- write(out, packedValue.toString());
- newline(out);
- }
- }
- }) {
+ try (SimpleTextBKDWriter writer = new SimpleTextBKDWriter(writeState.segmentInfo.maxDoc(),
+ writeState.directory,
+ writeState.segmentInfo.name,
+ fieldInfo.getPointDimensionCount(),
+ fieldInfo.getPointNumBytes(),
+ SimpleTextBKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
+ SimpleTextBKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
+ values.size(),
+ singleValuePerDoc)) {
values.intersect(fieldInfo.name, new IntersectVisitor() {
@Override
@@ -196,26 +104,6 @@ class SimpleTextPointsWriter extends PointsWriter {
}
}
- private void write(IndexOutput out, String s) throws IOException {
- SimpleTextUtil.write(out, s, scratch);
- }
-
- private void writeInt(IndexOutput out, int x) throws IOException {
- SimpleTextUtil.write(out, Integer.toString(x), scratch);
- }
-
- private void writeLong(IndexOutput out, long x) throws IOException {
- SimpleTextUtil.write(out, Long.toString(x), scratch);
- }
-
- private void write(IndexOutput out, BytesRef b) throws IOException {
- SimpleTextUtil.write(out, b);
- }
-
- private void newline(IndexOutput out) throws IOException {
- SimpleTextUtil.writeNewline(out);
- }
-
@Override
public void finish() throws IOException {
SimpleTextUtil.write(dataOut, END);
@@ -248,4 +136,24 @@ class SimpleTextPointsWriter extends PointsWriter {
}
}
}
+
+ private void write(IndexOutput out, String s) throws IOException {
+ SimpleTextUtil.write(out, s, scratch);
+ }
+
+ private void writeInt(IndexOutput out, int x) throws IOException {
+ SimpleTextUtil.write(out, Integer.toString(x), scratch);
+ }
+
+ private void writeLong(IndexOutput out, long x) throws IOException {
+ SimpleTextUtil.write(out, Long.toString(x), scratch);
+ }
+
+ private void write(IndexOutput out, BytesRef b) throws IOException {
+ SimpleTextUtil.write(out, b);
+ }
+
+ private void newline(IndexOutput out) throws IOException {
+ SimpleTextUtil.writeNewline(out);
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsFormat.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsFormat.java
index e558d0d..1d2285c 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsFormat.java
@@ -28,7 +28,8 @@ import org.apache.lucene.index.SegmentWriteState;
/**
* Lucene 6.0 point format, which encodes dimensional values in a block KD-tree structure
- * for fast shape intersection filtering. See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
+ * for fast 1D range and N dimesional shape intersection filtering.
+ * See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
*
* <p>This data structure is written as a series of blocks on disk, with an in-memory perfectly balanced
* binary tree of split values referencing those blocks at the leaves.
@@ -50,10 +51,13 @@ import org.apache.lucene.index.SegmentWriteState;
* <li> maxPointsInLeafNode (vInt)
* <li> bytesPerDim (vInt)
* <li> count (vInt)
- * <li> byte[bytesPerDim]<sup>count</sup> (packed <code>byte[]</code> all split values)
- * <li> delta-blockFP (vLong)<sup>count</sup> (delta-coded file pointers to the on-disk leaf blocks))
+ * <li> packed index (byte[])
* </ul>
*
+ * <p>The packed index uses hierarchical delta and prefix coding to compactly encode the file pointer for
+ * all leaf blocks, once the tree is traversed, as well as the split dimension and split value for each
+ * inner node of the tree.
+ *
* <p>After all fields blocks + index data are written, {@link CodecUtil#writeFooter} writes the checksum.
*
* <p>The <code>.dii</code> file records the file pointer in the <code>.dim</code> file where each field's
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
index 8968a6d..a914001 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
@@ -16,7 +16,7 @@
*/
/**
- * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene62}
- * for an overview of the index format.
+ * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene70}
+ * for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene60;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
index 6785f3a..bc22d86 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -1799,158 +1799,32 @@ public final class CheckIndex implements Closeable {
}
for (FieldInfo fieldInfo : fieldInfos) {
if (fieldInfo.getPointDimensionCount() > 0) {
- FixedBitSet docsSeen = new FixedBitSet(reader.maxDoc());
- status.totalValueFields++;
- int dimCount = fieldInfo.getPointDimensionCount();
- int bytesPerDim = fieldInfo.getPointNumBytes();
- int packedBytesCount = dimCount * bytesPerDim;
- byte[] lastMinPackedValue = new byte[packedBytesCount];
- byte[] lastMaxPackedValue = new byte[packedBytesCount];
- BytesRef scratch = new BytesRef();
- scratch.length = bytesPerDim;
- byte[] lastPackedValue = new byte[packedBytesCount];
-
- long[] pointCountSeen = new long[1];
-
- byte[] globalMinPackedValue = values.getMinPackedValue(fieldInfo.name);
- long size = values.size(fieldInfo.name);
- int docCount = values.getDocCount(fieldInfo.name);
-
- if (docCount > size) {
- throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points and inconsistent docCount=" + docCount);
+ PointValues values = pointsReader.getValues(fieldInfo.name);
+ if (values == null) {
+ continue;
}
- if (docCount > reader.maxDoc()) {
- throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but that's greater than maxDoc=" + reader.maxDoc());
- }
+ status.totalValueFields++;
- if (globalMinPackedValue == null) {
- if (size != 0) {
- throw new RuntimeException("getMinPackedValue is null points for field \"" + fieldInfo.name + "\" yet size=" + size);
- }
- } else if (globalMinPackedValue.length != packedBytesCount) {
- throw new RuntimeException("getMinPackedValue for field \"" + fieldInfo.name + "\" return length=" + globalMinPackedValue.length + " array, but should be " + packedBytesCount);
- }
- byte[] globalMaxPackedValue = values.getMaxPackedValue(fieldInfo.name);
- if (globalMaxPackedValue == null) {
- if (size != 0) {
- throw new RuntimeException("getMaxPackedValue is null points for field \"" + fieldInfo.name + "\" yet size=" + size);
- }
- } else if (globalMaxPackedValue.length != packedBytesCount) {
- throw new RuntimeException("getMaxPackedValue for field \"" + fieldInfo.name + "\" return length=" + globalMaxPackedValue.length + " array, but should be " + packedBytesCount);
- }
+ long size = values.size();
+ int docCount = values.getDocCount();
+
+ VerifyPointsVisitor visitor = new VerifyPointsVisitor(fieldInfo.name, reader.maxDoc(), values);
+ values.intersect(visitor);
- values.intersect(fieldInfo.name,
- new PointValues.IntersectVisitor() {
-
- private int lastDocID = -1;
-
- @Override
- public void visit(int docID) {
- throw new RuntimeException("codec called IntersectVisitor.visit without a packed value for docID=" + docID);
- }
-
- @Override
- public void visit(int docID, byte[] packedValue) {
- checkPackedValue("packed value", packedValue, docID);
- pointCountSeen[0]++;
- docsSeen.set(docID);
-
- for(int dim=0;dim<dimCount;dim++) {
- int offset = bytesPerDim * dim;
-
- // Compare to last cell:
- if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMinPackedValue, offset) < 0) {
- // This doc's point, in this dimension, is lower than the minimum value of the last cell checked:
- throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldInfo.name + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
- }
-
- if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMaxPackedValue, offset) > 0) {
- // This doc's point, in this dimension, is greater than the maximum value of the last cell checked:
- throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldInfo.name + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
- }
- }
-
- // In the 1D case, PointValues must make a single in-order sweep through all values, and tie-break by
- // increasing docID:
- if (dimCount == 1) {
- int cmp = StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0);
- if (cmp > 0) {
- throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldInfo.name + "\", for docID=" + docID + " is out-of-order vs the previous document's value " + Arrays.toString(lastPackedValue));
- } else if (cmp == 0) {
- if (docID < lastDocID) {
- throw new RuntimeException("packed points value is the same, but docID=" + docID + " is out of order vs previous docID=" + lastDocID + ", field=\"" + fieldInfo.name + "\"");
- }
- }
- System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
- lastDocID = docID;
- }
-
- status.totalValuePoints++;
- }
-
- @Override
- public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
- checkPackedValue("min packed value", minPackedValue, -1);
- System.arraycopy(minPackedValue, 0, lastMinPackedValue, 0, packedBytesCount);
- checkPackedValue("max packed value", maxPackedValue, -1);
- System.arraycopy(maxPackedValue, 0, lastMaxPackedValue, 0, packedBytesCount);
-
- for(int dim=0;dim<dimCount;dim++) {
- int offset = bytesPerDim * dim;
-
- if (StringHelper.compare(bytesPerDim, minPackedValue, offset, maxPackedValue, offset) > 0) {
- throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
- " is out-of-bounds of the cell's maxPackedValue " + Arrays.toString(maxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
- }
-
- // Make sure this cell is not outside of the global min/max:
- if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMinPackedValue, offset) < 0) {
- throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
- " is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
- }
-
- if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMinPackedValue, offset) < 0) {
- throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
- " is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
- }
-
- if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMaxPackedValue, offset) > 0) {
- throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
- " is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
- }
- if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMaxPackedValue, offset) > 0) {
- throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
- " is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
- }
- }
-
- // We always pretend the query shape is so complex that it crosses every cell, so
- // that packedValue is passed for every document
- return PointValues.Relation.CELL_CROSSES_QUERY;
- }
-
- private void checkPackedValue(String desc, byte[] packedValue, int docID) {
- if (packedValue == null) {
- throw new RuntimeException(desc + " is null for docID=" + docID + " field=\"" + fieldInfo.name + "\"");
- }
-
- if (packedValue.length != packedBytesCount) {
- throw new RuntimeException(desc + " has incorrect length=" + packedValue.length + " vs expected=" + packedBytesCount + " for docID=" + docID + " field=\"" + fieldInfo.name + "\"");
- }
- }
- });
-
- if (pointCountSeen[0] != size) {
- throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points, but in fact has " + pointCountSeen[0]);
+ if (visitor.getPointCountSeen() != size) {
+ throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points, but in fact has " + visitor.getPointCountSeen());
}
- if (docsSeen.cardinality() != docCount) {
- throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but in fact has " + docsSeen.cardinality());
+ if (visitor.getDocCountSeen() != docCount) {
+ throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but in fact has " + visitor.getDocCountSeen());
}
+
+ status.totalValuePoints += visitor.getPointCountSeen();
}
}
}
+
msg(infoStream, String.format(Locale.ROOT, "OK [%d fields, %d points] [took %.3f sec]", status.totalValueFields, status.totalValuePoints, nsToSec(System.nanoTime()-startNS)));
} catch (Throwable e) {
@@ -1967,6 +1841,167 @@ public final class CheckIndex implements Closeable {
return status;
}
+ /** Walks the entire N-dimensional points space, verifying that all points fall within the last cell's boundaries.
+ *
+ * @lucene.internal */
+ public static class VerifyPointsVisitor implements PointValues.IntersectVisitor {
+ private long pointCountSeen;
+ private int lastDocID = -1;
+ private final int maxDoc;
+ private final FixedBitSet docsSeen;
+ private final byte[] lastMinPackedValue;
+ private final byte[] lastMaxPackedValue;
+ private final byte[] lastPackedValue;
+ private final byte[] globalMinPackedValue;
+ private final byte[] globalMaxPackedValue;
+ private final int packedBytesCount;
+ private final int numDims;
+ private final int bytesPerDim;
+ private final String fieldName;
+
+ /** Sole constructor */
+ public VerifyPointsVisitor(String fieldName, int maxDoc, PointValues values) throws IOException {
+ this.maxDoc = maxDoc;
+ this.fieldName = fieldName;
+ numDims = values.getNumDimensions();
+ bytesPerDim = values.getBytesPerDimension();
+ packedBytesCount = numDims * bytesPerDim;
+ globalMinPackedValue = values.getMinPackedValue();
+ globalMaxPackedValue = values.getMaxPackedValue();
+ docsSeen = new FixedBitSet(maxDoc);
+ lastMinPackedValue = new byte[packedBytesCount];
+ lastMaxPackedValue = new byte[packedBytesCount];
+ lastPackedValue = new byte[packedBytesCount];
+
+ if (values.getDocCount() > values.size()) {
+ throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have size=" + values.size() + " points and inconsistent docCount=" + values.getDocCount());
+ }
+
+ if (values.getDocCount() > maxDoc) {
+ throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have docCount=" + values.getDocCount() + " but that's greater than maxDoc=" + maxDoc);
+ }
+
+ if (globalMinPackedValue == null) {
+ if (values.size() != 0) {
+ throw new RuntimeException("getMinPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size());
+ }
+ } else if (globalMinPackedValue.length != packedBytesCount) {
+ throw new RuntimeException("getMinPackedValue for field \"" + fieldName + "\" return length=" + globalMinPackedValue.length + " array, but should be " + packedBytesCount);
+ }
+ if (globalMaxPackedValue == null) {
+ if (values.size() != 0) {
+ throw new RuntimeException("getMaxPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size());
+ }
+ } else if (globalMaxPackedValue.length != packedBytesCount) {
+ throw new RuntimeException("getMaxPackedValue for field \"" + fieldName + "\" return length=" + globalMaxPackedValue.length + " array, but should be " + packedBytesCount);
+ }
+ }
+
+ /** Returns total number of points in this BKD tree */
+ public long getPointCountSeen() {
+ return pointCountSeen;
+ }
+
+ /** Returns total number of unique docIDs in this BKD tree */
+ public long getDocCountSeen() {
+ return docsSeen.cardinality();
+ }
+
+ @Override
+ public void visit(int docID) {
+ throw new RuntimeException("codec called IntersectVisitor.visit without a packed value for docID=" + docID);
+ }
+
+ @Override
+ public void visit(int docID, byte[] packedValue) {
+ checkPackedValue("packed value", packedValue, docID);
+ pointCountSeen++;
+ docsSeen.set(docID);
+
+ for(int dim=0;dim<numDims;dim++) {
+ int offset = bytesPerDim * dim;
+
+ // Compare to last cell:
+ if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMinPackedValue, offset) < 0) {
+ // This doc's point, in this dimension, is lower than the minimum value of the last cell checked:
+ throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldName + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
+ }
+
+ if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMaxPackedValue, offset) > 0) {
+ // This doc's point, in this dimension, is greater than the maximum value of the last cell checked:
+ throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldName + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
+ }
+ }
+
+ // In the 1D case, PointValues must make a single in-order sweep through all values, and tie-break by
+ // increasing docID:
+ if (numDims == 1) {
+ int cmp = StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0);
+ if (cmp > 0) {
+ throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldName + "\", for docID=" + docID + " is out-of-order vs the previous document's value " + Arrays.toString(lastPackedValue));
+ } else if (cmp == 0) {
+ if (docID < lastDocID) {
+ throw new RuntimeException("packed points value is the same, but docID=" + docID + " is out of order vs previous docID=" + lastDocID + ", field=\"" + fieldName + "\"");
+ }
+ }
+ System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
+ lastDocID = docID;
+ }
+ }
+
+ @Override
+ public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ checkPackedValue("min packed value", minPackedValue, -1);
+ System.arraycopy(minPackedValue, 0, lastMinPackedValue, 0, packedBytesCount);
+ checkPackedValue("max packed value", maxPackedValue, -1);
+ System.arraycopy(maxPackedValue, 0, lastMaxPackedValue, 0, packedBytesCount);
+
+ for(int dim=0;dim<numDims;dim++) {
+ int offset = bytesPerDim * dim;
+
+ if (StringHelper.compare(bytesPerDim, minPackedValue, offset, maxPackedValue, offset) > 0) {
+ throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
+ " is out-of-bounds of the cell's maxPackedValue " + Arrays.toString(maxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
+ }
+
+ // Make sure this cell is not outside of the global min/max:
+ if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMinPackedValue, offset) < 0) {
+ throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
+ " is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
+ }
+
+ if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMinPackedValue, offset) < 0) {
+ throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
+ " is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
+ }
+
+ if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMaxPackedValue, offset) > 0) {
+ throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
+ " is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
+ }
+ if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMaxPackedValue, offset) > 0) {
+ throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
+ " is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
+ }
+ }
+
+ // We always pretend the query shape is so complex that it crosses every cell, so
+ // that packedValue is passed for every document
+ return PointValues.Relation.CELL_CROSSES_QUERY;
+ }
+
+ private void checkPackedValue(String desc, byte[] packedValue, int docID) {
+ if (packedValue == null) {
+ throw new RuntimeException(desc + " is null for docID=" + docID + " field=\"" + fieldName + "\"");
+ }
+
+ if (packedValue.length != packedBytesCount) {
+ throw new RuntimeException(desc + " has incorrect length=" + packedValue.length + " vs expected=" + packedBytesCount + " for docID=" + docID + " field=\"" + fieldName + "\"");
+ }
+ }
+ }
+
+
/**
* Test stored fields.
* @lucene.experimental
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f51766c0/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index 10285e8..8970124 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -17,15 +17,15 @@
package org.apache.lucene.util.bkd;
import java.io.IOException;
-import java.util.Arrays;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CorruptIndexException;
-import org.apache.lucene.index.PointValues.IntersectVisitor;
-import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
@@ -33,14 +33,12 @@ import org.apache.lucene.util.StringHelper;
*
* @lucene.experimental */
-public class BKDReader implements Accountable {
+public final class BKDReader extends PointValues implements Accountable {
// Packed array of byte[] holding all split values in the full binary tree:
- final private byte[] splitPackedValues;
- final long[] leafBlockFPs;
- final private int leafNodeOffset;
+ final int leafNodeOffset;
final int numDims;
final int bytesPerDim;
- final int bytesPerIndexEntry;
+ final int numLeaves;
final IndexInput in;
final int maxPointsInLeafNode;
final byte[] minPackedValue;
@@ -50,6 +48,14 @@ public class BKDReader implements Accountable {
final int version;
protected final int packedBytesLength;
+ // Used for 6.4.0+ index format:
+ final byte[] packedIndex;
+
+ // Used for Legacy (pre-6.4.0) index format, to hold a compact form of the index:
+ final private byte[] splitPackedValues;
+ final int bytesPerIndexEntry;
+ final long[] leafBlockFPs;
+
/** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
public BKDReader(IndexInput in) throws IOException {
version = CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
@@ -60,7 +66,7 @@ public class BKDReader implements Accountable {
packedBytesLength = numDims * bytesPerDim;
// Read index:
- int numLeaves = in.readVInt();
+ numLeaves = in.readVInt();
assert numLeaves > 0;
leafNodeOffset = numLeaves;
@@ -79,205 +85,380 @@ public class BKDReader implements Accountable {
pointCount = in.readVLong();
docCount = in.readVInt();
- splitPackedValues = new byte[bytesPerIndexEntry*numLeaves];
-
- // TODO: don't write split packed values[0]!
- in.readBytes(splitPackedValues, 0, splitPackedValues.length);
-
- // Read the file pointers to the start of each leaf block:
- long[] leafBlockFPs = new long[numLeaves];
- long lastFP = 0;
- for(int i=0;i<numLeaves;i++) {
- long delta = in.readVLong();
- leafBlockFPs[i] = lastFP + delta;
- lastFP += delta;
- }
-
- // Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
- // if it was created by BKDWriter.merge). In this case the leaf nodes may straddle the two bottom
- // levels of the binary tree:
- if (numDims == 1 && numLeaves > 1) {
- //System.out.println("BKDR: numLeaves=" + numLeaves);
- int levelCount = 2;
- while (true) {
- //System.out.println(" cycle levelCount=" + levelCount);
- if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
- int lastLevel = 2*(numLeaves - levelCount);
- assert lastLevel >= 0;
- /*
- System.out.println("BKDR: lastLevel=" + lastLevel + " vs " + levelCount);
- System.out.println("FPs before:");
- for(int i=0;i<leafBlockFPs.length;i++) {
- System.out.println(" " + i + " " + leafBlockFPs[i]);
- }
- */
- if (lastLevel != 0) {
- // Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading
- // at read-time, so that we can still delta code them on disk at write:
- //System.out.println("BKDR: now rotate index");
- long[] newLeafBlockFPs = new long[numLeaves];
- System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
- System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
- leafBlockFPs = newLeafBlockFPs;
- }
- /*
- System.out.println("FPs:");
- for(int i=0;i<leafBlockFPs.length;i++) {
- System.out.println(" " + i + " " + leafBlockFPs[i]);
+ if (version >= BKDWriter.VERSION_PACKED_INDEX) {
+ int numBytes = in.readVInt();
+ packedIndex = new byte[numBytes];
+ in.readBytes(packedIndex, 0, numBytes);
+ leafBlockFPs = null;
+ splitPackedValues = null;
+ } else {
+ // legacy un-packed index
+
+ splitPackedValues = new byte[bytesPerIndexEntry*numLeaves];
+
+ in.readBytes(splitPackedValues, 0, splitPackedValues.length);
+
+ // Read the file pointers to the start of each leaf block:
+ long[] leafBlockFPs = new long[numLeaves];
+ long lastFP = 0;
+ for(int i=0;i<numLeaves;i++) {
+ long delta = in.readVLong();
+ leafBlockFPs[i] = lastFP + delta;
+ lastFP += delta;
+ }
+
+ // Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
+ // if it was created by BKDWriter.merge or OneDimWriter). In this case the leaf nodes may straddle the two bottom
+ // levels of the binary tree:
+ if (numDims == 1 && numLeaves > 1) {
+ int levelCount = 2;
+ while (true) {
+ if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
+ int lastLevel = 2*(numLeaves - levelCount);
+ assert lastLevel >= 0;
+ if (lastLevel != 0) {
+ // Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading
+ // at read-time, so that we can still delta code them on disk at write:
+ long[] newLeafBlockFPs = new long[numLeaves];
+ System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
+ System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
+ leafBlockFPs = newLeafBlockFPs;
+ }
+ break;
}
- */
- break;
- }
- levelCount *= 2;
+ levelCount *= 2;
+ }
}
+
+ this.leafBlockFPs = leafBlockFPs;
+ packedIndex = null;
}
- this.leafBlockFPs = leafBlockFPs;
this.in = in;
}
- /** Called by consumers that have their own on-disk format for the index (e.g. SimpleText) */
- protected BKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
- byte[] minPackedValue, byte[] maxPackedValue, long pointCount, int docCount) throws IOException {
- this.in = in;
- this.numDims = numDims;
- this.maxPointsInLeafNode = maxPointsInLeafNode;
- this.bytesPerDim = bytesPerDim;
- // no version check here because callers of this API (SimpleText) have no back compat:
- bytesPerIndexEntry = numDims == 1 ? bytesPerDim : bytesPerDim + 1;
- packedBytesLength = numDims * bytesPerDim;
- this.leafNodeOffset = leafBlockFPs.length;
- this.leafBlockFPs = leafBlockFPs;
- this.splitPackedValues = splitPackedValues;
- this.minPackedValue = minPackedValue;
- this.maxPackedValue = maxPackedValue;
- this.pointCount = pointCount;
- this.docCount = docCount;
- this.version = BKDWriter.VERSION_CURRENT;
- assert minPackedValue.length == packedBytesLength;
- assert maxPackedValue.length == packedBytesLength;
+ long getMinLeafBlockFP() {
+ if (packedIndex != null) {
+ return new ByteArrayDataInput(packedIndex).readVLong();
+ } else {
+ long minFP = Long.MAX_VALUE;
+ for(long fp : leafBlockFPs) {
+ minFP = Math.min(minFP, fp);
+ }
+ return minFP;
+ }
}
- private static class VerifyVisitor implements IntersectVisitor {
- byte[] cellMinPacked;
- byte[] cellMaxPacked;
- byte[] lastPackedValue;
- final int numDims;
- final int bytesPerDim;
- final int maxDoc;
+ /** Used to walk the in-heap index
+ *
+ * @lucene.internal */
+ public abstract class IndexTree implements Cloneable {
+ protected int nodeID;
+ // level is 1-based so that we can do level-1 w/o checking each time:
+ protected int level;
+ protected int splitDim;
+ protected final byte[][] splitPackedValueStack;
+
+ protected IndexTree() {
+ int treeDepth = getTreeDepth();
+ splitPackedValueStack = new byte[treeDepth+1][];
+ nodeID = 1;
+ level = 1;
+ splitPackedValueStack[level] = new byte[packedBytesLength];
+ }
+
+ public void pushLeft() {
+ nodeID *= 2;
+ level++;
+ if (splitPackedValueStack[level] == null) {
+ splitPackedValueStack[level] = new byte[packedBytesLength];
+ }
+ }
+
+ /** Clone, but you are not allowed to pop up past the point where the clone happened. */
+ public abstract IndexTree clone();
+
+ public void pushRight() {
+ nodeID = nodeID * 2 + 1;
+ level++;
+ if (splitPackedValueStack[level] == null) {
+ splitPackedValueStack[level] = new byte[packedBytesLength];
+ }
+ }
+
+ public void pop() {
+ nodeID /= 2;
+ level--;
+ splitDim = -1;
+ //System.out.println(" pop nodeID=" + nodeID);
+ }
- public VerifyVisitor(int numDims, int bytesPerDim, int maxDoc) {
- this.numDims = numDims;
- this.bytesPerDim = bytesPerDim;
- this.maxDoc = maxDoc;
+ public boolean isLeafNode() {
+ return nodeID >= leafNodeOffset;
}
- @Override
- public void visit(int docID) {
- throw new UnsupportedOperationException();
+ public boolean nodeExists() {
+ return nodeID - leafNodeOffset < leafNodeOffset;
+ }
+
+ public int getNodeID() {
+ return nodeID;
+ }
+
+ public byte[] getSplitPackedValue() {
+ assert isLeafNode() == false;
+ assert splitPackedValueStack[level] != null: "level=" + level;
+ return splitPackedValueStack[level];
+ }
+
+ /** Only valid after pushLeft or pushRight, not pop! */
+ public int getSplitDim() {
+ assert isLeafNode() == false;
+ return splitDim;
+ }
+
+ /** Only valid after pushLeft or pushRight, not pop! */
+ public abstract BytesRef getSplitDimValue();
+
+ /** Only valid after pushLeft or pushRight, not pop! */
+ public abstract long getLeafBlockFP();
+ }
+
+ /** Reads the original simple yet heap-heavy index format */
+ private final class LegacyIndexTree extends IndexTree {
+
+ private long leafBlockFP;
+ private final byte[] splitDimValue = new byte[bytesPerDim];
+ private final BytesRef scratch = new BytesRef();
+
+ public LegacyIndexTree() {
+ setNodeData();
+ scratch.bytes = splitDimValue;
+ scratch.length = bytesPerDim;
}
@Override
- public void visit(int docID, byte[] packedValue) {
- if (docID < 0 || docID >= maxDoc) {
- throw new RuntimeException("docID=" + docID + " is out of bounds of 0.." + maxDoc);
- }
- for(int dim=0;dim<numDims;dim++) {
- if (StringHelper.compare(bytesPerDim, cellMinPacked, dim*bytesPerDim, packedValue, dim*bytesPerDim) > 0) {
- throw new RuntimeException("value=" + new BytesRef(packedValue, dim*bytesPerDim, bytesPerDim) + " for docID=" + docID + " dim=" + dim + " is less than this leaf block's minimum=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim));
- }
- if (StringHelper.compare(bytesPerDim, cellMaxPacked, dim*bytesPerDim, packedValue, dim*bytesPerDim) < 0) {
- throw new RuntimeException("value=" + new BytesRef(packedValue, dim*bytesPerDim, bytesPerDim) + " for docID=" + docID + " dim=" + dim + " is greater than this leaf block's maximum=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
- }
- }
+ public LegacyIndexTree clone() {
+ LegacyIndexTree index = new LegacyIndexTree();
+ index.nodeID = nodeID;
+ index.level = level;
+ index.splitDim = splitDim;
+ index.leafBlockFP = leafBlockFP;
+ index.splitPackedValueStack[index.level] = splitPackedValueStack[index.level].clone();
+
+ return index;
+ }
+
+ @Override
+ public void pushLeft() {
+ super.pushLeft();
+ setNodeData();
+ }
+
+ @Override
+ public void pushRight() {
+ super.pushRight();
+ setNodeData();
+ }
- if (numDims == 1) {
- // With only 1D, all values should always be in sorted order
- if (lastPackedValue == null) {
- lastPackedValue = Arrays.copyOf(packedValue, packedValue.length);
- } else if (StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0) > 0) {
- throw new RuntimeException("value=" + new BytesRef(packedValue) + " for docID=" + docID + " dim=0" + " sorts before last value=" + new BytesRef(lastPackedValue));
+ private void setNodeData() {
+ if (isLeafNode()) {
+ leafBlockFP = leafBlockFPs[nodeID - leafNodeOffset];
+ splitDim = -1;
+ } else {
+ leafBlockFP = -1;
+ int address = nodeID * bytesPerIndexEntry;
+ if (numDims == 1) {
+ splitDim = 0;
+ if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
+ // skip over wastefully encoded 0 splitDim:
+ assert splitPackedValues[address] == 0;
+ address++;
+ }
} else {
- System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
+ splitDim = splitPackedValues[address++] & 0xff;
}
+ System.arraycopy(splitPackedValues, address, splitDimValue, 0, bytesPerDim);
}
}
@Override
- public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
- throw new UnsupportedOperationException();
+ public long getLeafBlockFP() {
+ assert isLeafNode();
+ return leafBlockFP;
+ }
+
+ @Override
+ public BytesRef getSplitDimValue() {
+ assert isLeafNode() == false;
+ return scratch;
}
- }
- /** Only used for debugging, to make sure all values in each leaf block fall within the range expected by the index */
- // TODO: maybe we can get this into CheckIndex?
- public void verify(int maxDoc) throws IOException {
- //System.out.println("BKDR.verify this=" + this);
- // Visits every doc in every leaf block and confirms that
- // their values agree with the index:
- byte[] rootMinPacked = new byte[packedBytesLength];
- byte[] rootMaxPacked = new byte[packedBytesLength];
- Arrays.fill(rootMaxPacked, (byte) 0xff);
- verify(getIntersectState(new VerifyVisitor(numDims, bytesPerDim, maxDoc)), 1, rootMinPacked, rootMaxPacked);
+ @Override
+ public void pop() {
+ super.pop();
+ leafBlockFP = -1;
+ }
}
- private void verify(IntersectState state, int nodeID, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException {
+ /** Reads the new packed byte[] index format which can be up to ~63% smaller than the legacy index format on 20M NYC taxis tests. This
+ * format takes advantage of the limited access pattern to the BKD tree at search time, i.e. starting at the root node and recursing
+ * downwards one child at a time. */
+ private final class PackedIndexTree extends IndexTree {
+ // used to read the packed byte[]
+ private final ByteArrayDataInput in;
+ // holds the minimum (left most) leaf block file pointer for each level we've recursed to:
+ private final long[] leafBlockFPStack;
+ // holds the address, in the packed byte[] index, of the left-node of each level:
+ private final int[] leftNodePositions;
+ // holds the address, in the packed byte[] index, of the right-node of each level:
+ private final int[] rightNodePositions;
+ // holds the splitDim for each level:
+ private final int[] splitDims;
+ // true if the per-dim delta we read for the node at this level is a negative offset vs. the last split on this dim; this is a packed
+ // 2D array, i.e. to access array[level][dim] you read from negativeDeltas[level*numDims+dim]. this will be true if the last time we
+ // split on this dimension, we next pushed to the left sub-tree:
+ private final boolean[] negativeDeltas;
+ // holds the packed per-level split values; the intersect method uses this to save the cell min/max as it recurses:
+ private final byte[][] splitValuesStack;
+ // scratch value to return from getPackedValue:
+ private final BytesRef scratch;
+
+ public PackedIndexTree() {
+ int treeDepth = getTreeDepth();
+ leafBlockFPStack = new long[treeDepth+1];
+ leftNodePositions = new int[treeDepth+1];
+ rightNodePositions = new int[treeDepth+1];
+ splitValuesStack = new byte[treeDepth+1][];
+ splitDims = new int[treeDepth+1];
+ negativeDeltas = new boolean[numDims*(treeDepth+1)];
+
+ in = new ByteArrayDataInput(packedIndex);
+ splitValuesStack[0] = new byte[packedBytesLength];
+ readNodeData(false);
+ scratch = new BytesRef();
+ scratch.length = bytesPerDim;
+ }
- if (nodeID >= leafNodeOffset) {
- int leafID = nodeID - leafNodeOffset;
+ @Override
+ public PackedIndexTree clone() {
+ PackedIndexTree index = new PackedIndexTree();
+ index.nodeID = nodeID;
+ index.level = level;
+ index.splitDim = splitDim;
+ System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims);
+ index.leafBlockFPStack[level] = leafBlockFPStack[level];
+ index.leftNodePositions[level] = leftNodePositions[level];
+ index.rightNodePositions[level] = rightNodePositions[level];
+ index.splitValuesStack[index.level] = splitValuesStack[index.level].clone();
+ System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims);
+ index.splitDims[level] = splitDims[level];
+ return index;
+ }
- // In the unbalanced case it's possible the left most node only has one child:
- if (leafID < leafBlockFPs.length) {
- //System.out.println("CHECK nodeID=" + nodeID + " leaf=" + (nodeID-leafNodeOffset) + " offset=" + leafNodeOffset + " fp=" + leafBlockFPs[leafID]);
- //System.out.println("BKDR.verify leafID=" + leafID + " nodeID=" + nodeID + " fp=" + leafBlockFPs[leafID] + " min=" + new BytesRef(cellMinPacked) + " max=" + new BytesRef(cellMaxPacked));
+ @Override
+ public void pushLeft() {
+ int nodePosition = leftNodePositions[level];
+ super.pushLeft();
+ System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
+ assert splitDim != -1;
+ negativeDeltas[level*numDims+splitDim] = true;
+ in.setPosition(nodePosition);
+ readNodeData(true);
+ }
+
+ @Override
+ public void pushRight() {
+ int nodePosition = rightNodePositions[level];
+ super.pushRight();
+ System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
+ assert splitDim != -1;
+ negativeDeltas[level*numDims+splitDim] = false;
+ in.setPosition(nodePosition);
+ readNodeData(false);
+ }
- // Leaf node: check that all values are in fact in bounds:
- VerifyVisitor visitor = (VerifyVisitor) state.visitor;
- visitor.cellMinPacked = cellMinPacked;
- visitor.cellMaxPacked = cellMaxPacked;
+ @Override
+ public void pop() {
+ super.pop();
+ splitDim = splitDims[level];
+ }
- int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
- visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
- } else {
- //System.out.println("BKDR.verify skip leafID=" + leafID);
+ @Override
+ public long getLeafBlockFP() {
+ assert isLeafNode(): "nodeID=" + nodeID + " is not a leaf";
+ return leafBlockFPStack[level];
+ }
+
+ @Override
+ public BytesRef getSplitDimValue() {
+ assert isLeafNode() == false;
+ scratch.bytes = splitValuesStack[level];
+ scratch.offset = splitDim * bytesPerDim;
+ return scratch;
+ }
+
+ private void readNodeData(boolean isLeft) {
+
+ leafBlockFPStack[level] = leafBlockFPStack[level-1];
+
+ // read leaf block FP delta
+ if (isLeft == false) {
+ leafBlockFPStack[level] += in.readVLong();
}
- } else {
- // Non-leaf node:
-
- int address = nodeID * bytesPerIndexEntry;
- int splitDim;
- if (numDims == 1) {
- splitDim = 0;
- if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
- // skip over wastefully encoded 0 splitDim:
- assert splitPackedValues[address] == 0;
- address++;
- }
+
+ if (isLeafNode()) {
+ splitDim = -1;
} else {
- splitDim = splitPackedValues[address++] & 0xff;
- }
-
- assert splitDim < numDims;
- byte[] splitPackedValue = new byte[packedBytesLength];
+ // read split dim, prefix, firstDiffByteDelta encoded as int:
+ int code = in.readVInt();
+ splitDim = code % numDims;
+ splitDims[level] = splitDim;
+ code /= numDims;
+ int prefix = code % (1+bytesPerDim);
+ int suffix = bytesPerDim - prefix;
- // Recurse on left sub-tree:
- System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
- System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
- verify(state,
- 2*nodeID,
- cellMinPacked, splitPackedValue);
+ if (splitValuesStack[level] == null) {
+ splitValuesStack[level] = new byte[packedBytesLength];
+ }
+ System.arraycopy(splitValuesStack[level-1], 0, splitValuesStack[level], 0, packedBytesLength);
+ if (suffix > 0) {
+ int firstDiffByteDelta = code / (1+bytesPerDim);
+ if (negativeDeltas[level*numDims + splitDim]) {
+ firstDiffByteDelta = -firstDiffByteDelta;
+ }
+ int oldByte = splitValuesStack[level][splitDim*bytesPerDim+prefix] & 0xFF;
+ splitValuesStack[level][splitDim*bytesPerDim+prefix] = (byte) (oldByte + firstDiffByteDelta);
+ in.readBytes(splitValuesStack[level], splitDim*bytesPerDim+prefix+1, suffix-1);
+ } else {
+ // our split value is == last split value in this dim, which can happen when there are many duplicate values
+ }
- // Recurse on right sub-tree:
- System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
- System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
- verify(state,
- 2*nodeID+1,
- splitPackedValue, cellMaxPacked);
+ int leftNumBytes;
+ if (nodeID * 2 < leafNodeOffset) {
+ leftNumBytes = in.readVInt();
+ } else {
+ leftNumBytes = 0;
+ }
+
+ leftNodePositions[level] = in.getPosition();
+ rightNodePositions[level] = leftNodePositions[level] + leftNumBytes;
+ }
}
}
+ private int getTreeDepth() {
+ // First +1 because all the non-leave nodes makes another power
+ // of 2; e.g. to have a fully balanced tree with 4 leaves you
+ // need a depth=3 tree:
+
+ // Second +1 because MathUtil.log computes floor of the logarithm; e.g.
+ // with 5 leaves you need a depth=4 tree:
+ return MathUtil.log(numLeaves, 2) + 2;
+ }
+
/** Used to track all state for a single call to {@link #intersect}. */
public static final class IntersectState {
final IndexInput in;
@@ -286,57 +467,73 @@ public class BKDReader implements Accountable {
final int[] commonPrefixLengths;
final IntersectVisitor visitor;
+ public final IndexTree index;
public IntersectState(IndexInput in, int numDims,
int packedBytesLength,
int maxPointsInLeafNode,
- IntersectVisitor visitor) {
+ IntersectVisitor visitor,
+ IndexTree indexVisitor) {
this.in = in;
this.visitor = visitor;
this.commonPrefixLengths = new int[numDims];
this.scratchDocIDs = new int[maxPointsInLeafNode];
this.scratchPackedValue = new byte[packedBytesLength];
+ this.index = indexVisitor;
}
}
public void intersect(IntersectVisitor visitor) throws IOException {
- intersect(getIntersectState(visitor), 1, minPackedValue, maxPackedValue);
+ intersect(getIntersectState(visitor), minPackedValue, maxPackedValue);
}
/** Fast path: this is called when the query box fully encompasses all cells under this node. */
- private void addAll(IntersectState state, int nodeID) throws IOException {
+ private void addAll(IntersectState state) throws IOException {
//System.out.println("R: addAll nodeID=" + nodeID);
- if (nodeID >= leafNodeOffset) {
+ if (state.index.isLeafNode()) {
//System.out.println("ADDALL");
- visitDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.visitor);
+ if (state.index.nodeExists()) {
+ visitDocIDs(state.in, state.index.getLeafBlockFP(), state.visitor);
+ }
// TODO: we can assert that the first value here in fact matches what the index claimed?
} else {
- addAll(state, 2*nodeID);
- addAll(state, 2*nodeID+1);
+ state.index.pushLeft();
+ addAll(state);
+ state.index.pop();
+
+ state.index.pushRight();
+ addAll(state);
+ state.index.pop();
}
}
/** Create a new {@link IntersectState} */
public IntersectState getIntersectState(IntersectVisitor visitor) {
+ IndexTree index;
+ if (packedIndex != null) {
+ index = new PackedIndexTree();
+ } else {
+ index = new LegacyIndexTree();
+ }
return new IntersectState(in.clone(), numDims,
packedBytesLength,
maxPointsInLeafNode,
- visitor);
+ visitor,
+ index);
}
/** Visits all docIDs and packed values in a single leaf block */
- public void visitLeafBlockValues(int nodeID, IntersectState state) throws IOException {
- int leafID = nodeID - leafNodeOffset;
+ public void visitLeafBlockValues(IndexTree index, IntersectState state) throws IOException {
// Leaf node; scan and filter all points in this block:
- int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
+ int count = readDocIDs(state.in, index.getLeafBlockFP(), state.scratchDocIDs);
// Again, this time reading values and checking with the visitor
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
}
- protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
+ private void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
// Leaf node
in.seek(blockFP);
@@ -351,7 +548,7 @@ public class BKDReader implements Accountable {
}
}
- protected int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
+ int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
in.seek(blockFP);
// How many points are stored in this leaf cell:
@@ -366,7 +563,7 @@ public class BKDReader implements Accountable {
return count;
}
- protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+ void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
visitor.grow(count);
readCommonPrefixes(commonPrefixLengths, scratchPackedValue, in);
@@ -435,13 +632,10 @@ public class BKDReader implements Accountable {
}
}
- private void intersect(IntersectState state,
- int nodeID,
- byte[] cellMinPacked, byte[] cellMaxPacked)
- throws IOException {
+ private void intersect(IntersectState state, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException {
/*
- System.out.println("\nR: intersect nodeID=" + nodeID);
+ System.out.println("\nR: intersect nodeID=" + state.index.getNodeID());
for(int dim=0;dim<numDims;dim++) {
System.out.println(" dim=" + dim + "\n cellMin=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim) + "\n cellMax=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
}
@@ -451,24 +645,18 @@ public class BKDReader implements Accountable {
if (r == Relation.CELL_OUTSIDE_QUERY) {
// This cell is fully outside of the query shape: stop recursing
- return;
} else if (r == Relation.CELL_INSIDE_QUERY) {
// This cell is fully inside of the query shape: recursively add all points in this cell without filtering
- addAll(state, nodeID);
- return;
- } else {
- // The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering
- }
-
- if (nodeID >= leafNodeOffset) {
+ addAll(state);
+ // The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering:
+ } else if (state.index.isLeafNode()) {
+
// TODO: we can assert that the first value here in fact matches what the index claimed?
-
- int leafID = nodeID - leafNodeOffset;
// In the unbalanced case it's possible the left most node only has one child:
- if (leafID < leafBlockFPs.length) {
+ if (state.index.nodeExists()) {
// Leaf node; scan and filter all points in this block:
- int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
+ int count = readDocIDs(state.in, state.index.getLeafBlockFP(), state.scratchDocIDs);
// Again, this time reading values and checking with the visitor
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
@@ -477,65 +665,45 @@ public class BKDReader implements Accountable {
} else {
// Non-leaf node: recurse on the split left and right nodes
-
- int address = nodeID * bytesPerIndexEntry;
- int splitDim;
- if (numDims == 1) {
- splitDim = 0;
- if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
- // skip over wastefully encoded 0 splitDim:
- assert splitPackedValues[address] == 0;
- address++;
- }
- } else {
- splitDim = splitPackedValues[address++] & 0xff;
- }
-
+ int splitDim = state.index.getSplitDim();
+ assert splitDim >= 0: "splitDim=" + splitDim;
assert splitDim < numDims;
- // TODO: can we alloc & reuse this up front?
+ byte[] splitPackedValue = state.index.getSplitPackedValue();
+ BytesRef splitDimValue = state.index.getSplitDimValue();
+ assert splitDimValue.length == bytesPerDim;
+ //System.out.println(" splitDimValue=" + splitDimValue + " splitDim=" + splitDim);
- byte[] splitPackedValue = new byte[packedBytesLength];
+ // make sure cellMin <= splitValue <= cellMax:
+ assert StringHelper.compare(bytesPerDim, cellMinPacked, splitDim*bytesPerDim, splitDimValue.bytes, splitDimValue.offset) <= 0: "bytesPerDim=" + bytesPerDim + " splitDim=" + splitDim + " numDims=" + numDims;
+ assert StringHelper.compare(bytesPerDim, cellMaxPacked, splitDim*bytesPerDim, splitDimValue.bytes, splitDimValue.offset) >= 0: "bytesPerDim=" + bytesPerDim + " splitDim=" + splitDim + " numDims=" + numDims;
// Recurse on left sub-tree:
System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
- System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
- intersect(state,
- 2*nodeID,
- cellMinPacked, splitPackedValue);
+ System.arraycopy(splitDimValue.bytes, splitDimValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+ state.index.pushLeft();
+ intersect(state, cellMinPacked, splitPackedValue);
+ state.index.pop();
+
+ // Restore the split dim value since it may have been overwritten while recursing:
+ System.arraycopy(splitPackedValue, splitDim*bytesPerDim, splitDimValue.bytes, splitDimValue.offset, bytesPerDim);
// Recurse on right sub-tree:
System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
- System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
- intersect(state,
- 2*nodeID+1,
- splitPackedValue, cellMaxPacked);
+ System.arraycopy(splitDimValue.bytes, splitDimValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+ state.index.pushRight();
+ intersect(state, splitPackedValue, cellMaxPacked);
+ state.index.pop();
}
}
- /** Copies the split value for this node into the provided byte array */
- public void copySplitValue(int nodeID, byte[] splitPackedValue) {
- int address = nodeID * bytesPerIndexEntry;
- int splitDim;
- if (numDims == 1) {
- splitDim = 0;
- if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
- // skip over wastefully encoded 0 splitDim:
- assert splitPackedValues[address] == 0;
- address++;
- }
- } else {
- splitDim = splitPackedValues[address++] & 0xff;
- }
-
- assert splitDim < numDims;
- System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
- }
-
@Override
public long ramBytesUsed() {
- return RamUsageEstimator.sizeOf(splitPackedValues) +
- RamUsageEstimator.sizeOf(leafBlockFPs);
+ if (packedIndex != null) {
+ return packedIndex.length;
+ } else {
+ return RamUsageEstimator.sizeOf(splitPackedValues) + RamUsageEstimator.sizeOf(leafBlockFPs);
+ }
}
public byte[] getMinPackedValue() {