You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2015/11/24 17:06:36 UTC
svn commit: r1716189 - in /lucene/dev/trunk/lucene: ./
core/src/java/org/apache/lucene/codecs/
core/src/java/org/apache/lucene/codecs/lucene60/
core/src/java/org/apache/lucene/index/
core/src/java/org/apache/lucene/util/bkd/ core/src/test/org/apache/lu...
Author: mikemccand
Date: Tue Nov 24 16:06:36 2015
New Revision: 1716189
URL: http://svn.apache.org/viewvc?rev=1716189&view=rev
Log:
LUCENE-6901: speed up dimensional values indexing and merging
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/DimensionalWriter.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60DimensionalReader.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60DimensionalWriter.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDUtil.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1716189&r1=1716188&r2=1716189&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Nov 24 16:06:36 2015
@@ -89,6 +89,11 @@ Optimizations
each leaf block in the default codec, to reduce the index
size (Mike McCandless)
+* LUCENE-6901: Optimize dimensional values indexing: use faster
+ IntroSorter instead of InPlaceMergeSorter, and specialize 1D
+ merging to merge sort the already sorted segments instead of
+ re-indexing (Mike McCandless)
+
Changes in Runtime Behavior
* LUCENE-6789: IndexSearcher's default Similarity is changed to BM25Similarity.
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/DimensionalWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/DimensionalWriter.java?rev=1716189&r1=1716188&r2=1716189&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/DimensionalWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/DimensionalWriter.java Tue Nov 24 16:06:36 2015
@@ -37,66 +37,73 @@ public abstract class DimensionalWriter
/** Write all values contained in the provided reader */
public abstract void writeField(FieldInfo fieldInfo, DimensionalReader values) throws IOException;
+ /** Default naive merge implemenation for one field: it just re-indexes all the values
+ * from the incoming segment. The default codec overrides this for 1D fields and uses
+ * a faster but more complex implementation. */
+ protected void mergeOneField(MergeState mergeState, FieldInfo fieldInfo) throws IOException {
+ writeField(fieldInfo,
+ new DimensionalReader() {
+ @Override
+ public void intersect(String fieldName, IntersectVisitor mergedVisitor) throws IOException {
+ if (fieldName.equals(fieldInfo.name) == false) {
+ throw new IllegalArgumentException("field name must match the field being merged");
+ }
+ for (int i=0;i<mergeState.dimensionalReaders.length;i++) {
+ DimensionalReader dimensionalReader = mergeState.dimensionalReaders[i];
+ if (dimensionalReader == null) {
+ // This segment has no dimensional values
+ continue;
+ }
+ MergeState.DocMap docMap = mergeState.docMaps[i];
+ int docBase = mergeState.docBase[i];
+ dimensionalReader.intersect(fieldInfo.name,
+ new IntersectVisitor() {
+ @Override
+ public void visit(int docID) {
+ // Should never be called because our compare method never returns Relation.CELL_INSIDE_QUERY
+ throw new IllegalStateException();
+ }
+
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {
+ int newDocID = docMap.get(docID);
+ if (newDocID != -1) {
+ // Not deleted:
+ mergedVisitor.visit(docBase + newDocID, packedValue);
+ }
+ }
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ // Forces this segment's DimensionalReader to always visit all docs + values:
+ return Relation.CELL_CROSSES_QUERY;
+ }
+ });
+ }
+ }
+
+ @Override
+ public void checkIntegrity() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return 0L;
+ }
+
+ @Override
+ public void close() {
+ }
+ });
+ }
+
/** Default merge implementation to merge incoming dimensional readers by visiting all their points and
* adding to this writer */
public void merge(MergeState mergeState) throws IOException {
for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) {
if (fieldInfo.getDimensionCount() != 0) {
- writeField(fieldInfo,
- new DimensionalReader() {
- @Override
- public void intersect(String fieldName, IntersectVisitor mergedVisitor) throws IOException {
- if (fieldName.equals(fieldInfo.name) == false) {
- throw new IllegalArgumentException("field name must match the field being merged");
- }
- for (int i=0;i<mergeState.dimensionalReaders.length;i++) {
- DimensionalReader dimensionalReader = mergeState.dimensionalReaders[i];
- if (dimensionalReader == null) {
- // This segment has no dimensional values
- continue;
- }
- MergeState.DocMap docMap = mergeState.docMaps[i];
- int docBase = mergeState.docBase[i];
- dimensionalReader.intersect(fieldInfo.name,
- new IntersectVisitor() {
- @Override
- public void visit(int docID) {
- // Should never be called because our compare method never returns Relation.CELL_INSIDE_QUERY
- throw new IllegalStateException();
- }
-
- @Override
- public void visit(int docID, byte[] packedValue) throws IOException {
- int newDocID = docMap.get(docID);
- if (newDocID != -1) {
- // Not deleted:
- mergedVisitor.visit(docBase + newDocID, packedValue);
- }
- }
-
- @Override
- public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
- // Forces this segment's DimensionalReader to always visit all docs + values:
- return Relation.CELL_CROSSES_QUERY;
- }
- });
- }
- }
-
- @Override
- public void checkIntegrity() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public long ramBytesUsed() {
- return 0L;
- }
-
- @Override
- public void close() {
- }
- });
+ mergeOneField(mergeState, fieldInfo);
}
}
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60DimensionalReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60DimensionalReader.java?rev=1716189&r1=1716188&r2=1716189&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60DimensionalReader.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60DimensionalReader.java Tue Nov 24 16:06:36 2015
@@ -71,7 +71,9 @@ public class Lucene60DimensionalReader e
int fieldNumber = indexIn.readVInt();
long fp = indexIn.readVLong();
dataIn.seek(fp);
- readers.put(fieldNumber, new BKDReader(dataIn));
+ BKDReader reader = new BKDReader(dataIn);
+ readers.put(fieldNumber, reader);
+ //reader.verify(readState.segmentInfo.maxDoc());
}
CodecUtil.checkFooter(indexIn);
success = true;
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60DimensionalWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60DimensionalWriter.java?rev=1716189&r1=1716188&r2=1716189&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60DimensionalWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60DimensionalWriter.java Tue Nov 24 16:06:36 2015
@@ -19,7 +19,9 @@ package org.apache.lucene.codecs.lucene6
import java.io.Closeable;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
@@ -28,10 +30,13 @@ import org.apache.lucene.codecs.Dimensio
import org.apache.lucene.index.DimensionalValues.IntersectVisitor;
import org.apache.lucene.index.DimensionalValues.Relation;
import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.bkd.BKDReader;
import org.apache.lucene.util.bkd.BKDWriter;
/** Writes dimensional values */
@@ -104,6 +109,61 @@ public class Lucene60DimensionalWriter e
}
}
+ @Override
+ public void merge(MergeState mergeState) throws IOException {
+ for(DimensionalReader reader : mergeState.dimensionalReaders) {
+ if (reader instanceof Lucene60DimensionalReader == false) {
+ // We can only bulk merge when all to-be-merged segments use our format:
+ super.merge(mergeState);
+ return;
+ }
+ }
+
+ for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) {
+ if (fieldInfo.getDimensionCount() != 0) {
+ if (fieldInfo.getDimensionCount() == 1) {
+ //System.out.println("MERGE: field=" + fieldInfo.name);
+ // Optimize the 1D case to use BKDWriter.merge, which does a single merge sort of the
+ // already sorted incoming segments, instead of trying to sort all points again as if
+ // we were simply reindexing them:
+ try (BKDWriter writer = new BKDWriter(writeState.directory,
+ writeState.segmentInfo.name,
+ fieldInfo.getDimensionCount(),
+ fieldInfo.getDimensionNumBytes(),
+ maxPointsInLeafNode,
+ maxMBSortInHeap)) {
+ List<BKDReader> bkdReaders = new ArrayList<>();
+ List<MergeState.DocMap> docMaps = new ArrayList<>();
+ List<Integer> docIDBases = new ArrayList<>();
+ for(int i=0;i<mergeState.dimensionalReaders.length;i++) {
+ DimensionalReader reader = mergeState.dimensionalReaders[i];
+
+ Lucene60DimensionalReader reader60 = (Lucene60DimensionalReader) reader;
+ if (reader60 != null) {
+ // TODO: I could just use the merged fieldInfo.number instead of resolving to this
+ // reader's FieldInfo, right? Field numbers are always consistent across segments,
+ // since when?
+ FieldInfos readerFieldInfos = mergeState.fieldInfos[i];
+ FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name);
+ if (readerFieldInfo != null) {
+ BKDReader bkdReader = reader60.readers.get(readerFieldInfo.number);
+ if (bkdReader != null) {
+ docIDBases.add(mergeState.docBase[i]);
+ bkdReaders.add(bkdReader);
+ docMaps.add(mergeState.docMaps[i]);
+ }
+ }
+ }
+ }
+
+ indexFPs.put(fieldInfo.name, writer.merge(dataOut, docMaps, bkdReaders, docIDBases));
+ }
+ } else {
+ mergeOneField(mergeState, fieldInfo);
+ }
+ }
+ }
+ }
@Override
public void close() throws IOException {
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java?rev=1716189&r1=1716188&r2=1716189&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java Tue Nov 24 16:06:36 2015
@@ -415,6 +415,8 @@ class DocumentsWriterPerThread {
return null;
}
+ long t0 = System.nanoTime();
+
if (infoStream.isEnabled("DWPT")) {
infoStream.message("DWPT", "flush postings as segment " + flushState.segmentInfo.name + " numDocs=" + numDocsInRAM);
}
@@ -458,6 +460,9 @@ class DocumentsWriterPerThread {
FlushedSegment fs = new FlushedSegment(segmentInfoPerCommit, flushState.fieldInfos,
segmentDeletes, flushState.liveDocs, flushState.delCountOnFlush);
sealFlushedSegment(fs);
+ if (infoStream.isEnabled("DWPT")) {
+ infoStream.message("DWPT", "flush time " + ((System.nanoTime() - t0)/1000000.0) + " msec");
+ }
return fs;
} catch (Throwable th) {
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java?rev=1716189&r1=1716188&r2=1716189&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java Tue Nov 24 16:06:36 2015
@@ -25,7 +25,9 @@ import org.apache.lucene.index.Dimension
import org.apache.lucene.index.DimensionalValues.Relation;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.StringHelper;
/** Handles intersection of an multi-dimensional shape in byte[] space with a block KD-tree previously written with {@link BKDWriter}.
*
@@ -34,7 +36,7 @@ import org.apache.lucene.util.RamUsageEs
public class BKDReader implements Accountable {
// Packed array of byte[] holding all split values in the full binary tree:
final private byte[] splitPackedValues;
- final private long[] leafBlockFPs;
+ final long[] leafBlockFPs;
final private int leafNodeOffset;
final int numDims;
final int bytesPerDim;
@@ -55,10 +57,12 @@ public class BKDReader implements Accoun
leafNodeOffset = numLeaves;
splitPackedValues = new byte[(1+bytesPerDim)*numLeaves];
+
+ // TODO: don't write split packed values[0]!
in.readBytes(splitPackedValues, 0, splitPackedValues.length);
- // Tree is fully balanced binary tree, so number of nodes = numLeaves-1, except our nodeIDs are 1-based (splitPackedValues[0] is unused):
- leafBlockFPs = new long[numLeaves];
+ // Read the file pointers to the start of each leaf block:
+ long[] leafBlockFPs = new long[numLeaves];
long lastFP = 0;
for(int i=0;i<numLeaves;i++) {
long delta = in.readVLong();
@@ -66,6 +70,47 @@ public class BKDReader implements Accoun
lastFP += delta;
}
+ // Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
+ // if it was created by BKDWriter.merge). In this case the leaf nodes may straddle the two bottom
+ // levels of the binary tree:
+ if (numDims == 1 && numLeaves > 1) {
+ //System.out.println("BKDR: numLeaves=" + numLeaves);
+ int levelCount = 2;
+ while (true) {
+ //System.out.println(" cycle levelCount=" + levelCount);
+ if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
+ int lastLevel = 2*(numLeaves - levelCount);
+ assert lastLevel >= 0;
+ /*
+ System.out.println("BKDR: lastLevel=" + lastLevel + " vs " + levelCount);
+ System.out.println("FPs before:");
+ for(int i=0;i<leafBlockFPs.length;i++) {
+ System.out.println(" " + i + " " + leafBlockFPs[i]);
+ }
+ */
+ if (lastLevel != 0) {
+ // Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading
+ // at read-time, so that we can still delta code them on disk at write:
+ //System.out.println("BKDR: now rotate index");
+ long[] newLeafBlockFPs = new long[numLeaves];
+ System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
+ System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
+ leafBlockFPs = newLeafBlockFPs;
+ }
+ /*
+ System.out.println("FPs:");
+ for(int i=0;i<leafBlockFPs.length;i++) {
+ System.out.println(" " + i + " " + leafBlockFPs[i]);
+ }
+ */
+ break;
+ }
+
+ levelCount *= 2;
+ }
+ }
+
+ this.leafBlockFPs = leafBlockFPs;
this.in = in;
}
@@ -81,7 +126,120 @@ public class BKDReader implements Accoun
this.splitPackedValues = splitPackedValues;
}
- private static final class IntersectState {
+ private static class VerifyVisitor implements IntersectVisitor {
+ byte[] cellMinPacked;
+ byte[] cellMaxPacked;
+ byte[] lastPackedValue;
+ final int numDims;
+ final int bytesPerDim;
+ final int maxDoc;
+
+ public VerifyVisitor(int numDims, int bytesPerDim, int maxDoc) {
+ this.numDims = numDims;
+ this.bytesPerDim = bytesPerDim;
+ this.maxDoc = maxDoc;
+ }
+
+ @Override
+ public void visit(int docID) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void visit(int docID, byte[] packedValue) {
+ if (docID < 0 || docID >= maxDoc) {
+ throw new RuntimeException("docID=" + docID + " is out of bounds of 0.." + maxDoc);
+ }
+ for(int dim=0;dim<numDims;dim++) {
+ if (StringHelper.compare(bytesPerDim, cellMinPacked, dim*bytesPerDim, packedValue, dim*bytesPerDim) > 0) {
+ throw new RuntimeException("value=" + new BytesRef(packedValue, dim*bytesPerDim, bytesPerDim) + " for docID=" + docID + " dim=" + dim + " is less than this leaf block's minimum=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim));
+ }
+ if (StringHelper.compare(bytesPerDim, cellMaxPacked, dim*bytesPerDim, packedValue, dim*bytesPerDim) < 0) {
+ throw new RuntimeException("value=" + new BytesRef(packedValue, dim*bytesPerDim, bytesPerDim) + " for docID=" + docID + " dim=" + dim + " is greater than this leaf block's maximum=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
+ }
+ }
+
+ if (numDims == 1) {
+ // With only 1D, all values should always be in sorted order
+ if (lastPackedValue == null) {
+ lastPackedValue = Arrays.copyOf(packedValue, packedValue.length);
+ } else if (BKDUtil.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0) > 0) {
+ throw new RuntimeException("value=" + new BytesRef(packedValue) + " for docID=" + docID + " dim=0" + " sorts before last value=" + new BytesRef(lastPackedValue));
+ } else {
+ System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
+ }
+ }
+ }
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ /** Only used for debugging, to make sure all values in each leaf block fall within the range expected by the index */
+ // TODO: maybe we can get this into CheckIndex?
+ public void verify(int maxDoc) throws IOException {
+ //System.out.println("BKDR.verify this=" + this);
+ // Visits every doc in every leaf block and confirms that
+ // their values agree with the index:
+ byte[] rootMinPacked = new byte[packedBytesLength];
+ byte[] rootMaxPacked = new byte[packedBytesLength];
+ Arrays.fill(rootMaxPacked, (byte) 0xff);
+
+ IntersectState state = new IntersectState(in.clone(), numDims, packedBytesLength,
+ maxPointsInLeafNode,
+ new VerifyVisitor(numDims, bytesPerDim, maxDoc));
+
+ verify(state, 1, rootMinPacked, rootMaxPacked);
+ }
+
+ private void verify(IntersectState state, int nodeID, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException {
+
+ if (nodeID >= leafNodeOffset) {
+ int leafID = nodeID - leafNodeOffset;
+
+ // In the unbalanced case it's possible the left most node only has one child:
+ if (leafID < leafBlockFPs.length) {
+ //System.out.println("CHECK nodeID=" + nodeID + " leaf=" + (nodeID-leafNodeOffset) + " offset=" + leafNodeOffset + " fp=" + leafBlockFPs[leafID]);
+ //System.out.println("BKDR.verify leafID=" + leafID + " nodeID=" + nodeID + " fp=" + leafBlockFPs[leafID] + " min=" + new BytesRef(cellMinPacked) + " max=" + new BytesRef(cellMaxPacked));
+
+ // Leaf node: check that all values are in fact in bounds:
+ VerifyVisitor visitor = (VerifyVisitor) state.visitor;
+ visitor.cellMinPacked = cellMinPacked;
+ visitor.cellMaxPacked = cellMaxPacked;
+
+ int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
+ visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
+ } else {
+ //System.out.println("BKDR.verify skip leafID=" + leafID);
+ }
+ } else {
+ // Non-leaf node:
+
+ int address = nodeID * (bytesPerDim+1);
+ int splitDim = splitPackedValues[address] & 0xff;
+ assert splitDim < numDims;
+
+ byte[] splitPackedValue = new byte[packedBytesLength];
+
+ // Recurse on left sub-tree:
+ System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
+ System.arraycopy(splitPackedValues, address+1, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+ verify(state,
+ 2*nodeID,
+ cellMinPacked, splitPackedValue);
+
+ // Recurse on right sub-tree:
+ System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
+ System.arraycopy(splitPackedValues, address+1, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+ verify(state,
+ 2*nodeID+1,
+ splitPackedValue, cellMaxPacked);
+ }
+ }
+
+ static final class IntersectState {
final IndexInput in;
final int[] scratchDocIDs;
final byte[] scratchPackedValue;
@@ -119,6 +277,7 @@ public class BKDReader implements Accoun
if (nodeID >= leafNodeOffset) {
//System.out.println("ADDALL");
visitDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.visitor);
+ // TODO: we can assert that the first value here in fact matches what the index claimed?
} else {
addAll(state, 2*nodeID);
addAll(state, 2*nodeID+1);
@@ -196,38 +355,43 @@ public class BKDReader implements Accoun
}
if (nodeID >= leafNodeOffset) {
- //System.out.println("FILTER");
- // Leaf node; scan and filter all points in this block:
- int count = readDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.scratchDocIDs);
+ // TODO: we can assert that the first value here in fact matches what the index claimed?
- // Again, this time reading values and checking with the visitor
- visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
+ int leafID = nodeID - leafNodeOffset;
+
+ // In the unbalanced case it's possible the left most node only has one child:
+ if (leafID < leafBlockFPs.length) {
+ // Leaf node; scan and filter all points in this block:
+ int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
+
+ // Again, this time reading values and checking with the visitor
+ visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
+ }
} else {
// Non-leaf node: recurse on the split left and right nodes
+ // TODO: save the unused 1 byte prefix (it's always 0) in the 1d case here:
int address = nodeID * (bytesPerDim+1);
int splitDim = splitPackedValues[address] & 0xff;
assert splitDim < numDims;
// TODO: can we alloc & reuse this up front?
- byte[] splitValue = new byte[bytesPerDim];
- System.arraycopy(splitPackedValues, address+1, splitValue, 0, bytesPerDim);
// TODO: can we alloc & reuse this up front?
byte[] splitPackedValue = new byte[packedBytesLength];
// Recurse on left sub-tree:
System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
- System.arraycopy(splitValue, 0, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+ System.arraycopy(splitPackedValues, address+1, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
intersect(state,
2*nodeID,
cellMinPacked, splitPackedValue);
// Recurse on right sub-tree:
System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
- System.arraycopy(splitValue, 0, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
+ System.arraycopy(splitPackedValues, address+1, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
intersect(state,
2*nodeID+1,
splitPackedValue, cellMaxPacked);
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDUtil.java?rev=1716189&r1=1716188&r2=1716189&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDUtil.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDUtil.java Tue Nov 24 16:06:36 2015
@@ -69,8 +69,12 @@ public final class BKDUtil {
/** Returns positive int if a > b, negative int if a < b and 0 if a == b */
public static int compare(int bytesPerDim, byte[] a, int aIndex, byte[] b, int bIndex) {
+ assert aIndex >= 0;
+ assert bIndex >= 0;
+ int aOffset = aIndex*bytesPerDim;
+ int bOffset = bIndex*bytesPerDim;
for(int i=0;i<bytesPerDim;i++) {
- int cmp = (a[aIndex*bytesPerDim+i]&0xff) - (b[bIndex*bytesPerDim+i]&0xff);
+ int cmp = (a[aOffset+i]&0xff) - (b[bOffset+i]&0xff);
if (cmp != 0) {
return cmp;
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java?rev=1716189&r1=1716188&r2=1716189&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java Tue Nov 24 16:06:36 2015
@@ -20,10 +20,13 @@ package org.apache.lucene.util.bkd;
import java.io.Closeable;
import java.io.EOFException;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
+import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.MergeState;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
@@ -33,11 +36,13 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.InPlaceMergeSorter;
+import org.apache.lucene.util.IntroSorter;
import org.apache.lucene.util.LongBitSet;
-import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
+import org.apache.lucene.util.OfflineSorter;
+import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.StringHelper;
// TODO
// - the compression is somewhat stupid now (delta vInt for 1024 docIDs, no compression for the byte[] values even though they have high locality)
@@ -210,16 +215,339 @@ public class BKDWriter implements Closea
pointCount++;
}
+ private static class MergeReader {
+ final BKDReader bkd;
+ final BKDReader.IntersectState state;
+ final MergeState.DocMap docMap;
+
+ /** Base offset for all our docIDs */
+ final int docIDBase;
+
+ /** Current doc ID */
+ public int docID;
+
+ /** Which doc in this block we are up to */
+ private int docBlockUpto;
+
+ /** How many docs in the current block */
+ private int docsInBlock;
+
+ /** Which leaf block we are up to */
+ private int blockID;
+
+ public MergeReader(BKDReader bkd, MergeState.DocMap docMap, int docIDBase) throws IOException {
+ this.bkd = bkd;
+ state = new BKDReader.IntersectState(bkd.in.clone(),
+ bkd.numDims,
+ bkd.packedBytesLength,
+ bkd.maxPointsInLeafNode,
+ null);
+ this.docMap = docMap;
+ this.docIDBase = docIDBase;
+ long minFP = Long.MAX_VALUE;
+ //System.out.println("MR.init " + this + " bkdreader=" + bkd + " leafBlockFPs.length=" + bkd.leafBlockFPs.length);
+ for(long fp : bkd.leafBlockFPs) {
+ minFP = Math.min(minFP, fp);
+ //System.out.println(" leaf fp=" + fp);
+ }
+ state.in.seek(minFP);
+ }
+
+ public boolean next() throws IOException {
+ //System.out.println("MR.next this=" + this);
+ while (true) {
+ if (docBlockUpto == docsInBlock) {
+ if (blockID == bkd.leafBlockFPs.length) {
+ //System.out.println(" done!");
+ return false;
+ }
+ //System.out.println(" new block @ fp=" + state.in.getFilePointer());
+ docsInBlock = bkd.readDocIDs(state.in, state.in.getFilePointer(), state.scratchDocIDs);
+ docBlockUpto = 0;
+ for(int dim=0;dim<bkd.numDims;dim++) {
+ int prefix = state.in.readVInt();
+ state.commonPrefixLengths[dim] = prefix;
+ if (prefix > 0) {
+ state.in.readBytes(state.scratchPackedValue, dim*bkd.bytesPerDim, prefix);
+ }
+ }
+
+ blockID++;
+ }
+
+ int oldDocID = state.scratchDocIDs[docBlockUpto++];
+ int mappedDocID;
+ if (docMap == null) {
+ mappedDocID = oldDocID;
+ } else {
+ mappedDocID = docMap.get(oldDocID);
+ }
+ for(int dim=0;dim<bkd.numDims;dim++) {
+ int prefix = state.commonPrefixLengths[dim];
+ state.in.readBytes(state.scratchPackedValue, dim*bkd.bytesPerDim + prefix, bkd.bytesPerDim - prefix);
+ }
+ if (mappedDocID != -1) {
+ // Not deleted!
+ docID = mappedDocID;
+ return true;
+ }
+ }
+ }
+ }
+
+ private static class BKDMergeQueue extends PriorityQueue<MergeReader> {
+ private final int bytesPerDim;
+
+ public BKDMergeQueue(int bytesPerDim, int maxSize) {
+ super(maxSize);
+ this.bytesPerDim = bytesPerDim;
+ }
+
+ @Override
+ public boolean lessThan(MergeReader a, MergeReader b) {
+ assert a != b;
+
+ int cmp = StringHelper.compare(bytesPerDim, a.state.scratchPackedValue, 0, b.state.scratchPackedValue, 0);
+ if (cmp < 0) {
+ return true;
+ } else if (cmp > 0) {
+ return false;
+ }
+
+ // Tie break by sorting smaller docIDs earlier:
+ return a.docIDBase < b.docIDBase;
+ }
+ }
+
+ /** More efficient bulk-add for incoming {@link BKDReader}s. This does a merge sort of the already
+ * sorted values and currently only works when numDims==1. */
+ public long merge(IndexOutput out, List<MergeState.DocMap> docMaps, List<BKDReader> readers, List<Integer> docIDBases) throws IOException {
+ if (numDims != 1) {
+ throw new UnsupportedOperationException("numDims must be 1 but got " + numDims);
+ }
+ if (pointCount != 0) {
+ throw new IllegalStateException("cannot mix add and merge");
+ }
+
+ //System.out.println("BKDW.merge segs=" + readers.size());
+
+ // Catch user silliness:
+ if (heapPointWriter == null && tempInput == null) {
+ throw new IllegalStateException("already finished");
+ }
+
+ // Mark that we already finished:
+ heapPointWriter = null;
+
+ assert docMaps == null || readers.size() == docMaps.size();
+
+ BKDMergeQueue queue = new BKDMergeQueue(bytesPerDim, readers.size());
+
+ for(int i=0;i<readers.size();i++) {
+ BKDReader bkd = readers.get(i);
+ MergeState.DocMap docMap;
+ if (docMaps == null) {
+ docMap = null;
+ } else {
+ docMap = docMaps.get(i);
+ }
+ MergeReader reader = new MergeReader(bkd, docMap, docIDBases.get(i));
+ if (reader.next()) {
+ queue.add(reader);
+ }
+ }
+
+ int leafCount = 0;
+ List<Long> leafBlockFPs = new ArrayList<>();
+ List<byte[]> leafBlockStartValues = new ArrayList<>();
+
+ // Target halfway between min and max allowed for the leaf:
+ int pointsPerLeafBlock = (int) (0.75 * maxPointsInLeafNode);
+ //System.out.println("POINTS PER: " + pointsPerLeafBlock);
+
+ byte[] lastPackedValue = new byte[bytesPerDim];
+ byte[] firstPackedValue = new byte[bytesPerDim];
+ long valueCount = 0;
+
+ // Buffer up each leaf block's docs and values
+ int[] leafBlockDocIDs = new int[maxPointsInLeafNode];
+ byte[][] leafBlockPackedValues = new byte[maxPointsInLeafNode][];
+ for(int i=0;i<maxPointsInLeafNode;i++) {
+ leafBlockPackedValues[i] = new byte[packedBytesLength];
+ }
+ Arrays.fill(commonPrefixLengths, bytesPerDim);
+
+ while (queue.size() != 0) {
+ MergeReader reader = queue.top();
+ // System.out.println("iter reader=" + reader);
+
+ // NOTE: doesn't work with subclasses (e.g. SimpleText!)
+ leafBlockDocIDs[leafCount] = reader.docIDBase + reader.docID;
+ System.arraycopy(reader.state.scratchPackedValue, 0, leafBlockPackedValues[leafCount], 0, packedBytesLength);
+
+ assert numDims > 1 || valueInOrder(valueCount++, lastPackedValue, reader.state.scratchPackedValue);
+
+ if (leafCount == 0) {
+ if (leafBlockFPs.size() > 0) {
+ // Save the first (minimum) value in each leaf block except the first, to build the split value index in the end:
+ leafBlockStartValues.add(Arrays.copyOf(reader.state.scratchPackedValue, bytesPerDim));
+ }
+ Arrays.fill(commonPrefixLengths, bytesPerDim);
+ System.arraycopy(reader.state.scratchPackedValue, 0, firstPackedValue, 0, bytesPerDim);
+ } else {
+ // Find per-dim common prefix:
+ for(int dim=0;dim<numDims;dim++) {
+ int offset = dim * bytesPerDim;
+ for(int j=0;j<commonPrefixLengths[dim];j++) {
+ if (firstPackedValue[offset+j] != reader.state.scratchPackedValue[offset+j]) {
+ commonPrefixLengths[dim] = j;
+ break;
+ }
+ }
+ }
+ }
+
+ leafCount++;
+
+ if (reader.next()) {
+ queue.updateTop();
+ } else {
+ // This segment was exhausted
+ queue.pop();
+ }
+
+ // We write a block once we hit exactly the max count ... this is different from
+ // when we flush a new segment, where we write between max/2 and max per leaf block,
+ // so merged segments will behave differently from newly flushed segments:
+ if (leafCount == pointsPerLeafBlock || queue.size() == 0) {
+ leafBlockFPs.add(out.getFilePointer());
+ checkMaxLeafNodeCount(leafBlockFPs.size());
+
+ writeLeafBlockDocs(out, leafBlockDocIDs, 0, leafCount);
+ writeCommonPrefixes(out, commonPrefixLengths, firstPackedValue);
+
+ // Write the full values:
+ for (int i=0;i<leafCount;i++) {
+ writeLeafBlockPackedValue(out, commonPrefixLengths, leafBlockPackedValues[i]);
+ }
+
+ leafCount = 0;
+ }
+ }
+
+ long indexFP = out.getFilePointer();
+
+ int numInnerNodes = leafBlockStartValues.size();
+
+ //System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts=" + leafBlockStartValues.size());
+
+ byte[] index = new byte[(1+numInnerNodes) * (1+bytesPerDim)];
+ rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues);
+ long[] arr = new long[leafBlockFPs.size()];
+ for(int i=0;i<leafBlockFPs.size();i++) {
+ arr[i] = leafBlockFPs.get(i);
+ }
+ writeIndex(out, arr, index);
+ return indexFP;
+ }
+
+ // TODO: there must be a simpler way?
+ private void rotateToTree(int nodeID, int offset, int count, byte[] index, List<byte[]> leafBlockStartValues) {
+ //System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + " bpd=" + bytesPerDim + " index.length=" + index.length);
+ if (count == 1) {
+ // Leaf index node
+ //System.out.println(" leaf index node");
+ //System.out.println(" index[" + nodeID + "] = blockStartValues[" + offset + "]");
+ System.arraycopy(leafBlockStartValues.get(offset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim);
+ } else if (count > 1) {
+ // Internal index node: binary partition of count
+ int countAtLevel = 1;
+ int totalCount = 0;
+ while (true) {
+ int countLeft = count - totalCount;
+ //System.out.println(" cycle countLeft=" + countLeft + " coutAtLevel=" + countAtLevel);
+ if (countLeft <= countAtLevel) {
+ // This is the last level, possibly partially filled:
+ int lastLeftCount = Math.min(countAtLevel/2, countLeft);
+ assert lastLeftCount >= 0;
+ int leftHalf = (totalCount-1)/2 + lastLeftCount;
+
+ int rootOffset = offset + leftHalf;
+ /*
+ System.out.println(" last left count " + lastLeftCount);
+ System.out.println(" leftHalf " + leftHalf + " rightHalf=" + (count-leftHalf-1));
+ System.out.println(" rootOffset=" + rootOffset);
+ */
+
+ System.arraycopy(leafBlockStartValues.get(rootOffset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim);
+ //System.out.println(" index[" + nodeID + "] = blockStartValues[" + rootOffset + "]");
+
+ // TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree
+ // under here, to save this while loop on each recursion
+
+ // Recurse left
+ rotateToTree(2*nodeID, offset, leftHalf, index, leafBlockStartValues);
+
+ // Recurse right
+ rotateToTree(2*nodeID+1, rootOffset+1, count-leftHalf-1, index, leafBlockStartValues);
+ return;
+ }
+ totalCount += countAtLevel;
+ countAtLevel *= 2;
+ }
+ } else {
+ assert count == 0;
+ }
+ }
+
// TODO: if we fixed each partition step to just record the file offset at the "split point", we could probably handle variable length
// encoding and not have our own ByteSequencesReader/Writer
- /** If dim=-1 we sort by docID, else by that dim. */
+ /** Sort the heap writer by the specified dim */
private void sortHeapPointWriter(final HeapPointWriter writer, int start, int length, int dim) {
assert pointCount < Integer.MAX_VALUE;
+ //int[] swapCount = new int[1];
+ //int[] cmpCount = new int[1];
+
+ //System.out.println("SORT length=" + length);
// All buffered points are still in heap; just do in-place sort:
- new InPlaceMergeSorter() {
+ new IntroSorter() {
+ private final byte[] pivotPackedValue = new byte[bytesPerDim];
+ private int pivotDocID;
+ private long pivotOrd;
+
+ @Override
+ protected void setPivot(int i) {
+ pivotDocID = writer.docIDs[i];
+ pivotOrd = writer.ords[i];
+
+ int block = i / writer.valuesPerBlock;
+ int index = i % writer.valuesPerBlock;
+ System.arraycopy(writer.blocks.get(block), index*packedBytesLength+dim*bytesPerDim, pivotPackedValue, 0, bytesPerDim);
+ }
+
+ @Override
+ protected int comparePivot(int j) {
+ //cmpCount[0]++;
+ int block = j / writer.valuesPerBlock;
+ int index = j % writer.valuesPerBlock;
+ assert index >= 0: "index=" + index + " j=" + j;
+ int cmp = BKDUtil.compare(bytesPerDim, pivotPackedValue, 0, writer.blocks.get(block), index*numDims+dim);
+ if (cmp != 0) {
+ return cmp;
+ }
+
+ // Tie-break
+ cmp = Integer.compare(pivotDocID, writer.docIDs[j]);
+ if (cmp != 0) {
+ return cmp;
+ }
+
+ return Long.compare(pivotOrd, writer.ords[j]);
+ }
+
@Override
protected void swap(int i, int j) {
int docID = writer.docIDs[i];
@@ -230,21 +558,27 @@ public class BKDWriter implements Closea
writer.ords[i] = writer.ords[j];
writer.ords[j] = ord;
+ byte[] blockI = writer.blocks.get(i / writer.valuesPerBlock);
+ int indexI = (i % writer.valuesPerBlock) * packedBytesLength;
+ byte[] blockJ = writer.blocks.get(j / writer.valuesPerBlock);
+ int indexJ = (j % writer.valuesPerBlock) * packedBytesLength;
+
// scratch1 = values[i]
- writer.readPackedValue(i, scratch1);
- // scratch2 = values[j]
- writer.readPackedValue(j, scratch2);
- // values[i] = scratch2
- writer.writePackedValue(i, scratch2);
+ System.arraycopy(blockI, indexI, scratch1, 0, packedBytesLength);
+ // values[i] = values[j]
+ System.arraycopy(blockJ, indexJ, blockI, indexI, packedBytesLength);
// values[j] = scratch1
- writer.writePackedValue(j, scratch1);
+ System.arraycopy(scratch1, 0, blockJ, indexJ, packedBytesLength);
}
@Override
protected int compare(int i, int j) {
- writer.readPackedValue(i, scratch1);
- writer.readPackedValue(j, scratch2);
- int cmp = BKDUtil.compare(bytesPerDim, scratch1, dim, scratch2, dim);
+ //cmpCount[0]++;
+ int blockI = i / writer.valuesPerBlock;
+ int dimI = i % writer.valuesPerBlock;
+ int blockJ = j / writer.valuesPerBlock;
+ int dimJ = j % writer.valuesPerBlock;
+ int cmp = BKDUtil.compare(bytesPerDim, writer.blocks.get(blockI), dimI*numDims+dim, writer.blocks.get(blockJ), dimJ*numDims+dim);
if (cmp != 0) {
return cmp;
}
@@ -258,6 +592,7 @@ public class BKDWriter implements Closea
return Long.compare(writer.ords[i], writer.ords[j]);
}
}.sort(start, start+length);
+ //System.out.println("LEN=" + length + " SWAP=" + swapCount[0] + " CMP=" + cmpCount[0]);
}
private PointWriter sort(int dim) throws IOException {
@@ -278,7 +613,10 @@ public class BKDWriter implements Closea
sorted.copyFrom(heapPointWriter);
}
+ //long t0 = System.nanoTime();
sortHeapPointWriter(sorted, 0, (int) pointCount, dim);
+ //long t1 = System.nanoTime();
+ //System.out.println("BKD: sort took " + ((t1-t0)/1000000.0) + " msec");
sorted.close();
return sorted;
@@ -366,6 +704,12 @@ public class BKDWriter implements Closea
}
}
+ private void checkMaxLeafNodeCount(int numLeaves) {
+ if ((1+bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
+ throw new IllegalStateException("too many nodes; increase maxPointsInLeafNode (currently " + maxPointsInLeafNode + ") and reindex");
+ }
+ }
+
/** Writes the BKD tree to the provided {@link IndexOutput} and returns the file offset where index was written. */
public long finish(IndexOutput out) throws IOException {
//System.out.println("\nBKDTreeWriter.finish pointCount=" + pointCount + " out=" + out + " heapWriter=" + heapWriter);
@@ -381,7 +725,12 @@ public class BKDWriter implements Closea
offlinePointWriter.close();
}
- LongBitSet ordBitSet = new LongBitSet(pointCount);
+ LongBitSet ordBitSet;
+ if (numDims > 1) {
+ ordBitSet = new LongBitSet(pointCount);
+ } else {
+ ordBitSet = null;
+ }
long countPerLeaf = pointCount;
long innerNodeCount = 1;
@@ -391,16 +740,9 @@ public class BKDWriter implements Closea
innerNodeCount *= 2;
}
- //System.out.println("innerNodeCount=" + innerNodeCount);
-
- if (1+2*innerNodeCount >= Integer.MAX_VALUE) {
- throw new IllegalStateException("too many nodes; increase maxPointsInLeafNode (currently " + maxPointsInLeafNode + ") and reindex");
- }
-
- innerNodeCount--;
+ int numLeaves = (int) innerNodeCount;
- int numLeaves = (int) (innerNodeCount+1);
- //System.out.println("LEAVES: " + numLeaves);
+ checkMaxLeafNodeCount(numLeaves);
// NOTE: we could save the 1+ here, to use a bit less heap at search time, but then we'd need a somewhat costly check at each
// step of the recursion to recompute the split dim:
@@ -548,7 +890,7 @@ public class BKDWriter implements Closea
private byte[] markRightTree(long rightCount, int splitDim, PathSlice source, LongBitSet ordBitSet) throws IOException {
// Now we mark ords that fall into the right half, so we can partition on all other dims that are not the split dim:
- assert ordBitSet.cardinality() == 0: "cardinality=" + ordBitSet.cardinality();
+ assert numDims == 1 || ordBitSet.cardinality() == 0: "cardinality=" + ordBitSet.cardinality();
// Read the split value, then mark all ords in the right tree (larger than the split value):
try (PointReader reader = source.writer.getReader(source.start + source.count - rightCount)) {
@@ -556,17 +898,19 @@ public class BKDWriter implements Closea
assert result;
System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim);
+ if (numDims > 1) {
- ordBitSet.set(reader.ord());
-
- // Start at 1 because we already did the first value above (so we could keep the split value):
- for(int i=1;i<rightCount;i++) {
- result = reader.next();
- assert result;
ordBitSet.set(reader.ord());
- }
- assert rightCount == ordBitSet.cardinality(): "rightCount=" + rightCount + " cardinality=" + ordBitSet.cardinality();
+ // Start at 1 because we already did the first value above (so we could keep the split value):
+ for(int i=1;i<rightCount;i++) {
+ result = reader.next();
+ assert result;
+ ordBitSet.set(reader.ord());
+ }
+
+ assert rightCount == ordBitSet.cardinality(): "rightCount=" + rightCount + " cardinality=" + ordBitSet.cardinality();
+ }
}
return scratch1;
@@ -643,7 +987,8 @@ public class BKDWriter implements Closea
PathSlice source = slices[0];
if (source.writer instanceof HeapPointWriter == false) {
- // Adversarial cases can cause this, e.g. very lopsided data, all equal points
+ // Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started
+ // offline, but then kept splitting only in one dimension, and so never had to rewrite into heap writer
source = switchToHeap(source);
}
@@ -652,17 +997,19 @@ public class BKDWriter implements Closea
// Save the block file pointer:
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
+ //System.out.println(" write leaf block @ fp=" + out.getFilePointer());
// Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o
// loading the values:
- writeLeafBlockDocs(out, heapSource.docIDs, Math.toIntExact(source.start), Math.toIntExact(source.count));
+ int count = Math.toIntExact(source.count);
+ writeLeafBlockDocs(out, heapSource.docIDs, Math.toIntExact(source.start), count);
// TODO: we should delta compress / only write suffix bytes, like terms dict (the values will all be "close together" since we are at
// a leaf cell):
// First pass: find the per-dim common prefix for all values in this block:
Arrays.fill(commonPrefixLengths, bytesPerDim);
- for (int i=0;i<source.count;i++) {
+ for (int i=0;i<count;i++) {
if (i == 0) {
heapSource.readPackedValue(Math.toIntExact(source.start + i), scratch1);
} else {
@@ -682,9 +1029,11 @@ public class BKDWriter implements Closea
writeCommonPrefixes(out, commonPrefixLengths, scratch1);
// Second pass: write the full values:
+ byte[] lastPackedValue = new byte[bytesPerDim];
for (int i=0;i<source.count;i++) {
// TODO: we could do bulk copying here, avoiding the intermediate copy:
heapSource.readPackedValue(Math.toIntExact(source.start + i), scratchPackedValue);
+ assert numDims != 1 || valueInOrder(i, lastPackedValue, scratchPackedValue);
// Make sure this value does in fact fall within this leaf cell:
assert valueInBounds(scratchPackedValue, minPackedValue, maxPackedValue);
@@ -694,7 +1043,12 @@ public class BKDWriter implements Closea
} else {
// Inner node: partition/recurse
- int splitDim = split(minPackedValue, maxPackedValue);
+ int splitDim;
+ if (numDims > 1) {
+ splitDim = split(minPackedValue, maxPackedValue);
+ } else {
+ splitDim = 0;
+ }
PathSlice source = slices[splitDim];
@@ -758,7 +1112,9 @@ public class BKDWriter implements Closea
}
}
- ordBitSet.clear(0, pointCount);
+ if (numDims > 1) {
+ ordBitSet.clear(0, pointCount);
+ }
// Recurse on left tree:
build(2*nodeID, leafNodeOffset, leftSlices,
@@ -787,6 +1143,15 @@ public class BKDWriter implements Closea
}
}
+ // only called from assert
+ private boolean valueInOrder(long ord, byte[] lastPackedValue, byte[] packedValue) {
+ if (ord > 0 && BKDUtil.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0) > 0) {
+ throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue) + " ord=" + ord);
+ }
+ System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
+ return true;
+ }
+
PointWriter getPointWriter(long count) throws IOException {
if (count <= maxPointsSortInHeap) {
int size = Math.toIntExact(count);
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java?rev=1716189&r1=1716188&r2=1716189&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java Tue Nov 24 16:06:36 2015
@@ -114,7 +114,7 @@ public class TestBKD extends LuceneTestC
try (Directory dir = getDirectory(numDocs)) {
int numDims = TestUtil.nextInt(random(), 1, 5);
int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 100);
- float maxMB = (float) 0.1 + (3*random().nextFloat());
+ float maxMB = (float) 3.0 + (3*random().nextFloat());
BKDWriter w = new BKDWriter(dir, "tmp", numDims, 4, maxPointsInLeafNode, maxMB);
if (VERBOSE) {
@@ -238,7 +238,7 @@ public class TestBKD extends LuceneTestC
int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
int numDims = TestUtil.nextInt(random(), 1, 5);
int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 100);
- float maxMB = (float) 0.1 + (3*random().nextFloat());
+ float maxMB = (float) 3.0 + (3*random().nextFloat());
BKDWriter w = new BKDWriter(dir, "tmp", numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
BigInteger[][] docs = new BigInteger[numDocs][];
@@ -425,6 +425,7 @@ public class TestBKD extends LuceneTestC
private void doTestRandomBinary(int count) throws Exception {
int numDocs = TestUtil.nextInt(random(), count, count*2);
int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
+
int numDims = TestUtil.nextInt(random(), 1, 5);
byte[][][] docValues = new byte[numDocs][][];
@@ -597,21 +598,13 @@ public class TestBKD extends LuceneTestC
assertEquals("a < b", iae.getMessage());
}
}
-
+
/** docIDs can be null, for the single valued case, else it maps value to docID */
private void verify(byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim) throws Exception {
try (Directory dir = getDirectory(docValues.length)) {
- while (true) {
- int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 100);
- double maxMB = (float) 0.1 + (3*random().nextDouble());
- try {
- verify(dir, docValues, docIDs, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
- return;
- } catch (IllegalArgumentException iae) {
- // This just means we got a too-small maxMB for the maxPointsInLeafNode; just retry
- assertTrue(iae.getMessage().contains("either increase maxMBSortInHeap or decrease maxPointsInLeafNode"));
- }
- }
+ int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 1000);
+ double maxMB = (float) 3.0 + (3*random().nextDouble());
+ verify(dir, docValues, docIDs, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
}
}
@@ -620,10 +613,32 @@ public class TestBKD extends LuceneTestC
if (VERBOSE) {
System.out.println("TEST: numValues=" + numValues + " numDims=" + numDims + " numBytesPerDim=" + numBytesPerDim + " maxPointsInLeafNode=" + maxPointsInLeafNode + " maxMB=" + maxMB);
}
- long indexFP;
- try (BKDWriter w = new BKDWriter(dir, "tmp", numDims, numBytesPerDim, maxPointsInLeafNode, maxMB)) {
+
+ List<Long> toMerge = null;
+ List<Integer> docIDBases = null;
+ int seg = 0;
+
+ BKDWriter w = new BKDWriter(dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
+ IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT);
+ IndexInput in = null;
+
+ boolean success = false;
+
+ try {
byte[] scratch = new byte[numBytesPerDim*numDims];
+ int lastDocIDBase = 0;
+ boolean useMerge = numDims == 1 && numValues >= 10 && random().nextBoolean();
+ int valuesInThisSeg;
+ if (useMerge) {
+ // Sometimes we will call merge with a single segment:
+ valuesInThisSeg = TestUtil.nextInt(random(), numValues/10, numValues);
+ } else {
+ valuesInThisSeg = 0;
+ }
+
+ int segCount = 0;
+
for(int ord=0;ord<numValues;ord++) {
int docID;
if (docIDs == null) {
@@ -632,7 +647,7 @@ public class TestBKD extends LuceneTestC
docID = docIDs[ord];
}
if (VERBOSE) {
- System.out.println(" ord=" + ord + " docID=" + docID);
+ System.out.println(" ord=" + ord + " docID=" + docID + " lastDocIDBase=" + lastDocIDBase);
}
for(int dim=0;dim<numDims;dim++) {
if (VERBOSE) {
@@ -640,21 +655,56 @@ public class TestBKD extends LuceneTestC
}
System.arraycopy(docValues[ord][dim], 0, scratch, dim*numBytesPerDim, numBytesPerDim);
}
- w.add(scratch, docID);
+ w.add(scratch, docID-lastDocIDBase);
+
+ segCount++;
+
+ if (useMerge && segCount == valuesInThisSeg) {
+ if (toMerge == null) {
+ toMerge = new ArrayList<>();
+ docIDBases = new ArrayList<>();
+ }
+ docIDBases.add(lastDocIDBase);
+ toMerge.add(w.finish(out));
+ valuesInThisSeg = TestUtil.nextInt(random(), numValues/10, numValues/2);
+ segCount = 0;
+
+ seg++;
+ maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 1000);
+ maxMB = (float) 3.0 + (3*random().nextDouble());
+ w = new BKDWriter(dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
+ lastDocIDBase = docID;
+ }
}
- boolean success = false;
- try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
+ long indexFP;
+
+ if (toMerge != null) {
+ System.out.println("merge " + toMerge.size());
+ if (segCount > 0) {
+ docIDBases.add(lastDocIDBase);
+ toMerge.add(w.finish(out));
+ }
+ out.close();
+ in = dir.openInput("bkd", IOContext.DEFAULT);
+ seg++;
+ w = new BKDWriter(dir, "_" + seg, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
+ List<BKDReader> readers = new ArrayList<>();
+ for(long fp : toMerge) {
+ in.seek(fp);
+ readers.add(new BKDReader(in));
+ }
+ out = dir.createOutput("bkd2", IOContext.DEFAULT);
+ indexFP = w.merge(out, null, readers, docIDBases);
+ out.close();
+ in.close();
+ in = dir.openInput("bkd2", IOContext.DEFAULT);
+ } else {
indexFP = w.finish(out);
- success = true;
- } finally {
- if (success == false) {
- IOUtils.deleteFilesIgnoringExceptions(dir, "bkd");
- }
+ out.close();
+ in = dir.openInput("bkd", IOContext.DEFAULT);
}
- }
- try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
BKDReader r = new BKDReader(in);
@@ -751,8 +801,17 @@ public class TestBKD extends LuceneTestC
assertEquals("docID=" + docID, expected.get(docID), hits.get(docID));
}
}
- } finally {
+ in.close();
dir.deleteFile("bkd");
+ if (toMerge != null) {
+ dir.deleteFile("bkd2");
+ }
+ success = true;
+ } finally {
+ if (success == false) {
+ IOUtils.closeWhileHandlingException(w, in, out);
+ IOUtils.deleteFilesIgnoringExceptions(dir, "bkd", "bkd2");
+ }
}
}