You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by kr...@apache.org on 2016/12/15 21:35:36 UTC
[21/50] [abbrv] lucene-solr:jira/solr-8593: LUCENE-7583: buffer small
leaf-block writes in BKDWriter
LUCENE-7583: buffer small leaf-block writes in BKDWriter
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/b97d9d74
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/b97d9d74
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/b97d9d74
Branch: refs/heads/jira/solr-8593
Commit: b97d9d7478f99660c1cfc91ef4461b7405254dea
Parents: cacabc9
Author: Mike McCandless <mi...@apache.org>
Authored: Wed Dec 7 18:59:23 2016 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Wed Dec 7 18:59:23 2016 -0500
----------------------------------------------------------------------
lucene/CHANGES.txt | 4 +
.../CompressingStoredFieldsWriter.java | 19 ++--
.../CompressingTermVectorsWriter.java | 11 +-
.../GrowableByteArrayDataOutput.java | 83 ---------------
.../util/GrowableByteArrayDataOutput.java | 103 +++++++++++++++++++
.../org/apache/lucene/util/bkd/BKDWriter.java | 85 ++++++++-------
.../apache/lucene/util/bkd/DocIdsWriter.java | 4 +-
.../TestGrowableByteArrayDataOutput.java | 80 --------------
.../store/TestGrowableByteArrayDataOutput.java | 80 ++++++++++++++
9 files changed, 251 insertions(+), 218 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b97d9d74/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index c6c39ac..26a9dec 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -134,6 +134,10 @@ Optimizations
a compressed format, using substantially less RAM in some cases
(Adrien Grand, Mike McCandless)
+* LUCENE-7583: BKD writing now buffers each leaf block in heap before
+ writing to disk, giving a small speedup in points-heavy use cases.
+ (Mike McCandless)
+
Other
* LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b97d9d74/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
index 1956ab7..cda855d 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
@@ -33,6 +33,7 @@ import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.GrowableByteArrayDataOutput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
@@ -157,7 +158,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
}
this.numStoredFields[numBufferedDocs] = numStoredFieldsInDoc;
numStoredFieldsInDoc = 0;
- endOffsets[numBufferedDocs] = bufferedDocs.length;
+ endOffsets[numBufferedDocs] = bufferedDocs.getPosition();
++numBufferedDocs;
if (triggerFlush()) {
flush();
@@ -210,7 +211,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
}
private boolean triggerFlush() {
- return bufferedDocs.length >= chunkSize || // chunks of at least chunkSize bytes
+ return bufferedDocs.getPosition() >= chunkSize || // chunks of at least chunkSize bytes
numBufferedDocs >= maxDocsPerChunk;
}
@@ -223,23 +224,23 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
lengths[i] = endOffsets[i] - endOffsets[i - 1];
assert lengths[i] >= 0;
}
- final boolean sliced = bufferedDocs.length >= 2 * chunkSize;
+ final boolean sliced = bufferedDocs.getPosition() >= 2 * chunkSize;
writeHeader(docBase, numBufferedDocs, numStoredFields, lengths, sliced);
// compress stored fields to fieldsStream
if (sliced) {
// big chunk, slice it
- for (int compressed = 0; compressed < bufferedDocs.length; compressed += chunkSize) {
- compressor.compress(bufferedDocs.bytes, compressed, Math.min(chunkSize, bufferedDocs.length - compressed), fieldsStream);
+ for (int compressed = 0; compressed < bufferedDocs.getPosition(); compressed += chunkSize) {
+ compressor.compress(bufferedDocs.getBytes(), compressed, Math.min(chunkSize, bufferedDocs.getPosition() - compressed), fieldsStream);
}
} else {
- compressor.compress(bufferedDocs.bytes, 0, bufferedDocs.length, fieldsStream);
+ compressor.compress(bufferedDocs.getBytes(), 0, bufferedDocs.getPosition(), fieldsStream);
}
// reset
docBase += numBufferedDocs;
numBufferedDocs = 0;
- bufferedDocs.length = 0;
+ bufferedDocs.reset();
numChunks++;
}
@@ -459,7 +460,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
flush();
numDirtyChunks++; // incomplete: we had to force this flush
} else {
- assert bufferedDocs.length == 0;
+ assert bufferedDocs.getPosition() == 0;
}
if (docBase != numDocs) {
throw new RuntimeException("Wrote " + docBase + " docs, finish called with numDocs=" + numDocs);
@@ -468,7 +469,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
fieldsStream.writeVLong(numChunks);
fieldsStream.writeVLong(numDirtyChunks);
CodecUtil.writeFooter(fieldsStream);
- assert bufferedDocs.length == 0;
+ assert bufferedDocs.getPosition() == 0;
}
// bulk merge is scary: its caused corruption bugs in the past.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b97d9d74/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
index 46a289a..9bd2483 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
@@ -37,6 +37,7 @@ import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.GrowableByteArrayDataOutput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
@@ -269,8 +270,8 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
@Override
public void finishDocument() throws IOException {
// append the payload bytes of the doc after its terms
- termSuffixes.writeBytes(payloadBytes.bytes, payloadBytes.length);
- payloadBytes.length = 0;
+ termSuffixes.writeBytes(payloadBytes.getBytes(), payloadBytes.getPosition());
+ payloadBytes.reset();
++numDocs;
if (triggerFlush()) {
flush();
@@ -316,7 +317,7 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
}
private boolean triggerFlush() {
- return termSuffixes.length >= chunkSize
+ return termSuffixes.getPosition() >= chunkSize
|| pendingDocs.size() >= MAX_DOCUMENTS_PER_CHUNK;
}
@@ -355,14 +356,14 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
flushPayloadLengths();
// compress terms and payloads and write them to the output
- compressor.compress(termSuffixes.bytes, 0, termSuffixes.length, vectorsStream);
+ compressor.compress(termSuffixes.getBytes(), 0, termSuffixes.getPosition(), vectorsStream);
}
// reset
pendingDocs.clear();
curDoc = null;
curField = null;
- termSuffixes.length = 0;
+ termSuffixes.reset();
numChunks++;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b97d9d74/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
deleted file mode 100644
index ec551d1..0000000
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.codecs.compressing;
-
-
-import java.io.IOException;
-
-import org.apache.lucene.store.DataOutput;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.UnicodeUtil;
-
-/**
- * A {@link DataOutput} that can be used to build a byte[].
- * @lucene.internal
- */
-public final class GrowableByteArrayDataOutput extends DataOutput {
-
- /** Minimum utf8 byte size of a string over which double pass over string is to save memory during encode */
- static final int MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING = 65536;
-
- /** The bytes */
- public byte[] bytes;
- /** The length */
- public int length;
-
- // scratch for utf8 encoding of small strings
- byte[] scratchBytes = new byte[16];
-
- /** Create a {@link GrowableByteArrayDataOutput} with the given initial capacity. */
- public GrowableByteArrayDataOutput(int cp) {
- this.bytes = new byte[ArrayUtil.oversize(cp, 1)];
- this.length = 0;
- }
-
- @Override
- public void writeByte(byte b) {
- if (length >= bytes.length) {
- bytes = ArrayUtil.grow(bytes);
- }
- bytes[length++] = b;
- }
-
- @Override
- public void writeBytes(byte[] b, int off, int len) {
- final int newLength = length + len;
- bytes = ArrayUtil.grow(bytes, newLength);
- System.arraycopy(b, off, bytes, length, len);
- length = newLength;
- }
-
- @Override
- public void writeString(String string) throws IOException {
- int maxLen = UnicodeUtil.maxUTF8Length(string.length());
- if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING) {
- // string is small enough that we don't need to save memory by falling back to double-pass approach
- // this is just an optimized writeString() that re-uses scratchBytes.
- scratchBytes = ArrayUtil.grow(scratchBytes, maxLen);
- int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
- writeVInt(len);
- writeBytes(scratchBytes, len);
- } else {
- // use a double pass approach to avoid allocating a large intermediate buffer for string encoding
- int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length());
- writeVInt(numBytes);
- bytes = ArrayUtil.grow(bytes, length + numBytes);
- length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b97d9d74/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java
new file mode 100644
index 0000000..5f00d4a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.store;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.UnicodeUtil;
+
+/**
+ * A {@link DataOutput} that can be used to build a byte[].
+ *
+ * @lucene.internal
+ */
+public final class GrowableByteArrayDataOutput extends DataOutput {
+
+ /** Minimum utf8 byte size of a string over which double pass over string is to save memory during encode */
+ static final int MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING = 65536;
+
+ /** The bytes */
+ private byte[] bytes;
+
+ /** The length */
+ private int length;
+
+ // scratch for utf8 encoding of small strings
+ private byte[] scratchBytes;
+
+ /** Create a {@link GrowableByteArrayDataOutput} with the given initial capacity. */
+ public GrowableByteArrayDataOutput(int cp) {
+ this.bytes = new byte[ArrayUtil.oversize(cp, 1)];
+ this.length = 0;
+ }
+
+ @Override
+ public void writeByte(byte b) {
+ if (length >= bytes.length) {
+ bytes = ArrayUtil.grow(bytes);
+ }
+ bytes[length++] = b;
+ }
+
+ @Override
+ public void writeBytes(byte[] b, int off, int len) {
+ final int newLength = length + len;
+ if (newLength > bytes.length) {
+ bytes = ArrayUtil.grow(bytes, newLength);
+ }
+ System.arraycopy(b, off, bytes, length, len);
+ length = newLength;
+ }
+
+ @Override
+ public void writeString(String string) throws IOException {
+ int maxLen = UnicodeUtil.maxUTF8Length(string.length());
+ if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING) {
+ // string is small enough that we don't need to save memory by falling back to double-pass approach
+ // this is just an optimized writeString() that re-uses scratchBytes.
+ if (scratchBytes == null) {
+ scratchBytes = new byte[ArrayUtil.oversize(maxLen, Character.BYTES)];
+ } else {
+ scratchBytes = ArrayUtil.grow(scratchBytes, maxLen);
+ }
+ int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
+ writeVInt(len);
+ writeBytes(scratchBytes, len);
+ } else {
+ // use a double pass approach to avoid allocating a large intermediate buffer for string encoding
+ int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length());
+ writeVInt(numBytes);
+ bytes = ArrayUtil.grow(bytes, length + numBytes);
+ length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length);
+ }
+ }
+
+ public byte[] getBytes() {
+ return bytes;
+ }
+
+ public int getPosition() {
+ return length;
+ }
+
+ public void reset() {
+ length = 0;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b97d9d74/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index c82a0c8..9657578 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -30,7 +30,9 @@ import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.GrowableByteArrayDataOutput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
@@ -478,8 +480,8 @@ public class BKDWriter implements Closeable {
}
build(1, numLeaves, values, 0, Math.toIntExact(pointCount), out,
- minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs,
- new int[maxPointsInLeafNode]);
+ minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs,
+ new int[maxPointsInLeafNode]);
long indexFP = out.getFilePointer();
writeIndex(out, leafBlockFPs, splitPackedValues);
@@ -556,6 +558,9 @@ public class BKDWriter implements Closeable {
return oneDimWriter.finish();
}
+ // reused when writing leaf blocks
+ private final GrowableByteArrayDataOutput scratchOut = new GrowableByteArrayDataOutput(32*1024);
+
private class OneDimensionBKDWriter {
final IndexOutput out;
@@ -563,8 +568,8 @@ public class BKDWriter implements Closeable {
final List<byte[]> leafBlockStartValues = new ArrayList<>();
final byte[] leafValues = new byte[maxPointsInLeafNode * packedBytesLength];
final int[] leafDocs = new int[maxPointsInLeafNode];
- long valueCount;
- int leafCount;
+ private long valueCount;
+ private int leafCount;
OneDimensionBKDWriter(IndexOutput out) {
if (numDims != 1) {
@@ -589,7 +594,7 @@ public class BKDWriter implements Closeable {
// for asserts
final byte[] lastPackedValue;
- int lastDocID;
+ private int lastDocID;
void add(byte[] packedValue, int docID) throws IOException {
assert valueInOrder(valueCount + leafCount,
@@ -606,8 +611,7 @@ public class BKDWriter implements Closeable {
if (leafCount == maxPointsInLeafNode) {
// We write a block once we hit exactly the max count ... this is different from
- // when we flush a new segment, where we write between max/2 and max per leaf block,
- // so merged segments will behave differently from newly flushed segments:
+ // when we write N > 1 dimensional points where we write between max/2 and max per leaf block
writeLeafBlock();
leafCount = 0;
}
@@ -644,7 +648,6 @@ public class BKDWriter implements Closeable {
}
private void writeLeafBlock() throws IOException {
- //System.out.println("writeLeafBlock pos=" + out.getFilePointer());
assert leafCount != 0;
if (valueCount == 0) {
System.arraycopy(leafValues, 0, minPackedValue, 0, packedBytesLength);
@@ -660,42 +663,39 @@ public class BKDWriter implements Closeable {
leafBlockFPs.add(out.getFilePointer());
checkMaxLeafNodeCount(leafBlockFPs.size());
- Arrays.fill(commonPrefixLengths, bytesPerDim);
// Find per-dim common prefix:
- for(int dim=0;dim<numDims;dim++) {
- int offset1 = dim * bytesPerDim;
- int offset2 = (leafCount - 1) * packedBytesLength + offset1;
- for(int j=0;j<commonPrefixLengths[dim];j++) {
- if (leafValues[offset1+j] != leafValues[offset2+j]) {
- commonPrefixLengths[dim] = j;
- break;
- }
+ int prefix = bytesPerDim;
+ int offset = (leafCount - 1) * packedBytesLength;
+ for(int j=0;j<bytesPerDim;j++) {
+ if (leafValues[j] != leafValues[offset+j]) {
+ prefix = j;
+ break;
}
}
- writeLeafBlockDocs(out, leafDocs, 0, leafCount);
- writeCommonPrefixes(out, commonPrefixLengths, leafValues);
+ commonPrefixLengths[0] = prefix;
- final IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
- final BytesRef scratch = new BytesRef();
-
- {
- scratch.length = packedBytesLength;
- scratch.bytes = leafValues;
- }
+ assert scratchOut.getPosition() == 0;
+ writeLeafBlockDocs(scratchOut, leafDocs, 0, leafCount);
+ writeCommonPrefixes(scratchOut, commonPrefixLengths, leafValues);
+ scratchBytesRef1.length = packedBytesLength;
+ scratchBytesRef1.bytes = leafValues;
+
+ final IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
@Override
public BytesRef apply(int i) {
- scratch.offset = packedBytesLength * i;
- return scratch;
+ scratchBytesRef1.offset = packedBytesLength * i;
+ return scratchBytesRef1;
}
};
assert valuesInOrderAndBounds(leafCount, 0, Arrays.copyOf(leafValues, packedBytesLength),
Arrays.copyOfRange(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength),
packedValues, leafDocs, 0);
- writeLeafBlockPackedValues(out, commonPrefixLengths, leafCount, 0, packedValues);
+ writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, leafCount, 0, packedValues);
+ out.writeBytes(scratchOut.getBytes(), 0, scratchOut.getPosition());
+ scratchOut.reset();
}
-
}
// TODO: there must be a simpler way?
@@ -1259,13 +1259,13 @@ public class BKDWriter implements Closeable {
out.writeBytes(packedIndex, 0, packedIndex.length);
}
- private void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
+ private void writeLeafBlockDocs(DataOutput out, int[] docIDs, int start, int count) throws IOException {
assert count > 0: "maxPointsInLeafNode=" + maxPointsInLeafNode;
out.writeVInt(count);
DocIdsWriter.writeDocIds(docIDs, start, count, out);
}
- private void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
+ private void writeLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
int prefixLenSum = Arrays.stream(commonPrefixLengths).sum();
if (prefixLenSum == packedBytesLength) {
// all values in this block are equal
@@ -1290,7 +1290,7 @@ public class BKDWriter implements Closeable {
}
}
- private void writeLeafBlockPackedValuesRange(IndexOutput out, int[] commonPrefixLengths, int start, int end, IntFunction<BytesRef> packedValues) throws IOException {
+ private void writeLeafBlockPackedValuesRange(DataOutput out, int[] commonPrefixLengths, int start, int end, IntFunction<BytesRef> packedValues) throws IOException {
for (int i = start; i < end; ++i) {
BytesRef ref = packedValues.apply(i);
assert ref.length == packedBytesLength;
@@ -1316,7 +1316,7 @@ public class BKDWriter implements Closeable {
return end - start;
}
- private void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
+ private void writeCommonPrefixes(DataOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
for(int dim=0;dim<numDims;dim++) {
out.writeVInt(commonPrefixes[dim]);
//System.out.println(commonPrefixes[dim] + " of " + bytesPerDim);
@@ -1449,7 +1449,8 @@ public class BKDWriter implements Closeable {
}
}
- /* Recursively reorders the provided reader and writes the bkd-tree on the fly. */
+ /* Recursively reorders the provided reader and writes the bkd-tree on the fly; this method is used
+ * when we are writing a new segment directly from IndexWriter's indexing buffer (MutablePointsReader). */
private void build(int nodeID, int leafNodeOffset,
MutablePointValues reader, int from, int to,
IndexOutput out,
@@ -1513,18 +1514,20 @@ public class BKDWriter implements Closeable {
// Save the block file pointer:
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
+ assert scratchOut.getPosition() == 0;
+
// Write doc IDs
int[] docIDs = spareDocIds;
for (int i = from; i < to; ++i) {
docIDs[i - from] = reader.getDocID(i);
}
//System.out.println("writeLeafBlock pos=" + out.getFilePointer());
- writeLeafBlockDocs(out, docIDs, 0, count);
+ writeLeafBlockDocs(scratchOut, docIDs, 0, count);
// Write the common prefixes:
reader.getValue(from, scratchBytesRef1);
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, packedBytesLength);
- writeCommonPrefixes(out, commonPrefixLengths, scratch1);
+ writeCommonPrefixes(scratchOut, commonPrefixLengths, scratch1);
// Write the full values:
IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
@@ -1536,7 +1539,10 @@ public class BKDWriter implements Closeable {
};
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
docIDs, 0);
- writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
+ writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, count, sortedDim, packedValues);
+
+ out.writeBytes(scratchOut.getBytes(), 0, scratchOut.getPosition());
+ scratchOut.reset();
} else {
// inner node
@@ -1577,7 +1583,8 @@ public class BKDWriter implements Closeable {
}
}
- /** The array (sized numDims) of PathSlice describe the cell we have currently recursed to. */
+ /** The array (sized numDims) of PathSlice describe the cell we have currently recursed to.
+ /* This method is used when we are merging previously written segments, in the numDims > 1 case. */
private void build(int nodeID, int leafNodeOffset,
PathSlice[] slices,
LongBitSet ordBitSet,
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b97d9d74/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
index 9dce5a8..d76c6c7 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
@@ -19,14 +19,14 @@ package org.apache.lucene.util.bkd;
import java.io.IOException;
import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.IndexOutput;
class DocIdsWriter {
private DocIdsWriter() {}
- static void writeDocIds(int[] docIds, int start, int count, IndexOutput out) throws IOException {
+ static void writeDocIds(int[] docIds, int start, int count, DataOutput out) throws IOException {
// docs can be sorted either when all docs in a block have the same value
// or when a segment is sorted
boolean sorted = true;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b97d9d74/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java b/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
deleted file mode 100644
index 37a7e4c..0000000
--- a/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.codecs.compressing;
-
-
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.UnicodeUtil;
-import org.junit.Test;
-
-/**
- * Test for {@link GrowableByteArrayDataOutput}
- */
-public class TestGrowableByteArrayDataOutput extends LuceneTestCase {
-
- @Test
- public void testWriteSmallStrings() throws Exception {
- int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
-
- // a simple string encoding test
- int num = atLeast(1000);
- for (int i = 0; i < num; i++) {
- // create a small string such that the single pass approach is used
- int length = TestUtil.nextInt(random(), 1, minSizeForDoublePass - 1);
- String unicode = TestUtil.randomFixedByteLengthUnicodeString(random(), length);
- byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(unicode.length())];
- int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
-
- GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
- //explicitly write utf8 len so that we know how many bytes it occupies
- dataOutput.writeVInt(len);
- int vintLen = dataOutput.length;
- // now write the string which will internally write number of bytes as a vint and then utf8 bytes
- dataOutput.writeString(unicode);
-
- assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
- for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
- assertEquals(utf8[j], dataOutput.bytes[k]);
- }
- }
- }
-
- @Test
- public void testWriteLargeStrings() throws Exception {
- int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
-
- int num = atLeast(100);
- for (int i = 0; i < num; i++) {
- String unicode = TestUtil.randomRealisticUnicodeString(random(), minSizeForDoublePass, 10 * minSizeForDoublePass);
- byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(unicode.length())];
- int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
-
- GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
- //explicitly write utf8 len so that we know how many bytes it occupies
- dataOutput.writeVInt(len);
- int vintLen = dataOutput.length;
- // now write the string which will internally write number of bytes as a vint and then utf8 bytes
- dataOutput.writeString(unicode);
-
- assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
- for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
- assertEquals(utf8[j], dataOutput.bytes[k]);
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b97d9d74/lucene/core/src/test/org/apache/lucene/store/TestGrowableByteArrayDataOutput.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/store/TestGrowableByteArrayDataOutput.java b/lucene/core/src/test/org/apache/lucene/store/TestGrowableByteArrayDataOutput.java
new file mode 100644
index 0000000..10992b7
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/store/TestGrowableByteArrayDataOutput.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.store;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.UnicodeUtil;
+import org.junit.Test;
+
+/**
+ * Test for {@link GrowableByteArrayDataOutput}
+ */
+public class TestGrowableByteArrayDataOutput extends LuceneTestCase {
+
+ @Test
+ public void testWriteSmallStrings() throws Exception {
+ int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
+
+ // a simple string encoding test
+ int num = atLeast(1000);
+ for (int i = 0; i < num; i++) {
+ // create a small string such that the single pass approach is used
+ int length = TestUtil.nextInt(random(), 1, minSizeForDoublePass - 1);
+ String unicode = TestUtil.randomFixedByteLengthUnicodeString(random(), length);
+ byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(unicode.length())];
+ int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
+
+ GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
+ //explicitly write utf8 len so that we know how many bytes it occupies
+ dataOutput.writeVInt(len);
+ int vintLen = dataOutput.getPosition();
+ // now write the string which will internally write number of bytes as a vint and then utf8 bytes
+ dataOutput.writeString(unicode);
+
+ assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.getPosition());
+ for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
+ assertEquals(utf8[j], dataOutput.getBytes()[k]);
+ }
+ }
+ }
+
+ @Test
+ public void testWriteLargeStrings() throws Exception {
+ int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
+
+ int num = atLeast(100);
+ for (int i = 0; i < num; i++) {
+ String unicode = TestUtil.randomRealisticUnicodeString(random(), minSizeForDoublePass, 10 * minSizeForDoublePass);
+ byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(unicode.length())];
+ int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
+
+ GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
+ //explicitly write utf8 len so that we know how many bytes it occupies
+ dataOutput.writeVInt(len);
+ int vintLen = dataOutput.getPosition();
+ // now write the string which will internally write number of bytes as a vint and then utf8 bytes
+ dataOutput.writeString(unicode);
+
+ assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.getPosition());
+ for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
+ assertEquals(utf8[j], dataOutput.getBytes()[k]);
+ }
+ }
+ }
+}