You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/08/03 12:48:50 UTC
[1/4] lucene-solr:master: LUCENE-7403: Use blocks of exactly
maxPointsInLeafNodes values in the 1D case.
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x 72750167a -> 33197ec98
refs/heads/master a07425a4e -> 9fc462485
LUCENE-7403: Use blocks of exactly maxPointsInLeafNodes values in the 1D case.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/234ea3ef
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/234ea3ef
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/234ea3ef
Branch: refs/heads/master
Commit: 234ea3ef8954325923f4e85c5c0aa72c3bb15baa
Parents: a07425a
Author: Adrien Grand <jp...@gmail.com>
Authored: Wed Aug 3 14:33:18 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Aug 3 14:34:06 2016 +0200
----------------------------------------------------------------------
.../core/src/java/org/apache/lucene/util/bkd/BKDWriter.java | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/234ea3ef/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index 8bd66d0..e11fc08 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -562,11 +562,10 @@ public class BKDWriter implements Closeable {
private class OneDimensionBKDWriter {
final IndexOutput out;
- final int pointsPerLeafBlock = (int) (0.75 * maxPointsInLeafNode);
final List<Long> leafBlockFPs = new ArrayList<>();
final List<byte[]> leafBlockStartValues = new ArrayList<>();
- final byte[] leafValues = new byte[pointsPerLeafBlock * packedBytesLength];
- final int[] leafDocs = new int[pointsPerLeafBlock];
+ final byte[] leafValues = new byte[maxPointsInLeafNode * packedBytesLength];
+ final int[] leafDocs = new int[maxPointsInLeafNode];
long valueCount;
int leafCount;
@@ -608,7 +607,7 @@ public class BKDWriter implements Closeable {
throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values");
}
- if (leafCount == pointsPerLeafBlock) {
+ if (leafCount == maxPointsInLeafNode) {
// We write a block once we hit exactly the max count ... this is different from
// when we flush a new segment, where we write between max/2 and max per leaf block,
// so merged segments will behave differently from newly flushed segments:
[3/4] lucene-solr:branch_6x: LUCENE-7403: Use blocks of exactly
maxPointsInLeafNodes values in the 1D case.
Posted by jp...@apache.org.
LUCENE-7403: Use blocks of exactly maxPointsInLeafNodes values in the 1D case.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/e360afe7
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/e360afe7
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/e360afe7
Branch: refs/heads/branch_6x
Commit: e360afe70ce99d0d0b7c23cec0439f8773618de4
Parents: 7275016
Author: Adrien Grand <jp...@gmail.com>
Authored: Wed Aug 3 14:33:18 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Aug 3 14:48:22 2016 +0200
----------------------------------------------------------------------
.../core/src/java/org/apache/lucene/util/bkd/BKDWriter.java | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e360afe7/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index 8bd66d0..e11fc08 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -562,11 +562,10 @@ public class BKDWriter implements Closeable {
private class OneDimensionBKDWriter {
final IndexOutput out;
- final int pointsPerLeafBlock = (int) (0.75 * maxPointsInLeafNode);
final List<Long> leafBlockFPs = new ArrayList<>();
final List<byte[]> leafBlockStartValues = new ArrayList<>();
- final byte[] leafValues = new byte[pointsPerLeafBlock * packedBytesLength];
- final int[] leafDocs = new int[pointsPerLeafBlock];
+ final byte[] leafValues = new byte[maxPointsInLeafNode * packedBytesLength];
+ final int[] leafDocs = new int[maxPointsInLeafNode];
long valueCount;
int leafCount;
@@ -608,7 +607,7 @@ public class BKDWriter implements Closeable {
throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values");
}
- if (leafCount == pointsPerLeafBlock) {
+ if (leafCount == maxPointsInLeafNode) {
// We write a block once we hit exactly the max count ... this is different from
// when we flush a new segment, where we write between max/2 and max per leaf block,
// so merged segments will behave differently from newly flushed segments:
[2/4] lucene-solr:master: LUCENE-7399: Speed up flush of points.
Posted by jp...@apache.org.
LUCENE-7399: Speed up flush of points.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/9fc46248
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/9fc46248
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/9fc46248
Branch: refs/heads/master
Commit: 9fc4624853d170b5982a0d0c9d9e76a477a2b713
Parents: 234ea3e
Author: Adrien Grand <jp...@gmail.com>
Authored: Fri Jul 29 17:32:29 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Aug 3 14:35:48 2016 +0200
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +-
.../lucene/codecs/MutablePointsReader.java | 6 +-
.../apache/lucene/index/PointValuesWriter.java | 14 ++-
.../org/apache/lucene/util/ByteBlockPool.java | 21 ++++
.../apache/lucene/util/InPlaceMergeSorter.java | 4 +-
.../org/apache/lucene/util/IntroSelector.java | 2 +
.../org/apache/lucene/util/IntroSorter.java | 13 ++-
.../org/apache/lucene/util/MSBRadixSorter.java | 109 +++++++++++++++----
.../org/apache/lucene/util/RadixSelector.java | 94 ++++++++++++++--
.../src/java/org/apache/lucene/util/Sorter.java | 59 +++++-----
.../org/apache/lucene/util/bkd/BKDWriter.java | 49 ++++-----
.../util/bkd/MutablePointsReaderUtils.java | 19 ++--
.../apache/lucene/util/TestByteBlockPool.java | 25 ++++-
.../apache/lucene/util/TestMSBRadixSorter.java | 3 +
.../apache/lucene/util/TestRadixSelector.java | 31 +++++-
.../util/bkd/TestMutablePointsReaderUtils.java | 49 ++++++---
16 files changed, 368 insertions(+), 133 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 45bc830..e83e6cb 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -156,7 +156,8 @@ Optimizations
* LUCENE-7311: Cached term queries do not seek the terms dictionary anymore.
(Adrien Grand)
-* LUCENE-7396: Faster flush of points. (Adrien Grand, Mike McCandless)
+* LUCENE-7396, LUCENE-7399: Faster flush of points.
+ (Adrien Grand, Mike McCandless)
Other
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/java/org/apache/lucene/codecs/MutablePointsReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/MutablePointsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/MutablePointsReader.java
index b6c328b..dccca26 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/MutablePointsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/MutablePointsReader.java
@@ -16,6 +16,8 @@
*/
package org.apache.lucene.codecs;
+import org.apache.lucene.util.BytesRef;
+
/** {@link PointsReader} whose order of points can be changed.
* This class is useful for codecs to optimize flush.
* @lucene.internal */
@@ -24,8 +26,8 @@ public abstract class MutablePointsReader extends PointsReader {
/** Sole constructor. */
protected MutablePointsReader() {}
- /** Fill {@code packedValue} with the packed bytes of the i-th value. */
- public abstract void getValue(int i, byte[] packedValue);
+ /** Set {@code packedValue} with a reference to the packed bytes of the i-th value. */
+ public abstract void getValue(int i, BytesRef packedValue);
/** Get the k-th byte of the i-th value. */
public abstract byte getByteAt(int i, int k);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
index dcc7600..daf1f33 100644
--- a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
@@ -51,9 +51,10 @@ class PointValuesWriter {
if (value == null) {
throw new IllegalArgumentException("field=" + fieldInfo.name + ": point value must not be null");
}
- if (value.length != fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes()) {
+ if (value.length != packedBytesLength) {
throw new IllegalArgumentException("field=" + fieldInfo.name + ": this field's value has length=" + value.length + " but should be " + (fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes()));
}
+
if (docIDs.length == numPoints) {
docIDs = ArrayUtil.grow(docIDs, numPoints+1);
iwBytesUsed.addAndGet((docIDs.length - numPoints) * Integer.BYTES);
@@ -64,6 +65,7 @@ class PointValuesWriter {
numDocs++;
lastDocID = docID;
}
+
numPoints++;
}
@@ -82,9 +84,12 @@ class PointValuesWriter {
if (fieldName.equals(fieldInfo.name) == false) {
throw new IllegalArgumentException("fieldName must be the same");
}
+ final BytesRef scratch = new BytesRef();
final byte[] packedValue = new byte[packedBytesLength];
for(int i=0;i<numPoints;i++) {
- getValue(i, packedValue);
+ getValue(i, scratch);
+ assert scratch.length == packedValue.length;
+ System.arraycopy(scratch.bytes, scratch.offset, packedValue, 0, packedBytesLength);
visitor.visit(getDocID(i), packedValue);
}
}
@@ -152,9 +157,10 @@ class PointValuesWriter {
}
@Override
- public void getValue(int i, byte[] packedValue) {
+ public void getValue(int i, BytesRef packedValue) {
final long offset = (long) packedBytesLength * ords[i];
- bytes.readBytes(offset, packedValue, 0, packedBytesLength);
+ packedValue.length = packedBytesLength;
+ bytes.setRawBytesRef(packedValue, offset);
}
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
index e202d63..e575d87 100644
--- a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
+++ b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
@@ -379,6 +379,27 @@ public final class ByteBlockPool {
} while (true);
}
+ /**
+ * Set the given {@link BytesRef} so that its content is equal to the
+ * {@code ref.length} bytes starting at {@code offset}. Most of the time this
+ * method will set pointers to internal data-structures. However, in case a
+ * value crosses a boundary, a fresh copy will be returned.
+ * On the contrary to {@link #setBytesRef(BytesRef, int)}, this does not
+ * expect the length to be encoded with the data.
+ */
+ public void setRawBytesRef(BytesRef ref, final long offset) {
+ int bufferIndex = (int) (offset >> BYTE_BLOCK_SHIFT);
+ int pos = (int) (offset & BYTE_BLOCK_MASK);
+ if (pos + ref.length <= BYTE_BLOCK_SIZE) {
+ ref.bytes = buffers[bufferIndex];
+ ref.offset = pos;
+ } else {
+ ref.bytes = new byte[ref.length];
+ ref.offset = 0;
+ readBytes(offset, ref.bytes, 0, ref.length);
+ }
+ }
+
/** Read a single byte at the given {@code offset}. */
public byte readByte(long offset) {
int bufferIndex = (int) (offset >> BYTE_BLOCK_SHIFT);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/java/org/apache/lucene/util/InPlaceMergeSorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/InPlaceMergeSorter.java b/lucene/core/src/java/org/apache/lucene/util/InPlaceMergeSorter.java
index 1afba99..de8f976 100644
--- a/lucene/core/src/java/org/apache/lucene/util/InPlaceMergeSorter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/InPlaceMergeSorter.java
@@ -33,8 +33,8 @@ public abstract class InPlaceMergeSorter extends Sorter {
}
void mergeSort(int from, int to) {
- if (to - from < INSERTION_SORT_THRESHOLD) {
- insertionSort(from, to);
+ if (to - from < BINARY_SORT_THRESHOLD) {
+ binarySort(from, to);
} else {
final int mid = (from + to) >>> 1;
mergeSort(from, mid);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java b/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
index 7898535..2984801 100644
--- a/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
+++ b/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
@@ -34,6 +34,8 @@ public abstract class IntroSelector extends Selector {
}
// heap sort
+ // TODO: use median of median instead to have linear worst-case rather than
+ // n*log(n)
void slowSelect(int from, int to, int k) {
new Sorter() {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java b/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java
index 83d118d..79a39bf 100644
--- a/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java
@@ -37,8 +37,8 @@ public abstract class IntroSorter extends Sorter {
}
void quicksort(int from, int to, int maxDepth) {
- if (to - from < INSERTION_SORT_THRESHOLD) {
- insertionSort(from, to);
+ if (to - from < BINARY_SORT_THRESHOLD) {
+ binarySort(from, to);
return;
} else if (--maxDepth < 0) {
heapSort(from, to);
@@ -83,12 +83,13 @@ public abstract class IntroSorter extends Sorter {
quicksort(left + 1, to, maxDepth);
}
- /** Save the value at slot <code>i</code> so that it can later be used as a
- * pivot, see {@link #comparePivot(int)}. */
+ // Don't rely on the slow default impl of setPivot/comparePivot since
+ // quicksort relies on these methods to be fast for good performance
+
+ @Override
protected abstract void setPivot(int i);
- /** Compare the pivot with the slot at <code>j</code>, similarly to
- * {@link #compare(int, int) compare(i, j)}. */
+ @Override
protected abstract int comparePivot(int j);
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java b/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
index 33f20b6..f0e2a61 100644
--- a/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
@@ -38,6 +38,7 @@ public abstract class MSBRadixSorter extends Sorter {
// we store one histogram per recursion level
private final int[][] histograms = new int[LEVEL_THRESHOLD][];
private final int[] endOffsets = new int[HISTOGRAM_SIZE];
+ private final int[] commonPrefix;
private final int maxLength;
@@ -47,6 +48,7 @@ public abstract class MSBRadixSorter extends Sorter {
*/
protected MSBRadixSorter(int maxLength) {
this.maxLength = maxLength;
+ this.commonPrefix = new int[Math.min(24, maxLength)];
}
/** Return the k-th byte of the entry at index {@code i}, or {@code -1} if
@@ -116,14 +118,14 @@ public abstract class MSBRadixSorter extends Sorter {
@Override
public void sort(int from, int to) {
checkRange(from, to);
- sort(from, to, 0);
+ sort(from, to, 0, 0);
}
- private void sort(int from, int to, int k) {
- if (to - from <= LENGTH_THRESHOLD || k >= LEVEL_THRESHOLD) {
+ private void sort(int from, int to, int k, int l) {
+ if (to - from <= LENGTH_THRESHOLD || l >= LEVEL_THRESHOLD) {
introSort(from, to, k);
} else {
- radixSort(from, to, k);
+ radixSort(from, to, k, l);
}
}
@@ -131,28 +133,30 @@ public abstract class MSBRadixSorter extends Sorter {
getFallbackSorter(k).sort(from, to);
}
- private void radixSort(int from, int to, int k) {
- int[] histogram = histograms[k];
+ /**
+ * @param k the character number to compare
+ * @param l the level of recursion
+ */
+ private void radixSort(int from, int to, int k, int l) {
+ int[] histogram = histograms[l];
if (histogram == null) {
- histogram = histograms[k] = new int[HISTOGRAM_SIZE];
+ histogram = histograms[l] = new int[HISTOGRAM_SIZE];
} else {
Arrays.fill(histogram, 0);
}
- buildHistogram(from, to, k, histogram);
-
- // short-circuit: if all keys have the same byte at offset k, then recurse directly
- for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
- if (histogram[i] == to - from) {
- // everything is in the same bucket, recurse
- if (i > 0) {
- sort(from, to, k + 1);
- }
- return;
- } else if (histogram[i] != 0) {
- break;
+ final int commonPrefixLength = computeCommonPrefixLengthAndBuildHistogram(from, to, k, histogram);
+ if (commonPrefixLength > 0) {
+ // if there are no more chars to compare or if all entries fell into the
+ // first bucket (which means strings are shorter than k) then we are done
+ // otherwise recurse
+ if (k + commonPrefixLength < maxLength
+ && histogram[0] < to - from) {
+ radixSort(from, to, k + commonPrefixLength, l);
}
+ return;
}
+ assert assertHistogram(commonPrefixLength, histogram);
int[] startOffsets = histogram;
int[] endOffsets = this.endOffsets;
@@ -167,24 +171,83 @@ public abstract class MSBRadixSorter extends Sorter {
int h = endOffsets[i];
final int bucketLen = h - prev;
if (bucketLen > 1) {
- sort(from + prev, from + h, k + 1);
+ sort(from + prev, from + h, k + 1, l + 1);
}
prev = h;
}
}
}
+ // only used from assert
+ private boolean assertHistogram(int commonPrefixLength, int[] histogram) {
+ int numberOfUniqueBytes = 0;
+ for (int freq : histogram) {
+ if (freq > 0) {
+ numberOfUniqueBytes++;
+ }
+ }
+ if (numberOfUniqueBytes == 1) {
+ assert commonPrefixLength >= 1;
+ } else {
+ assert commonPrefixLength == 0 : commonPrefixLength;
+ }
+ return true;
+ }
+
/** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
private int getBucket(int i, int k) {
return byteAt(i, k) + 1;
}
- /** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}. */
- private int[] buildHistogram(int from, int to, int k, int[] histogram) {
+ /** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}
+ * and return a common prefix length for all visited values.
+ * @see #buildHistogram */
+ private int computeCommonPrefixLengthAndBuildHistogram(int from, int to, int k, int[] histogram) {
+ final int[] commonPrefix = this.commonPrefix;
+ int commonPrefixLength = Math.min(commonPrefix.length, maxLength - k);
+ for (int j = 0; j < commonPrefixLength; ++j) {
+ final int b = byteAt(from, k + j);
+ commonPrefix[j] = b;
+ if (b == -1) {
+ commonPrefixLength = j + 1;
+ break;
+ }
+ }
+
+ int i;
+ outer: for (i = from + 1; i < to; ++i) {
+ for (int j = 0; j < commonPrefixLength; ++j) {
+ final int b = byteAt(i, k + j);
+ if (b != commonPrefix[j]) {
+ commonPrefixLength = j;
+ if (commonPrefixLength == 0) { // we have no common prefix
+ histogram[commonPrefix[0] + 1] = i - from;
+ histogram[b + 1] = 1;
+ break outer;
+ }
+ break;
+ }
+ }
+ }
+
+ if (i < to) {
+ // the loop got broken because there is no common prefix
+ assert commonPrefixLength == 0;
+ buildHistogram(i + 1, to, k, histogram);
+ } else {
+ assert commonPrefixLength > 0;
+ histogram[commonPrefix[0] + 1] = to - from;
+ }
+
+ return commonPrefixLength;
+ }
+
+ /** Build an histogram of the k-th characters of values occurring between
+ * offsets {@code from} and {@code to}, using {@link #getBucket}. */
+ private void buildHistogram(int from, int to, int k, int[] histogram) {
for (int i = from; i < to; ++i) {
histogram[getBucket(i, k)]++;
}
- return histogram;
}
/** Accumulate values of the histogram so that it does not store counts but
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java b/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java
index 543204a..c9b6133 100644
--- a/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java
+++ b/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java
@@ -36,6 +36,7 @@ public abstract class RadixSelector extends Selector {
// we store one histogram per recursion level
private final int[] histogram = new int[HISTOGRAM_SIZE];
+ private final int[] commonPrefix;
private final int maxLength;
@@ -45,6 +46,7 @@ public abstract class RadixSelector extends Selector {
*/
protected RadixSelector(int maxLength) {
this.maxLength = maxLength;
+ this.commonPrefix = new int[Math.min(24, maxLength)];
}
/** Return the k-th byte of the entry at index {@code i}, or {@code -1} if
@@ -112,22 +114,37 @@ public abstract class RadixSelector extends Selector {
@Override
public void select(int from, int to, int k) {
checkArgs(from, to, k);
- select(from, to, k, 0);
+ select(from, to, k, 0, 0);
}
- private void select(int from, int to, int k, int d) {
+ private void select(int from, int to, int k, int d, int l) {
if (to - from <= LENGTH_THRESHOLD || d >= LEVEL_THRESHOLD) {
getFallbackSelector(d).select(from, to, k);
} else {
- radixSelect(from, to, k, d);
+ radixSelect(from, to, k, d, l);
}
}
- private void radixSelect(int from, int to, int k, int d) {
+ /**
+ * @param d the character number to compare
+ * @param l the level of recursion
+ */
+ private void radixSelect(int from, int to, int k, int d, int l) {
final int[] histogram = this.histogram;
Arrays.fill(histogram, 0);
- buildHistogram(from, to, d, histogram);
+ final int commonPrefixLength = computeCommonPrefixLengthAndBuildHistogram(from, to, d, histogram);
+ if (commonPrefixLength > 0) {
+ // if there are no more chars to compare or if all entries fell into the
+ // first bucket (which means strings are shorter than d) then we are done
+ // otherwise recurse
+ if (d + commonPrefixLength < maxLength
+ && histogram[0] < to - from) {
+ radixSelect(from, to, k, d + commonPrefixLength, l);
+ }
+ return;
+ }
+ assert assertHistogram(commonPrefixLength, histogram);
int bucketFrom = from;
for (int bucket = 0; bucket < HISTOGRAM_SIZE; ++bucket) {
@@ -138,7 +155,7 @@ public abstract class RadixSelector extends Selector {
if (bucket != 0 && d + 1 < maxLength) {
// all elements in bucket 0 are equal so we only need to recurse if bucket != 0
- select(bucketFrom, bucketTo, k, d + 1);
+ select(bucketFrom, bucketTo, k, d + 1, l + 1);
}
return;
}
@@ -147,17 +164,76 @@ public abstract class RadixSelector extends Selector {
throw new AssertionError("Unreachable code");
}
+ // only used from assert
+ private boolean assertHistogram(int commonPrefixLength, int[] histogram) {
+ int numberOfUniqueBytes = 0;
+ for (int freq : histogram) {
+ if (freq > 0) {
+ numberOfUniqueBytes++;
+ }
+ }
+ if (numberOfUniqueBytes == 1) {
+ assert commonPrefixLength >= 1;
+ } else {
+ assert commonPrefixLength == 0;
+ }
+ return true;
+ }
+
/** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
private int getBucket(int i, int k) {
return byteAt(i, k) + 1;
}
- /** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}. */
- private int[] buildHistogram(int from, int to, int k, int[] histogram) {
+ /** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}
+ * and return a common prefix length for all visited values.
+ * @see #buildHistogram */
+ private int computeCommonPrefixLengthAndBuildHistogram(int from, int to, int k, int[] histogram) {
+ final int[] commonPrefix = this.commonPrefix;
+ int commonPrefixLength = Math.min(commonPrefix.length, maxLength - k);
+ for (int j = 0; j < commonPrefixLength; ++j) {
+ final int b = byteAt(from, k + j);
+ commonPrefix[j] = b;
+ if (b == -1) {
+ commonPrefixLength = j + 1;
+ break;
+ }
+ }
+
+ int i;
+ outer: for (i = from + 1; i < to; ++i) {
+ for (int j = 0; j < commonPrefixLength; ++j) {
+ final int b = byteAt(i, k + j);
+ if (b != commonPrefix[j]) {
+ commonPrefixLength = j;
+ if (commonPrefixLength == 0) { // we have no common prefix
+ histogram[commonPrefix[0] + 1] = i - from;
+ histogram[b + 1] = 1;
+ break outer;
+ }
+ break;
+ }
+ }
+ }
+
+ if (i < to) {
+ // the loop got broken because there is no common prefix
+ assert commonPrefixLength == 0;
+ buildHistogram(i + 1, to, k, histogram);
+ } else {
+ assert commonPrefixLength > 0;
+ histogram[commonPrefix[0] + 1] = to - from;
+ }
+
+ return commonPrefixLength;
+ }
+
+ /** Build an histogram of the k-th characters of values occurring between
+ * offsets {@code from} and {@code to}, using {@link #getBucket}. */
+ private void buildHistogram(int from, int to, int k, int[] histogram) {
for (int i = from; i < to; ++i) {
histogram[getBucket(i, k)]++;
}
- return histogram;
}
/** Reorder elements so that all of them that fall into {@code bucket} are
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/java/org/apache/lucene/util/Sorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/Sorter.java b/lucene/core/src/java/org/apache/lucene/util/Sorter.java
index 0ac954b..32d27cd 100644
--- a/lucene/core/src/java/org/apache/lucene/util/Sorter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/Sorter.java
@@ -23,7 +23,7 @@ import java.util.Comparator;
* @lucene.internal */
public abstract class Sorter {
- static final int INSERTION_SORT_THRESHOLD = 20;
+ static final int BINARY_SORT_THRESHOLD = 20;
/** Sole constructor, used for inheritance. */
protected Sorter() {}
@@ -36,6 +36,20 @@ public abstract class Sorter {
/** Swap values at slots <code>i</code> and <code>j</code>. */
protected abstract void swap(int i, int j);
+ private int pivotIndex;
+
+ /** Save the value at slot <code>i</code> so that it can later be used as a
+ * pivot, see {@link #comparePivot(int)}. */
+ protected void setPivot(int i) {
+ pivotIndex = i;
+ }
+
+ /** Compare the pivot with the slot at <code>j</code>, similarly to
+ * {@link #compare(int, int) compare(i, j)}. */
+ protected int comparePivot(int j) {
+ return compare(pivotIndex, j);
+ }
+
/** Sort the slice which starts at <code>from</code> (inclusive) and ends at
* <code>to</code> (exclusive). */
public abstract void sort(int from, int to);
@@ -163,54 +177,41 @@ public abstract class Sorter {
}
}
- void insertionSort(int from, int to) {
- for (int i = from + 1; i < to; ++i) {
- for (int j = i; j > from; --j) {
- if (compare(j - 1, j) > 0) {
- swap(j - 1, j);
- } else {
- break;
- }
- }
- }
- }
-
+ /**
+ * A binary sort implementation. This performs {@code O(n*log(n))} comparisons
+ * and {@code O(n^2)} swaps. It is typically used by more sophisticated
+ * implementations as a fall-back when the numbers of items to sort has become
+ * less than {@value #BINARY_SORT_THRESHOLD}.
+ */
void binarySort(int from, int to) {
binarySort(from, to, from + 1);
}
void binarySort(int from, int to, int i) {
for ( ; i < to; ++i) {
+ setPivot(i);
int l = from;
int h = i - 1;
while (l <= h) {
final int mid = (l + h) >>> 1;
- final int cmp = compare(i, mid);
+ final int cmp = comparePivot(mid);
if (cmp < 0) {
h = mid - 1;
} else {
l = mid + 1;
}
}
- switch (i - l) {
- case 2:
- swap(l + 1, l + 2);
- swap(l, l + 1);
- break;
- case 1:
- swap(l, l + 1);
- break;
- case 0:
- break;
- default:
- for (int j = i; j > l; --j) {
- swap(j - 1, j);
- }
- break;
+ for (int j = i; j > l; --j) {
+ swap(j - 1, j);
}
}
}
+ /**
+ * Use heap sort to sort items between {@code from} inclusive and {@code to}
+ * exclusive. This runs in {@code O(n*log(n))} and is used as a fall-back by
+ * {@link IntroSorter}.
+ */
void heapSort(int from, int to) {
if (to - from <= 1) {
return;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index e11fc08..88a84e9 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -112,7 +112,8 @@ public class BKDWriter implements Closeable {
final byte[] scratchDiff;
final byte[] scratch1;
final byte[] scratch2;
- final BytesRef scratchBytesRef = new BytesRef();
+ final BytesRef scratchBytesRef1 = new BytesRef();
+ final BytesRef scratchBytesRef2 = new BytesRef();
final int[] commonPrefixLengths;
protected final FixedBitSet docsSeen;
@@ -174,7 +175,6 @@ public class BKDWriter implements Closeable {
packedBytesLength = numDims * bytesPerDim;
scratchDiff = new byte[bytesPerDim];
- scratchBytesRef.length = packedBytesLength;
scratch1 = new byte[packedBytesLength];
scratch2 = new byte[packedBytesLength];
commonPrefixLengths = new int[numDims];
@@ -465,14 +465,14 @@ public class BKDWriter implements Closeable {
Arrays.fill(minPackedValue, (byte) 0xff);
Arrays.fill(maxPackedValue, (byte) 0);
for (int i = 0; i < Math.toIntExact(pointCount); ++i) {
- reader.getValue(i, scratch1);
+ reader.getValue(i, scratchBytesRef1);
for(int dim=0;dim<numDims;dim++) {
int offset = dim*bytesPerDim;
- if (StringHelper.compare(bytesPerDim, scratch1, offset, minPackedValue, offset) < 0) {
- System.arraycopy(scratch1, offset, minPackedValue, offset, bytesPerDim);
+ if (StringHelper.compare(bytesPerDim, scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, minPackedValue, offset) < 0) {
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, minPackedValue, offset, bytesPerDim);
}
- if (StringHelper.compare(bytesPerDim, scratch1, offset, maxPackedValue, offset) > 0) {
- System.arraycopy(scratch1, offset, maxPackedValue, offset, bytesPerDim);
+ if (StringHelper.compare(bytesPerDim, scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset) > 0) {
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset, bytesPerDim);
}
}
@@ -811,6 +811,7 @@ public class BKDWriter implements Closeable {
}
private PointWriter sort(int dim) throws IOException {
+ assert dim >= 0 && dim < numDims;
if (heapPointWriter != null) {
@@ -1251,13 +1252,13 @@ public class BKDWriter implements Closeable {
// Compute common prefixes
Arrays.fill(commonPrefixLengths, bytesPerDim);
- reader.getValue(from, scratch1);
+ reader.getValue(from, scratchBytesRef1);
for (int i = from + 1; i < to; ++i) {
- reader.getValue(i, scratch2);
+ reader.getValue(i, scratchBytesRef2);
for (int dim=0;dim<numDims;dim++) {
final int offset = dim * bytesPerDim;
for(int j=0;j<commonPrefixLengths[dim];j++) {
- if (scratch1[offset+j] != scratch2[offset+j]) {
+ if (scratchBytesRef1.bytes[scratchBytesRef1.offset+offset+j] != scratchBytesRef2.bytes[scratchBytesRef2.offset+offset+j]) {
commonPrefixLengths[dim] = j;
break;
}
@@ -1294,7 +1295,7 @@ public class BKDWriter implements Closeable {
// sort by sortedDim
MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths,
- reader, from, to, scratch1, scratch2);
+ reader, from, to, scratchBytesRef1, scratchBytesRef2);
// Save the block file pointer:
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
@@ -1307,22 +1308,16 @@ public class BKDWriter implements Closeable {
writeLeafBlockDocs(out, docIDs, 0, count);
// Write the common prefixes:
- reader.getValue(from, scratch1);
+ reader.getValue(from, scratchBytesRef1);
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, packedBytesLength);
writeCommonPrefixes(out, commonPrefixLengths, scratch1);
// Write the full values:
IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
- final BytesRef scratch = new BytesRef(packedBytesLength);
-
- {
- scratch.offset = 0;
- scratch.length = packedBytesLength;
- }
-
@Override
public BytesRef apply(int i) {
- reader.getValue(from + i, scratch.bytes);
- return scratch;
+ reader.getValue(from + i, scratchBytesRef1);
+ return scratchBytesRef1;
}
};
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
@@ -1344,18 +1339,20 @@ public class BKDWriter implements Closeable {
}
}
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen,
- reader, from, to, mid, scratch1, scratch2);
+ reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
// set the split value
final int address = nodeID * (1+bytesPerDim);
splitPackedValues[address] = (byte) splitDim;
- reader.getValue(mid, scratch1);
- System.arraycopy(scratch1, splitDim * bytesPerDim, splitPackedValues, address + 1, bytesPerDim);
+ reader.getValue(mid, scratchBytesRef1);
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, splitPackedValues, address + 1, bytesPerDim);
byte[] minSplitPackedValue = Arrays.copyOf(minPackedValue, packedBytesLength);
byte[] maxSplitPackedValue = Arrays.copyOf(maxPackedValue, packedBytesLength);
- System.arraycopy(scratch1, splitDim * bytesPerDim, minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
- System.arraycopy(scratch1, splitDim * bytesPerDim, maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,
+ minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,
+ maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
// recurse
build(nodeID * 2, leafNodeOffset, reader, from, mid, out,
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
index d499e2b..b439208 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
@@ -17,6 +17,7 @@
package org.apache.lucene.util.bkd;
import org.apache.lucene.codecs.MutablePointsReader;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntroSelector;
import org.apache.lucene.util.IntroSorter;
import org.apache.lucene.util.MSBRadixSorter;
@@ -54,8 +55,8 @@ final class MutablePointsReaderUtils {
protected org.apache.lucene.util.Sorter getFallbackSorter(int k) {
return new IntroSorter() {
- final byte[] pivot = new byte[packedBytesLength];
- final byte[] scratch = new byte[packedBytesLength];
+ final BytesRef pivot = new BytesRef();
+ final BytesRef scratch = new BytesRef();
int pivotDoc;
@Override
@@ -73,7 +74,7 @@ final class MutablePointsReaderUtils {
protected int comparePivot(int j) {
if (k < packedBytesLength) {
reader.getValue(j, scratch);
- int cmp = StringHelper.compare(packedBytesLength - k, pivot, k, scratch, k);
+ int cmp = StringHelper.compare(packedBytesLength - k, pivot.bytes, pivot.offset + k, scratch.bytes, scratch.offset + k);
if (cmp != 0) {
return cmp;
}
@@ -89,7 +90,7 @@ final class MutablePointsReaderUtils {
/** Sort points on the given dimension. */
static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
MutablePointsReader reader, int from, int to,
- byte[] scratch1, byte[] scratch2) {
+ BytesRef scratch1, BytesRef scratch2) {
// No need for a fancy radix sort here, this is called on the leaves only so
// there are not many values to sort
@@ -97,7 +98,7 @@ final class MutablePointsReaderUtils {
final int numBytesToCompare = bytesPerDim - commonPrefixLengths[sortedDim];
new IntroSorter() {
- final byte[] pivot = scratch1;
+ final BytesRef pivot = scratch1;
int pivotDoc = -1;
@Override
@@ -114,7 +115,7 @@ final class MutablePointsReaderUtils {
@Override
protected int comparePivot(int j) {
reader.getValue(j, scratch2);
- int cmp = StringHelper.compare(numBytesToCompare, pivot, offset, scratch2, offset);
+ int cmp = StringHelper.compare(numBytesToCompare, pivot.bytes, pivot.offset + offset, scratch2.bytes, scratch2.offset + offset);
if (cmp == 0) {
cmp = pivotDoc - reader.getDocID(j);
}
@@ -128,7 +129,7 @@ final class MutablePointsReaderUtils {
* equal to it. */
static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
MutablePointsReader reader, int from, int to, int mid,
- byte[] scratch1, byte[] scratch2) {
+ BytesRef scratch1, BytesRef scratch2) {
final int offset = splitDim * bytesPerDim + commonPrefixLen;
final int cmpBytes = bytesPerDim - commonPrefixLen;
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
@@ -138,7 +139,7 @@ final class MutablePointsReaderUtils {
protected Selector getFallbackSelector(int k) {
return new IntroSelector() {
- final byte[] pivot = scratch1;
+ final BytesRef pivot = scratch1;
int pivotDoc;
@Override
@@ -156,7 +157,7 @@ final class MutablePointsReaderUtils {
protected int comparePivot(int j) {
if (k < cmpBytes) {
reader.getValue(j, scratch2);
- int cmp = StringHelper.compare(cmpBytes - k, pivot, offset + k, scratch2, offset + k);
+ int cmp = StringHelper.compare(cmpBytes - k, pivot.bytes, pivot.offset + offset + k, scratch2.bytes, scratch2.offset + offset + k);
if (cmp != 0) {
return cmp;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java b/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
index 388a789..df73687 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
@@ -45,12 +45,25 @@ public class TestByteBlockPool extends LuceneTestCase {
for (BytesRef expected : list) {
ref.grow(expected.length);
ref.setLength(expected.length);
- if (random().nextBoolean()) {
- pool.readBytes(position, ref.bytes(), 0, ref.length());
- } else {
- for (int i = 0; i < ref.length(); ++i) {
- ref.setByteAt(i, pool.readByte(position + i));
- }
+ switch (random().nextInt(3)) {
+ case 0:
+ // copy bytes
+ pool.readBytes(position, ref.bytes(), 0, ref.length());
+ break;
+ case 1:
+ // copy bytes one by one
+ for (int i = 0; i < ref.length(); ++i) {
+ ref.setByteAt(i, pool.readByte(position + i));
+ }
+ break;
+ case 2:
+ BytesRef scratch = new BytesRef();
+ scratch.length = ref.length();
+ pool.setRawBytesRef(scratch, position);
+ System.arraycopy(scratch.bytes, scratch.offset, ref.bytes(), 0, ref.length());
+ break;
+ default:
+ fail();
}
assertEquals(expected, ref.get());
position += ref.length();
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java b/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java
index bc5af7f..c496ff8 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java
@@ -41,9 +41,12 @@ public class TestMSBRadixSorter extends LuceneTestCase {
break;
}
+ final int finalMaxLength = maxLength;
new MSBRadixSorter(maxLength) {
+ @Override
protected int byteAt(int i, int k) {
+ assertTrue(k < finalMaxLength);
BytesRef ref = refs[i];
if (ref.length <= k) {
return -1;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/test/org/apache/lucene/util/TestRadixSelector.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestRadixSelector.java b/lucene/core/src/test/org/apache/lucene/util/TestRadixSelector.java
index e0d6bf9..3016580 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestRadixSelector.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestRadixSelector.java
@@ -36,13 +36,41 @@ public class TestRadixSelector extends LuceneTestCase {
random().nextBytes(bytes);
arr[i] = new BytesRef(bytes);
}
+ doTest(arr, from, to, maxLen);
+ }
+
+ public void testSharedPrefixes() {
+ for (int iter = 0; iter < 100; ++iter) {
+ doTestSharedPrefixes();
+ }
+ }
+
+ private void doTestSharedPrefixes() {
+ final int from = random().nextInt(5);
+ final int to = from + TestUtil.nextInt(random(), 1, 10000);
+ final int maxLen = TestUtil.nextInt(random(), 1, 12);
+ BytesRef[] arr = new BytesRef[from + to + random().nextInt(5)];
+ for (int i = 0; i < arr.length; ++i) {
+ byte[] bytes = new byte[TestUtil.nextInt(random(), 0, maxLen)];
+ random().nextBytes(bytes);
+ arr[i] = new BytesRef(bytes);
+ }
+ final int sharedPrefixLength = Math.min(arr[0].length, TestUtil.nextInt(random(), 1, maxLen));
+ for (int i = 1; i < arr.length; ++i) {
+ System.arraycopy(arr[0].bytes, arr[0].offset, arr[i].bytes, arr[i].offset, Math.min(sharedPrefixLength, arr[i].length));
+ }
+ doTest(arr, from, to, maxLen);
+ }
+
+ private void doTest(BytesRef[] arr, int from, int to, int maxLen) {
final int k = TestUtil.nextInt(random(), from, to - 1);
BytesRef[] expected = arr.clone();
Arrays.sort(expected, from, to);
BytesRef[] actual = arr.clone();
- RadixSelector selector = new RadixSelector(random().nextBoolean() ? maxLen : Integer.MAX_VALUE) {
+ final int enforcedMaxLen = random().nextBoolean() ? maxLen : Integer.MAX_VALUE;
+ RadixSelector selector = new RadixSelector(enforcedMaxLen) {
@Override
protected void swap(int i, int j) {
@@ -51,6 +79,7 @@ public class TestRadixSelector extends LuceneTestCase {
@Override
protected int byteAt(int i, int k) {
+ assertTrue(k < enforcedMaxLen);
BytesRef b = actual[i];
if (k >= b.length) {
return -1;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9fc46248/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
index f1140d4..4616ce3 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
@@ -44,7 +44,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
Arrays.sort(points, new Comparator<Point>() {
@Override
public int compare(Point o1, Point o2) {
- int cmp = StringHelper.compare(bytesPerDim, o1.packedValue, 0, o2.packedValue, 0);
+ int cmp = o1.packedValue.compareTo(o2.packedValue);
if (cmp == 0) {
cmp = Integer.compare(o1.doc, o2.doc);
}
@@ -70,19 +70,25 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
for (int i = 0; i < commonPrefixLengths.length; ++i) {
commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, bytesPerDim);
}
+ BytesRef firstValue = points[0].packedValue;
for (int i = 1; i < points.length; ++i) {
for (int dim = 0; dim < numDims; ++dim) {
int offset = dim * bytesPerDim;
- System.arraycopy(points[0].packedValue, offset, points[i].packedValue, offset, commonPrefixLengths[dim]);
+ BytesRef packedValue = points[i].packedValue;
+ System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLengths[dim]);
}
}
DummyPointsReader reader = new DummyPointsReader(points);
final int sortedDim = random().nextInt(numDims);
MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths, reader, 0, points.length,
- new byte[numDims * bytesPerDim], new byte[numDims * bytesPerDim]);
+ new BytesRef(), new BytesRef());
for (int i = 1; i < points.length; ++i) {
final int offset = sortedDim * bytesPerDim;
- int cmp = StringHelper.compare(bytesPerDim, reader.points[i-1].packedValue, offset, reader.points[i].packedValue, offset);
+ BytesRef previousValue = reader.points[i-1].packedValue;
+ BytesRef currentValue = reader.points[i].packedValue;
+ int cmp = StringHelper.compare(bytesPerDim,
+ previousValue.bytes, previousValue.offset + offset,
+ currentValue.bytes, currentValue.offset + offset);
if (cmp == 0) {
cmp = reader.points[i - 1].doc - reader.points[i].doc;
}
@@ -103,17 +109,23 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
Point[] points = createRandomPoints(numDims, bytesPerDim, maxDoc);
int commonPrefixLength = TestUtil.nextInt(random(), 0, bytesPerDim);
final int splitDim = random().nextInt(numDims);
+ BytesRef firstValue = points[0].packedValue;
for (int i = 1; i < points.length; ++i) {
+ BytesRef packedValue = points[i].packedValue;
int offset = splitDim * bytesPerDim;
- System.arraycopy(points[0].packedValue, offset, points[i].packedValue, offset, commonPrefixLength);
+ System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLength);
}
DummyPointsReader reader = new DummyPointsReader(points);
final int pivot = TestUtil.nextInt(random(), 0, points.length - 1);
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLength, reader, 0, points.length, pivot,
- new byte[numDims * bytesPerDim], new byte[numDims * bytesPerDim]);
+ new BytesRef(), new BytesRef());
+ BytesRef pivotValue = reader.points[pivot].packedValue;
int offset = splitDim * bytesPerDim;
for (int i = 0; i < points.length; ++i) {
- int cmp = StringHelper.compare(bytesPerDim, reader.points[i].packedValue, offset, reader.points[pivot].packedValue, offset);
+ BytesRef value = reader.points[i].packedValue;
+ int cmp = StringHelper.compare(bytesPerDim,
+ value.bytes, value.offset + offset,
+ pivotValue.bytes, pivotValue.offset + offset);
if (cmp == 0) {
cmp = reader.points[i].doc - reader.points[pivot].doc;
}
@@ -140,11 +152,15 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
}
private static class Point {
- final byte[] packedValue;
+ final BytesRef packedValue;
final int doc;
Point(byte[] packedValue, int doc) {
- this.packedValue = packedValue;
+ // use a non-null offset to make sure MutablePointsReaderUtils does not ignore it
+ this.packedValue = new BytesRef(packedValue.length + 1);
+ this.packedValue.bytes[0] = (byte) random().nextInt(256);
+ this.packedValue.offset = 1;
+ this.packedValue.length = packedValue.length;
this.doc = doc;
}
@@ -154,17 +170,17 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
return false;
}
Point that = (Point) obj;
- return Arrays.equals(packedValue, that.packedValue) && doc == that.doc;
+ return packedValue.equals(that.packedValue) && doc == that.doc;
}
@Override
public int hashCode() {
- return 31 * Arrays.hashCode(packedValue) + doc;
+ return 31 * packedValue.hashCode() + doc;
}
@Override
public String toString() {
- return "value=" + new BytesRef(packedValue) + " doc=" + doc;
+ return "value=" + packedValue + " doc=" + doc;
}
}
@@ -187,13 +203,16 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
}
@Override
- public void getValue(int i, byte[] packedValue) {
- System.arraycopy(points[i].packedValue, 0, packedValue, 0, points[i].packedValue.length);
+ public void getValue(int i, BytesRef packedValue) {
+ packedValue.bytes = points[i].packedValue.bytes;
+ packedValue.offset = points[i].packedValue.offset;
+ packedValue.length = points[i].packedValue.length;
}
@Override
public byte getByteAt(int i, int k) {
- return points[i].packedValue[k];
+ BytesRef packedValue = points[i].packedValue;
+ return packedValue.bytes[packedValue.offset + k];
}
@Override
[4/4] lucene-solr:branch_6x: LUCENE-7399: Speed up flush of points.
Posted by jp...@apache.org.
LUCENE-7399: Speed up flush of points.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/33197ec9
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/33197ec9
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/33197ec9
Branch: refs/heads/branch_6x
Commit: 33197ec98afbd3be8f9bf48aa4556fcbfab3f37e
Parents: e360afe
Author: Adrien Grand <jp...@gmail.com>
Authored: Fri Jul 29 17:32:29 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Aug 3 14:48:23 2016 +0200
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +-
.../lucene/codecs/MutablePointsReader.java | 6 +-
.../apache/lucene/index/PointValuesWriter.java | 14 ++-
.../org/apache/lucene/util/ByteBlockPool.java | 21 ++++
.../apache/lucene/util/InPlaceMergeSorter.java | 4 +-
.../org/apache/lucene/util/IntroSelector.java | 2 +
.../org/apache/lucene/util/IntroSorter.java | 13 ++-
.../org/apache/lucene/util/MSBRadixSorter.java | 109 +++++++++++++++----
.../org/apache/lucene/util/RadixSelector.java | 94 ++++++++++++++--
.../src/java/org/apache/lucene/util/Sorter.java | 59 +++++-----
.../org/apache/lucene/util/bkd/BKDWriter.java | 49 ++++-----
.../util/bkd/MutablePointsReaderUtils.java | 19 ++--
.../apache/lucene/util/TestByteBlockPool.java | 25 ++++-
.../apache/lucene/util/TestMSBRadixSorter.java | 3 +
.../apache/lucene/util/TestRadixSelector.java | 31 +++++-
.../util/bkd/TestMutablePointsReaderUtils.java | 49 ++++++---
16 files changed, 368 insertions(+), 133 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 8a15803..29ad129 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -139,7 +139,8 @@ Optimizations
* LUCENE-7311: Cached term queries do not seek the terms dictionary anymore.
(Adrien Grand)
-* LUCENE-7396: Faster flush of points. (Adrien Grand, Mike McCandless)
+* LUCENE-7396, LUCENE-7399: Faster flush of points.
+ (Adrien Grand, Mike McCandless)
Other
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/java/org/apache/lucene/codecs/MutablePointsReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/MutablePointsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/MutablePointsReader.java
index b6c328b..dccca26 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/MutablePointsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/MutablePointsReader.java
@@ -16,6 +16,8 @@
*/
package org.apache.lucene.codecs;
+import org.apache.lucene.util.BytesRef;
+
/** {@link PointsReader} whose order of points can be changed.
* This class is useful for codecs to optimize flush.
* @lucene.internal */
@@ -24,8 +26,8 @@ public abstract class MutablePointsReader extends PointsReader {
/** Sole constructor. */
protected MutablePointsReader() {}
- /** Fill {@code packedValue} with the packed bytes of the i-th value. */
- public abstract void getValue(int i, byte[] packedValue);
+ /** Set {@code packedValue} with a reference to the packed bytes of the i-th value. */
+ public abstract void getValue(int i, BytesRef packedValue);
/** Get the k-th byte of the i-th value. */
public abstract byte getByteAt(int i, int k);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
index dcc7600..daf1f33 100644
--- a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
@@ -51,9 +51,10 @@ class PointValuesWriter {
if (value == null) {
throw new IllegalArgumentException("field=" + fieldInfo.name + ": point value must not be null");
}
- if (value.length != fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes()) {
+ if (value.length != packedBytesLength) {
throw new IllegalArgumentException("field=" + fieldInfo.name + ": this field's value has length=" + value.length + " but should be " + (fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes()));
}
+
if (docIDs.length == numPoints) {
docIDs = ArrayUtil.grow(docIDs, numPoints+1);
iwBytesUsed.addAndGet((docIDs.length - numPoints) * Integer.BYTES);
@@ -64,6 +65,7 @@ class PointValuesWriter {
numDocs++;
lastDocID = docID;
}
+
numPoints++;
}
@@ -82,9 +84,12 @@ class PointValuesWriter {
if (fieldName.equals(fieldInfo.name) == false) {
throw new IllegalArgumentException("fieldName must be the same");
}
+ final BytesRef scratch = new BytesRef();
final byte[] packedValue = new byte[packedBytesLength];
for(int i=0;i<numPoints;i++) {
- getValue(i, packedValue);
+ getValue(i, scratch);
+ assert scratch.length == packedValue.length;
+ System.arraycopy(scratch.bytes, scratch.offset, packedValue, 0, packedBytesLength);
visitor.visit(getDocID(i), packedValue);
}
}
@@ -152,9 +157,10 @@ class PointValuesWriter {
}
@Override
- public void getValue(int i, byte[] packedValue) {
+ public void getValue(int i, BytesRef packedValue) {
final long offset = (long) packedBytesLength * ords[i];
- bytes.readBytes(offset, packedValue, 0, packedBytesLength);
+ packedValue.length = packedBytesLength;
+ bytes.setRawBytesRef(packedValue, offset);
}
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
index e202d63..e575d87 100644
--- a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
+++ b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
@@ -379,6 +379,27 @@ public final class ByteBlockPool {
} while (true);
}
+ /**
+ * Set the given {@link BytesRef} so that its content is equal to the
+ * {@code ref.length} bytes starting at {@code offset}. Most of the time this
+ * method will set pointers to internal data-structures. However, in case a
+ * value crosses a boundary, a fresh copy will be returned.
+ * On the contrary to {@link #setBytesRef(BytesRef, int)}, this does not
+ * expect the length to be encoded with the data.
+ */
+ public void setRawBytesRef(BytesRef ref, final long offset) {
+ int bufferIndex = (int) (offset >> BYTE_BLOCK_SHIFT);
+ int pos = (int) (offset & BYTE_BLOCK_MASK);
+ if (pos + ref.length <= BYTE_BLOCK_SIZE) {
+ ref.bytes = buffers[bufferIndex];
+ ref.offset = pos;
+ } else {
+ ref.bytes = new byte[ref.length];
+ ref.offset = 0;
+ readBytes(offset, ref.bytes, 0, ref.length);
+ }
+ }
+
/** Read a single byte at the given {@code offset}. */
public byte readByte(long offset) {
int bufferIndex = (int) (offset >> BYTE_BLOCK_SHIFT);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/java/org/apache/lucene/util/InPlaceMergeSorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/InPlaceMergeSorter.java b/lucene/core/src/java/org/apache/lucene/util/InPlaceMergeSorter.java
index 1afba99..de8f976 100644
--- a/lucene/core/src/java/org/apache/lucene/util/InPlaceMergeSorter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/InPlaceMergeSorter.java
@@ -33,8 +33,8 @@ public abstract class InPlaceMergeSorter extends Sorter {
}
void mergeSort(int from, int to) {
- if (to - from < INSERTION_SORT_THRESHOLD) {
- insertionSort(from, to);
+ if (to - from < BINARY_SORT_THRESHOLD) {
+ binarySort(from, to);
} else {
final int mid = (from + to) >>> 1;
mergeSort(from, mid);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java b/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
index 7898535..2984801 100644
--- a/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
+++ b/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
@@ -34,6 +34,8 @@ public abstract class IntroSelector extends Selector {
}
// heap sort
+ // TODO: use median of median instead to have linear worst-case rather than
+ // n*log(n)
void slowSelect(int from, int to, int k) {
new Sorter() {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java b/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java
index 83d118d..79a39bf 100644
--- a/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/IntroSorter.java
@@ -37,8 +37,8 @@ public abstract class IntroSorter extends Sorter {
}
void quicksort(int from, int to, int maxDepth) {
- if (to - from < INSERTION_SORT_THRESHOLD) {
- insertionSort(from, to);
+ if (to - from < BINARY_SORT_THRESHOLD) {
+ binarySort(from, to);
return;
} else if (--maxDepth < 0) {
heapSort(from, to);
@@ -83,12 +83,13 @@ public abstract class IntroSorter extends Sorter {
quicksort(left + 1, to, maxDepth);
}
- /** Save the value at slot <code>i</code> so that it can later be used as a
- * pivot, see {@link #comparePivot(int)}. */
+ // Don't rely on the slow default impl of setPivot/comparePivot since
+ // quicksort relies on these methods to be fast for good performance
+
+ @Override
protected abstract void setPivot(int i);
- /** Compare the pivot with the slot at <code>j</code>, similarly to
- * {@link #compare(int, int) compare(i, j)}. */
+ @Override
protected abstract int comparePivot(int j);
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java b/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
index 33f20b6..f0e2a61 100644
--- a/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
@@ -38,6 +38,7 @@ public abstract class MSBRadixSorter extends Sorter {
// we store one histogram per recursion level
private final int[][] histograms = new int[LEVEL_THRESHOLD][];
private final int[] endOffsets = new int[HISTOGRAM_SIZE];
+ private final int[] commonPrefix;
private final int maxLength;
@@ -47,6 +48,7 @@ public abstract class MSBRadixSorter extends Sorter {
*/
protected MSBRadixSorter(int maxLength) {
this.maxLength = maxLength;
+ this.commonPrefix = new int[Math.min(24, maxLength)];
}
/** Return the k-th byte of the entry at index {@code i}, or {@code -1} if
@@ -116,14 +118,14 @@ public abstract class MSBRadixSorter extends Sorter {
@Override
public void sort(int from, int to) {
checkRange(from, to);
- sort(from, to, 0);
+ sort(from, to, 0, 0);
}
- private void sort(int from, int to, int k) {
- if (to - from <= LENGTH_THRESHOLD || k >= LEVEL_THRESHOLD) {
+ private void sort(int from, int to, int k, int l) {
+ if (to - from <= LENGTH_THRESHOLD || l >= LEVEL_THRESHOLD) {
introSort(from, to, k);
} else {
- radixSort(from, to, k);
+ radixSort(from, to, k, l);
}
}
@@ -131,28 +133,30 @@ public abstract class MSBRadixSorter extends Sorter {
getFallbackSorter(k).sort(from, to);
}
- private void radixSort(int from, int to, int k) {
- int[] histogram = histograms[k];
+ /**
+ * @param k the character number to compare
+ * @param l the level of recursion
+ */
+ private void radixSort(int from, int to, int k, int l) {
+ int[] histogram = histograms[l];
if (histogram == null) {
- histogram = histograms[k] = new int[HISTOGRAM_SIZE];
+ histogram = histograms[l] = new int[HISTOGRAM_SIZE];
} else {
Arrays.fill(histogram, 0);
}
- buildHistogram(from, to, k, histogram);
-
- // short-circuit: if all keys have the same byte at offset k, then recurse directly
- for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
- if (histogram[i] == to - from) {
- // everything is in the same bucket, recurse
- if (i > 0) {
- sort(from, to, k + 1);
- }
- return;
- } else if (histogram[i] != 0) {
- break;
+ final int commonPrefixLength = computeCommonPrefixLengthAndBuildHistogram(from, to, k, histogram);
+ if (commonPrefixLength > 0) {
+ // if there are no more chars to compare or if all entries fell into the
+ // first bucket (which means strings are shorter than k) then we are done
+ // otherwise recurse
+ if (k + commonPrefixLength < maxLength
+ && histogram[0] < to - from) {
+ radixSort(from, to, k + commonPrefixLength, l);
}
+ return;
}
+ assert assertHistogram(commonPrefixLength, histogram);
int[] startOffsets = histogram;
int[] endOffsets = this.endOffsets;
@@ -167,24 +171,83 @@ public abstract class MSBRadixSorter extends Sorter {
int h = endOffsets[i];
final int bucketLen = h - prev;
if (bucketLen > 1) {
- sort(from + prev, from + h, k + 1);
+ sort(from + prev, from + h, k + 1, l + 1);
}
prev = h;
}
}
}
+ // only used from assert
+ private boolean assertHistogram(int commonPrefixLength, int[] histogram) {
+ int numberOfUniqueBytes = 0;
+ for (int freq : histogram) {
+ if (freq > 0) {
+ numberOfUniqueBytes++;
+ }
+ }
+ if (numberOfUniqueBytes == 1) {
+ assert commonPrefixLength >= 1;
+ } else {
+ assert commonPrefixLength == 0 : commonPrefixLength;
+ }
+ return true;
+ }
+
/** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
private int getBucket(int i, int k) {
return byteAt(i, k) + 1;
}
- /** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}. */
- private int[] buildHistogram(int from, int to, int k, int[] histogram) {
+ /** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}
+ * and return a common prefix length for all visited values.
+ * @see #buildHistogram */
+ private int computeCommonPrefixLengthAndBuildHistogram(int from, int to, int k, int[] histogram) {
+ final int[] commonPrefix = this.commonPrefix;
+ int commonPrefixLength = Math.min(commonPrefix.length, maxLength - k);
+ for (int j = 0; j < commonPrefixLength; ++j) {
+ final int b = byteAt(from, k + j);
+ commonPrefix[j] = b;
+ if (b == -1) {
+ commonPrefixLength = j + 1;
+ break;
+ }
+ }
+
+ int i;
+ outer: for (i = from + 1; i < to; ++i) {
+ for (int j = 0; j < commonPrefixLength; ++j) {
+ final int b = byteAt(i, k + j);
+ if (b != commonPrefix[j]) {
+ commonPrefixLength = j;
+ if (commonPrefixLength == 0) { // we have no common prefix
+ histogram[commonPrefix[0] + 1] = i - from;
+ histogram[b + 1] = 1;
+ break outer;
+ }
+ break;
+ }
+ }
+ }
+
+ if (i < to) {
+ // the loop got broken because there is no common prefix
+ assert commonPrefixLength == 0;
+ buildHistogram(i + 1, to, k, histogram);
+ } else {
+ assert commonPrefixLength > 0;
+ histogram[commonPrefix[0] + 1] = to - from;
+ }
+
+ return commonPrefixLength;
+ }
+
+ /** Build an histogram of the k-th characters of values occurring between
+ * offsets {@code from} and {@code to}, using {@link #getBucket}. */
+ private void buildHistogram(int from, int to, int k, int[] histogram) {
for (int i = from; i < to; ++i) {
histogram[getBucket(i, k)]++;
}
- return histogram;
}
/** Accumulate values of the histogram so that it does not store counts but
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java b/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java
index 543204a..c9b6133 100644
--- a/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java
+++ b/lucene/core/src/java/org/apache/lucene/util/RadixSelector.java
@@ -36,6 +36,7 @@ public abstract class RadixSelector extends Selector {
// we store one histogram per recursion level
private final int[] histogram = new int[HISTOGRAM_SIZE];
+ private final int[] commonPrefix;
private final int maxLength;
@@ -45,6 +46,7 @@ public abstract class RadixSelector extends Selector {
*/
protected RadixSelector(int maxLength) {
this.maxLength = maxLength;
+ this.commonPrefix = new int[Math.min(24, maxLength)];
}
/** Return the k-th byte of the entry at index {@code i}, or {@code -1} if
@@ -112,22 +114,37 @@ public abstract class RadixSelector extends Selector {
@Override
public void select(int from, int to, int k) {
checkArgs(from, to, k);
- select(from, to, k, 0);
+ select(from, to, k, 0, 0);
}
- private void select(int from, int to, int k, int d) {
+ private void select(int from, int to, int k, int d, int l) {
if (to - from <= LENGTH_THRESHOLD || d >= LEVEL_THRESHOLD) {
getFallbackSelector(d).select(from, to, k);
} else {
- radixSelect(from, to, k, d);
+ radixSelect(from, to, k, d, l);
}
}
- private void radixSelect(int from, int to, int k, int d) {
+ /**
+ * @param d the character number to compare
+ * @param l the level of recursion
+ */
+ private void radixSelect(int from, int to, int k, int d, int l) {
final int[] histogram = this.histogram;
Arrays.fill(histogram, 0);
- buildHistogram(from, to, d, histogram);
+ final int commonPrefixLength = computeCommonPrefixLengthAndBuildHistogram(from, to, d, histogram);
+ if (commonPrefixLength > 0) {
+ // if there are no more chars to compare or if all entries fell into the
+ // first bucket (which means strings are shorter than d) then we are done
+ // otherwise recurse
+ if (d + commonPrefixLength < maxLength
+ && histogram[0] < to - from) {
+ radixSelect(from, to, k, d + commonPrefixLength, l);
+ }
+ return;
+ }
+ assert assertHistogram(commonPrefixLength, histogram);
int bucketFrom = from;
for (int bucket = 0; bucket < HISTOGRAM_SIZE; ++bucket) {
@@ -138,7 +155,7 @@ public abstract class RadixSelector extends Selector {
if (bucket != 0 && d + 1 < maxLength) {
// all elements in bucket 0 are equal so we only need to recurse if bucket != 0
- select(bucketFrom, bucketTo, k, d + 1);
+ select(bucketFrom, bucketTo, k, d + 1, l + 1);
}
return;
}
@@ -147,17 +164,76 @@ public abstract class RadixSelector extends Selector {
throw new AssertionError("Unreachable code");
}
+ // only used from assert
+ private boolean assertHistogram(int commonPrefixLength, int[] histogram) {
+ int numberOfUniqueBytes = 0;
+ for (int freq : histogram) {
+ if (freq > 0) {
+ numberOfUniqueBytes++;
+ }
+ }
+ if (numberOfUniqueBytes == 1) {
+ assert commonPrefixLength >= 1;
+ } else {
+ assert commonPrefixLength == 0;
+ }
+ return true;
+ }
+
/** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
private int getBucket(int i, int k) {
return byteAt(i, k) + 1;
}
- /** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}. */
- private int[] buildHistogram(int from, int to, int k, int[] histogram) {
+ /** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}
+ * and return a common prefix length for all visited values.
+ * @see #buildHistogram */
+ private int computeCommonPrefixLengthAndBuildHistogram(int from, int to, int k, int[] histogram) {
+ final int[] commonPrefix = this.commonPrefix;
+ int commonPrefixLength = Math.min(commonPrefix.length, maxLength - k);
+ for (int j = 0; j < commonPrefixLength; ++j) {
+ final int b = byteAt(from, k + j);
+ commonPrefix[j] = b;
+ if (b == -1) {
+ commonPrefixLength = j + 1;
+ break;
+ }
+ }
+
+ int i;
+ outer: for (i = from + 1; i < to; ++i) {
+ for (int j = 0; j < commonPrefixLength; ++j) {
+ final int b = byteAt(i, k + j);
+ if (b != commonPrefix[j]) {
+ commonPrefixLength = j;
+ if (commonPrefixLength == 0) { // we have no common prefix
+ histogram[commonPrefix[0] + 1] = i - from;
+ histogram[b + 1] = 1;
+ break outer;
+ }
+ break;
+ }
+ }
+ }
+
+ if (i < to) {
+ // the loop got broken because there is no common prefix
+ assert commonPrefixLength == 0;
+ buildHistogram(i + 1, to, k, histogram);
+ } else {
+ assert commonPrefixLength > 0;
+ histogram[commonPrefix[0] + 1] = to - from;
+ }
+
+ return commonPrefixLength;
+ }
+
+ /** Build an histogram of the k-th characters of values occurring between
+ * offsets {@code from} and {@code to}, using {@link #getBucket}. */
+ private void buildHistogram(int from, int to, int k, int[] histogram) {
for (int i = from; i < to; ++i) {
histogram[getBucket(i, k)]++;
}
- return histogram;
}
/** Reorder elements so that all of them that fall into {@code bucket} are
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/java/org/apache/lucene/util/Sorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/Sorter.java b/lucene/core/src/java/org/apache/lucene/util/Sorter.java
index 0ac954b..32d27cd 100644
--- a/lucene/core/src/java/org/apache/lucene/util/Sorter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/Sorter.java
@@ -23,7 +23,7 @@ import java.util.Comparator;
* @lucene.internal */
public abstract class Sorter {
- static final int INSERTION_SORT_THRESHOLD = 20;
+ static final int BINARY_SORT_THRESHOLD = 20;
/** Sole constructor, used for inheritance. */
protected Sorter() {}
@@ -36,6 +36,20 @@ public abstract class Sorter {
/** Swap values at slots <code>i</code> and <code>j</code>. */
protected abstract void swap(int i, int j);
+ private int pivotIndex;
+
+ /** Save the value at slot <code>i</code> so that it can later be used as a
+ * pivot, see {@link #comparePivot(int)}. */
+ protected void setPivot(int i) {
+ pivotIndex = i;
+ }
+
+ /** Compare the pivot with the slot at <code>j</code>, similarly to
+ * {@link #compare(int, int) compare(i, j)}. */
+ protected int comparePivot(int j) {
+ return compare(pivotIndex, j);
+ }
+
/** Sort the slice which starts at <code>from</code> (inclusive) and ends at
* <code>to</code> (exclusive). */
public abstract void sort(int from, int to);
@@ -163,54 +177,41 @@ public abstract class Sorter {
}
}
- void insertionSort(int from, int to) {
- for (int i = from + 1; i < to; ++i) {
- for (int j = i; j > from; --j) {
- if (compare(j - 1, j) > 0) {
- swap(j - 1, j);
- } else {
- break;
- }
- }
- }
- }
-
+ /**
+ * A binary sort implementation. This performs {@code O(n*log(n))} comparisons
+ * and {@code O(n^2)} swaps. It is typically used by more sophisticated
+ * implementations as a fall-back when the numbers of items to sort has become
+ * less than {@value #BINARY_SORT_THRESHOLD}.
+ */
void binarySort(int from, int to) {
binarySort(from, to, from + 1);
}
void binarySort(int from, int to, int i) {
for ( ; i < to; ++i) {
+ setPivot(i);
int l = from;
int h = i - 1;
while (l <= h) {
final int mid = (l + h) >>> 1;
- final int cmp = compare(i, mid);
+ final int cmp = comparePivot(mid);
if (cmp < 0) {
h = mid - 1;
} else {
l = mid + 1;
}
}
- switch (i - l) {
- case 2:
- swap(l + 1, l + 2);
- swap(l, l + 1);
- break;
- case 1:
- swap(l, l + 1);
- break;
- case 0:
- break;
- default:
- for (int j = i; j > l; --j) {
- swap(j - 1, j);
- }
- break;
+ for (int j = i; j > l; --j) {
+ swap(j - 1, j);
}
}
}
+ /**
+ * Use heap sort to sort items between {@code from} inclusive and {@code to}
+ * exclusive. This runs in {@code O(n*log(n))} and is used as a fall-back by
+ * {@link IntroSorter}.
+ */
void heapSort(int from, int to) {
if (to - from <= 1) {
return;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index e11fc08..88a84e9 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -112,7 +112,8 @@ public class BKDWriter implements Closeable {
final byte[] scratchDiff;
final byte[] scratch1;
final byte[] scratch2;
- final BytesRef scratchBytesRef = new BytesRef();
+ final BytesRef scratchBytesRef1 = new BytesRef();
+ final BytesRef scratchBytesRef2 = new BytesRef();
final int[] commonPrefixLengths;
protected final FixedBitSet docsSeen;
@@ -174,7 +175,6 @@ public class BKDWriter implements Closeable {
packedBytesLength = numDims * bytesPerDim;
scratchDiff = new byte[bytesPerDim];
- scratchBytesRef.length = packedBytesLength;
scratch1 = new byte[packedBytesLength];
scratch2 = new byte[packedBytesLength];
commonPrefixLengths = new int[numDims];
@@ -465,14 +465,14 @@ public class BKDWriter implements Closeable {
Arrays.fill(minPackedValue, (byte) 0xff);
Arrays.fill(maxPackedValue, (byte) 0);
for (int i = 0; i < Math.toIntExact(pointCount); ++i) {
- reader.getValue(i, scratch1);
+ reader.getValue(i, scratchBytesRef1);
for(int dim=0;dim<numDims;dim++) {
int offset = dim*bytesPerDim;
- if (StringHelper.compare(bytesPerDim, scratch1, offset, minPackedValue, offset) < 0) {
- System.arraycopy(scratch1, offset, minPackedValue, offset, bytesPerDim);
+ if (StringHelper.compare(bytesPerDim, scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, minPackedValue, offset) < 0) {
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, minPackedValue, offset, bytesPerDim);
}
- if (StringHelper.compare(bytesPerDim, scratch1, offset, maxPackedValue, offset) > 0) {
- System.arraycopy(scratch1, offset, maxPackedValue, offset, bytesPerDim);
+ if (StringHelper.compare(bytesPerDim, scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset) > 0) {
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset, bytesPerDim);
}
}
@@ -811,6 +811,7 @@ public class BKDWriter implements Closeable {
}
private PointWriter sort(int dim) throws IOException {
+ assert dim >= 0 && dim < numDims;
if (heapPointWriter != null) {
@@ -1251,13 +1252,13 @@ public class BKDWriter implements Closeable {
// Compute common prefixes
Arrays.fill(commonPrefixLengths, bytesPerDim);
- reader.getValue(from, scratch1);
+ reader.getValue(from, scratchBytesRef1);
for (int i = from + 1; i < to; ++i) {
- reader.getValue(i, scratch2);
+ reader.getValue(i, scratchBytesRef2);
for (int dim=0;dim<numDims;dim++) {
final int offset = dim * bytesPerDim;
for(int j=0;j<commonPrefixLengths[dim];j++) {
- if (scratch1[offset+j] != scratch2[offset+j]) {
+ if (scratchBytesRef1.bytes[scratchBytesRef1.offset+offset+j] != scratchBytesRef2.bytes[scratchBytesRef2.offset+offset+j]) {
commonPrefixLengths[dim] = j;
break;
}
@@ -1294,7 +1295,7 @@ public class BKDWriter implements Closeable {
// sort by sortedDim
MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths,
- reader, from, to, scratch1, scratch2);
+ reader, from, to, scratchBytesRef1, scratchBytesRef2);
// Save the block file pointer:
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
@@ -1307,22 +1308,16 @@ public class BKDWriter implements Closeable {
writeLeafBlockDocs(out, docIDs, 0, count);
// Write the common prefixes:
- reader.getValue(from, scratch1);
+ reader.getValue(from, scratchBytesRef1);
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, packedBytesLength);
writeCommonPrefixes(out, commonPrefixLengths, scratch1);
// Write the full values:
IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
- final BytesRef scratch = new BytesRef(packedBytesLength);
-
- {
- scratch.offset = 0;
- scratch.length = packedBytesLength;
- }
-
@Override
public BytesRef apply(int i) {
- reader.getValue(from + i, scratch.bytes);
- return scratch;
+ reader.getValue(from + i, scratchBytesRef1);
+ return scratchBytesRef1;
}
};
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
@@ -1344,18 +1339,20 @@ public class BKDWriter implements Closeable {
}
}
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen,
- reader, from, to, mid, scratch1, scratch2);
+ reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
// set the split value
final int address = nodeID * (1+bytesPerDim);
splitPackedValues[address] = (byte) splitDim;
- reader.getValue(mid, scratch1);
- System.arraycopy(scratch1, splitDim * bytesPerDim, splitPackedValues, address + 1, bytesPerDim);
+ reader.getValue(mid, scratchBytesRef1);
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, splitPackedValues, address + 1, bytesPerDim);
byte[] minSplitPackedValue = Arrays.copyOf(minPackedValue, packedBytesLength);
byte[] maxSplitPackedValue = Arrays.copyOf(maxPackedValue, packedBytesLength);
- System.arraycopy(scratch1, splitDim * bytesPerDim, minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
- System.arraycopy(scratch1, splitDim * bytesPerDim, maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,
+ minSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim,
+ maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
// recurse
build(nodeID * 2, leafNodeOffset, reader, from, mid, out,
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
index d499e2b..b439208 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
@@ -17,6 +17,7 @@
package org.apache.lucene.util.bkd;
import org.apache.lucene.codecs.MutablePointsReader;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntroSelector;
import org.apache.lucene.util.IntroSorter;
import org.apache.lucene.util.MSBRadixSorter;
@@ -54,8 +55,8 @@ final class MutablePointsReaderUtils {
protected org.apache.lucene.util.Sorter getFallbackSorter(int k) {
return new IntroSorter() {
- final byte[] pivot = new byte[packedBytesLength];
- final byte[] scratch = new byte[packedBytesLength];
+ final BytesRef pivot = new BytesRef();
+ final BytesRef scratch = new BytesRef();
int pivotDoc;
@Override
@@ -73,7 +74,7 @@ final class MutablePointsReaderUtils {
protected int comparePivot(int j) {
if (k < packedBytesLength) {
reader.getValue(j, scratch);
- int cmp = StringHelper.compare(packedBytesLength - k, pivot, k, scratch, k);
+ int cmp = StringHelper.compare(packedBytesLength - k, pivot.bytes, pivot.offset + k, scratch.bytes, scratch.offset + k);
if (cmp != 0) {
return cmp;
}
@@ -89,7 +90,7 @@ final class MutablePointsReaderUtils {
/** Sort points on the given dimension. */
static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
MutablePointsReader reader, int from, int to,
- byte[] scratch1, byte[] scratch2) {
+ BytesRef scratch1, BytesRef scratch2) {
// No need for a fancy radix sort here, this is called on the leaves only so
// there are not many values to sort
@@ -97,7 +98,7 @@ final class MutablePointsReaderUtils {
final int numBytesToCompare = bytesPerDim - commonPrefixLengths[sortedDim];
new IntroSorter() {
- final byte[] pivot = scratch1;
+ final BytesRef pivot = scratch1;
int pivotDoc = -1;
@Override
@@ -114,7 +115,7 @@ final class MutablePointsReaderUtils {
@Override
protected int comparePivot(int j) {
reader.getValue(j, scratch2);
- int cmp = StringHelper.compare(numBytesToCompare, pivot, offset, scratch2, offset);
+ int cmp = StringHelper.compare(numBytesToCompare, pivot.bytes, pivot.offset + offset, scratch2.bytes, scratch2.offset + offset);
if (cmp == 0) {
cmp = pivotDoc - reader.getDocID(j);
}
@@ -128,7 +129,7 @@ final class MutablePointsReaderUtils {
* equal to it. */
static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
MutablePointsReader reader, int from, int to, int mid,
- byte[] scratch1, byte[] scratch2) {
+ BytesRef scratch1, BytesRef scratch2) {
final int offset = splitDim * bytesPerDim + commonPrefixLen;
final int cmpBytes = bytesPerDim - commonPrefixLen;
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
@@ -138,7 +139,7 @@ final class MutablePointsReaderUtils {
protected Selector getFallbackSelector(int k) {
return new IntroSelector() {
- final byte[] pivot = scratch1;
+ final BytesRef pivot = scratch1;
int pivotDoc;
@Override
@@ -156,7 +157,7 @@ final class MutablePointsReaderUtils {
protected int comparePivot(int j) {
if (k < cmpBytes) {
reader.getValue(j, scratch2);
- int cmp = StringHelper.compare(cmpBytes - k, pivot, offset + k, scratch2, offset + k);
+ int cmp = StringHelper.compare(cmpBytes - k, pivot.bytes, pivot.offset + offset + k, scratch2.bytes, scratch2.offset + offset + k);
if (cmp != 0) {
return cmp;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java b/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
index 388a789..df73687 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
@@ -45,12 +45,25 @@ public class TestByteBlockPool extends LuceneTestCase {
for (BytesRef expected : list) {
ref.grow(expected.length);
ref.setLength(expected.length);
- if (random().nextBoolean()) {
- pool.readBytes(position, ref.bytes(), 0, ref.length());
- } else {
- for (int i = 0; i < ref.length(); ++i) {
- ref.setByteAt(i, pool.readByte(position + i));
- }
+ switch (random().nextInt(3)) {
+ case 0:
+ // copy bytes
+ pool.readBytes(position, ref.bytes(), 0, ref.length());
+ break;
+ case 1:
+ // copy bytes one by one
+ for (int i = 0; i < ref.length(); ++i) {
+ ref.setByteAt(i, pool.readByte(position + i));
+ }
+ break;
+ case 2:
+ BytesRef scratch = new BytesRef();
+ scratch.length = ref.length();
+ pool.setRawBytesRef(scratch, position);
+ System.arraycopy(scratch.bytes, scratch.offset, ref.bytes(), 0, ref.length());
+ break;
+ default:
+ fail();
}
assertEquals(expected, ref.get());
position += ref.length();
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java b/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java
index bc5af7f..c496ff8 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java
@@ -41,9 +41,12 @@ public class TestMSBRadixSorter extends LuceneTestCase {
break;
}
+ final int finalMaxLength = maxLength;
new MSBRadixSorter(maxLength) {
+ @Override
protected int byteAt(int i, int k) {
+ assertTrue(k < finalMaxLength);
BytesRef ref = refs[i];
if (ref.length <= k) {
return -1;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/test/org/apache/lucene/util/TestRadixSelector.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestRadixSelector.java b/lucene/core/src/test/org/apache/lucene/util/TestRadixSelector.java
index e0d6bf9..3016580 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestRadixSelector.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestRadixSelector.java
@@ -36,13 +36,41 @@ public class TestRadixSelector extends LuceneTestCase {
random().nextBytes(bytes);
arr[i] = new BytesRef(bytes);
}
+ doTest(arr, from, to, maxLen);
+ }
+
+ public void testSharedPrefixes() {
+ for (int iter = 0; iter < 100; ++iter) {
+ doTestSharedPrefixes();
+ }
+ }
+
+ private void doTestSharedPrefixes() {
+ final int from = random().nextInt(5);
+ final int to = from + TestUtil.nextInt(random(), 1, 10000);
+ final int maxLen = TestUtil.nextInt(random(), 1, 12);
+ BytesRef[] arr = new BytesRef[from + to + random().nextInt(5)];
+ for (int i = 0; i < arr.length; ++i) {
+ byte[] bytes = new byte[TestUtil.nextInt(random(), 0, maxLen)];
+ random().nextBytes(bytes);
+ arr[i] = new BytesRef(bytes);
+ }
+ final int sharedPrefixLength = Math.min(arr[0].length, TestUtil.nextInt(random(), 1, maxLen));
+ for (int i = 1; i < arr.length; ++i) {
+ System.arraycopy(arr[0].bytes, arr[0].offset, arr[i].bytes, arr[i].offset, Math.min(sharedPrefixLength, arr[i].length));
+ }
+ doTest(arr, from, to, maxLen);
+ }
+
+ private void doTest(BytesRef[] arr, int from, int to, int maxLen) {
final int k = TestUtil.nextInt(random(), from, to - 1);
BytesRef[] expected = arr.clone();
Arrays.sort(expected, from, to);
BytesRef[] actual = arr.clone();
- RadixSelector selector = new RadixSelector(random().nextBoolean() ? maxLen : Integer.MAX_VALUE) {
+ final int enforcedMaxLen = random().nextBoolean() ? maxLen : Integer.MAX_VALUE;
+ RadixSelector selector = new RadixSelector(enforcedMaxLen) {
@Override
protected void swap(int i, int j) {
@@ -51,6 +79,7 @@ public class TestRadixSelector extends LuceneTestCase {
@Override
protected int byteAt(int i, int k) {
+ assertTrue(k < enforcedMaxLen);
BytesRef b = actual[i];
if (k >= b.length) {
return -1;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/33197ec9/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
index f1140d4..4616ce3 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
@@ -44,7 +44,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
Arrays.sort(points, new Comparator<Point>() {
@Override
public int compare(Point o1, Point o2) {
- int cmp = StringHelper.compare(bytesPerDim, o1.packedValue, 0, o2.packedValue, 0);
+ int cmp = o1.packedValue.compareTo(o2.packedValue);
if (cmp == 0) {
cmp = Integer.compare(o1.doc, o2.doc);
}
@@ -70,19 +70,25 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
for (int i = 0; i < commonPrefixLengths.length; ++i) {
commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, bytesPerDim);
}
+ BytesRef firstValue = points[0].packedValue;
for (int i = 1; i < points.length; ++i) {
for (int dim = 0; dim < numDims; ++dim) {
int offset = dim * bytesPerDim;
- System.arraycopy(points[0].packedValue, offset, points[i].packedValue, offset, commonPrefixLengths[dim]);
+ BytesRef packedValue = points[i].packedValue;
+ System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLengths[dim]);
}
}
DummyPointsReader reader = new DummyPointsReader(points);
final int sortedDim = random().nextInt(numDims);
MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths, reader, 0, points.length,
- new byte[numDims * bytesPerDim], new byte[numDims * bytesPerDim]);
+ new BytesRef(), new BytesRef());
for (int i = 1; i < points.length; ++i) {
final int offset = sortedDim * bytesPerDim;
- int cmp = StringHelper.compare(bytesPerDim, reader.points[i-1].packedValue, offset, reader.points[i].packedValue, offset);
+ BytesRef previousValue = reader.points[i-1].packedValue;
+ BytesRef currentValue = reader.points[i].packedValue;
+ int cmp = StringHelper.compare(bytesPerDim,
+ previousValue.bytes, previousValue.offset + offset,
+ currentValue.bytes, currentValue.offset + offset);
if (cmp == 0) {
cmp = reader.points[i - 1].doc - reader.points[i].doc;
}
@@ -103,17 +109,23 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
Point[] points = createRandomPoints(numDims, bytesPerDim, maxDoc);
int commonPrefixLength = TestUtil.nextInt(random(), 0, bytesPerDim);
final int splitDim = random().nextInt(numDims);
+ BytesRef firstValue = points[0].packedValue;
for (int i = 1; i < points.length; ++i) {
+ BytesRef packedValue = points[i].packedValue;
int offset = splitDim * bytesPerDim;
- System.arraycopy(points[0].packedValue, offset, points[i].packedValue, offset, commonPrefixLength);
+ System.arraycopy(firstValue.bytes, firstValue.offset + offset, packedValue.bytes, packedValue.offset + offset, commonPrefixLength);
}
DummyPointsReader reader = new DummyPointsReader(points);
final int pivot = TestUtil.nextInt(random(), 0, points.length - 1);
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLength, reader, 0, points.length, pivot,
- new byte[numDims * bytesPerDim], new byte[numDims * bytesPerDim]);
+ new BytesRef(), new BytesRef());
+ BytesRef pivotValue = reader.points[pivot].packedValue;
int offset = splitDim * bytesPerDim;
for (int i = 0; i < points.length; ++i) {
- int cmp = StringHelper.compare(bytesPerDim, reader.points[i].packedValue, offset, reader.points[pivot].packedValue, offset);
+ BytesRef value = reader.points[i].packedValue;
+ int cmp = StringHelper.compare(bytesPerDim,
+ value.bytes, value.offset + offset,
+ pivotValue.bytes, pivotValue.offset + offset);
if (cmp == 0) {
cmp = reader.points[i].doc - reader.points[pivot].doc;
}
@@ -140,11 +152,15 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
}
private static class Point {
- final byte[] packedValue;
+ final BytesRef packedValue;
final int doc;
Point(byte[] packedValue, int doc) {
- this.packedValue = packedValue;
+ // use a non-null offset to make sure MutablePointsReaderUtils does not ignore it
+ this.packedValue = new BytesRef(packedValue.length + 1);
+ this.packedValue.bytes[0] = (byte) random().nextInt(256);
+ this.packedValue.offset = 1;
+ this.packedValue.length = packedValue.length;
this.doc = doc;
}
@@ -154,17 +170,17 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
return false;
}
Point that = (Point) obj;
- return Arrays.equals(packedValue, that.packedValue) && doc == that.doc;
+ return packedValue.equals(that.packedValue) && doc == that.doc;
}
@Override
public int hashCode() {
- return 31 * Arrays.hashCode(packedValue) + doc;
+ return 31 * packedValue.hashCode() + doc;
}
@Override
public String toString() {
- return "value=" + new BytesRef(packedValue) + " doc=" + doc;
+ return "value=" + packedValue + " doc=" + doc;
}
}
@@ -187,13 +203,16 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
}
@Override
- public void getValue(int i, byte[] packedValue) {
- System.arraycopy(points[i].packedValue, 0, packedValue, 0, points[i].packedValue.length);
+ public void getValue(int i, BytesRef packedValue) {
+ packedValue.bytes = points[i].packedValue.bytes;
+ packedValue.offset = points[i].packedValue.offset;
+ packedValue.length = points[i].packedValue.length;
}
@Override
public byte getByteAt(int i, int k) {
- return points[i].packedValue[k];
+ BytesRef packedValue = points[i].packedValue;
+ return packedValue.bytes[packedValue.offset + k];
}
@Override