You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2021/05/14 08:10:52 UTC
[lucene-solr] 02/02: LUCENE-9932: Performance improvement for BKD
index building
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
commit 86b6d35f7229010757924da3312ffb2fe72b17f4
Author: neoReMinD <xu...@gmail.com>
AuthorDate: Fri May 14 09:49:27 2021 +0200
LUCENE-9932: Performance improvement for BKD index building
---
lucene/CHANGES.txt | 2 +
.../apache/lucene/codecs/MutablePointValues.java | 5 +
.../org/apache/lucene/index/PointValuesWriter.java | 26 +++
.../org/apache/lucene/util/MSBRadixSorter.java | 10 +-
.../apache/lucene/util/StableMSBRadixSorter.java | 81 ++++++++
.../lucene/util/bkd/MutablePointsReaderUtils.java | 65 +++----
.../lucene/util/TestStableMSBRadixSorter.java | 211 +++++++++++++++++++++
.../test/org/apache/lucene/util/bkd/TestBKD.java | 22 +++
.../util/bkd/TestMutablePointsReaderUtils.java | 60 ++++--
9 files changed, 428 insertions(+), 54 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index fb5c0ab..ac78502 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -58,6 +58,8 @@ Improvements
Optimizations
---------------------
+* LUCENE-9932: Performance improvement for BKD index building (neoremind)
+
* LUCENE-9827: Speed up merging of stored fields and term vectors for smaller segments.
(Daniel Mitterdorfer, Dimitrios Liapis, Adrien Grand, Robert Muir)
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/MutablePointValues.java b/lucene/core/src/java/org/apache/lucene/codecs/MutablePointValues.java
index 8f4d69c..2d38ac5 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/MutablePointValues.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/MutablePointValues.java
@@ -39,4 +39,9 @@ public abstract class MutablePointValues extends PointValues {
/** Swap the i-th and j-th values. */
public abstract void swap(int i, int j);
+ /** Save the i-th value into the j-th position in temporary storage. */
+ public abstract void save(int i, int j);
+
+ /** Restore values between i-th and j-th(excluding) in temporary storage into original storage. */
+ public abstract void restore(int i, int j);
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
index d715872..ddbc6a6 100644
--- a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
@@ -72,6 +72,7 @@ class PointValuesWriter {
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, PointsWriter writer) throws IOException {
PointValues points = new MutablePointValues() {
final int[] ords = new int[numPoints];
+ int[] temp;
{
for (int i = 0; i < numPoints; ++i) {
ords[i] = i;
@@ -154,6 +155,21 @@ class PointValuesWriter {
final long offset = (long) packedBytesLength * ords[i] + k;
return bytes.readByte(offset);
}
+
+ @Override
+ public void save(int i, int j) {
+ if (temp == null) {
+ temp = new int[ords.length];
+ }
+ temp[j] = ords[i];
+ }
+
+ @Override
+ public void restore(int i, int j) {
+ if (temp != null) {
+ System.arraycopy(temp, i, ords, i, j - i);
+ }
+ }
};
final PointValues values;
@@ -277,5 +293,15 @@ class PointValuesWriter {
public void swap(int i, int j) {
in.swap(i, j);
}
+
+ @Override
+ public void save(int i, int j) {
+ in.save(i, j);
+ }
+
+ @Override
+ public void restore(int i, int j) {
+ in.restore(i, j);
+ }
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java b/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
index f0e2a61..f1059a9 100644
--- a/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
@@ -31,7 +31,7 @@ public abstract class MSBRadixSorter extends Sorter {
// locality)
private static final int LEVEL_THRESHOLD = 8;
// size of histograms: 256 + 1 to indicate that the string is finished
- private static final int HISTOGRAM_SIZE = 257;
+ protected static final int HISTOGRAM_SIZE = 257;
// buckets below this size will be sorted with introsort
private static final int LENGTH_THRESHOLD = 100;
@@ -40,7 +40,7 @@ public abstract class MSBRadixSorter extends Sorter {
private final int[] endOffsets = new int[HISTOGRAM_SIZE];
private final int[] commonPrefix;
- private final int maxLength;
+ protected final int maxLength;
/**
* Sole constructor.
@@ -121,7 +121,7 @@ public abstract class MSBRadixSorter extends Sorter {
sort(from, to, 0, 0);
}
- private void sort(int from, int to, int k, int l) {
+ protected void sort(int from, int to, int k, int l) {
if (to - from <= LENGTH_THRESHOLD || l >= LEVEL_THRESHOLD) {
introSort(from, to, k);
} else {
@@ -195,7 +195,7 @@ public abstract class MSBRadixSorter extends Sorter {
}
/** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
- private int getBucket(int i, int k) {
+ protected int getBucket(int i, int k) {
return byteAt(i, k) + 1;
}
@@ -268,7 +268,7 @@ public abstract class MSBRadixSorter extends Sorter {
* @param startOffsets start offsets per bucket
* @param endOffsets end offsets per bucket
*/
- private void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
+ protected void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
// reorder in place, like the dutch flag problem
for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
final int limit = endOffsets[i];
diff --git a/lucene/core/src/java/org/apache/lucene/util/StableMSBRadixSorter.java b/lucene/core/src/java/org/apache/lucene/util/StableMSBRadixSorter.java
new file mode 100644
index 0000000..c6ec774
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/StableMSBRadixSorter.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util;
+
+/**
+ * Stable radix sorter for variable-length strings.
+ *
+ * @lucene.internal
+ */
+public abstract class StableMSBRadixSorter extends MSBRadixSorter {
+
+ private final int[] fixedStartOffsets;
+
+ public StableMSBRadixSorter(int maxLength) {
+ super(maxLength);
+ fixedStartOffsets = new int[HISTOGRAM_SIZE];
+ }
+
+ /** Save the i-th value into the j-th position in temporary storage. */
+ protected abstract void save(int i, int j);
+
+ /** Restore values between i-th and j-th(excluding) in temporary storage into original storage. */
+ protected abstract void restore(int i, int j);
+
+ @Override
+ protected Sorter getFallbackSorter(int k) {
+ return new InPlaceMergeSorter() {
+ @Override
+ protected void swap(int i, int j) {
+ StableMSBRadixSorter.this.swap(i, j);
+ }
+
+ @Override
+ protected int compare(int i, int j) {
+ for (int o = k; o < maxLength; ++o) {
+ final int b1 = byteAt(i, o);
+ final int b2 = byteAt(j, o);
+ if (b1 != b2) {
+ return b1 - b2;
+ } else if (b1 == -1) {
+ break;
+ }
+ }
+ return 0;
+ }
+ };
+ }
+
+ /**
+ * Reorder elements in stable way, since Dutch sort does not guarantee ordering for same values.
+ *
+ * <p>When this method returns, startOffsets and endOffsets are equal.
+ */
+ @Override
+ protected void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
+ System.arraycopy(startOffsets, 0, fixedStartOffsets, 0, startOffsets.length);
+ for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
+ final int limit = endOffsets[i];
+ for (int h1 = fixedStartOffsets[i]; h1 < limit; h1++) {
+ final int b = getBucket(from + h1, k);
+ final int h2 = startOffsets[b]++;
+ save(from + h1, from + h2);
+ }
+ }
+ restore(from, to);
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
index 3d54e69..e6bfd2a 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
@@ -21,9 +21,9 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FutureArrays;
import org.apache.lucene.util.IntroSelector;
import org.apache.lucene.util.IntroSorter;
-import org.apache.lucene.util.MSBRadixSorter;
import org.apache.lucene.util.RadixSelector;
import org.apache.lucene.util.Selector;
+import org.apache.lucene.util.StableMSBRadixSorter;
import org.apache.lucene.util.packed.PackedInts;
/** Utility APIs for sorting and partitioning buffered points.
@@ -36,8 +36,22 @@ public final class MutablePointsReaderUtils {
/** Sort the given {@link MutablePointValues} based on its packed value then doc ID. */
public static void sort(BKDConfig config, int maxDoc,
MutablePointValues reader, int from, int to) {
- final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
- new MSBRadixSorter(config.packedBytesLength + (bitsPerDocId + 7) / 8) {
+ boolean sortedByDocID = true;
+ int prevDoc = 0;
+ for (int i = from; i < to; ++i) {
+ int doc = reader.getDocID(i);
+ if (doc < prevDoc) {
+ sortedByDocID = false;
+ break;
+ }
+ prevDoc = doc;
+ }
+
+ // No need to tie break on doc IDs if already sorted by doc ID, since we use a stable sort.
+ // This should be a common situation as IndexWriter accumulates data in doc ID order when
+ // index sorting is not enabled.
+ final int bitsPerDocId = sortedByDocID ? 0 : PackedInts.bitsRequired(maxDoc - 1);
+ new StableMSBRadixSorter(config.packedBytesLength + (bitsPerDocId + 7) / 8) {
@Override
protected void swap(int i, int j) {
@@ -45,6 +59,16 @@ public final class MutablePointsReaderUtils {
}
@Override
+ protected void save(int i, int j) {
+ reader.save(i, j);
+ }
+
+ @Override
+ protected void restore(int i, int j) {
+ reader.restore(i, j);
+ }
+
+ @Override
protected int byteAt(int i, int k) {
if (k < config.packedBytesLength) {
return Byte.toUnsignedInt(reader.getByteAt(i, k));
@@ -53,41 +77,6 @@ public final class MutablePointsReaderUtils {
return (reader.getDocID(i) >>> Math.max(0, shift)) & 0xff;
}
}
-
- @Override
- protected org.apache.lucene.util.Sorter getFallbackSorter(int k) {
- return new IntroSorter() {
-
- final BytesRef pivot = new BytesRef();
- final BytesRef scratch = new BytesRef();
- int pivotDoc;
-
- @Override
- protected void swap(int i, int j) {
- reader.swap(i, j);
- }
-
- @Override
- protected void setPivot(int i) {
- reader.getValue(i, pivot);
- pivotDoc = reader.getDocID(i);
- }
-
- @Override
- protected int comparePivot(int j) {
- if (k < config.packedBytesLength) {
- reader.getValue(j, scratch);
- int cmp = FutureArrays.compareUnsigned(pivot.bytes, pivot.offset + k, pivot.offset + k + config.packedBytesLength - k,
- scratch.bytes, scratch.offset + k, scratch.offset + k + config.packedBytesLength - k);
- if (cmp != 0) {
- return cmp;
- }
- }
- return pivotDoc - reader.getDocID(j);
- }
- };
- }
-
}.sort(from, to);
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestStableMSBRadixSorter.java b/lucene/core/src/test/org/apache/lucene/util/TestStableMSBRadixSorter.java
new file mode 100644
index 0000000..387b6b6
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/util/TestStableMSBRadixSorter.java
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+public class TestStableMSBRadixSorter extends LuceneTestCase {
+
+ private void test(BytesRef[] refs, int len) {
+ BytesRef[] expected = ArrayUtil.copyOfSubArray(refs, 0, len);
+ Arrays.sort(expected);
+
+ int maxLength = 0;
+ for (int i = 0; i < len; ++i) {
+ BytesRef ref = refs[i];
+ maxLength = Math.max(maxLength, ref.length);
+ }
+ switch (random().nextInt(3)) {
+ case 0:
+ maxLength += TestUtil.nextInt(random(), 1, 5);
+ break;
+ case 1:
+ maxLength = Integer.MAX_VALUE;
+ break;
+ default:
+ // leave unchanged
+ break;
+ }
+
+ final int finalMaxLength = maxLength;
+ new StableMSBRadixSorter(maxLength) {
+
+ private BytesRef[] temp;
+
+ @Override
+ protected int byteAt(int i, int k) {
+ assertTrue(k < finalMaxLength);
+ BytesRef ref = refs[i];
+ if (ref.length <= k) {
+ return -1;
+ }
+ return ref.bytes[ref.offset + k] & 0xff;
+ }
+
+ @Override
+ protected void swap(int i, int j) {
+ BytesRef tmp = refs[i];
+ refs[i] = refs[j];
+ refs[j] = tmp;
+ }
+
+ @Override
+ protected void save(int i, int j) {
+ if (temp == null) {
+ temp = new BytesRef[refs.length];
+ }
+ temp[j] = refs[i];
+ }
+
+ @Override
+ protected void restore(int i, int j) {
+ if (temp != null) {
+ System.arraycopy(temp, i, refs, i, j - i);
+ }
+ }
+ }.sort(0, len);
+ BytesRef[] actual = ArrayUtil.copyOfSubArray(refs, 0, len);
+ assertArrayEquals(expected, actual);
+ // Verify that the arrays are not only equal after sorting with Arrays#sort and
+ // StableMSBRadixSorter
+ // but also that they have the very same instance at every index.
+ // This is different from MSBRadixSorter which does not guarantee ordering of the same value.
+ assertEquals(expected.length, actual.length);
+ for (int i = 0; i < expected.length; i++) {
+ assertSame(expected[i].bytes, actual[i].bytes);
+ }
+ }
+
+ public void testEmpty() {
+ test(new BytesRef[random().nextInt(5)], 0);
+ }
+
+ public void testOneValue() {
+ BytesRef bytes = new BytesRef(TestUtil.randomSimpleString(random()));
+ test(new BytesRef[] {bytes}, 1);
+ }
+
+ public void testTwoValues() {
+ BytesRef bytes1 = new BytesRef(TestUtil.randomSimpleString(random()));
+ BytesRef bytes2 = new BytesRef(TestUtil.randomSimpleString(random()));
+ test(new BytesRef[] {bytes1, bytes2}, 2);
+ }
+
+ private void testRandom(int commonPrefixLen, int maxLen) {
+ byte[] commonPrefix = new byte[commonPrefixLen];
+ random().nextBytes(commonPrefix);
+ final int len = random().nextInt(100000);
+ BytesRef[] bytes = new BytesRef[len + random().nextInt(50)];
+ for (int i = 0; i < len; ++i) {
+ byte[] b = new byte[commonPrefixLen + random().nextInt(maxLen)];
+ random().nextBytes(b);
+ System.arraycopy(commonPrefix, 0, b, 0, commonPrefixLen);
+ bytes[i] = new BytesRef(b);
+ }
+ test(bytes, len);
+ }
+
+ public void testRandom() {
+ for (int iter = 0; iter < 10; ++iter) {
+ testRandom(0, 10);
+ }
+ }
+
+ public void testRandomWithLotsOfDuplicates() {
+ for (int iter = 0; iter < 10; ++iter) {
+ testRandom(0, 2);
+ }
+ }
+
+ public void testRandomWithSharedPrefix() {
+ for (int iter = 0; iter < 10; ++iter) {
+ testRandom(TestUtil.nextInt(random(), 1, 30), 10);
+ }
+ }
+
+ public void testRandomWithSharedPrefixAndLotsOfDuplicates() {
+ for (int iter = 0; iter < 10; ++iter) {
+ testRandom(TestUtil.nextInt(random(), 1, 30), 2);
+ }
+ }
+
+ public void testRandom2() {
+ // how large our alphabet is
+ int letterCount = TestUtil.nextInt(random(), 2, 10);
+
+ // how many substring fragments to use
+ int substringCount = TestUtil.nextInt(random(), 2, 10);
+ Set<BytesRef> substringsSet = new HashSet<>();
+
+ // how many strings to make
+ int stringCount = atLeast(10000);
+
+ // System.out.println("letterCount=" + letterCount + " substringCount=" + substringCount + "
+ // stringCount=" + stringCount);
+ while (substringsSet.size() < substringCount) {
+ int length = TestUtil.nextInt(random(), 2, 10);
+ byte[] bytes = new byte[length];
+ for (int i = 0; i < length; i++) {
+ bytes[i] = (byte) random().nextInt(letterCount);
+ }
+ BytesRef br = new BytesRef(bytes);
+ substringsSet.add(br);
+ // System.out.println("add substring count=" + substringsSet.size() + ": " + br);
+ }
+
+ BytesRef[] substrings = substringsSet.toArray(new BytesRef[substringsSet.size()]);
+ double[] chance = new double[substrings.length];
+ double sum = 0.0;
+ for (int i = 0; i < substrings.length; i++) {
+ chance[i] = random().nextDouble();
+ sum += chance[i];
+ }
+
+ // give each substring a random chance of occurring:
+ double accum = 0.0;
+ for (int i = 0; i < substrings.length; i++) {
+ accum += chance[i] / sum;
+ chance[i] = accum;
+ }
+
+ Set<BytesRef> stringsSet = new HashSet<>();
+ int iters = 0;
+ while (stringsSet.size() < stringCount && iters < stringCount * 5) {
+ int count = TestUtil.nextInt(random(), 1, 5);
+ BytesRefBuilder b = new BytesRefBuilder();
+ for (int i = 0; i < count; i++) {
+ double v = random().nextDouble();
+ accum = 0.0;
+ for (int j = 0; j < substrings.length; j++) {
+ accum += chance[j];
+ if (accum >= v) {
+ b.append(substrings[j]);
+ break;
+ }
+ }
+ }
+ BytesRef br = b.toBytesRef();
+ stringsSet.add(br);
+ // System.out.println("add string count=" + stringsSet.size() + ": " + br);
+ iters++;
+ }
+
+ test(stringsSet.toArray(new BytesRef[stringsSet.size()]), stringsSet.size());
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
index f6eacc6..d37284a 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
@@ -1465,6 +1465,18 @@ public class TestBKD extends LuceneTestCase {
@Override
public byte getByteAt(int i, int k) {
+ BytesRef b = new BytesRef();
+ getValue(i, b);
+ return b.bytes[b.offset + k];
+ }
+
+ @Override
+ public void save(int i, int j) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void restore(int i, int j) {
throw new UnsupportedOperationException();
}
};
@@ -1582,6 +1594,16 @@ public class TestBKD extends LuceneTestCase {
public int getDocCount() {
return 11;
}
+
+ @Override
+ public void save(int i, int j) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void restore(int i, int j) {
+ throw new UnsupportedOperationException();
+ }
};
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
IllegalStateException ex = expectThrows(IllegalStateException.class, () -> { w.writeField(out, out, out, "", val);});
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
index e75ab24..d4869b0 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
@@ -30,16 +30,22 @@ import org.apache.lucene.util.TestUtil;
public class TestMutablePointsReaderUtils extends LuceneTestCase {
public void testSort() {
- for (int iter = 0; iter < 5; ++iter) {
- doTestSort();
+ for (int iter = 0; iter < 10; ++iter) {
+ doTestSort(false);
+ }
+ }
+
+ public void testSortWithIncrementalDocId() {
+ for (int iter = 0; iter < 10; ++iter) {
+ doTestSort(true);
}
}
- private void doTestSort() {
+ private void doTestSort(boolean isDocIdIncremental) {
final int bytesPerDim = TestUtil.nextInt(random(), 1, 16);
final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30));
BKDConfig config = new BKDConfig(1, 1, bytesPerDim, BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
- Point[] points = createRandomPoints(config, maxDoc, new int[1]);
+ Point[] points = createRandomPoints(config, maxDoc, new int[1], isDocIdIncremental);
DummyPointsReader reader = new DummyPointsReader(points);
MutablePointsReaderUtils.sort(config, maxDoc, reader, 0, points.length);
Arrays.sort(points, new Comparator<Point>() {
@@ -53,7 +59,23 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
}
});
assertNotSame(points, reader.points);
- assertArrayEquals(points, reader.points);
+ assertEquals(points.length, reader.points.length);
+
+ // Check doc IDs are in ascending order.
+ // If doc IDs are already increasing, StableMSBRadixSorter should keep doc ID's ordering.
+ // If doc IDs are not ordered, StableMSBRadixSorter should compare doc ID to guarantee the
+ // ordering.
+ Point prevPoint = null;
+ for (int i = 0; i < points.length; i++) {
+ assertEquals(points[i].packedValue, reader.points[i].packedValue);
+ assertSame(points[i].packedValue, reader.points[i].packedValue);
+ if (prevPoint != null) {
+ if (reader.points[i].packedValue.equals(prevPoint.packedValue)) {
+ assertTrue(reader.points[i].doc >= prevPoint.doc);
+ }
+ }
+ prevPoint = reader.points[i];
+ }
}
public void testSortByDim() {
@@ -66,7 +88,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
BKDConfig config = createRandomConfig();
final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30));
int[] commonPrefixLengths = new int[config.numDims];
- Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths);
+ Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths, false);
DummyPointsReader reader = new DummyPointsReader(points);
final int sortedDim = random().nextInt(config.numIndexDims);
MutablePointsReaderUtils.sortByDim(config, sortedDim, commonPrefixLengths, reader, 0, points.length,
@@ -99,7 +121,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
BKDConfig config = createRandomConfig();
int[] commonPrefixLengths = new int[config.numDims];
final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30));
- Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths);
+ Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths, false);
final int splitDim = random().nextInt(config.numIndexDims);
DummyPointsReader reader = new DummyPointsReader(points);
final int pivot = TestUtil.nextInt(random(), 0, points.length - 1);
@@ -138,15 +160,15 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
return new BKDConfig(numDims, numIndexDims, bytesPerDim, maxPointsInLeafNode);
}
- private static Point[] createRandomPoints(BKDConfig config, int maxDoc, int[] commonPrefixLengths) {
+ private static Point[] createRandomPoints(BKDConfig config, int maxDoc, int[] commonPrefixLengths, boolean isDocIdIncremental) {
assertTrue(commonPrefixLengths.length == config.numDims);
final int numPoints = TestUtil.nextInt(random(), 1, 100000);
Point[] points = new Point[numPoints];
- if (random().nextInt(5) != 0) {
+ if (random().nextInt(10) != 0) {
for (int i = 0; i < numPoints; ++i) {
byte[] value = new byte[config.packedBytesLength];
random().nextBytes(value);
- points[i] = new Point(value, random().nextInt(maxDoc));
+ points[i] = new Point(value, isDocIdIncremental ? Math.min(i, maxDoc - 1) : random().nextInt(maxDoc));
}
for (int i = 0; i < config.numDims; ++i) {
commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, config.bytesPerDim);
@@ -170,7 +192,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
System.arraycopy(indexDims, 0, value, 0, config.packedIndexBytesLength);
random().nextBytes(dataDims);
System.arraycopy(dataDims, 0, value, config.packedIndexBytesLength, numDataDims * config.bytesPerDim);
- points[i] = new Point(value, random().nextInt(maxDoc));
+ points[i] = new Point(value, isDocIdIncremental ? Math.min(i, maxDoc - 1) : random().nextInt(maxDoc));
}
for (int i = 0; i < config.numIndexDims; ++i) {
commonPrefixLengths[i] = config.bytesPerDim;
@@ -228,6 +250,8 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
private final Point[] points;
+ private Point[] temp;
+
DummyPointsReader(Point[] points) {
this.points = points.clone();
}
@@ -300,6 +324,20 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
throw new UnsupportedOperationException();
}
+ @Override
+ public void save(int i, int j) {
+ if (temp == null) {
+ temp = new Point[points.length];
+ }
+ temp[j] = points[i];
+ }
+
+ @Override
+ public void restore(int i, int j) {
+ if (temp != null) {
+ System.arraycopy(temp, i, points, i, j - i);
+ }
+ }
}
}