You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by iv...@apache.org on 2019/06/26 07:45:50 UTC
[lucene-solr] branch master updated: LUCENE-8879: Improve
BKDRadixSelector tests
This is an automated email from the ASF dual-hosted git repository.
ivera pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 36eaf75 LUCENE-8879: Improve BKDRadixSelector tests
36eaf75 is described below
commit 36eaf75b1ff90dae78cf119179ac48cfb3aa3f8b
Author: Ignacio Vera <iv...@gmail.com>
AuthorDate: Wed Jun 26 09:45:44 2019 +0200
LUCENE-8879: Improve BKDRadixSelector tests
This change adds explicit test for the sorting capabilities.
---
lucene/CHANGES.txt | 4 +-
.../lucene/util/bkd/TestBKDRadixSelector.java | 65 +++++---
.../apache/lucene/util/bkd/TestBKDRadixSort.java | 163 +++++++++++++++++++++
3 files changed, 211 insertions(+), 21 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index c557fa2..ba1fb2a 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -145,7 +145,9 @@ Other
* LUCENE-8778: Define analyzer SPI names as static final fields and document the names in Javadocs.
(Tomoko Uchida, Uwe Schindler)
-* LUCENE-8838: Remove support for Steiner points on Tessellator. (Ignacio Vera)
+* LUCENE-8838: Remove support for Steiner points on Tessellator. (Ignacio Vera)
+
+* LUCENE-8879: Improve BKDRadixSelector tests. (Ignacio Vera)
======================= Lucene 8.1.1 =======================
(No Changes)
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java
index dfe654b..075cd1c 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java
@@ -48,7 +48,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
points.append(value, 3);
points.close();
PointWriter copy = copyPoints(dir,points, packedLength);
- verify(dir, copy, dimensions, 0, values, middle, packedLength, bytesPerDimensions, 0);
+ verify(dir, copy, dimensions, dimensions, 0, values, middle, packedLength, bytesPerDimensions, 0);
dir.close();
}
@@ -79,9 +79,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
}
int partitionPoint = TestUtil.nextInt(random(), start + 1, end - 1);
int sortedOnHeap = random().nextInt(5000);
- int dimensions = TestUtil.nextInt(random(), 1, 8);
+ int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+ int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
- int packedLength = dimensions * bytesPerDimensions;
+ int packedLength = dataDimensions * bytesPerDimensions;
PointWriter points = getRandomPointWriter(dir, values, packedLength);
byte[] value = new byte[packedLength];
for (int i =0; i < values; i++) {
@@ -89,7 +90,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
points.append(value, i);
}
points.close();
- verify(dir, points, dimensions, start, end, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
+ verify(dir, points, dataDimensions, indexDimensions, start, end, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
dir.close();
}
@@ -112,7 +113,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
}
}
points.close();
- verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
+ verify(dir, points, dimensions, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
dir.close();
}
@@ -121,9 +122,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
Directory dir = getDirectory(values);
int partitionPoint = random().nextInt(values);
int sortedOnHeap = random().nextInt(5000);
- int dimensions = TestUtil.nextInt(random(), 1, 8);
+ int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+ int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
- int packedLength = dimensions * bytesPerDimensions;
+ int packedLength = dataDimensions * bytesPerDimensions;
PointWriter points = getRandomPointWriter(dir, values, packedLength);
byte[] value = new byte[packedLength];
random().nextBytes(value);
@@ -135,7 +137,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
}
}
points.close();
- verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
+ verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
dir.close();
}
@@ -144,9 +146,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
Directory dir = getDirectory(values);
int partitionPoint = random().nextInt(values);
int sortedOnHeap = random().nextInt(5000);
- int dimensions = TestUtil.nextInt(random(), 1, 8);
+ int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+ int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
- int packedLength = dimensions * bytesPerDimensions;
+ int packedLength = dataDimensions * bytesPerDimensions;
PointWriter points = getRandomPointWriter(dir, values, packedLength);
byte[] value = new byte[packedLength];
random().nextBytes(value);
@@ -154,7 +157,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
points.append(value, 0);
}
points.close();
- verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
+ verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
dir.close();
}
@@ -163,9 +166,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
Directory dir = getDirectory(values);
int partitionPoint = random().nextInt(values);
int sortedOnHeap = random().nextInt(5000);
- int dimensions = TestUtil.nextInt(random(), 1, 8);
+ int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+ int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
- int packedLength = dimensions * bytesPerDimensions;
+ int packedLength = dataDimensions * bytesPerDimensions;
PointWriter points = getRandomPointWriter(dir, values, packedLength);
int numberValues = random().nextInt(8) + 2;
byte[][] differentValues = new byte[numberValues][packedLength];
@@ -176,14 +180,37 @@ public class TestBKDRadixSelector extends LuceneTestCase {
points.append(differentValues[random().nextInt(numberValues)], i);
}
points.close();
- verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
+ verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
dir.close();
}
- private void verify(Directory dir, PointWriter points, int dimensions, long start, long end, long middle, int packedLength, int bytesPerDimensions, int sortedOnHeap) throws IOException{
- BKDRadixSelector radixSelector = new BKDRadixSelector(dimensions, bytesPerDimensions, sortedOnHeap, dir, "test");
- //we check for each dimension
- for (int splitDim =0; splitDim < dimensions; splitDim++) {
+ public void testRandomDataDimDiffValues() throws IOException {
+ int values = atLeast(15000);
+ Directory dir = getDirectory(values);
+ int partitionPoint = random().nextInt(values);
+ int sortedOnHeap = random().nextInt(5000);
+ int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+ int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
+ int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
+ int packedLength = dataDimensions * bytesPerDimensions;
+ PointWriter points = getRandomPointWriter(dir, values, packedLength);
+ byte[] value = new byte[packedLength];
+ byte[] dataValue = new byte[(dataDimensions - indexDimensions) * bytesPerDimensions];
+ random().nextBytes(value);
+ for (int i =0; i < values; i++) {
+ random().nextBytes(dataValue);
+ System.arraycopy(dataValue, 0, value, indexDimensions * bytesPerDimensions, (dataDimensions - indexDimensions) * bytesPerDimensions);
+ points.append(value, i);
+ }
+ points.close();
+ verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
+ dir.close();
+ }
+
+ private void verify(Directory dir, PointWriter points, int dataDimensions, int indexDimensions, long start, long end, long middle, int packedLength, int bytesPerDimensions, int sortedOnHeap) throws IOException{
+ BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDimensions, sortedOnHeap, dir, "test");
+ //we only split by indexed dimension so we check for each only those dimension
+ for (int splitDim = 0; splitDim < indexDimensions; splitDim++) {
//We need to make a copy of the data as it is deleted in the process
BKDRadixSelector.PathSlice inputSlice = new BKDRadixSelector.PathSlice(copyPoints(dir, points, packedLength), 0, points.count());
int commonPrefixLengthInput = getRandomCommonPrefix(inputSlice, bytesPerDimensions, splitDim);
@@ -208,7 +235,6 @@ public class TestBKDRadixSelector extends LuceneTestCase {
points.destroy();
}
-
private PointWriter copyPoints(Directory dir, PointWriter points, int packedLength) throws IOException {
try (PointWriter copy = getRandomPointWriter(dir, points.count(), packedLength);
PointReader reader = points.getReader(0, points.count())) {
@@ -318,5 +344,4 @@ public class TestBKDRadixSelector extends LuceneTestCase {
}
return docID;
}
-
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java
new file mode 100644
index 0000000..a7cebd0
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.bkd;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestBKDRadixSort extends LuceneTestCase {
+
+ public void testRandom() throws IOException {
+ int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+ int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+ int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
+ int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
+ int packedBytesLength = dataDimensions * bytesPerDim;
+ HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
+ byte[] value = new byte[packedBytesLength];
+ for (int i = 0; i < numPoints; i++) {
+ random().nextBytes(value);
+ points.append(value, i);
+ }
+ verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
+ }
+
+ public void testRandomAllEquals() throws IOException {
+ int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+ int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+ int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
+ int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
+ int packedBytesLength = dataDimensions * bytesPerDim;
+ HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
+ byte[] value = new byte[packedBytesLength];
+ random().nextBytes(value);
+ for (int i = 0; i < numPoints; i++) {
+ points.append(value, random().nextInt(numPoints));
+ }
+ verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
+ }
+
+ public void testRandomLastByteTwoValues() throws IOException {
+ int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+ int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+ int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
+ int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
+ int packedBytesLength = dataDimensions * bytesPerDim;
+ HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
+ byte[] value = new byte[packedBytesLength];
+ random().nextBytes(value);
+ for (int i = 0; i < numPoints; i++) {
+ if (random().nextBoolean()) {
+ points.append(value, 1);
+ } else {
+ points.append(value, 2);
+ }
+ }
+ verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
+ }
+
+ public void testRandomFewDifferentValues() throws IOException {
+ int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+ int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+ int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
+ int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
+ int packedBytesLength = dataDimensions * bytesPerDim;
+ HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
+ int numberValues = random().nextInt(8) + 2;
+ byte[][] differentValues = new byte[numberValues][packedBytesLength];
+ for (int i = 0; i < numberValues; i++) {
+ random().nextBytes(differentValues[i]);
+ }
+ for (int i = 0; i < numPoints; i++) {
+ points.append(differentValues[random().nextInt(numberValues)], i);
+ }
+ verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
+ }
+
+ public void testRandomDataDimDifferent() throws IOException {
+ int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+ int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+ int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
+ int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
+ int packedBytesLength = dataDimensions * bytesPerDim;
+ HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
+ byte[] value = new byte[packedBytesLength];
+ int totalDataDimension = dataDimensions - indexDimensions;
+ byte[] dataDimensionValues = new byte[totalDataDimension * bytesPerDim];
+ random().nextBytes(value);
+ for (int i = 0; i < numPoints; i++) {
+ random().nextBytes(dataDimensionValues);
+ System.arraycopy(dataDimensionValues, 0, value, indexDimensions * bytesPerDim, totalDataDimension * bytesPerDim);
+ points.append(value, random().nextInt(numPoints));
+ }
+ verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
+ }
+
+ private void verifySort(HeapPointWriter points, int dataDimensions, int indexDimensions, int start, int end, int bytesPerDim) throws IOException{
+ int packedBytesLength = dataDimensions * bytesPerDim;
+ Directory dir = newDirectory();
+ BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDim, 1000, dir, "test");
+ // we check for each dimension
+ for (int splitDim = 0; splitDim < dataDimensions; splitDim++) {
+ radixSelector.heapRadixSort(points, start, end, splitDim, getRandomCommonPrefix(points, start, end, bytesPerDim, splitDim));
+ byte[] previous = new byte[bytesPerDim * dataDimensions];
+ int previousDocId = -1;
+ Arrays.fill(previous, (byte) 0);
+ int dimOffset = splitDim * bytesPerDim;
+ for (int j = start; j < end; j++) {
+ PointValue pointValue = points.getPackedValueSlice(j);
+ BytesRef value = pointValue.packedValue();
+ int cmp = Arrays.compareUnsigned(value.bytes, value.offset + dimOffset, value.offset + dimOffset + bytesPerDim, previous, dimOffset, dimOffset + bytesPerDim);
+ assertTrue(cmp >= 0);
+ if (cmp == 0) {
+ assertTrue(pointValue.docID() >= previousDocId);
+ }
+ System.arraycopy(value.bytes, value.offset, previous, 0, packedBytesLength);
+ previousDocId = pointValue.docID();
+ }
+ }
+ dir.close();
+ }
+
+ /** returns a common prefix length equal or lower than the current one */
+ private int getRandomCommonPrefix(HeapPointWriter points, int start, int end, int bytesPerDimension, int sortDim) {
+ int commonPrefixLength = bytesPerDimension;
+ PointValue value = points.getPackedValueSlice(start);
+ BytesRef bytesRef = value.packedValue();
+ byte[] firstValue = new byte[bytesPerDimension];
+ int offset = sortDim * bytesPerDimension;
+ System.arraycopy(bytesRef.bytes, bytesRef.offset + offset, firstValue, 0, bytesPerDimension);
+ for (int i = start + 1; i < end; i++) {
+ value = points.getPackedValueSlice(i);
+ bytesRef = value.packedValue();
+ int diff = Arrays.mismatch(bytesRef.bytes, bytesRef.offset + offset, bytesRef.offset + offset + bytesPerDimension, firstValue, 0, bytesPerDimension);
+ if (diff != -1 && commonPrefixLength > diff) {
+ if (diff == 0) {
+ return diff;
+ }
+ commonPrefixLength = diff;
+ }
+ }
+ return (random().nextBoolean()) ? commonPrefixLength : random().nextInt(commonPrefixLength);
+ }
+}