You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by iv...@apache.org on 2019/06/26 07:50:44 UTC

[lucene-solr] 01/02: LUCENE-8879: Improve BKDRadixSelector tests

This is an automated email from the ASF dual-hosted git repository.

ivera pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit afb4b928bc2c3d352e1002217787323d54f51535
Author: Ignacio Vera <iv...@gmail.com>
AuthorDate: Wed Jun 26 09:45:44 2019 +0200

    LUCENE-8879: Improve BKDRadixSelector tests
    
    This change adds explicit test for the sorting capabilities.
---
 lucene/CHANGES.txt                                 |   4 +-
 .../lucene/util/bkd/TestBKDRadixSelector.java      |  65 +++++---
 .../apache/lucene/util/bkd/TestBKDRadixSort.java   | 163 +++++++++++++++++++++
 3 files changed, 211 insertions(+), 21 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index bbe2ccc..6652c7c 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -121,7 +121,9 @@ Other
 * LUCENE-8778: Define analyzer SPI names as static final fields and document the names in Javadocs.
   (Tomoko Uchida, Uwe Schindler)
 
-* LUCENE-8838: Remove support for Steiner points on Tessellator.  (Ignacio Vera)
+* LUCENE-8838: Remove support for Steiner points on Tessellator. (Ignacio Vera)
+
+* LUCENE-8879: Improve BKDRadixSelector tests. (Ignacio Vera)
 
 ======================= Lucene 8.1.1 =======================
 (No Changes)
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java
index c290825..4566673 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSelector.java
@@ -49,7 +49,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
     points.append(value, 3);
     points.close();
     PointWriter copy = copyPoints(dir,points, packedLength);
-    verify(dir, copy, dimensions, 0, values, middle, packedLength, bytesPerDimensions, 0);
+    verify(dir, copy, dimensions, dimensions, 0, values, middle, packedLength, bytesPerDimensions, 0);
     dir.close();
   }
 
@@ -80,9 +80,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
     }
     int partitionPoint = TestUtil.nextInt(random(), start + 1, end - 1);
     int sortedOnHeap = random().nextInt(5000);
-    int dimensions =  TestUtil.nextInt(random(), 1, 8);
+    int indexDimensions =  TestUtil.nextInt(random(), 1, 8);
+    int dataDimensions =  TestUtil.nextInt(random(), indexDimensions, 8);
     int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
-    int packedLength = dimensions * bytesPerDimensions;
+    int packedLength = dataDimensions * bytesPerDimensions;
     PointWriter points = getRandomPointWriter(dir, values, packedLength);
     byte[] value = new byte[packedLength];
     for (int i =0; i < values; i++) {
@@ -90,7 +91,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
       points.append(value, i);
     }
     points.close();
-    verify(dir, points, dimensions, start, end, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
+    verify(dir, points, dataDimensions, indexDimensions, start, end, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
     dir.close();
   }
 
@@ -113,7 +114,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
       }
     }
     points.close();
-    verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
+    verify(dir, points, dimensions, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
     dir.close();
   }
 
@@ -122,9 +123,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
     Directory dir = getDirectory(values);
     int partitionPoint = random().nextInt(values);
     int sortedOnHeap = random().nextInt(5000);
-    int dimensions =  TestUtil.nextInt(random(), 1, 8);
+    int indexDimensions =  TestUtil.nextInt(random(), 1, 8);
+    int dataDimensions =  TestUtil.nextInt(random(), indexDimensions, 8);
     int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
-    int packedLength = dimensions * bytesPerDimensions;
+    int packedLength = dataDimensions * bytesPerDimensions;
     PointWriter points = getRandomPointWriter(dir, values, packedLength);
     byte[] value = new byte[packedLength];
     random().nextBytes(value);
@@ -136,7 +138,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
       }
     }
     points.close();
-    verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
+    verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
     dir.close();
   }
 
@@ -145,9 +147,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
     Directory dir = getDirectory(values);
     int partitionPoint = random().nextInt(values);
     int sortedOnHeap = random().nextInt(5000);
-    int dimensions =  TestUtil.nextInt(random(), 1, 8);
+    int indexDimensions =  TestUtil.nextInt(random(), 1, 8);
+    int dataDimensions =  TestUtil.nextInt(random(), indexDimensions, 8);
     int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
-    int packedLength = dimensions * bytesPerDimensions;
+    int packedLength = dataDimensions * bytesPerDimensions;
     PointWriter points = getRandomPointWriter(dir, values, packedLength);
     byte[] value = new byte[packedLength];
     random().nextBytes(value);
@@ -155,7 +158,7 @@ public class TestBKDRadixSelector extends LuceneTestCase {
       points.append(value, 0);
     }
     points.close();
-    verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
+    verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
     dir.close();
   }
 
@@ -164,9 +167,10 @@ public class TestBKDRadixSelector extends LuceneTestCase {
     Directory dir = getDirectory(values);
     int partitionPoint = random().nextInt(values);
     int sortedOnHeap = random().nextInt(5000);
-    int dimensions =  TestUtil.nextInt(random(), 1, 8);
+    int indexDimensions =  TestUtil.nextInt(random(), 1, 8);
+    int dataDimensions =  TestUtil.nextInt(random(), indexDimensions, 8);
     int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
-    int packedLength = dimensions * bytesPerDimensions;
+    int packedLength = dataDimensions * bytesPerDimensions;
     PointWriter points = getRandomPointWriter(dir, values, packedLength);
     int numberValues = random().nextInt(8) + 2;
     byte[][] differentValues = new byte[numberValues][packedLength];
@@ -177,14 +181,37 @@ public class TestBKDRadixSelector extends LuceneTestCase {
       points.append(differentValues[random().nextInt(numberValues)], i);
     }
     points.close();
-    verify(dir, points, dimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
+    verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
     dir.close();
   }
 
-  private void verify(Directory dir, PointWriter points, int dimensions, long start, long end, long middle, int packedLength, int bytesPerDimensions, int sortedOnHeap) throws IOException{
-    BKDRadixSelector radixSelector = new BKDRadixSelector(dimensions, bytesPerDimensions, sortedOnHeap, dir, "test");
-    //we check for each dimension
-    for (int splitDim =0; splitDim < dimensions; splitDim++) {
+  public void testRandomDataDimDiffValues() throws IOException {
+    int values = atLeast(15000);
+    Directory dir = getDirectory(values);
+    int partitionPoint = random().nextInt(values);
+    int sortedOnHeap = random().nextInt(5000);
+    int indexDimensions =  TestUtil.nextInt(random(), 1, 8);
+    int dataDimensions =  TestUtil.nextInt(random(), indexDimensions, 8);
+    int bytesPerDimensions = TestUtil.nextInt(random(), 2, 30);
+    int packedLength = dataDimensions * bytesPerDimensions;
+    PointWriter points = getRandomPointWriter(dir, values, packedLength);
+    byte[] value = new byte[packedLength];
+    byte[] dataValue = new byte[(dataDimensions - indexDimensions) * bytesPerDimensions];
+    random().nextBytes(value);
+    for (int i =0; i < values; i++) {
+      random().nextBytes(dataValue);
+      System.arraycopy(dataValue, 0, value, indexDimensions * bytesPerDimensions, (dataDimensions - indexDimensions) * bytesPerDimensions);
+      points.append(value, i);
+    }
+    points.close();
+    verify(dir, points, dataDimensions, indexDimensions, 0, values, partitionPoint, packedLength, bytesPerDimensions, sortedOnHeap);
+    dir.close();
+  }
+
+  private void verify(Directory dir, PointWriter points, int dataDimensions, int indexDimensions, long start, long end, long middle, int packedLength, int bytesPerDimensions, int sortedOnHeap) throws IOException{
+    BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDimensions, sortedOnHeap, dir, "test");
+    //we only split by indexed dimension so we check for each only those dimension
+    for (int splitDim = 0; splitDim < indexDimensions; splitDim++) {
       //We need to make a copy of the data as it is deleted in the process
       BKDRadixSelector.PathSlice inputSlice = new BKDRadixSelector.PathSlice(copyPoints(dir, points, packedLength), 0, points.count());
       int commonPrefixLengthInput = getRandomCommonPrefix(inputSlice, bytesPerDimensions, splitDim);
@@ -209,7 +236,6 @@ public class TestBKDRadixSelector extends LuceneTestCase {
     points.destroy();
   }
 
-
   private PointWriter copyPoints(Directory dir, PointWriter points, int packedLength) throws IOException {
     try (PointWriter copy  = getRandomPointWriter(dir, points.count(), packedLength);
          PointReader reader = points.getReader(0, points.count())) {
@@ -319,5 +345,4 @@ public class TestBKDRadixSelector extends LuceneTestCase {
     }
     return docID;
   }
-
 }
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java
new file mode 100644
index 0000000..a7cebd0
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKDRadixSort.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.bkd;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestBKDRadixSort extends LuceneTestCase {
+
+  public void testRandom() throws IOException {
+    int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+    int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+    int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
+    int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
+    int packedBytesLength = dataDimensions * bytesPerDim;
+    HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
+    byte[] value = new byte[packedBytesLength];
+    for (int i = 0; i < numPoints; i++) {
+      random().nextBytes(value);
+      points.append(value, i);
+    }
+    verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
+  }
+
+  public void testRandomAllEquals() throws IOException {
+    int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+    int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+    int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
+    int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
+    int packedBytesLength = dataDimensions * bytesPerDim;
+    HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
+    byte[] value = new byte[packedBytesLength];
+    random().nextBytes(value);
+    for (int i = 0; i < numPoints; i++) {
+      points.append(value, random().nextInt(numPoints));
+    }
+    verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
+  }
+
+  public void testRandomLastByteTwoValues() throws IOException {
+    int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+    int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+    int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
+    int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
+    int packedBytesLength = dataDimensions * bytesPerDim;
+    HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
+    byte[] value = new byte[packedBytesLength];
+    random().nextBytes(value);
+    for (int i = 0; i < numPoints; i++) {
+      if (random().nextBoolean()) {
+        points.append(value, 1);
+      } else {
+        points.append(value, 2);
+      }
+    }
+    verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
+  }
+
+  public void testRandomFewDifferentValues() throws IOException {
+    int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+    int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+    int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
+    int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
+    int packedBytesLength = dataDimensions * bytesPerDim;
+    HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
+    int numberValues = random().nextInt(8) + 2;
+    byte[][] differentValues = new byte[numberValues][packedBytesLength];
+    for (int i = 0; i < numberValues; i++) {
+      random().nextBytes(differentValues[i]);
+    }
+    for (int i = 0; i < numPoints; i++) {
+      points.append(differentValues[random().nextInt(numberValues)], i);
+    }
+    verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
+  }
+
+  public void testRandomDataDimDifferent() throws IOException {
+    int numPoints = TestUtil.nextInt(random(), 1, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+    int indexDimensions = TestUtil.nextInt(random(), 1, 8);
+    int dataDimensions = TestUtil.nextInt(random(), indexDimensions, 8);
+    int bytesPerDim = TestUtil.nextInt(random(), 2, 30);
+    int packedBytesLength = dataDimensions * bytesPerDim;
+    HeapPointWriter points = new HeapPointWriter(numPoints, packedBytesLength);
+    byte[] value = new byte[packedBytesLength];
+    int totalDataDimension = dataDimensions - indexDimensions;
+    byte[] dataDimensionValues = new byte[totalDataDimension * bytesPerDim];
+    random().nextBytes(value);
+    for (int i = 0; i < numPoints; i++) {
+      random().nextBytes(dataDimensionValues);
+      System.arraycopy(dataDimensionValues, 0, value, indexDimensions * bytesPerDim, totalDataDimension * bytesPerDim);
+      points.append(value, random().nextInt(numPoints));
+    }
+    verifySort(points, dataDimensions, indexDimensions, 0, numPoints, bytesPerDim);
+  }
+
+  private void verifySort(HeapPointWriter points, int dataDimensions, int indexDimensions, int start, int end, int bytesPerDim) throws IOException{
+    int packedBytesLength = dataDimensions * bytesPerDim;
+    Directory dir = newDirectory();
+    BKDRadixSelector radixSelector = new BKDRadixSelector(dataDimensions, bytesPerDim, 1000, dir, "test");
+    // we check for each dimension
+    for (int splitDim = 0; splitDim < dataDimensions; splitDim++) {
+      radixSelector.heapRadixSort(points, start, end, splitDim, getRandomCommonPrefix(points, start, end, bytesPerDim, splitDim));
+      byte[] previous = new byte[bytesPerDim * dataDimensions];
+      int previousDocId = -1;
+      Arrays.fill(previous, (byte) 0);
+      int dimOffset = splitDim * bytesPerDim;
+      for (int j = start; j < end; j++) {
+        PointValue pointValue = points.getPackedValueSlice(j);
+        BytesRef value = pointValue.packedValue();
+        int cmp = Arrays.compareUnsigned(value.bytes, value.offset + dimOffset, value.offset + dimOffset + bytesPerDim, previous, dimOffset, dimOffset + bytesPerDim);
+        assertTrue(cmp >= 0);
+        if (cmp == 0) {
+          assertTrue(pointValue.docID() >= previousDocId);
+        }
+        System.arraycopy(value.bytes, value.offset, previous, 0, packedBytesLength);
+        previousDocId = pointValue.docID();
+      }
+    }
+    dir.close();
+  }
+
+  /** returns a common prefix length equal or lower than the current one */
+  private int getRandomCommonPrefix(HeapPointWriter points, int start, int end, int bytesPerDimension, int sortDim)  {
+    int commonPrefixLength = bytesPerDimension;
+    PointValue value = points.getPackedValueSlice(start);
+    BytesRef bytesRef = value.packedValue();
+    byte[] firstValue = new byte[bytesPerDimension];
+    int offset = sortDim * bytesPerDimension;
+    System.arraycopy(bytesRef.bytes, bytesRef.offset + offset, firstValue, 0, bytesPerDimension);
+    for (int i = start + 1; i < end; i++) {
+      value = points.getPackedValueSlice(i);
+      bytesRef = value.packedValue();
+      int diff = Arrays.mismatch(bytesRef.bytes, bytesRef.offset + offset, bytesRef.offset + offset + bytesPerDimension, firstValue, 0, bytesPerDimension);
+      if (diff != -1 && commonPrefixLength > diff) {
+        if (diff == 0) {
+          return diff;
+        }
+        commonPrefixLength = diff;
+      }
+    }
+    return (random().nextBoolean()) ? commonPrefixLength : random().nextInt(commonPrefixLength);
+  }
+}