You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/12/26 09:46:27 UTC

lucene-solr:branch_6x: LUCENE-7401: Make sure BKD trees index all dimensions.

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x ba5b81845 -> 0c1cab719


LUCENE-7401: Make sure BKD trees index all dimensions.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/0c1cab71
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/0c1cab71
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/0c1cab71

Branch: refs/heads/branch_6x
Commit: 0c1cab71920a540807555501f7198ca402e16740
Parents: ba5b818
Author: Adrien Grand <jp...@gmail.com>
Authored: Mon Dec 26 10:10:03 2016 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Mon Dec 26 10:39:34 2016 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 ++
 .../org/apache/lucene/util/bkd/BKDWriter.java   | 52 +++++++++++++++++---
 .../org/apache/lucene/util/bkd/TestBKD.java     | 48 ++++++++++++++++--
 .../org/apache/lucene/index/RandomCodec.java    |  2 +-
 4 files changed, 92 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c1cab71/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 5054702..33b0deb 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -120,6 +120,9 @@ Improvements
   is no longer needed. Support for older Java 9 builds was removed.
   (Uwe Schindler)
 
+* LUCENE-7401: Changed the way BKD trees pick the split dimension in order to
+  ensure all dimensions are indexed. (Adrien Grand)
+
 Optimizations
 
 * LUCENE-7568: Optimize merging when index sorting is used but the

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c1cab71/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index 61a7ae9..120bd65 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -479,9 +479,12 @@ public class BKDWriter implements Closeable {
       docsSeen.set(reader.getDocID(i));
     }
 
+    final int[] parentSplits = new int[numDims];
     build(1, numLeaves, reader, 0, Math.toIntExact(pointCount), out,
-          minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs,
+          minPackedValue, maxPackedValue, parentSplits,
+          splitPackedValues, leafBlockFPs,
           new int[maxPointsInLeafNode]);
+    assert Arrays.equals(parentSplits, new int[numDims]);
 
     long indexFP = out.getFilePointer();
     writeIndex(out, leafBlockFPs, splitPackedValues);
@@ -1001,12 +1004,15 @@ public class BKDWriter implements Closeable {
         heapPointWriter = null;
       }
 
+      final int[] parentSplits = new int[numDims];
       build(1, numLeaves, sortedPointWriters,
             ordBitSet, out,
             minPackedValue, maxPackedValue,
+            parentSplits,
             splitPackedValues,
             leafBlockFPs,
             toCloseHeroically);
+      assert Arrays.equals(parentSplits, new int[numDims]);
 
       for(PathSlice slice : sortedPointWriters) {
         slice.writer.destroy();
@@ -1413,7 +1419,29 @@ public class BKDWriter implements Closeable {
     return true;
   }
 
-  protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
+  /**
+   * Pick the next dimension to split.
+   * @param minPackedValue the min values for all dimensions
+   * @param maxPackedValue the max values for all dimensions
+   * @param parentSplits how many times each dim has been split on the parent levels
+   * @return the dimension to split
+   */
+  protected int split(byte[] minPackedValue, byte[] maxPackedValue, int[] parentSplits) {
+    // First look at whether there is a dimension that has split less than 2x less than
+    // the dim that has most splits, and return it if there is such a dimension and it
+    // does not only have equals values. This helps ensure all dimensions are indexed.
+    int maxNumSplits = 0;
+    for (int numSplits : parentSplits) {
+      maxNumSplits = Math.max(maxNumSplits, numSplits);
+    }
+    for (int dim = 0; dim < numDims; ++dim) {
+      final int offset = dim * bytesPerDim;
+      if (parentSplits[dim] < maxNumSplits / 2 &&
+          StringHelper.compare(bytesPerDim, minPackedValue, offset, maxPackedValue, offset) != 0) {
+        return dim;
+      }
+    }
+
     // Find which dim has the largest span so we can split on it:
     int splitDim = -1;
     for(int dim=0;dim<numDims;dim++) {
@@ -1454,6 +1482,7 @@ public class BKDWriter implements Closeable {
                      MutablePointsReader reader, int from, int to,
                      IndexOutput out,
                      byte[] minPackedValue, byte[] maxPackedValue,
+                     int[] parentSplits,
                      byte[] splitPackedValues,
                      long[] leafBlockFPs,
                      int[] spareDocIds) throws IOException {
@@ -1547,7 +1576,7 @@ public class BKDWriter implements Closeable {
       // inner node
 
       // compute the split dimension and partition around it
-      final int splitDim = split(minPackedValue, maxPackedValue);
+      final int splitDim = split(minPackedValue, maxPackedValue, parentSplits);
       final int mid = (from + to + 1) >>> 1;
 
       int commonPrefixLen = bytesPerDim;
@@ -1575,10 +1604,14 @@ public class BKDWriter implements Closeable {
           maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
 
       // recurse
+      parentSplits[splitDim]++;
       build(nodeID * 2, leafNodeOffset, reader, from, mid, out,
-          minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
+          minPackedValue, maxSplitPackedValue, parentSplits,
+          splitPackedValues, leafBlockFPs, spareDocIds);
       build(nodeID * 2 + 1, leafNodeOffset, reader, mid, to, out,
-          minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
+          minSplitPackedValue, maxPackedValue, parentSplits,
+          splitPackedValues, leafBlockFPs, spareDocIds);
+      parentSplits[splitDim]--;
     }
   }
 
@@ -1589,6 +1622,7 @@ public class BKDWriter implements Closeable {
                      LongBitSet ordBitSet,
                      IndexOutput out,
                      byte[] minPackedValue, byte[] maxPackedValue,
+                     int[] parentSplits,
                      byte[] splitPackedValues,
                      long[] leafBlockFPs,
                      List<Closeable> toCloseHeroically) throws IOException {
@@ -1699,7 +1733,7 @@ public class BKDWriter implements Closeable {
 
       int splitDim;
       if (numDims > 1) {
-        splitDim = split(minPackedValue, maxPackedValue);
+        splitDim = split(minPackedValue, maxPackedValue, parentSplits);
       } else {
         splitDim = 0;
       }
@@ -1767,10 +1801,11 @@ public class BKDWriter implements Closeable {
         }
       }
 
+      parentSplits[splitDim]++;
       // Recurse on left tree:
       build(2*nodeID, leafNodeOffset, leftSlices,
             ordBitSet, out,
-            minPackedValue, maxSplitPackedValue,
+            minPackedValue, maxSplitPackedValue, parentSplits,
             splitPackedValues, leafBlockFPs, toCloseHeroically);
       for(int dim=0;dim<numDims;dim++) {
         // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
@@ -1783,7 +1818,7 @@ public class BKDWriter implements Closeable {
       // Recurse on right tree:
       build(2*nodeID+1, leafNodeOffset, rightSlices,
             ordBitSet, out,
-            minSplitPackedValue, maxPackedValue,
+            minSplitPackedValue, maxPackedValue, parentSplits,
             splitPackedValues, leafBlockFPs, toCloseHeroically);
       for(int dim=0;dim<numDims;dim++) {
         // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
@@ -1791,6 +1826,7 @@ public class BKDWriter implements Closeable {
           rightSlices[dim].writer.destroy();
         }
       }
+      parentSplits[splitDim]--;
     }
   }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c1cab71/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
index 8b9b7a5..c5c5c1f 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
@@ -505,7 +505,45 @@ public class TestBKD extends LuceneTestCase {
       }
     }
 
-    verify(docValues, null, numDims, numBytesPerDim);
+    // Use a small number of points in leaf blocks to trigger a lot of splitting
+    verify(docValues, null, numDims, numBytesPerDim, TestUtil.nextInt(random(), 20, 50));
+  }
+
+  // This triggers the logic that makes sure all dimensions get indexed
+  // by looking at how many times each dim has been split
+  public void testOneDimLowCard() throws Exception {
+    int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
+    int numDims = TestUtil.nextInt(random(), 2, 5);
+
+    int numDocs = atLeast(10000);
+    int theLowCardDim = random().nextInt(numDims);
+
+    byte[] value1 = new byte[numBytesPerDim];
+    random().nextBytes(value1);
+    byte[] value2 = value1.clone();
+    if (value2[numBytesPerDim-1] == 0 || random().nextBoolean()) {
+      value2[numBytesPerDim-1]++;
+    } else {
+      value2[numBytesPerDim-1]--;
+    }
+
+    byte[][][] docValues = new byte[numDocs][][];
+
+    for(int docID=0;docID<numDocs;docID++) {
+      byte[][] values = new byte[numDims][];
+      for(int dim=0;dim<numDims;dim++) {
+        if (dim == theLowCardDim) {
+          values[dim] = random().nextBoolean() ? value1 : value2;
+        } else {
+          values[dim] = new byte[numBytesPerDim];
+          random().nextBytes(values[dim]);
+        }
+      }
+      docValues[docID] = values;
+    }
+
+    // Use a small number of points in leaf blocks to trigger a lot of splitting
+    verify(docValues, null, numDims, numBytesPerDim, TestUtil.nextInt(random(), 20, 50));
   }
 
   // this should trigger run-length compression with lengths that are greater than 255
@@ -567,12 +605,14 @@ public class TestBKD extends LuceneTestCase {
     verify(docValuesArray, docIDsArray, numDims, numBytesPerDim);
   }
 
-
-
   /** docIDs can be null, for the single valued case, else it maps value to docID */
   private void verify(byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim) throws Exception {
+    verify(docValues, docIDs, numDims, numBytesPerDim, TestUtil.nextInt(random(), 50, 1000));
+  }
+
+  private void verify(byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim,
+      int maxPointsInLeafNode) throws Exception {
     try (Directory dir = getDirectory(docValues.length)) {
-      int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 1000);
       double maxMB = (float) 3.0 + (3*random().nextDouble());
       verify(dir, docValues, docIDs, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
     }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0c1cab71/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
index 127549f..b23ab60 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
@@ -293,7 +293,7 @@ public class RandomCodec extends AssertingCodec {
     }
 
     @Override
-    protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
+    protected int split(byte[] minPackedValue, byte[] maxPackedValue, int[] parentDims) {
       // BKD normally defaults by the widest dimension, to try to make as squarish cells as possible, but we just pick a random one ;)
       return random.nextInt(numDims);
     }