You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by iv...@apache.org on 2020/05/02 09:34:30 UTC

[lucene-solr] branch master updated: LUCENE-9087: Build always trees with full leaves and lower the default value for maxPointsPerLeafNode to 512

This is an automated email from the ASF dual-hosted git repository.

ivera pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 96c47bc  LUCENE-9087: Build always trees with full leaves and lower the default value for maxPointsPerLeafNode to 512
96c47bc is described below

commit 96c47bc8508142b5bd11d2cdc492df380801efec
Author: Ignacio Vera <iv...@apache.org>
AuthorDate: Sat May 2 11:34:19 2020 +0200

    LUCENE-9087: Build always trees with full leaves and lower the default value for maxPointsPerLeafNode to 512
---
 lucene/CHANGES.txt                                 |   3 +
 .../java/org/apache/lucene/util/bkd/BKDWriter.java | 184 +++++++++------------
 .../codecs/lucene60/TestLucene60PointsFormat.java  |  12 +-
 .../test/org/apache/lucene/util/bkd/TestBKD.java   |  16 +-
 4 files changed, 89 insertions(+), 126 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index eca9bb8..fe9bd82 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -193,6 +193,9 @@ Optimizations
   computed at deserialization time, therefore we can be more selective when decoding points of a triangle.
   (Ignacio Vera)
 
+* LUCENE-9087: Build always trees with full leaves and lower the default value for maxPointsPerLeafNode to 512.
+  (Ignacio Vera)
+
 Bug Fixes
 ---------------------
 * LUCENE-9259: Fix wrong NGramFilterFactory argument name for preserveOriginal option (Paul Pazderski)
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index 727b824..3b46763 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -57,9 +57,10 @@ import org.apache.lucene.util.PriorityQueue;
  *  Recursively builds a block KD-tree to assign all incoming points in N-dim space to smaller
  *  and smaller N-dim rectangles (cells) until the number of points in a given
  *  rectangle is &lt;= <code>maxPointsInLeafNode</code>.  The tree is
- *  fully balanced, which means the leaf nodes will have between 50% and 100% of
- *  the requested <code>maxPointsInLeafNode</code>.  Values that fall exactly
- *  on a cell boundary may be in either cell.
+ *  partially balanced, which means the leaf nodes will have
+ *  the requested <code>maxPointsInLeafNode</code> except one that might have less.
+ *  Leaf nodes may straddle the two bottom levels of the binary tree.
+ *  Values that fall exactly on a cell boundary may be in either cell.
  *
  *  <p>The number of dimensions can be 1 to 8, but every byte[] value is fixed length.
  *
@@ -87,7 +88,7 @@ public class BKDWriter implements Closeable {
   private final int bytesPerDoc;
 
   /** Default maximum number of point in each leaf block */
-  public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE = 1024;
+  public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE = 512;
 
   /** Default maximum heap to use, before spilling to (slower) disk */
   public static final float DEFAULT_MAX_MB_SORT_IN_HEAP = 16.0f;
@@ -420,15 +421,9 @@ public class BKDWriter implements Closeable {
     // Mark that we already finished:
     finished = true;
 
-    long countPerLeaf = pointCount = values.size();
-    long innerNodeCount = 1;
+    pointCount = values.size();
 
-    while (countPerLeaf > maxPointsInLeafNode) {
-      countPerLeaf = (countPerLeaf+1)/2;
-      innerNodeCount *= 2;
-    }
-
-    int numLeaves = Math.toIntExact(innerNodeCount);
+    final int numLeaves = Math.toIntExact((pointCount + maxPointsInLeafNode - 1) / maxPointsInLeafNode);
 
     checkMaxLeafNodeCount(numLeaves);
 
@@ -442,14 +437,14 @@ public class BKDWriter implements Closeable {
     }
 
     final int[] parentSplits = new int[numIndexDims];
-    build(1, numLeaves, values, 0, Math.toIntExact(pointCount), out,
+    build(1, 0, numLeaves, values, 0, Math.toIntExact(pointCount), out,
           minPackedValue.clone(), maxPackedValue.clone(), parentSplits,
           splitPackedValues, leafBlockFPs,
           new int[maxPointsInLeafNode]);
     assert Arrays.equals(parentSplits, new int[numIndexDims]);
 
     long indexFP = out.getFilePointer();
-    writeIndex(out, Math.toIntExact(countPerLeaf), leafBlockFPs, splitPackedValues);
+    writeIndex(out, maxPointsInLeafNode, leafBlockFPs, splitPackedValues);
     return indexFP;
   }
 
@@ -665,55 +660,45 @@ public class BKDWriter implements Closeable {
     }
   }
 
-  // TODO: there must be a simpler way?
-  private void rotateToTree(int nodeID, int offset, int count, byte[] index, List<byte[]> leafBlockStartValues) {
-    //System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + " bpd=" + bytesPerDim + " index.length=" + index.length);
-    if (count == 1) {
+  private void rotateToTree(int nodeID, int offset, int numNodes, byte[] index, List<byte[]> leafBlockStartValues) {
+    if (numNodes == 1) {
       // Leaf index node
-      //System.out.println("  leaf index node");
-      //System.out.println("  index[" + nodeID + "] = blockStartValues[" + offset + "]");
       System.arraycopy(leafBlockStartValues.get(offset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim);
-    } else if (count > 1) {
-      // Internal index node: binary partition of count
-      int countAtLevel = 1;
-      int totalCount = 0;
-      while (true) {
-        int countLeft = count - totalCount;
-        //System.out.println("    cycle countLeft=" + countLeft + " coutAtLevel=" + countAtLevel);
-        if (countLeft <= countAtLevel) {
-          // This is the last level, possibly partially filled:
-          int lastLeftCount = Math.min(countAtLevel/2, countLeft);
-          assert lastLeftCount >= 0;
-          int leftHalf = (totalCount-1)/2 + lastLeftCount;
-
-          int rootOffset = offset + leftHalf;
-          /*
-          System.out.println("  last left count " + lastLeftCount);
-          System.out.println("  leftHalf " + leftHalf + " rightHalf=" + (count-leftHalf-1));
-          System.out.println("  rootOffset=" + rootOffset);
-          */
-
-          System.arraycopy(leafBlockStartValues.get(rootOffset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim);
-          //System.out.println("  index[" + nodeID + "] = blockStartValues[" + rootOffset + "]");
-
-          // TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree
-          // under here, to save this while loop on each recursion
-
-          // Recurse left
-          rotateToTree(2*nodeID, offset, leftHalf, index, leafBlockStartValues);
-
-          // Recurse right
-          rotateToTree(2*nodeID+1, rootOffset+1, count-leftHalf-1, index, leafBlockStartValues);
-          return;
-        }
-        totalCount += countAtLevel;
-        countAtLevel *= 2;
-      }
+    } else if (numNodes > 1) {
+      // Internal index node
+      // numNodes + 1 is the number of leaves
+      // -1 because there is one less inner node
+      int leftHalf = getNumLeftLeafNodes(numNodes + 1) - 1;
+      int rootOffset = offset + leftHalf;
+
+      System.arraycopy(leafBlockStartValues.get(rootOffset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim);
+
+      // Recurse left
+      rotateToTree(2*nodeID, offset, leftHalf, index, leafBlockStartValues);
+      // Recurse right
+      rotateToTree(2*nodeID+1, rootOffset+1, numNodes-leftHalf-1, index, leafBlockStartValues);
     } else {
-      assert count == 0;
+      assert numNodes == 0;
     }
   }
 
+  private int getNumLeftLeafNodes(int numLeaves) {
+    assert numLeaves > 1: "getNumLeftLeaveNodes() called with " + numLeaves;
+    // return the level that can be filled with this number of leaves
+    int lastFullLevel = 31 - Integer.numberOfLeadingZeros(numLeaves);
+    // how many leaf nodes are in the full level
+    int leavesFullLevel = 1 << lastFullLevel;
+    // half of the leaf nodes from the full level goes to the left
+    int numLeftLeafNodes = leavesFullLevel / 2;
+    // leaf nodes that do not fit in the full level
+    int unbalancedLeafNodes = numLeaves - leavesFullLevel;
+    // distribute unbalanced leaf nodes
+    numLeftLeafNodes += Math.min(unbalancedLeafNodes, numLeftLeafNodes);
+    // we should always place unbalanced leaf nodes on the left
+    assert numLeftLeafNodes >= numLeaves - numLeftLeafNodes && numLeftLeafNodes <= 2L * (numLeaves - numLeftLeafNodes);
+    return numLeftLeafNodes;
+  }
+
   // TODO: if we fixed each partition step to just record the file offset at the "split point", we could probably handle variable length
   // encoding and not have our own ByteSequencesReader/Writer
 
@@ -765,16 +750,7 @@ public class BKDWriter implements Closeable {
     tempInput = null;
     pointWriter = null;
 
-
-    long countPerLeaf = pointCount;
-    long innerNodeCount = 1;
-
-    while (countPerLeaf > maxPointsInLeafNode) {
-      countPerLeaf = (countPerLeaf+1)/2;
-      innerNodeCount *= 2;
-    }
-
-    int numLeaves = (int) innerNodeCount;
+    final int numLeaves = Math.toIntExact((pointCount + maxPointsInLeafNode - 1) / maxPointsInLeafNode);
 
     checkMaxLeafNodeCount(numLeaves);
 
@@ -797,7 +773,7 @@ public class BKDWriter implements Closeable {
     try {
 
       final int[] parentSplits = new int[numIndexDims];
-      build(1, numLeaves, points,
+      build(1, 0, numLeaves, points,
              out, radixSelector,
             minPackedValue.clone(), maxPackedValue.clone(),
             parentSplits,
@@ -822,39 +798,27 @@ public class BKDWriter implements Closeable {
 
     // Write index:
     long indexFP = out.getFilePointer();
-    writeIndex(out, Math.toIntExact(countPerLeaf), leafBlockFPs, splitPackedValues);
+    writeIndex(out, maxPointsInLeafNode, leafBlockFPs, splitPackedValues);
     return indexFP;
   }
 
-  /** Packs the two arrays, representing a balanced binary tree, into a compact byte[] structure. */
+  /** Packs the two arrays, representing a semi-balanced binary tree, into a compact byte[] structure. */
   private byte[] packIndex(long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
-
     int numLeaves = leafBlockFPs.length;
-
-    // Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
-    // if it was created by OneDimensionBKDWriter).  In this case the leaf nodes may straddle the two bottom
+    // Possibly rotate the leaf block FPs, if the index not fully balanced binary tree.
+    // In this case the leaf nodes may straddle the two bottom
     // levels of the binary tree:
-    if (numIndexDims == 1 && numLeaves > 1) {
-      int levelCount = 2;
-      while (true) {
-        if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
-          int lastLevel = 2*(numLeaves - levelCount);
-          assert lastLevel >= 0;
-          if (lastLevel != 0) {
-            // Last level is partially filled, so we must rotate the leaf FPs to match.  We do this here, after loading
-            // at read-time, so that we can still delta code them on disk at write:
-            long[] newLeafBlockFPs = new long[numLeaves];
-            System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
-            System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
-            leafBlockFPs = newLeafBlockFPs;
-          }
-          break;
-        }
-
-        levelCount *= 2;
-      }
+    int lastFullLevel = 31 - Integer.numberOfLeadingZeros(numLeaves);
+    int leavesFullLevel = 1 << lastFullLevel;
+    int leavesPartialLevel = 2 * (numLeaves - leavesFullLevel);
+    if (leavesPartialLevel != 0) {
+      // Last level is partially filled, so we must rotate the leaf FPs to match.  We do this here, after loading
+      // at read-time, so that we can still delta code them on disk at write:
+      long[] newLeafBlockFPs = new long[numLeaves];
+      System.arraycopy(leafBlockFPs, leavesPartialLevel, newLeafBlockFPs, 0, numLeaves - leavesPartialLevel);
+      System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, numLeaves - leavesPartialLevel, leavesPartialLevel);
+      leafBlockFPs = newLeafBlockFPs;
     }
-
     /** Reused while packing the index */
     ByteBuffersDataOutput writeBuffer = ByteBuffersDataOutput.newResettableInstance();
 
@@ -1319,7 +1283,7 @@ public class BKDWriter implements Closeable {
 
   /* Recursively reorders the provided reader and writes the bkd-tree on the fly; this method is used
    * when we are writing a new segment directly from IndexWriter's indexing buffer (MutablePointsReader). */
-  private void build(int nodeID, int leafNodeOffset,
+  private void build(int nodeID, int leavesOffset, int numLeaves,
                      MutablePointValues reader, int from, int to,
                      IndexOutput out,
                      byte[] minPackedValue, byte[] maxPackedValue,
@@ -1328,7 +1292,7 @@ public class BKDWriter implements Closeable {
                      long[] leafBlockFPs,
                      int[] spareDocIds) throws IOException {
 
-    if (nodeID >= leafNodeOffset) {
+    if (numLeaves == 1) {
       // leaf node
       final int count = to - from;
       assert count <= maxPointsInLeafNode;
@@ -1402,7 +1366,7 @@ public class BKDWriter implements Closeable {
         }
       }
       // Save the block file pointer:
-      leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
+      leafBlockFPs[leavesOffset] = out.getFilePointer();
 
       assert scratchOut.size() == 0;
 
@@ -1449,7 +1413,10 @@ public class BKDWriter implements Closeable {
         splitDim = split(minPackedValue, maxPackedValue, parentSplits);
       }
 
-      final int mid = (from + to + 1) >>> 1;
+      // How many leaves will be in the left tree:
+      int numLeftLeafNodes = getNumLeftLeafNodes(numLeaves);
+      // How many points will be in the left tree:
+      final int mid = from + numLeftLeafNodes * maxPointsInLeafNode;
 
       int commonPrefixLen = Arrays.mismatch(minPackedValue, splitDim * bytesPerDim,
           splitDim * bytesPerDim + bytesPerDim, maxPackedValue, splitDim * bytesPerDim,
@@ -1476,10 +1443,10 @@ public class BKDWriter implements Closeable {
 
       // recurse
       parentSplits[splitDim]++;
-      build(nodeID * 2, leafNodeOffset, reader, from, mid, out,
+      build(nodeID * 2, leavesOffset, numLeftLeafNodes, reader, from, mid, out,
           minPackedValue, maxSplitPackedValue, parentSplits,
           splitPackedValues, leafBlockFPs, spareDocIds);
-      build(nodeID * 2 + 1, leafNodeOffset, reader, mid, to, out,
+      build(nodeID * 2 + 1, leavesOffset + numLeftLeafNodes, numLeaves - numLeftLeafNodes, reader, mid, to, out,
           minSplitPackedValue, maxPackedValue, parentSplits,
           splitPackedValues, leafBlockFPs, spareDocIds);
       parentSplits[splitDim]--;
@@ -1512,7 +1479,7 @@ public class BKDWriter implements Closeable {
 
   /** The point writer contains the data that is going to be splitted using radix selection.
   /*  This method is used when we are merging previously written segments, in the numDims > 1 case. */
-  private void build(int nodeID, int leafNodeOffset,
+  private void build(int nodeID, int leavesOffset, int numLeaves,
                      BKDRadixSelector.PathSlice points,
                      IndexOutput out,
                      BKDRadixSelector radixSelector,
@@ -1522,7 +1489,7 @@ public class BKDWriter implements Closeable {
                      long[] leafBlockFPs,
                      int[] spareDocIds) throws IOException {
 
-    if (nodeID >= leafNodeOffset) {
+    if (numLeaves == 1) {
 
       // Leaf node: write block
       // We can write the block in any order so by default we write it sorted by the dimension that has the
@@ -1573,13 +1540,13 @@ public class BKDWriter implements Closeable {
       int leafCardinality = heapSource.computeCardinality(from ,to, numDataDims, bytesPerDim, commonPrefixLengths);
 
       // Save the block file pointer:
-      leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
+      leafBlockFPs[leavesOffset] = out.getFilePointer();
       //System.out.println("  write leaf block @ fp=" + out.getFilePointer());
 
       // Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o
       // loading the values:
       int count = to - from;
-      assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset;
+      assert count > 0: "nodeID=" + nodeID + " leavesOffset=" + leavesOffset;
       assert count <= spareDocIds.length : "count=" + count + " > length=" + spareDocIds.length;
       // Write doc IDs
       int[] docIDs = spareDocIds;
@@ -1630,9 +1597,10 @@ public class BKDWriter implements Closeable {
 
       assert nodeID < splitPackedValues.length : "nodeID=" + nodeID + " splitValues.length=" + splitPackedValues.length;
 
+      // How many leaves will be in the left tree:
+      int numLeftLeafNodes = getNumLeftLeafNodes(numLeaves);
       // How many points will be in the left tree:
-      long rightCount = points.count / 2;
-      long leftCount = points.count - rightCount;
+      final long leftCount = numLeftLeafNodes * maxPointsInLeafNode;
 
       BKDRadixSelector.PathSlice[] slices = new BKDRadixSelector.PathSlice[2];
 
@@ -1660,12 +1628,12 @@ public class BKDWriter implements Closeable {
 
       parentSplits[splitDim]++;
       // Recurse on left tree:
-      build(2 * nodeID, leafNodeOffset, slices[0],
+      build(2 * nodeID, leavesOffset, numLeftLeafNodes, slices[0],
           out, radixSelector, minPackedValue, maxSplitPackedValue,
           parentSplits, splitPackedValues, leafBlockFPs, spareDocIds);
 
       // Recurse on right tree:
-      build(2 * nodeID + 1, leafNodeOffset, slices[1],
+      build(2 * nodeID + 1, leavesOffset + numLeftLeafNodes, numLeaves - numLeftLeafNodes, slices[1],
           out, radixSelector, minSplitPackedValue, maxPackedValue
           , parentSplits, splitPackedValues, leafBlockFPs, spareDocIds);
 
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
index 4487ed0..f44a0d3 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
@@ -239,12 +239,6 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
     final LeafReader lr = getOnlyLeafReader(r);
     PointValues points = lr.getPointValues("f");
 
-    // With >1 dims, the tree is balanced
-    long actualMaxPointsInLeafNode = points.size();
-    while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
-      actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
-    }
-
     IntersectVisitor allPointsVisitor = new IntersectVisitor() {
       @Override
       public void visit(int docID, byte[] packedValue) throws IOException {}
@@ -259,9 +253,9 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
     };
 
     // If all points match, then the point count is numLeaves * maxPointsInLeafNode
-    final int numLeaves = (int) Math.max(Long.highestOneBit( ((points.size() - 1) / actualMaxPointsInLeafNode)) << 1, 1);
+    final int numLeaves = (int) Math.ceil((double) points.size() / maxPointsInLeafNode);
 
-    assertEquals(numLeaves * actualMaxPointsInLeafNode, points.estimatePointCount(allPointsVisitor));
+    assertEquals(numLeaves * maxPointsInLeafNode, points.estimatePointCount(allPointsVisitor));
     assertEquals(numDocs, points.estimateDocCount(allPointsVisitor));
 
     IntersectVisitor noPointsVisitor = new IntersectVisitor() {
@@ -302,7 +296,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
 
     final long pointCount = points.estimatePointCount(onePointMatchVisitor);
     // The number of matches needs to be multiple of count per leaf
-    final long countPerLeaf = (actualMaxPointsInLeafNode + 1) / 2;
+    final long countPerLeaf = (maxPointsInLeafNode + 1) / 2;
     assertTrue(""+pointCount, pointCount % countPerLeaf == 0);
     // in extreme cases, a point can be be shared by 4 leaves
     assertTrue(""+pointCount, pointCount / countPerLeaf <= 4 && pointCount / countPerLeaf >= 1);
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
index 38d91f2..dc94c1b 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
@@ -1308,14 +1308,12 @@ public class TestBKD extends LuceneTestCase {
     pointsIn.seek(indexFP);
     BKDReader points = new BKDReader(pointsIn);
 
-    int actualMaxPointsInLeafNode = numValues;
-    while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
-      actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
-    }
-
     // If all points match, then the point count is numLeaves * maxPointsInLeafNode
-    final int numLeaves = Integer.highestOneBit((numValues - 1) / actualMaxPointsInLeafNode) << 1;
-    assertEquals(numLeaves * actualMaxPointsInLeafNode,
+    int numLeaves = numValues / maxPointsInLeafNode;
+    if (numValues % maxPointsInLeafNode != 0) {
+      numLeaves++;
+    }
+    assertEquals(numLeaves * maxPointsInLeafNode,
         points.estimatePointCount(new IntersectVisitor() {
           @Override
           public void visit(int docID, byte[] packedValue) throws IOException {}
@@ -1363,8 +1361,8 @@ public class TestBKD extends LuceneTestCase {
       }
     });
     assertTrue(""+pointCount,
-        pointCount == (actualMaxPointsInLeafNode + 1) / 2 || // common case
-        pointCount == 2*((actualMaxPointsInLeafNode + 1) / 2)); // if the point is a split value
+        pointCount == (maxPointsInLeafNode + 1) / 2 || // common case
+        pointCount == 2*((maxPointsInLeafNode + 1) / 2)); // if the point is a split value
 
     pointsIn.close();
     dir.close();