You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by iv...@apache.org on 2022/01/20 06:50:36 UTC

[lucene] branch branch_9x updated: LUCENE-10288: Check BKD tree shape for lucene pre-8.6 1D indexes (#607)

This is an automated email from the ASF dual-hosted git repository.

ivera pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new 73a3db9  LUCENE-10288: Check BKD tree shape for lucene pre-8.6 1D indexes (#607)
73a3db9 is described below

commit 73a3db90dd7e008f39f4af4486bf634baafde6bc
Author: Ignacio Vera <iv...@apache.org>
AuthorDate: Thu Jan 20 07:49:29 2022 +0100

    LUCENE-10288: Check BKD tree shape for lucene pre-8.6 1D indexes (#607)
    
    Adds efficient logic to compute if a tree is balanced or unbalanced for indexes
    created before Lucene 8.6
---
 .../java/org/apache/lucene/util/bkd/BKDReader.java | 70 +++++++++++++++++++---
 1 file changed, 63 insertions(+), 7 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index ff2584f..ac55f33 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -44,6 +44,8 @@ public class BKDReader extends PointValues {
   final long minLeafBlockFP;
 
   final IndexInput packedIndex;
+  // if true, the tree is a legacy balanced tree
+  private final boolean isTreeBalanced;
 
   /**
    * Caller must pre-seek the provided {@link IndexInput} to the index location that {@link
@@ -105,6 +107,52 @@ public class BKDReader extends PointValues {
     }
     this.packedIndex = indexIn.slice("packedIndex", indexStartPointer, numIndexBytes);
     this.in = dataIn;
+    // for only one leaf, balanced and unbalanced trees can be handled the same way
+    // we set it to unbalanced.
+    this.isTreeBalanced = numLeaves != 1 && isTreeBalanced();
+  }
+
+  private boolean isTreeBalanced() throws IOException {
+    if (version >= BKDWriter.VERSION_META_FILE) {
+      // since lucene 8.6 all trees are unbalanced.
+      return false;
+    }
+    if (config.numDims > 1) {
+      // high dimensional tree in pre-8.6 indices are balanced.
+      assert 1 << MathUtil.log(numLeaves, 2) == numLeaves;
+      return true;
+    }
+    if (1 << MathUtil.log(numLeaves, 2) != numLeaves) {
+      // if we don't have enough leaves to fill the last level then it is unbalanced
+      return false;
+    }
+    // count of the last node for unbalanced trees
+    final int lastLeafNodePointCount = Math.toIntExact(pointCount % config.maxPointsInLeafNode);
+    // navigate to last node
+    PointTree pointTree = getPointTree();
+    do {
+      while (pointTree.moveToSibling()) {}
+    } while (pointTree.moveToChild());
+    // count number of docs in the node
+    final int[] count = new int[] {0};
+    pointTree.visitDocIDs(
+        new IntersectVisitor() {
+          @Override
+          public void visit(int docID) {
+            count[0]++;
+          }
+
+          @Override
+          public void visit(int docID, byte[] packedValue) {
+            throw new AssertionError();
+          }
+
+          @Override
+          public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+            throw new AssertionError();
+          }
+        });
+    return count[0] != lastLeafNodePointCount;
   }
 
   @Override
@@ -117,7 +165,8 @@ public class BKDReader extends PointValues {
         version,
         pointCount,
         minPackedValue,
-        maxPackedValue);
+        maxPackedValue,
+        isTreeBalanced);
   }
 
   private static class BKDPointTree implements PointTree {
@@ -168,6 +217,8 @@ public class BKDReader extends PointValues {
         scratchMaxIndexPackedValue;
     private final int[] commonPrefixLengths;
     private final BKDReaderDocIDSetIterator scratchIterator;
+    // if true the tree is balanced, otherwise unbalanced
+    private final boolean isTreeBalanced;
 
     private BKDPointTree(
         IndexInput innerNodes,
@@ -177,7 +228,8 @@ public class BKDReader extends PointValues {
         int version,
         long pointCount,
         byte[] minPackedValue,
-        byte[] maxPackedValue)
+        byte[] maxPackedValue,
+        boolean isTreeBalanced)
         throws IOException {
       this(
           innerNodes,
@@ -194,7 +246,8 @@ public class BKDReader extends PointValues {
           new byte[config.packedBytesLength],
           new byte[config.packedIndexBytesLength],
           new byte[config.packedIndexBytesLength],
-          new int[config.numDims]);
+          new int[config.numDims],
+          isTreeBalanced);
       // read root node
       readNodeData(false);
     }
@@ -214,12 +267,14 @@ public class BKDReader extends PointValues {
         byte[] scratchDataPackedValue,
         byte[] scratchMinIndexPackedValue,
         byte[] scratchMaxIndexPackedValue,
-        int[] commonPrefixLengths) {
+        int[] commonPrefixLengths,
+        boolean isTreeBalanced) {
       this.config = config;
       this.version = version;
       this.nodeID = nodeID;
       this.nodeRoot = nodeID;
       this.level = level;
+      this.isTreeBalanced = isTreeBalanced;
       leafNodeOffset = numLeaves;
       this.innerNodes = innerNodes;
       this.leafNodes = leafNodes;
@@ -268,7 +323,8 @@ public class BKDReader extends PointValues {
               scratchDataPackedValue,
               scratchMinIndexPackedValue,
               scratchMaxIndexPackedValue,
-              commonPrefixLengths);
+              commonPrefixLengths,
+              isTreeBalanced);
       index.leafBlockFPStack[index.level] = leafBlockFPStack[level];
       if (isLeafNode() == false) {
         // copy node data
@@ -452,8 +508,8 @@ public class BKDReader extends PointValues {
         numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
       }
       assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + getNumLeavesSlow(nodeID);
-      if (version < BKDWriter.VERSION_META_FILE && config.numDims > 1) {
-        // before lucene 8.6, high dimensional trees were constructed as fully balanced trees.
+      if (isTreeBalanced) {
+        // before lucene 8.6, trees might have been constructed as fully balanced trees.
         return sizeFromBalancedTree(leftMostLeafNode, rightMostLeafNode);
       }
       // size for an unbalanced tree.