You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@lucene.apache.org by GitBox <gi...@apache.org> on 2021/04/15 10:04:22 UTC
[GitHub] [lucene] jpountz commented on a change in pull request #7: LUCENE-9820: Separate logic for reading the BKD index from logic to intersecting it

jpountz commented on a change in pull request #7:
URL: https://github.com/apache/lucene/pull/7#discussion_r613927061



##########
File path: lucene/core/src/java/org/apache/lucene/util/bkd/BKDDefaultReader.java
##########
@@ -0,0 +1,899 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.bkd;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Arrays;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.MathUtil;
+
+/**
+ * Handles reading a block KD-tree previously written with {@link BKDWriter}.
+ *
+ * @lucene.experimental
+ */
+public class BKDDefaultReader implements BKDReader {
+
+  final BKDConfig config;
+  final int numLeaves;
+  // Packed array of byte[] holding all docs and values:
+  final IndexInput in;
+  final byte[] minPackedValue;
+  final byte[] maxPackedValue;
+  final long pointCount;
+  final int docCount;
+  final int version;
+  final long minLeafBlockFP;
+  // Packed array of byte[] holding all split values in the full binary tree:
+  private final IndexInput packedIndex;
+
+  /**
+   * Caller must pre-seek the provided {@link IndexInput} to the index location that {@link
+   * BKDWriter#finish} returned. BKD tree is always stored off-heap.
+   */
+  public BKDDefaultReader(IndexInput metaIn, IndexInput indexIn, IndexInput dataIn)
+      throws IOException {
+    version =
+        CodecUtil.checkHeader(
+            metaIn, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
+    final int numDims = metaIn.readVInt();
+    final int numIndexDims;
+    if (version >= BKDWriter.VERSION_SELECTIVE_INDEXING) {
+      numIndexDims = metaIn.readVInt();
+    } else {
+      numIndexDims = numDims;
+    }
+    final int maxPointsInLeafNode = metaIn.readVInt();
+    final int bytesPerDim = metaIn.readVInt();
+    config = new BKDConfig(numDims, numIndexDims, bytesPerDim, maxPointsInLeafNode);
+
+    // Read index:
+    numLeaves = metaIn.readVInt();
+    assert numLeaves > 0;
+
+    minPackedValue = new byte[config.packedIndexBytesLength];
+    maxPackedValue = new byte[config.packedIndexBytesLength];
+
+    metaIn.readBytes(minPackedValue, 0, config.packedIndexBytesLength);
+    metaIn.readBytes(maxPackedValue, 0, config.packedIndexBytesLength);
+
+    for (int dim = 0; dim < config.numIndexDims; dim++) {
+      if (Arrays.compareUnsigned(
+              minPackedValue,
+              dim * config.bytesPerDim,
+              dim * config.bytesPerDim + config.bytesPerDim,
+              maxPackedValue,
+              dim * config.bytesPerDim,
+              dim * config.bytesPerDim + config.bytesPerDim)
+          > 0) {
+        throw new CorruptIndexException(
+            "minPackedValue "
+                + new BytesRef(minPackedValue)
+                + " is > maxPackedValue "
+                + new BytesRef(maxPackedValue)
+                + " for dim="
+                + dim,
+            metaIn);
+      }
+    }
+
+    pointCount = metaIn.readVLong();
+    docCount = metaIn.readVInt();
+
+    int numIndexBytes = metaIn.readVInt();
+    long indexStartPointer;
+    if (version >= BKDWriter.VERSION_META_FILE) {
+      minLeafBlockFP = metaIn.readLong();
+      indexStartPointer = metaIn.readLong();
+    } else {
+      indexStartPointer = indexIn.getFilePointer();
+      minLeafBlockFP = indexIn.readVLong();
+      indexIn.seek(indexStartPointer);
+    }
+    this.packedIndex = indexIn.slice("packedIndex", indexStartPointer, numIndexBytes);
+    this.in = dataIn;
+  }
+
+  @Override
+  public BKDConfig getConfig() {
+    return config;
+  }
+
+  @Override
+  public byte[] getMinPackedValue() {
+    return minPackedValue;
+  }
+
+  @Override
+  public byte[] getMaxPackedValue() {
+    return maxPackedValue;
+  }

Review comment:
       should we clone in the above two methods to make sure that the content of these arrays can never change?

##########
File path: lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
##########
@@ -22,34 +22,31 @@
 
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
 import org.apache.lucene.index.PointValues;
 import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.MathUtil;
 import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.bkd.BKDConfig;
+import org.apache.lucene.util.bkd.BKDPointValues;
 import org.apache.lucene.util.bkd.BKDReader;
 
-/** Forked from {@link BKDReader} and simplified/specialized for SimpleText's usage */
-final class SimpleTextBKDReader extends PointValues implements Accountable {
+/** Forked from {@link BKDPointValues} and simplified/specialized for SimpleText's usage */

Review comment:
       Should it be BKDDefaultReader?
   
   ```suggestion
   /** Forked from {@link BKDDefaultReader} and simplified/specialized for SimpleText's usage */
   ```

##########
File path: lucene/core/src/java/org/apache/lucene/util/bkd/BKDDefaultReader.java
##########
@@ -0,0 +1,899 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.bkd;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Arrays;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.MathUtil;
+
+/**
+ * Handles reading a block KD-tree previously written with {@link BKDWriter}.
+ *
+ * @lucene.experimental
+ */
+public class BKDDefaultReader implements BKDReader {
+
+  final BKDConfig config;
+  final int numLeaves;
+  // Packed array of byte[] holding all docs and values:
+  final IndexInput in;
+  final byte[] minPackedValue;
+  final byte[] maxPackedValue;
+  final long pointCount;
+  final int docCount;
+  final int version;
+  final long minLeafBlockFP;
+  // Packed array of byte[] holding all split values in the full binary tree:
+  private final IndexInput packedIndex;
+
+  /**
+   * Caller must pre-seek the provided {@link IndexInput} to the index location that {@link
+   * BKDWriter#finish} returned. BKD tree is always stored off-heap.
+   */
+  public BKDDefaultReader(IndexInput metaIn, IndexInput indexIn, IndexInput dataIn)
+      throws IOException {
+    version =
+        CodecUtil.checkHeader(
+            metaIn, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
+    final int numDims = metaIn.readVInt();
+    final int numIndexDims;
+    if (version >= BKDWriter.VERSION_SELECTIVE_INDEXING) {
+      numIndexDims = metaIn.readVInt();
+    } else {
+      numIndexDims = numDims;
+    }
+    final int maxPointsInLeafNode = metaIn.readVInt();
+    final int bytesPerDim = metaIn.readVInt();
+    config = new BKDConfig(numDims, numIndexDims, bytesPerDim, maxPointsInLeafNode);
+
+    // Read index:
+    numLeaves = metaIn.readVInt();
+    assert numLeaves > 0;
+
+    minPackedValue = new byte[config.packedIndexBytesLength];
+    maxPackedValue = new byte[config.packedIndexBytesLength];
+
+    metaIn.readBytes(minPackedValue, 0, config.packedIndexBytesLength);
+    metaIn.readBytes(maxPackedValue, 0, config.packedIndexBytesLength);
+
+    for (int dim = 0; dim < config.numIndexDims; dim++) {
+      if (Arrays.compareUnsigned(
+              minPackedValue,
+              dim * config.bytesPerDim,
+              dim * config.bytesPerDim + config.bytesPerDim,
+              maxPackedValue,
+              dim * config.bytesPerDim,
+              dim * config.bytesPerDim + config.bytesPerDim)
+          > 0) {
+        throw new CorruptIndexException(
+            "minPackedValue "
+                + new BytesRef(minPackedValue)
+                + " is > maxPackedValue "
+                + new BytesRef(maxPackedValue)
+                + " for dim="
+                + dim,
+            metaIn);
+      }
+    }
+
+    pointCount = metaIn.readVLong();
+    docCount = metaIn.readVInt();
+
+    int numIndexBytes = metaIn.readVInt();
+    long indexStartPointer;
+    if (version >= BKDWriter.VERSION_META_FILE) {
+      minLeafBlockFP = metaIn.readLong();
+      indexStartPointer = metaIn.readLong();
+    } else {
+      indexStartPointer = indexIn.getFilePointer();
+      minLeafBlockFP = indexIn.readVLong();
+      indexIn.seek(indexStartPointer);
+    }
+    this.packedIndex = indexIn.slice("packedIndex", indexStartPointer, numIndexBytes);
+    this.in = dataIn;
+  }
+
+  @Override
+  public BKDConfig getConfig() {
+    return config;
+  }
+
+  @Override
+  public byte[] getMinPackedValue() {
+    return minPackedValue;
+  }
+
+  @Override
+  public byte[] getMaxPackedValue() {
+    return maxPackedValue;
+  }
+
+  @Override
+  public long getPointCount() {
+    return pointCount;
+  }
+
+  @Override
+  public int getDocCount() {
+    return docCount;
+  }
+
+  @Override
+  public BKDReader.IndexTree getIndexTree() {
+    return new IndexTree(
+        packedIndex.clone(),
+        this.in.clone(),
+        config,
+        numLeaves,
+        version,
+        minPackedValue,
+        maxPackedValue);
+  }
+
+  private static class IndexTree implements BKDReader.IndexTree {
+    private int nodeID;
+    // during clone, the node root can be different to 1
+    private final int nodeRoot;
+    // level is 1-based so that we can do level-1 w/o checking each time:
+    private int level;
+    // used to read the packed tree off-heap
+    private final IndexInput innerNodes;
+    // used to read the packed leaves off-heap
+    private final IndexInput leafNodes;
+    // holds the minimum (left most) leaf block file pointer for each level we've recursed to:
+    private final long[] leafBlockFPStack;
+    // holds the address, in the off-heap index, of the right-node of each level:
+    private final int[] rightNodePositions;
+    // holds the splitDim for each level:
+    private final int[] splitDims;
+    // true if the per-dim delta we read for the node at this level is a negative offset vs. the
+    // last split on this dim; this is a packed
+    // 2D array, i.e. to access array[level][dim] you read from negativeDeltas[level*numDims+dim].
+    // this will be true if the last time we
+    // split on this dimension, we next pushed to the left sub-tree:
+    private final boolean[] negativeDeltas;
+    // holds the packed per-level split values
+    private final byte[][] splitValuesStack;
+    // holds the min / max value of the current node.
+    private final byte[] minPackedValue, maxPackedValue;
+    // holds the previous value of the split dimension
+    private final byte[][] splitDimValueStack;
+    // tree parameters
+    private final BKDConfig config;
+    // number of leaves
+    private final int leafNodeOffset;
+    // version of the index
+    private final int version;
+    // helper objects for reading doc values
+    private final byte[] scratchDataPackedValue,
+        scratchMinIndexPackedValue,
+        scratchMaxIndexPackedValue;
+    private final int[] commonPrefixLengths;
+    private final BKDReaderDocIDSetIterator scratchIterator;
+
+    private IndexTree(
+        IndexInput innerNodes,
+        IndexInput leafNodes,
+        BKDConfig config,
+        int numLeaves,
+        int version,
+        byte[] minPackedValue,
+        byte[] maxPackedValue) {
+      this(
+          innerNodes,
+          leafNodes,
+          config,
+          numLeaves,
+          version,
+          1,
+          1,
+          minPackedValue,
+          maxPackedValue,
+          new BKDReaderDocIDSetIterator(config.maxPointsInLeafNode),
+          new byte[config.packedBytesLength],
+          new byte[config.packedIndexBytesLength],
+          new byte[config.packedIndexBytesLength],
+          new int[config.numDims]);
+      // read root node
+      readNodeData(false);
+    }
+
+    private IndexTree(
+        IndexInput innerNodes,
+        IndexInput leafNodes,
+        BKDConfig config,
+        int numLeaves,
+        int version,
+        int nodeID,
+        int level,
+        byte[] minPackedValue,
+        byte[] maxPackedValue,
+        BKDReaderDocIDSetIterator scratchIterator,
+        byte[] scratchDataPackedValue,
+        byte[] scratchMinIndexPackedValue,
+        byte[] scratchMaxIndexPackedValue,
+        int[] commonPrefixLengths) {
+      this.config = config;
+      this.version = version;
+      this.nodeID = nodeID;
+      this.nodeRoot = nodeID;
+      this.level = level;
+      leafNodeOffset = numLeaves;
+      this.innerNodes = innerNodes;
+      this.leafNodes = leafNodes;
+      this.minPackedValue = minPackedValue.clone();
+      this.maxPackedValue = maxPackedValue.clone();
+      // stack arrays that keep information at different levels
+      int treeDepth = getTreeDepth(numLeaves);
+      splitDimValueStack = new byte[treeDepth + 1][];
+      splitValuesStack = new byte[treeDepth + 1][];
+      splitValuesStack[0] = new byte[config.packedIndexBytesLength];
+      leafBlockFPStack = new long[treeDepth + 1];
+      rightNodePositions = new int[treeDepth + 1];
+      splitDims = new int[treeDepth + 1];
+      negativeDeltas = new boolean[config.numIndexDims * (treeDepth + 1)];
+      // scratch objects, reused between clones

Review comment:
       is it important to reuse these objects? It feels like not reusing would reduce potential for bugs without incurring major downsides?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org