You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2017/01/19 15:32:28 UTC
[19/38] lucene-solr:jira/solr-9857: LUCENE-7640: Speed up PointValues#estimatePointCount with Relation.CELL_INSIDE_QUERY.

LUCENE-7640: Speed up PointValues#estimatePointCount with Relation.CELL_INSIDE_QUERY.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/71aa463d
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/71aa463d
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/71aa463d

Branch: refs/heads/jira/solr-9857
Commit: 71aa463d4bbdfc03efb11b52ed2c4ce51d49bfb3
Parents: 3404677
Author: Adrien Grand <jp...@gmail.com>
Authored: Wed Jan 18 15:07:06 2017 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Jan 18 15:07:06 2017 +0100

----------------------------------------------------------------------
 .../org/apache/lucene/util/bkd/BKDReader.java   |  45 ++++-
 .../lucene60/TestLucene60PointsFormat.java      | 192 ++++++++++++++++++-
 .../org/apache/lucene/util/bkd/TestBKD.java     |  90 +++++++++
 3 files changed, 319 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71aa463d/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index 4089d82..e120435 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -223,6 +223,41 @@ public final class BKDReader extends PointValues implements Accountable {
     
     /** Only valid after pushLeft or pushRight, not pop! */
     public abstract long getLeafBlockFP();
+
+    /** Return the number of leaves below the current node. */
+    public int getNumLeaves() {
+      int leftMostLeafNode = nodeID;
+      while (leftMostLeafNode < leafNodeOffset) {
+        leftMostLeafNode = leftMostLeafNode * 2;
+      }
+      int rightMostLeafNode = nodeID;
+      while (rightMostLeafNode < leafNodeOffset) {
+        rightMostLeafNode = rightMostLeafNode * 2 + 1;
+      }
+      final int numLeaves;
+      if (rightMostLeafNode >= leftMostLeafNode) {
+        // both are on the same level
+        numLeaves = rightMostLeafNode - leftMostLeafNode + 1;
+      } else {
+        // left is one level deeper than right
+        numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
+      }
+      assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + getNumLeavesSlow(nodeID);
+      return numLeaves;
+    }
+
+    // for assertions
+    private int getNumLeavesSlow(int node) {
+      if (node >= 2 * leafNodeOffset) {
+        return 0;
+      } else if (node >= leafNodeOffset) {
+        return 1;
+      } else {
+        final int leftCount = getNumLeavesSlow(node * 2);
+        final int rightCount = getNumLeavesSlow(node * 2 + 1);
+        return leftCount + rightCount;
+      }
+    }
   }
 
   /** Reads the original simple yet heap-heavy index format */
@@ -716,13 +751,11 @@ public final class BKDReader extends PointValues implements Accountable {
     if (r == Relation.CELL_OUTSIDE_QUERY) {
       // This cell is fully outside of the query shape: stop recursing
       return 0L;
+    } else if (r == Relation.CELL_INSIDE_QUERY) {
+      return (long) maxPointsInLeafNode * state.index.getNumLeaves();
     } else if (state.index.isLeafNode()) {
-      if (r == Relation.CELL_INSIDE_QUERY) {
-        return maxPointsInLeafNode;
-      } else {
-        // Assume half the points matched
-        return (maxPointsInLeafNode + 1) / 2;
-      }
+      // Assume half the points matched
+      return (maxPointsInLeafNode + 1) / 2;
     } else {
       
       // Non-leaf node: recurse on the split left and right nodes

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71aa463d/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
index afa8ec4..3a08bfa 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
@@ -18,29 +18,43 @@ package org.apache.lucene.codecs.lucene60;
 
 
 import java.io.IOException;
+import java.util.Arrays;
 
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.FilterCodec;
 import org.apache.lucene.codecs.PointsFormat;
 import org.apache.lucene.codecs.PointsReader;
 import org.apache.lucene.codecs.PointsWriter;
+import org.apache.lucene.document.BinaryPoint;
+import org.apache.lucene.document.Document;
 import org.apache.lucene.index.BasePointsFormatTestCase;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.bkd.BKDWriter;
 
 /**
  * Tests Lucene60PointsFormat
  */
 public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
   private final Codec codec;
+  private final int maxPointsInLeafNode;
   
   public TestLucene60PointsFormat() {
     // standard issue
     Codec defaultCodec = TestUtil.getDefaultCodec();
     if (random().nextBoolean()) {
       // randomize parameters
-      int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
+      maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
       double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
       if (VERBOSE) {
         System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
@@ -66,6 +80,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
     } else {
       // standard issue
       codec = defaultCodec;
+      maxPointsInLeafNode = BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
     }
   }
 
@@ -79,5 +94,178 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
     assumeFalse("TODO: mess with the parameters and test gets angry!", codec instanceof FilterCodec);
     super.testMergeStability();
   }
-  
+
+  public void testEstimatePointCount() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
+    byte[] pointValue = new byte[3];
+    byte[] uniquePointValue = new byte[3];
+    random().nextBytes(uniquePointValue);
+    final int numDocs = atLeast(10000); // make sure we have several leaves
+    for (int i = 0; i < numDocs; ++i) {
+      Document doc = new Document();
+      if (i == numDocs / 2) {
+        doc.add(new BinaryPoint("f", uniquePointValue));
+      } else {
+        do {
+          random().nextBytes(pointValue);
+        } while (Arrays.equals(pointValue, uniquePointValue));
+        doc.add(new BinaryPoint("f", pointValue));
+      }
+      w.addDocument(doc);
+    }
+    w.forceMerge(1);
+    final IndexReader r = DirectoryReader.open(w);
+    w.close();
+    final LeafReader lr = getOnlyLeafReader(r);
+    PointValues points = lr.getPointValues("f");
+
+    // If all points match, then the point count is numLeaves * maxPointsInLeafNode
+    final int numLeaves = (int) Math.ceil((double) numDocs / maxPointsInLeafNode);
+    assertEquals(numLeaves * maxPointsInLeafNode,
+        points.estimatePointCount(new IntersectVisitor() {
+          @Override
+          public void visit(int docID, byte[] packedValue) throws IOException {}
+          
+          @Override
+          public void visit(int docID) throws IOException {}
+          
+          @Override
+          public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+            return Relation.CELL_INSIDE_QUERY;
+          }
+        }));
+
+    // Return 0 if no points match
+    assertEquals(0,
+        points.estimatePointCount(new IntersectVisitor() {
+          @Override
+          public void visit(int docID, byte[] packedValue) throws IOException {}
+          
+          @Override
+          public void visit(int docID) throws IOException {}
+          
+          @Override
+          public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+            return Relation.CELL_OUTSIDE_QUERY;
+          }
+        }));
+
+    // If only one point matches, then the point count is (maxPointsInLeafNode + 1) / 2
+    assertEquals((maxPointsInLeafNode + 1) / 2,
+        points.estimatePointCount(new IntersectVisitor() {
+          @Override
+          public void visit(int docID, byte[] packedValue) throws IOException {}
+          
+          @Override
+          public void visit(int docID) throws IOException {}
+          
+          @Override
+          public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+            if (StringHelper.compare(3, uniquePointValue, 0, maxPackedValue, 0) > 0 ||
+                StringHelper.compare(3, uniquePointValue, 0, minPackedValue, 0) < 0) {
+              return Relation.CELL_OUTSIDE_QUERY;
+            }
+            return Relation.CELL_CROSSES_QUERY;
+          }
+        }));
+
+    r.close();
+    dir.close();
+  }
+
+  // The tree is always balanced in the N dims case, and leaves are
+  // not all full so things are a bit different
+  public void testEstimatePointCount2Dims() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
+    byte[][] pointValue = new byte[2][];
+    pointValue[0] = new byte[3];
+    pointValue[1] = new byte[3];
+    byte[][] uniquePointValue = new byte[2][];
+    uniquePointValue[0] = new byte[3];
+    uniquePointValue[1] = new byte[3];
+    random().nextBytes(uniquePointValue[0]);
+    random().nextBytes(uniquePointValue[1]);
+    final int numDocs = atLeast(10000); // make sure we have several leaves
+    for (int i = 0; i < numDocs; ++i) {
+      Document doc = new Document();
+      if (i == numDocs / 2) {
+        doc.add(new BinaryPoint("f", uniquePointValue));
+      } else {
+        do {
+          random().nextBytes(pointValue[0]);
+          random().nextBytes(pointValue[1]);
+        } while (Arrays.equals(pointValue[0], uniquePointValue[0]) || Arrays.equals(pointValue[1], uniquePointValue[1]));
+        doc.add(new BinaryPoint("f", pointValue));
+      }
+      w.addDocument(doc);
+    }
+    w.forceMerge(1);
+    final IndexReader r = DirectoryReader.open(w);
+    w.close();
+    final LeafReader lr = getOnlyLeafReader(r);
+    PointValues points = lr.getPointValues("f");
+
+    // With >1 dims, the tree is balanced
+    int actualMaxPointsInLeafNode = numDocs;
+    while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
+      actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
+    }
+
+    // If all points match, then the point count is numLeaves * maxPointsInLeafNode
+    final int numLeaves = Integer.highestOneBit((numDocs - 1) / actualMaxPointsInLeafNode) << 1;
+    assertEquals(numLeaves * actualMaxPointsInLeafNode,
+        points.estimatePointCount(new IntersectVisitor() {
+          @Override
+          public void visit(int docID, byte[] packedValue) throws IOException {}
+          
+          @Override
+          public void visit(int docID) throws IOException {}
+          
+          @Override
+          public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+            return Relation.CELL_INSIDE_QUERY;
+          }
+        }));
+
+    // Return 0 if no points match
+    assertEquals(0,
+        points.estimatePointCount(new IntersectVisitor() {
+          @Override
+          public void visit(int docID, byte[] packedValue) throws IOException {}
+          
+          @Override
+          public void visit(int docID) throws IOException {}
+          
+          @Override
+          public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+            return Relation.CELL_OUTSIDE_QUERY;
+          }
+        }));
+
+    // If only one point matches, then the point count is (actualMaxPointsInLeafNode + 1) / 2
+    assertEquals((actualMaxPointsInLeafNode + 1) / 2,
+        points.estimatePointCount(new IntersectVisitor() {
+          @Override
+          public void visit(int docID, byte[] packedValue) throws IOException {}
+          
+          @Override
+          public void visit(int docID) throws IOException {}
+          
+          @Override
+          public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+            for (int dim = 0; dim < 2; ++dim) {
+              if (StringHelper.compare(3, uniquePointValue[0], 0, maxPackedValue, dim * 3) > 0 ||
+                  StringHelper.compare(3, uniquePointValue[0], 0, minPackedValue, dim * 3) < 0) {
+                return Relation.CELL_OUTSIDE_QUERY;
+              }
+            }
+            return Relation.CELL_CROSSES_QUERY;
+          }
+        }));
+
+    r.close();
+    dir.close();
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71aa463d/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
index c5c5c1f..f01f058 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
@@ -1104,4 +1104,94 @@ public class TestBKD extends LuceneTestCase {
     in.close();
     dir.close();
   }
+
+  public void testEstimatePointCount() throws IOException {
+    Directory dir = newDirectory();
+    final int numValues = atLeast(10000); // make sure to have multiple leaves
+    final int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
+    final int numBytesPerDim = TestUtil.nextInt(random(), 1, 4);
+    final byte[] pointValue = new byte[numBytesPerDim];
+    final byte[] uniquePointValue = new byte[numBytesPerDim];
+    random().nextBytes(uniquePointValue);
+
+    BKDWriter w = new BKDWriter(numValues, dir, "_temp", 1, numBytesPerDim, maxPointsInLeafNode,
+        BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, numValues, true);
+    for (int i = 0; i < numValues; ++i) {
+      if (i == numValues / 2) {
+        w.add(uniquePointValue, i);
+      } else {
+        do {
+          random().nextBytes(pointValue);
+        } while (Arrays.equals(pointValue, uniquePointValue));
+        w.add(pointValue, i);
+      }
+    }
+    final long indexFP;
+    try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
+      indexFP = w.finish(out);
+      w.close();
+    }
+    
+    IndexInput pointsIn = dir.openInput("bkd", IOContext.DEFAULT);
+    pointsIn.seek(indexFP);
+    BKDReader points = new BKDReader(pointsIn);
+
+    int actualMaxPointsInLeafNode = numValues;
+    while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
+      actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
+    }
+
+    // If all points match, then the point count is numLeaves * maxPointsInLeafNode
+    final int numLeaves = Integer.highestOneBit((numValues - 1) / actualMaxPointsInLeafNode) << 1;
+    assertEquals(numLeaves * actualMaxPointsInLeafNode,
+        points.estimatePointCount(new IntersectVisitor() {
+          @Override
+          public void visit(int docID, byte[] packedValue) throws IOException {}
+          
+          @Override
+          public void visit(int docID) throws IOException {}
+          
+          @Override
+          public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+            return Relation.CELL_INSIDE_QUERY;
+          }
+        }));
+
+    // Return 0 if no points match
+    assertEquals(0,
+        points.estimatePointCount(new IntersectVisitor() {
+          @Override
+          public void visit(int docID, byte[] packedValue) throws IOException {}
+          
+          @Override
+          public void visit(int docID) throws IOException {}
+          
+          @Override
+          public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+            return Relation.CELL_OUTSIDE_QUERY;
+          }
+        }));
+
+    // If only one point matches, then the point count is (actualMaxPointsInLeafNode + 1) / 2
+    assertEquals((actualMaxPointsInLeafNode + 1) / 2,
+        points.estimatePointCount(new IntersectVisitor() {
+          @Override
+          public void visit(int docID, byte[] packedValue) throws IOException {}
+          
+          @Override
+          public void visit(int docID) throws IOException {}
+          
+          @Override
+          public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+            if (StringHelper.compare(3, uniquePointValue, 0, maxPackedValue, 0) > 0 ||
+                StringHelper.compare(3, uniquePointValue, 0, minPackedValue, 0) < 0) {
+              return Relation.CELL_OUTSIDE_QUERY;
+            }
+            return Relation.CELL_CROSSES_QUERY;
+          }
+        }));
+
+    pointsIn.close();
+    dir.close();
+  }
 }