You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2017/01/19 15:32:28 UTC
[19/38] lucene-solr:jira/solr-9857: LUCENE-7640: Speed up
PointValues#estimatePointCount with Relation.CELL_INSIDE_QUERY.
LUCENE-7640: Speed up PointValues#estimatePointCount with Relation.CELL_INSIDE_QUERY.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/71aa463d
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/71aa463d
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/71aa463d
Branch: refs/heads/jira/solr-9857
Commit: 71aa463d4bbdfc03efb11b52ed2c4ce51d49bfb3
Parents: 3404677
Author: Adrien Grand <jp...@gmail.com>
Authored: Wed Jan 18 15:07:06 2017 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Jan 18 15:07:06 2017 +0100
----------------------------------------------------------------------
.../org/apache/lucene/util/bkd/BKDReader.java | 45 ++++-
.../lucene60/TestLucene60PointsFormat.java | 192 ++++++++++++++++++-
.../org/apache/lucene/util/bkd/TestBKD.java | 90 +++++++++
3 files changed, 319 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71aa463d/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index 4089d82..e120435 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -223,6 +223,41 @@ public final class BKDReader extends PointValues implements Accountable {
/** Only valid after pushLeft or pushRight, not pop! */
public abstract long getLeafBlockFP();
+
+ /** Return the number of leaves below the current node. */
+ public int getNumLeaves() {
+ int leftMostLeafNode = nodeID;
+ while (leftMostLeafNode < leafNodeOffset) {
+ leftMostLeafNode = leftMostLeafNode * 2;
+ }
+ int rightMostLeafNode = nodeID;
+ while (rightMostLeafNode < leafNodeOffset) {
+ rightMostLeafNode = rightMostLeafNode * 2 + 1;
+ }
+ final int numLeaves;
+ if (rightMostLeafNode >= leftMostLeafNode) {
+ // both are on the same level
+ numLeaves = rightMostLeafNode - leftMostLeafNode + 1;
+ } else {
+ // left is one level deeper than right
+ numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
+ }
+ assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + getNumLeavesSlow(nodeID);
+ return numLeaves;
+ }
+
+ // for assertions
+ private int getNumLeavesSlow(int node) {
+ if (node >= 2 * leafNodeOffset) {
+ return 0;
+ } else if (node >= leafNodeOffset) {
+ return 1;
+ } else {
+ final int leftCount = getNumLeavesSlow(node * 2);
+ final int rightCount = getNumLeavesSlow(node * 2 + 1);
+ return leftCount + rightCount;
+ }
+ }
}
/** Reads the original simple yet heap-heavy index format */
@@ -716,13 +751,11 @@ public final class BKDReader extends PointValues implements Accountable {
if (r == Relation.CELL_OUTSIDE_QUERY) {
// This cell is fully outside of the query shape: stop recursing
return 0L;
+ } else if (r == Relation.CELL_INSIDE_QUERY) {
+ return (long) maxPointsInLeafNode * state.index.getNumLeaves();
} else if (state.index.isLeafNode()) {
- if (r == Relation.CELL_INSIDE_QUERY) {
- return maxPointsInLeafNode;
- } else {
- // Assume half the points matched
- return (maxPointsInLeafNode + 1) / 2;
- }
+ // Assume half the points matched
+ return (maxPointsInLeafNode + 1) / 2;
} else {
// Non-leaf node: recurse on the split left and right nodes
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71aa463d/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
index afa8ec4..3a08bfa 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
@@ -18,29 +18,43 @@ package org.apache.lucene.codecs.lucene60;
import java.io.IOException;
+import java.util.Arrays;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.PointsWriter;
+import org.apache.lucene.document.BinaryPoint;
+import org.apache.lucene.document.Document;
import org.apache.lucene.index.BasePointsFormatTestCase;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.bkd.BKDWriter;
/**
* Tests Lucene60PointsFormat
*/
public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
private final Codec codec;
+ private final int maxPointsInLeafNode;
public TestLucene60PointsFormat() {
// standard issue
Codec defaultCodec = TestUtil.getDefaultCodec();
if (random().nextBoolean()) {
// randomize parameters
- int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
+ maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
if (VERBOSE) {
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
@@ -66,6 +80,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
} else {
// standard issue
codec = defaultCodec;
+ maxPointsInLeafNode = BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
}
}
@@ -79,5 +94,178 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
assumeFalse("TODO: mess with the parameters and test gets angry!", codec instanceof FilterCodec);
super.testMergeStability();
}
-
+
+ public void testEstimatePointCount() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
+ byte[] pointValue = new byte[3];
+ byte[] uniquePointValue = new byte[3];
+ random().nextBytes(uniquePointValue);
+ final int numDocs = atLeast(10000); // make sure we have several leaves
+ for (int i = 0; i < numDocs; ++i) {
+ Document doc = new Document();
+ if (i == numDocs / 2) {
+ doc.add(new BinaryPoint("f", uniquePointValue));
+ } else {
+ do {
+ random().nextBytes(pointValue);
+ } while (Arrays.equals(pointValue, uniquePointValue));
+ doc.add(new BinaryPoint("f", pointValue));
+ }
+ w.addDocument(doc);
+ }
+ w.forceMerge(1);
+ final IndexReader r = DirectoryReader.open(w);
+ w.close();
+ final LeafReader lr = getOnlyLeafReader(r);
+ PointValues points = lr.getPointValues("f");
+
+ // If all points match, then the point count is numLeaves * maxPointsInLeafNode
+ final int numLeaves = (int) Math.ceil((double) numDocs / maxPointsInLeafNode);
+ assertEquals(numLeaves * maxPointsInLeafNode,
+ points.estimatePointCount(new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return Relation.CELL_INSIDE_QUERY;
+ }
+ }));
+
+ // Return 0 if no points match
+ assertEquals(0,
+ points.estimatePointCount(new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return Relation.CELL_OUTSIDE_QUERY;
+ }
+ }));
+
+ // If only one point matches, then the point count is (maxPointsInLeafNode + 1) / 2
+ assertEquals((maxPointsInLeafNode + 1) / 2,
+ points.estimatePointCount(new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ if (StringHelper.compare(3, uniquePointValue, 0, maxPackedValue, 0) > 0 ||
+ StringHelper.compare(3, uniquePointValue, 0, minPackedValue, 0) < 0) {
+ return Relation.CELL_OUTSIDE_QUERY;
+ }
+ return Relation.CELL_CROSSES_QUERY;
+ }
+ }));
+
+ r.close();
+ dir.close();
+ }
+
+ // The tree is always balanced in the N dims case, and leaves are
+ // not all full so things are a bit different
+ public void testEstimatePointCount2Dims() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
+ byte[][] pointValue = new byte[2][];
+ pointValue[0] = new byte[3];
+ pointValue[1] = new byte[3];
+ byte[][] uniquePointValue = new byte[2][];
+ uniquePointValue[0] = new byte[3];
+ uniquePointValue[1] = new byte[3];
+ random().nextBytes(uniquePointValue[0]);
+ random().nextBytes(uniquePointValue[1]);
+ final int numDocs = atLeast(10000); // make sure we have several leaves
+ for (int i = 0; i < numDocs; ++i) {
+ Document doc = new Document();
+ if (i == numDocs / 2) {
+ doc.add(new BinaryPoint("f", uniquePointValue));
+ } else {
+ do {
+ random().nextBytes(pointValue[0]);
+ random().nextBytes(pointValue[1]);
+ } while (Arrays.equals(pointValue[0], uniquePointValue[0]) || Arrays.equals(pointValue[1], uniquePointValue[1]));
+ doc.add(new BinaryPoint("f", pointValue));
+ }
+ w.addDocument(doc);
+ }
+ w.forceMerge(1);
+ final IndexReader r = DirectoryReader.open(w);
+ w.close();
+ final LeafReader lr = getOnlyLeafReader(r);
+ PointValues points = lr.getPointValues("f");
+
+ // With >1 dims, the tree is balanced
+ int actualMaxPointsInLeafNode = numDocs;
+ while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
+ actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
+ }
+
+ // If all points match, then the point count is numLeaves * maxPointsInLeafNode
+ final int numLeaves = Integer.highestOneBit((numDocs - 1) / actualMaxPointsInLeafNode) << 1;
+ assertEquals(numLeaves * actualMaxPointsInLeafNode,
+ points.estimatePointCount(new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return Relation.CELL_INSIDE_QUERY;
+ }
+ }));
+
+ // Return 0 if no points match
+ assertEquals(0,
+ points.estimatePointCount(new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return Relation.CELL_OUTSIDE_QUERY;
+ }
+ }));
+
+ // If only one point matches, then the point count is (actualMaxPointsInLeafNode + 1) / 2
+ assertEquals((actualMaxPointsInLeafNode + 1) / 2,
+ points.estimatePointCount(new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ for (int dim = 0; dim < 2; ++dim) {
+ if (StringHelper.compare(3, uniquePointValue[0], 0, maxPackedValue, dim * 3) > 0 ||
+ StringHelper.compare(3, uniquePointValue[0], 0, minPackedValue, dim * 3) < 0) {
+ return Relation.CELL_OUTSIDE_QUERY;
+ }
+ }
+ return Relation.CELL_CROSSES_QUERY;
+ }
+ }));
+
+ r.close();
+ dir.close();
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71aa463d/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
index c5c5c1f..f01f058 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
@@ -1104,4 +1104,94 @@ public class TestBKD extends LuceneTestCase {
in.close();
dir.close();
}
+
+ public void testEstimatePointCount() throws IOException {
+ Directory dir = newDirectory();
+ final int numValues = atLeast(10000); // make sure to have multiple leaves
+ final int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
+ final int numBytesPerDim = TestUtil.nextInt(random(), 1, 4);
+ final byte[] pointValue = new byte[numBytesPerDim];
+ final byte[] uniquePointValue = new byte[numBytesPerDim];
+ random().nextBytes(uniquePointValue);
+
+ BKDWriter w = new BKDWriter(numValues, dir, "_temp", 1, numBytesPerDim, maxPointsInLeafNode,
+ BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, numValues, true);
+ for (int i = 0; i < numValues; ++i) {
+ if (i == numValues / 2) {
+ w.add(uniquePointValue, i);
+ } else {
+ do {
+ random().nextBytes(pointValue);
+ } while (Arrays.equals(pointValue, uniquePointValue));
+ w.add(pointValue, i);
+ }
+ }
+ final long indexFP;
+ try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
+ indexFP = w.finish(out);
+ w.close();
+ }
+
+ IndexInput pointsIn = dir.openInput("bkd", IOContext.DEFAULT);
+ pointsIn.seek(indexFP);
+ BKDReader points = new BKDReader(pointsIn);
+
+ int actualMaxPointsInLeafNode = numValues;
+ while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
+ actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
+ }
+
+ // If all points match, then the point count is numLeaves * maxPointsInLeafNode
+ final int numLeaves = Integer.highestOneBit((numValues - 1) / actualMaxPointsInLeafNode) << 1;
+ assertEquals(numLeaves * actualMaxPointsInLeafNode,
+ points.estimatePointCount(new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return Relation.CELL_INSIDE_QUERY;
+ }
+ }));
+
+ // Return 0 if no points match
+ assertEquals(0,
+ points.estimatePointCount(new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return Relation.CELL_OUTSIDE_QUERY;
+ }
+ }));
+
+ // If only one point matches, then the point count is (actualMaxPointsInLeafNode + 1) / 2
+ assertEquals((actualMaxPointsInLeafNode + 1) / 2,
+ points.estimatePointCount(new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ if (StringHelper.compare(3, uniquePointValue, 0, maxPackedValue, 0) > 0 ||
+ StringHelper.compare(3, uniquePointValue, 0, minPackedValue, 0) < 0) {
+ return Relation.CELL_OUTSIDE_QUERY;
+ }
+ return Relation.CELL_CROSSES_QUERY;
+ }
+ }));
+
+ pointsIn.close();
+ dir.close();
+ }
}