You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by kr...@apache.org on 2017/01/18 15:46:19 UTC
[14/16] lucene-solr:jira/solr-8593: LUCENE-7641: Speed up range
queries that match most documents.
LUCENE-7641: Speed up range queries that match most documents.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/3404677e
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/3404677e
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/3404677e
Branch: refs/heads/jira/solr-8593
Commit: 3404677e57fcf7901813f7d7ccfc3e57db011993
Parents: 9ee48aa
Author: Adrien Grand <jp...@gmail.com>
Authored: Wed Jan 18 13:48:27 2017 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Jan 18 13:48:27 2017 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 6 ++
.../org/apache/lucene/index/CheckIndex.java | 4 +-
.../apache/lucene/search/PointRangeQuery.java | 74 ++++++++++++++++++++
.../org/apache/lucene/util/bkd/BKDReader.java | 8 ++-
.../org/apache/lucene/util/bkd/BKDWriter.java | 14 ++--
.../apache/lucene/search/TestPointQueries.java | 35 +++++++++
6 files changed, 130 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3404677e/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4df7a67..cee0335 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -93,6 +93,12 @@ Improvements
should be run, eg. using points or doc values depending on costs of other
parts of the query. (Adrien Grand)
+Optimizations
+
+* LUCENE-7641: Optimized point range queries to compute documents that do not
+ match the range on single-valued fields when more than half the documents in
+ the index would match. (Adrien Grand)
+
======================= Lucene 6.4.0 =======================
API Changes
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3404677e/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
index 7611a7f..f3bdfb0 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -1813,8 +1813,8 @@ public final class CheckIndex implements Closeable {
int docCount = values.getDocCount();
final long crossCost = values.estimatePointCount(new ConstantRelationIntersectVisitor(Relation.CELL_CROSSES_QUERY));
- if (crossCost < size) {
- throw new RuntimeException("estimatePointCount should return >= size when all cells match");
+ if (crossCost < size / 2) {
+ throw new RuntimeException("estimatePointCount should return >= size/2 when all cells match");
}
final long insideCost = values.estimatePointCount(new ConstantRelationIntersectVisitor(Relation.CELL_INSIDE_QUERY));
if (insideCost < size) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3404677e/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
index 29c6e7f..7c997ca 100644
--- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
@@ -26,7 +26,9 @@ import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.document.IntPoint; // javadocs
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.DocIdSetBuilder;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.StringHelper;
/**
@@ -163,6 +165,64 @@ public abstract class PointRangeQuery extends Query {
};
}
+ /**
+ * Create a visitor that clears documents that do NOT match the range.
+ */
+ private IntersectVisitor getInverseIntersectVisitor(FixedBitSet result, int[] cost) {
+ return new IntersectVisitor() {
+
+ @Override
+ public void visit(int docID) {
+ result.clear(docID);
+ cost[0]--;
+ }
+
+ @Override
+ public void visit(int docID, byte[] packedValue) {
+ for(int dim=0;dim<numDims;dim++) {
+ int offset = dim*bytesPerDim;
+ if (StringHelper.compare(bytesPerDim, packedValue, offset, lowerPoint, offset) < 0) {
+ // Doc's value is too low, in this dimension
+ result.clear(docID);
+ cost[0]--;
+ return;
+ }
+ if (StringHelper.compare(bytesPerDim, packedValue, offset, upperPoint, offset) > 0) {
+ // Doc's value is too high, in this dimension
+ result.clear(docID);
+ cost[0]--;
+ return;
+ }
+ }
+ }
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+
+ boolean crosses = false;
+
+ for(int dim=0;dim<numDims;dim++) {
+ int offset = dim*bytesPerDim;
+
+ if (StringHelper.compare(bytesPerDim, minPackedValue, offset, upperPoint, offset) > 0 ||
+ StringHelper.compare(bytesPerDim, maxPackedValue, offset, lowerPoint, offset) < 0) {
+ // This dim is not in the range
+ return Relation.CELL_INSIDE_QUERY;
+ }
+
+ crosses |= StringHelper.compare(bytesPerDim, minPackedValue, offset, lowerPoint, offset) < 0 ||
+ StringHelper.compare(bytesPerDim, maxPackedValue, offset, upperPoint, offset) > 0;
+ }
+
+ if (crosses) {
+ return Relation.CELL_CROSSES_QUERY;
+ } else {
+ return Relation.CELL_OUTSIDE_QUERY;
+ }
+ }
+ };
+ }
+
@Override
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
@@ -221,6 +281,20 @@ public abstract class PointRangeQuery extends Query {
@Override
public Scorer get(boolean randomAccess) throws IOException {
+ if (values.getDocCount() == reader.maxDoc()
+ && values.getDocCount() == values.size()
+ && cost() > reader.maxDoc() / 2) {
+ // If all docs have exactly one value and the cost is greater
+ // than half the leaf size then maybe we can make things faster
+ // by computing the set of documents that do NOT match the range
+ final FixedBitSet result = new FixedBitSet(reader.maxDoc());
+ result.set(0, reader.maxDoc());
+ int[] cost = new int[] { reader.maxDoc() };
+ values.intersect(getInverseIntersectVisitor(result, cost));
+ final DocIdSetIterator iterator = new BitSetIterator(result, cost[0]);
+ return new ConstantScoreScorer(weight, score(), iterator);
+ }
+
values.intersect(visitor);
DocIdSetIterator iterator = result.build().iterator();
return new ConstantScoreScorer(weight, score(), iterator);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3404677e/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index 14e1adb..4089d82 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -717,8 +717,12 @@ public final class BKDReader extends PointValues implements Accountable {
// This cell is fully outside of the query shape: stop recursing
return 0L;
} else if (state.index.isLeafNode()) {
- // Assume all points match and there are no dups
- return maxPointsInLeafNode;
+ if (r == Relation.CELL_INSIDE_QUERY) {
+ return maxPointsInLeafNode;
+ } else {
+ // Assume half the points matched
+ return (maxPointsInLeafNode + 1) / 2;
+ }
} else {
// Non-leaf node: recurse on the split left and right nodes
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3404677e/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index 5e391f4..eeb40fa 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -487,7 +487,7 @@ public class BKDWriter implements Closeable {
assert Arrays.equals(parentSplits, new int[numDims]);
long indexFP = out.getFilePointer();
- writeIndex(out, leafBlockFPs, splitPackedValues);
+ writeIndex(out, Math.toIntExact(countPerLeaf), leafBlockFPs, splitPackedValues);
return indexFP;
}
@@ -645,7 +645,7 @@ public class BKDWriter implements Closeable {
for(int i=0;i<leafBlockFPs.size();i++) {
arr[i] = leafBlockFPs.get(i);
}
- writeIndex(out, arr, index);
+ writeIndex(out, maxPointsInLeafNode, arr, index);
return indexFP;
}
@@ -1035,7 +1035,7 @@ public class BKDWriter implements Closeable {
// Write index:
long indexFP = out.getFilePointer();
- writeIndex(out, leafBlockFPs, splitPackedValues);
+ writeIndex(out, Math.toIntExact(countPerLeaf), leafBlockFPs, splitPackedValues);
return indexFP;
}
@@ -1241,16 +1241,16 @@ public class BKDWriter implements Closeable {
return result;
}
- private void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
+ private void writeIndex(IndexOutput out, int countPerLeaf, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
byte[] packedIndex = packIndex(leafBlockFPs, splitPackedValues);
- writeIndex(out, leafBlockFPs.length, packedIndex);
+ writeIndex(out, countPerLeaf, leafBlockFPs.length, packedIndex);
}
- private void writeIndex(IndexOutput out, int numLeaves, byte[] packedIndex) throws IOException {
+ private void writeIndex(IndexOutput out, int countPerLeaf, int numLeaves, byte[] packedIndex) throws IOException {
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
out.writeVInt(numDims);
- out.writeVInt(maxPointsInLeafNode);
+ out.writeVInt(countPerLeaf);
out.writeVInt(bytesPerDim);
assert numLeaves > 0;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3404677e/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
index 5c66478..8f7beaf 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
@@ -69,6 +69,7 @@ import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.bkd.BKDWriter;
import org.junit.BeforeClass;
public class TestPointQueries extends LuceneTestCase {
@@ -2080,4 +2081,38 @@ public class TestPointQueries extends LuceneTestCase {
assertTrue(Float.compare(Float.NEGATIVE_INFINITY, FloatPoint.nextDown(Float.NEGATIVE_INFINITY)) == 0);
assertTrue(Float.compare(Float.MAX_VALUE, FloatPoint.nextDown(Float.POSITIVE_INFINITY)) == 0);
}
+
+ public void testInversePointRange() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
+ final int numDims = TestUtil.nextInt(random(), 1, 3);
+ final int numDocs = atLeast(10 * BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE); // we need multiple leaves to enable this optimization
+ for (int i = 0; i < numDocs; ++i) {
+ Document doc = new Document();
+ int[] values = new int[numDims];
+ Arrays.fill(values, i);
+ doc.add(new IntPoint("f", values));
+ w.addDocument(doc);
+ }
+ w.forceMerge(1);
+ IndexReader r = DirectoryReader.open(w);
+ w.close();
+
+ IndexSearcher searcher = newSearcher(r);
+ int[] low = new int[numDims];
+ int[] high = new int[numDims];
+ Arrays.fill(high, numDocs - 2);
+ assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+ Arrays.fill(low, 1);
+ assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+ Arrays.fill(high, numDocs - 1);
+ assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+ Arrays.fill(low, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE + 1);
+ assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+ Arrays.fill(high, numDocs - BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+ assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+
+ r.close();
+ dir.close();
+ }
}