You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2017/01/18 12:49:31 UTC

lucene-solr:master: LUCENE-7641: Speed up range queries that match most documents.

Repository: lucene-solr
Updated Branches:
  refs/heads/master 9ee48aa85 -> 3404677e5


LUCENE-7641: Speed up range queries that match most documents.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/3404677e
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/3404677e
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/3404677e

Branch: refs/heads/master
Commit: 3404677e57fcf7901813f7d7ccfc3e57db011993
Parents: 9ee48aa
Author: Adrien Grand <jp...@gmail.com>
Authored: Wed Jan 18 13:48:27 2017 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Jan 18 13:48:27 2017 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  6 ++
 .../org/apache/lucene/index/CheckIndex.java     |  4 +-
 .../apache/lucene/search/PointRangeQuery.java   | 74 ++++++++++++++++++++
 .../org/apache/lucene/util/bkd/BKDReader.java   |  8 ++-
 .../org/apache/lucene/util/bkd/BKDWriter.java   | 14 ++--
 .../apache/lucene/search/TestPointQueries.java  | 35 +++++++++
 6 files changed, 130 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3404677e/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4df7a67..cee0335 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -93,6 +93,12 @@ Improvements
   should be run, eg. using points or doc values depending on costs of other
   parts of the query. (Adrien Grand)
 
+Optimizations
+
+* LUCENE-7641: Optimized point range queries to compute documents that do not
+  match the range on single-valued fields when more than half the documents in
+  the index would match. (Adrien Grand)
+
 ======================= Lucene 6.4.0 =======================
 
 API Changes

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3404677e/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
index 7611a7f..f3bdfb0 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -1813,8 +1813,8 @@ public final class CheckIndex implements Closeable {
             int docCount = values.getDocCount();
 
             final long crossCost = values.estimatePointCount(new ConstantRelationIntersectVisitor(Relation.CELL_CROSSES_QUERY));
-            if (crossCost < size) {
-              throw new RuntimeException("estimatePointCount should return >= size when all cells match");
+            if (crossCost < size / 2) {
+              throw new RuntimeException("estimatePointCount should return >= size/2 when all cells match");
             }
             final long insideCost = values.estimatePointCount(new ConstantRelationIntersectVisitor(Relation.CELL_INSIDE_QUERY));
             if (insideCost < size) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3404677e/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
index 29c6e7f..7c997ca 100644
--- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
@@ -26,7 +26,9 @@ import org.apache.lucene.index.PointValues.Relation;
 import org.apache.lucene.document.IntPoint;    // javadocs
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.util.BitSetIterator;
 import org.apache.lucene.util.DocIdSetBuilder;
+import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.StringHelper;
 
 /** 
@@ -163,6 +165,64 @@ public abstract class PointRangeQuery extends Query {
         };
       }
 
+      /**
+       * Create a visitor that clears documents that do NOT match the range.
+       */
+      private IntersectVisitor getInverseIntersectVisitor(FixedBitSet result, int[] cost) {
+        return new IntersectVisitor() {
+
+          @Override
+          public void visit(int docID) {
+            result.clear(docID);
+            cost[0]--;
+          }
+
+          @Override
+          public void visit(int docID, byte[] packedValue) {
+            for(int dim=0;dim<numDims;dim++) {
+              int offset = dim*bytesPerDim;
+              if (StringHelper.compare(bytesPerDim, packedValue, offset, lowerPoint, offset) < 0) {
+                // Doc's value is too low, in this dimension
+                result.clear(docID);
+                cost[0]--;
+                return;
+              }
+              if (StringHelper.compare(bytesPerDim, packedValue, offset, upperPoint, offset) > 0) {
+                // Doc's value is too high, in this dimension
+                result.clear(docID);
+                cost[0]--;
+                return;
+              }
+            }
+          }
+
+          @Override
+          public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+
+            boolean crosses = false;
+
+            for(int dim=0;dim<numDims;dim++) {
+              int offset = dim*bytesPerDim;
+
+              if (StringHelper.compare(bytesPerDim, minPackedValue, offset, upperPoint, offset) > 0 ||
+                  StringHelper.compare(bytesPerDim, maxPackedValue, offset, lowerPoint, offset) < 0) {
+                // This dim is not in the range
+                return Relation.CELL_INSIDE_QUERY;
+              }
+
+              crosses |= StringHelper.compare(bytesPerDim, minPackedValue, offset, lowerPoint, offset) < 0 ||
+                  StringHelper.compare(bytesPerDim, maxPackedValue, offset, upperPoint, offset) > 0;
+            }
+
+            if (crosses) {
+              return Relation.CELL_CROSSES_QUERY;
+            } else {
+              return Relation.CELL_OUTSIDE_QUERY;
+            }
+          }
+        };
+      }
+
       @Override
       public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
         LeafReader reader = context.reader();
@@ -221,6 +281,20 @@ public abstract class PointRangeQuery extends Query {
 
             @Override
             public Scorer get(boolean randomAccess) throws IOException {
+              if (values.getDocCount() == reader.maxDoc()
+                  && values.getDocCount() == values.size()
+                  && cost() > reader.maxDoc() / 2) {
+                // If all docs have exactly one value and the cost is greater
+                // than half the leaf size then maybe we can make things faster
+                // by computing the set of documents that do NOT match the range
+                final FixedBitSet result = new FixedBitSet(reader.maxDoc());
+                result.set(0, reader.maxDoc());
+                int[] cost = new int[] { reader.maxDoc() };
+                values.intersect(getInverseIntersectVisitor(result, cost));
+                final DocIdSetIterator iterator = new BitSetIterator(result, cost[0]);
+                return new ConstantScoreScorer(weight, score(), iterator);
+              }
+
               values.intersect(visitor);
               DocIdSetIterator iterator = result.build().iterator();
               return new ConstantScoreScorer(weight, score(), iterator);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3404677e/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index 14e1adb..4089d82 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -717,8 +717,12 @@ public final class BKDReader extends PointValues implements Accountable {
       // This cell is fully outside of the query shape: stop recursing
       return 0L;
     } else if (state.index.isLeafNode()) {
-      // Assume all points match and there are no dups
-      return maxPointsInLeafNode;
+      if (r == Relation.CELL_INSIDE_QUERY) {
+        return maxPointsInLeafNode;
+      } else {
+        // Assume half the points matched
+        return (maxPointsInLeafNode + 1) / 2;
+      }
     } else {
       
       // Non-leaf node: recurse on the split left and right nodes

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3404677e/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index 5e391f4..eeb40fa 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -487,7 +487,7 @@ public class BKDWriter implements Closeable {
     assert Arrays.equals(parentSplits, new int[numDims]);
 
     long indexFP = out.getFilePointer();
-    writeIndex(out, leafBlockFPs, splitPackedValues);
+    writeIndex(out, Math.toIntExact(countPerLeaf), leafBlockFPs, splitPackedValues);
     return indexFP;
   }
 
@@ -645,7 +645,7 @@ public class BKDWriter implements Closeable {
       for(int i=0;i<leafBlockFPs.size();i++) {
         arr[i] = leafBlockFPs.get(i);
       }
-      writeIndex(out, arr, index);
+      writeIndex(out, maxPointsInLeafNode, arr, index);
       return indexFP;
     }
 
@@ -1035,7 +1035,7 @@ public class BKDWriter implements Closeable {
 
     // Write index:
     long indexFP = out.getFilePointer();
-    writeIndex(out, leafBlockFPs, splitPackedValues);
+    writeIndex(out, Math.toIntExact(countPerLeaf), leafBlockFPs, splitPackedValues);
     return indexFP;
   }
 
@@ -1241,16 +1241,16 @@ public class BKDWriter implements Closeable {
     return result;
   }
 
-  private void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
+  private void writeIndex(IndexOutput out, int countPerLeaf, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
     byte[] packedIndex = packIndex(leafBlockFPs, splitPackedValues);
-    writeIndex(out, leafBlockFPs.length, packedIndex);
+    writeIndex(out, countPerLeaf, leafBlockFPs.length, packedIndex);
   }
   
-  private void writeIndex(IndexOutput out, int numLeaves, byte[] packedIndex) throws IOException {
+  private void writeIndex(IndexOutput out, int countPerLeaf, int numLeaves, byte[] packedIndex) throws IOException {
     
     CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
     out.writeVInt(numDims);
-    out.writeVInt(maxPointsInLeafNode);
+    out.writeVInt(countPerLeaf);
     out.writeVInt(bytesPerDim);
 
     assert numLeaves > 0;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3404677e/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
index 5c66478..8f7beaf 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
@@ -69,6 +69,7 @@ import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.NumericUtils;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.bkd.BKDWriter;
 import org.junit.BeforeClass;
 
 public class TestPointQueries extends LuceneTestCase {
@@ -2080,4 +2081,38 @@ public class TestPointQueries extends LuceneTestCase {
     assertTrue(Float.compare(Float.NEGATIVE_INFINITY, FloatPoint.nextDown(Float.NEGATIVE_INFINITY)) == 0);
     assertTrue(Float.compare(Float.MAX_VALUE, FloatPoint.nextDown(Float.POSITIVE_INFINITY)) == 0);
   }
+
+  public void testInversePointRange() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
+    final int numDims = TestUtil.nextInt(random(), 1, 3);
+    final int numDocs = atLeast(10 * BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE); // we need multiple leaves to enable this optimization
+    for (int i = 0; i < numDocs; ++i) {
+      Document doc = new Document();
+      int[] values = new int[numDims];
+      Arrays.fill(values, i);
+      doc.add(new IntPoint("f", values));
+      w.addDocument(doc);
+    }
+    w.forceMerge(1);
+    IndexReader r = DirectoryReader.open(w);
+    w.close();
+
+    IndexSearcher searcher = newSearcher(r);
+    int[] low = new int[numDims];
+    int[] high = new int[numDims];
+    Arrays.fill(high, numDocs - 2);
+    assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+    Arrays.fill(low, 1);
+    assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+    Arrays.fill(high, numDocs - 1);
+    assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+    Arrays.fill(low, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE + 1);
+    assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+    Arrays.fill(high, numDocs - BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+    assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+
+    r.close();
+    dir.close();
+  }
 }