You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2017/01/18 13:55:34 UTC

lucene-solr:branch_6x: LUCENE-7641: Speed up range queries that match most documents.

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x 70f312943 -> be39d1fd0


LUCENE-7641: Speed up range queries that match most documents.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/be39d1fd
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/be39d1fd
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/be39d1fd

Branch: refs/heads/branch_6x
Commit: be39d1fd0971c211a05ee8ba6e7215e0c8cf5190
Parents: 70f3129
Author: Adrien Grand <jp...@gmail.com>
Authored: Wed Jan 18 13:48:27 2017 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Jan 18 13:54:17 2017 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  6 ++
 .../org/apache/lucene/index/CheckIndex.java     |  4 +-
 .../apache/lucene/search/PointRangeQuery.java   | 74 ++++++++++++++++++++
 .../org/apache/lucene/util/bkd/BKDReader.java   |  8 ++-
 .../org/apache/lucene/util/bkd/BKDWriter.java   | 14 ++--
 .../apache/lucene/search/TestPointQueries.java  | 35 +++++++++
 6 files changed, 130 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be39d1fd/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 50dcdd8..7c5f956 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -35,6 +35,12 @@ Improvements
   should be run, eg. using points or doc values depending on costs of other
   parts of the query. (Adrien Grand)
 
+Optimizations
+
+* LUCENE-7641: Optimized point range queries to compute documents that do not
+  match the range on single-valued fields when more than half the documents in
+  the index would match. (Adrien Grand)
+
 ======================= Lucene 6.4.0 =======================
 
 API Changes

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be39d1fd/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
index 27e878b..1b23274 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -1808,8 +1808,8 @@ public final class CheckIndex implements Closeable {
             int docCount = values.getDocCount(fieldInfo.name);
 
             final long crossCost = values.estimatePointCount(fieldInfo.name, new ConstantRelationIntersectVisitor(Relation.CELL_CROSSES_QUERY));
-            if (crossCost < size) {
-              throw new RuntimeException("estimatePointCount should return >= size when all cells match");
+            if (crossCost < size / 2) {
+              throw new RuntimeException("estimatePointCount should return >= size/2 when all cells match");
             }
             final long insideCost = values.estimatePointCount(fieldInfo.name, new ConstantRelationIntersectVisitor(Relation.CELL_INSIDE_QUERY));
             if (insideCost < size) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be39d1fd/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
index 20e9f33..0f73c66 100644
--- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
@@ -27,7 +27,9 @@ import org.apache.lucene.document.IntPoint;    // javadocs
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.util.BitSetIterator;
 import org.apache.lucene.util.DocIdSetBuilder;
+import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.StringHelper;
 
 /** 
@@ -164,6 +166,64 @@ public abstract class PointRangeQuery extends Query {
         };
       }
 
+      /**
+       * Create a visitor that clears documents that do NOT match the range.
+       */
+      private IntersectVisitor getInverseIntersectVisitor(FixedBitSet result, int[] cost) {
+        return new IntersectVisitor() {
+
+          @Override
+          public void visit(int docID) {
+            result.clear(docID);
+            cost[0]--;
+          }
+
+          @Override
+          public void visit(int docID, byte[] packedValue) {
+            for(int dim=0;dim<numDims;dim++) {
+              int offset = dim*bytesPerDim;
+              if (StringHelper.compare(bytesPerDim, packedValue, offset, lowerPoint, offset) < 0) {
+                // Doc's value is too low, in this dimension
+                result.clear(docID);
+                cost[0]--;
+                return;
+              }
+              if (StringHelper.compare(bytesPerDim, packedValue, offset, upperPoint, offset) > 0) {
+                // Doc's value is too high, in this dimension
+                result.clear(docID);
+                cost[0]--;
+                return;
+              }
+            }
+          }
+
+          @Override
+          public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+
+            boolean crosses = false;
+
+            for(int dim=0;dim<numDims;dim++) {
+              int offset = dim*bytesPerDim;
+
+              if (StringHelper.compare(bytesPerDim, minPackedValue, offset, upperPoint, offset) > 0 ||
+                  StringHelper.compare(bytesPerDim, maxPackedValue, offset, lowerPoint, offset) < 0) {
+                // This dim is not in the range
+                return Relation.CELL_INSIDE_QUERY;
+              }
+
+              crosses |= StringHelper.compare(bytesPerDim, minPackedValue, offset, lowerPoint, offset) < 0 ||
+                  StringHelper.compare(bytesPerDim, maxPackedValue, offset, upperPoint, offset) > 0;
+            }
+
+            if (crosses) {
+              return Relation.CELL_CROSSES_QUERY;
+            } else {
+              return Relation.CELL_OUTSIDE_QUERY;
+            }
+          }
+        };
+      }
+
       @Override
       public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
         LeafReader reader = context.reader();
@@ -225,6 +285,20 @@ public abstract class PointRangeQuery extends Query {
 
             @Override
             public Scorer get(boolean randomAccess) throws IOException {
+              if (values.getDocCount(field) == reader.maxDoc()
+                  && values.getDocCount(field) == values.size(field)
+                  && cost() > reader.maxDoc() / 2) {
+                // If all docs have exactly one value and the cost is greater
+                // than half the leaf size then maybe we can make things faster
+                // by computing the set of documents that do NOT match the range
+                final FixedBitSet result = new FixedBitSet(reader.maxDoc());
+                result.set(0, reader.maxDoc());
+                int[] cost = new int[] { reader.maxDoc() };
+                values.intersect(field, getInverseIntersectVisitor(result, cost));
+                final DocIdSetIterator iterator = new BitSetIterator(result, cost[0]);
+                return new ConstantScoreScorer(weight, score(), iterator);
+              }
+
               values.intersect(field, visitor);
               DocIdSetIterator iterator = result.build().iterator();
               return new ConstantScoreScorer(weight, score(), iterator);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be39d1fd/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index 7778472..5d2ac9a 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -716,8 +716,12 @@ public final class BKDReader implements Accountable {
       // This cell is fully outside of the query shape: stop recursing
       return 0L;
     } else if (state.index.isLeafNode()) {
-      // Assume all points match and there are no dups
-      return maxPointsInLeafNode;
+      if (r == Relation.CELL_INSIDE_QUERY) {
+        return maxPointsInLeafNode;
+      } else {
+        // Assume half the points matched
+        return (maxPointsInLeafNode + 1) / 2;
+      }
     } else {
       
       // Non-leaf node: recurse on the split left and right nodes

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be39d1fd/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index 120bd65..44d7e0c 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -487,7 +487,7 @@ public class BKDWriter implements Closeable {
     assert Arrays.equals(parentSplits, new int[numDims]);
 
     long indexFP = out.getFilePointer();
-    writeIndex(out, leafBlockFPs, splitPackedValues);
+    writeIndex(out, Math.toIntExact(countPerLeaf), leafBlockFPs, splitPackedValues);
     return indexFP;
   }
 
@@ -645,7 +645,7 @@ public class BKDWriter implements Closeable {
       for(int i=0;i<leafBlockFPs.size();i++) {
         arr[i] = leafBlockFPs.get(i);
       }
-      writeIndex(out, arr, index);
+      writeIndex(out, maxPointsInLeafNode, arr, index);
       return indexFP;
     }
 
@@ -1035,7 +1035,7 @@ public class BKDWriter implements Closeable {
 
     // Write index:
     long indexFP = out.getFilePointer();
-    writeIndex(out, leafBlockFPs, splitPackedValues);
+    writeIndex(out, Math.toIntExact(countPerLeaf), leafBlockFPs, splitPackedValues);
     return indexFP;
   }
 
@@ -1241,16 +1241,16 @@ public class BKDWriter implements Closeable {
     return result;
   }
 
-  private void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
+  private void writeIndex(IndexOutput out, int countPerLeaf, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
     byte[] packedIndex = packIndex(leafBlockFPs, splitPackedValues);
-    writeIndex(out, leafBlockFPs.length, packedIndex);
+    writeIndex(out, countPerLeaf, leafBlockFPs.length, packedIndex);
   }
   
-  private void writeIndex(IndexOutput out, int numLeaves, byte[] packedIndex) throws IOException {
+  private void writeIndex(IndexOutput out, int countPerLeaf, int numLeaves, byte[] packedIndex) throws IOException {
     
     CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
     out.writeVInt(numDims);
-    out.writeVInt(maxPointsInLeafNode);
+    out.writeVInt(countPerLeaf);
     out.writeVInt(bytesPerDim);
 
     assert numLeaves > 0;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/be39d1fd/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
index 5518082..09f7596 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
@@ -69,6 +69,7 @@ import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.NumericUtils;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.bkd.BKDWriter;
 import org.junit.BeforeClass;
 
 public class TestPointQueries extends LuceneTestCase {
@@ -2077,4 +2078,38 @@ public class TestPointQueries extends LuceneTestCase {
     assertTrue(Float.compare(Float.NEGATIVE_INFINITY, FloatPoint.nextDown(Float.NEGATIVE_INFINITY)) == 0);
     assertTrue(Float.compare(Float.MAX_VALUE, FloatPoint.nextDown(Float.POSITIVE_INFINITY)) == 0);
   }
+
+  public void testInversePointRange() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
+    final int numDims = TestUtil.nextInt(random(), 1, 3);
+    final int numDocs = atLeast(10 * BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE); // we need multiple leaves to enable this optimization
+    for (int i = 0; i < numDocs; ++i) {
+      Document doc = new Document();
+      int[] values = new int[numDims];
+      Arrays.fill(values, i);
+      doc.add(new IntPoint("f", values));
+      w.addDocument(doc);
+    }
+    w.forceMerge(1);
+    IndexReader r = DirectoryReader.open(w);
+    w.close();
+
+    IndexSearcher searcher = newSearcher(r);
+    int[] low = new int[numDims];
+    int[] high = new int[numDims];
+    Arrays.fill(high, numDocs - 2);
+    assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+    Arrays.fill(low, 1);
+    assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+    Arrays.fill(high, numDocs - 1);
+    assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+    Arrays.fill(low, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE + 1);
+    assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+    Arrays.fill(high, numDocs - BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
+    assertEquals(high[0] - low[0] + 1, searcher.count(IntPoint.newRangeQuery("f", low, high)));
+
+    r.close();
+    dir.close();
+  }
 }