You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2017/01/30 10:20:51 UTC

[6/6] lucene-solr:branch_6x: LUCENE-7654: To-parent block joins should implement two-phase iteration.

LUCENE-7654: To-parent block joins should implement two-phase iteration.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/b6b44d44
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/b6b44d44
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/b6b44d44

Branch: refs/heads/branch_6x
Commit: b6b44d44d8277c533c7df975e05dd1c313cf3f23
Parents: 1332f0f
Author: Adrien Grand <jp...@gmail.com>
Authored: Mon Jan 30 10:46:22 2017 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Mon Jan 30 11:16:41 2017 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   4 +
 .../search/join/ToParentBlockJoinQuery.java     | 336 ++++++++++---------
 .../lucene/search/join/TestBlockJoin.java       |   5 +-
 .../search/join/TestBlockJoinValidation.java    |  19 --
 4 files changed, 189 insertions(+), 175 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b6b44d44/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index e1af086..c26b8b9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -77,6 +77,10 @@ Optimizations
   whether BKD cells are entirely within the distance close to the dateline.
   (Adrien Grand)
 
+* LUCENE-7654: ToParentBlockJoinQuery now implements two-phase iteration and
+  computes scores lazily in order to be faster when used in conjunctions.
+  (Adrien Grand)
+
 Build
 
 * LUCENE-7651: Fix Javadocs build for Java 8u121 by injecting "Google Code

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b6b44d44/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java
----------------------------------------------------------------------
diff --git a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java
index 254917f..0e15ac2 100644
--- a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java
+++ b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java
@@ -20,15 +20,18 @@ import java.io.IOException;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Locale;
+
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.search.FilterWeight;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.FilterWeight;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.ScorerSupplier;
+import org.apache.lucene.search.TwoPhaseIterator;
 import org.apache.lucene.search.Weight;
 import org.apache.lucene.util.BitSet;
 
@@ -74,7 +77,7 @@ public class ToParentBlockJoinQuery extends Query {
   private final ScoreMode scoreMode;
 
   /** Create a ToParentBlockJoinQuery.
-   * 
+   *
    * @param childQuery Query matching child documents.
    * @param parentsFilter Filter identifying the parent documents.
    * @param scoreMode How to aggregate multiple child scores
@@ -100,7 +103,7 @@ public class ToParentBlockJoinQuery extends Query {
   public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
     return new BlockJoinWeight(this, childQuery.createWeight(searcher, needsScores), parentsFilter, needsScores ? scoreMode : ScoreMode.None);
   }
-  
+
   /** Return our child query. */
   public Query getChildQuery() {
     return childQuery;
@@ -116,33 +119,44 @@ public class ToParentBlockJoinQuery extends Query {
       this.scoreMode = scoreMode;
     }
 
-    // NOTE: acceptDocs applies (and is checked) only in the
-    // parent document space
     @Override
-    public Scorer scorer(LeafReaderContext readerContext) throws IOException {
-
-      final Scorer childScorer = in.scorer(readerContext);
-      if (childScorer == null) {
-        // No matches
+    public Scorer scorer(LeafReaderContext context) throws IOException {
+      final ScorerSupplier scorerSupplier = scorerSupplier(context);
+      if (scorerSupplier == null) {
         return null;
       }
+      return scorerSupplier.get(false);
+    }
 
-      final int firstChildDoc = childScorer.iterator().nextDoc();
-      if (firstChildDoc == DocIdSetIterator.NO_MORE_DOCS) {
-        // No matches
+    // NOTE: acceptDocs applies (and is checked) only in the
+    // parent document space
+    @Override
+    public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
+      final ScorerSupplier childScorerSupplier = in.scorerSupplier(context);
+      if (childScorerSupplier == null) {
         return null;
       }
 
       // NOTE: this does not take accept docs into account, the responsibility
       // to not match deleted docs is on the scorer
-      final BitSet parents = parentsFilter.getBitSet(readerContext);
-
+      final BitSet parents = parentsFilter.getBitSet(context);
       if (parents == null) {
         // No matches
         return null;
       }
 
-      return new BlockJoinScorer(this, childScorer, parents, firstChildDoc, scoreMode);
+      return new ScorerSupplier() {
+
+        @Override
+        public Scorer get(boolean randomAccess) throws IOException {
+          return new BlockJoinScorer(BlockJoinWeight.this, childScorerSupplier.get(randomAccess), parents, scoreMode);
+        }
+
+        @Override
+        public long cost() {
+          return childScorerSupplier.cost();
+        }
+      };
     }
 
     @Override
@@ -154,180 +168,191 @@ public class ToParentBlockJoinQuery extends Query {
       return Explanation.noMatch("Not a match");
     }
   }
-  
-  static class BlockJoinScorer extends Scorer {
-    private final Scorer childScorer;
+
+  private static class ParentApproximation extends DocIdSetIterator {
+
+    private final DocIdSetIterator childApproximation;
     private final BitSet parentBits;
-    private final ScoreMode scoreMode;
-    private int parentDoc = -1;
-    private int prevParentDoc;
-    private float parentScore;
-    private int parentFreq;
-    private int nextChildDoc;
-    private int childDocUpto;
-
-    public BlockJoinScorer(Weight weight, Scorer childScorer, BitSet parentBits, int firstChildDoc, ScoreMode scoreMode) {
-      super(weight);
-      //System.out.println("Q.init firstChildDoc=" + firstChildDoc);
+    private int doc = -1;
+
+    ParentApproximation(DocIdSetIterator childApproximation, BitSet parentBits) {
+      this.childApproximation = childApproximation;
       this.parentBits = parentBits;
-      this.childScorer = childScorer;
-      this.scoreMode = scoreMode;
-      nextChildDoc = firstChildDoc;
     }
 
     @Override
-    public Collection<ChildScorer> getChildren() {
-      return Collections.singleton(new ChildScorer(childScorer, "BLOCK_JOIN"));
+    public int docID() {
+      return doc;
     }
 
     @Override
-    public DocIdSetIterator iterator() {
-      return new DocIdSetIterator() {
-        final DocIdSetIterator childIt = childScorer.iterator();
-
-        @Override
-        public int nextDoc() throws IOException {
-          //System.out.println("Q.nextDoc() nextChildDoc=" + nextChildDoc);
-          if (nextChildDoc == NO_MORE_DOCS) {
-            //System.out.println("  end");
-            return parentDoc = NO_MORE_DOCS;
-          }
-
-          // Gather all children sharing the same parent as
-          // nextChildDoc
-
-          parentDoc = parentBits.nextSetBit(nextChildDoc);
-
-          // Parent & child docs are supposed to be
-          // orthogonal:
-          checkOrthogonal(nextChildDoc, parentDoc);
-
-          //System.out.println("  parentDoc=" + parentDoc);
-          assert parentDoc != DocIdSetIterator.NO_MORE_DOCS;
-
-          float totalScore = 0;
-          float maxScore = Float.NEGATIVE_INFINITY;
-          float minScore = Float.POSITIVE_INFINITY;
-
-          childDocUpto = 0;
-          parentFreq = 0;
-          do {
-
-            //System.out.println("  c=" + nextChildDoc);
-            if (scoreMode != ScoreMode.None) {
-              // TODO: specialize this into dedicated classes per-scoreMode
-              final float childScore = childScorer.score();
-              final int childFreq = childScorer.freq();
-              maxScore = Math.max(childScore, maxScore);
-              minScore = Math.min(childScore, minScore);
-              totalScore += childScore;
-              parentFreq += childFreq;
-            }
-            childDocUpto++;
-            nextChildDoc = childIt.nextDoc();
-          } while (nextChildDoc < parentDoc);
-
-          // Parent & child docs are supposed to be
-          // orthogonal:
-          checkOrthogonal(nextChildDoc, parentDoc);
-
-          switch(scoreMode) {
-          case Avg:
-            parentScore = totalScore / childDocUpto;
-            break;
-          case Max:
-            parentScore = maxScore;
-            break;
-          case Min:
-            parentScore = minScore;
-            break;
-          case Total:
-            parentScore = totalScore;
-            break;
-          case None:
-            break;
-          }
-
-          //System.out.println("  return parentDoc=" + parentDoc + " childDocUpto=" + childDocUpto);
-          return parentDoc;
-        }
-
-        @Override
-        public int advance(int parentTarget) throws IOException {
+    public int nextDoc() throws IOException {
+      return advance(doc + 1);
+    }
 
-          //System.out.println("Q.advance parentTarget=" + parentTarget);
-          if (parentTarget == NO_MORE_DOCS) {
-            return parentDoc = NO_MORE_DOCS;
-          }
+    @Override
+    public int advance(int target) throws IOException {
+      if (target >= parentBits.length()) {
+        return doc = NO_MORE_DOCS;
+      }
+      final int firstChildTarget = target == 0 ? 0 : parentBits.prevSetBit(target - 1) + 1;
+      int childDoc = childApproximation.docID();
+      if (childDoc < firstChildTarget) {
+        childDoc = childApproximation.advance(firstChildTarget);
+      }
+      if (childDoc >= parentBits.length() - 1) {
+        return doc = NO_MORE_DOCS;
+      }
+      return doc = parentBits.nextSetBit(childDoc + 1);
+    }
 
-          if (parentTarget == 0) {
-            // Callers should only be passing in a docID from
-            // the parent space, so this means this parent
-            // has no children (it got docID 0), so it cannot
-            // possibly match.  We must handle this case
-            // separately otherwise we pass invalid -1 to
-            // prevSetBit below:
-            return nextDoc();
-          }
+    @Override
+    public long cost() {
+      return childApproximation.cost();
+    }
+  }
 
-          prevParentDoc = parentBits.prevSetBit(parentTarget-1);
+  private static class ParentTwoPhase extends TwoPhaseIterator {
 
-          //System.out.println("  rolled back to prevParentDoc=" + prevParentDoc + " vs parentDoc=" + parentDoc);
-          assert prevParentDoc >= parentDoc;
-          if (prevParentDoc > nextChildDoc) {
-            nextChildDoc = childIt.advance(prevParentDoc);
-            // System.out.println("  childScorer advanced to child docID=" + nextChildDoc);
-          //} else {
-            //System.out.println("  skip childScorer advance");
-          }
+    private final ParentApproximation parentApproximation;
+    private final DocIdSetIterator childApproximation;
+    private final TwoPhaseIterator childTwoPhase;
 
-          // Parent & child docs are supposed to be orthogonal:
-          checkOrthogonal(nextChildDoc, prevParentDoc);
+    ParentTwoPhase(ParentApproximation parentApproximation, TwoPhaseIterator childTwoPhase) {
+      super(parentApproximation);
+      this.parentApproximation = parentApproximation;
+      this.childApproximation = childTwoPhase.approximation();
+      this.childTwoPhase = childTwoPhase;
+    }
 
-          final int nd = nextDoc();
-          //System.out.println("  return nextParentDoc=" + nd);
-          return nd;
+    @Override
+    public boolean matches() throws IOException {
+      assert childApproximation.docID() < parentApproximation.docID();
+      do {
+        if (childTwoPhase.matches()) {
+          return true;
         }
+      } while (childApproximation.nextDoc() < parentApproximation.docID());
+      return false;
+    }
 
-        @Override
-        public int docID() {
-          return parentDoc;
-        }
+    @Override
+    public float matchCost() {
+      // TODO: how could we compute a match cost?
+      return childTwoPhase.matchCost() + 10;
+    }
+  }
 
-        @Override
-        public long cost() {
-          return childIt.cost();
-        }
-      };
+  static class BlockJoinScorer extends Scorer {
+    private final Scorer childScorer;
+    private final BitSet parentBits;
+    private final ScoreMode scoreMode;
+    private final DocIdSetIterator childApproximation;
+    private final TwoPhaseIterator childTwoPhase;
+    private final ParentApproximation parentApproximation;
+    private final ParentTwoPhase parentTwoPhase;
+    private float score;
+    private int freq;
+
+    public BlockJoinScorer(Weight weight, Scorer childScorer, BitSet parentBits, ScoreMode scoreMode) {
+      super(weight);
+      //System.out.println("Q.init firstChildDoc=" + firstChildDoc);
+      this.parentBits = parentBits;
+      this.childScorer = childScorer;
+      this.scoreMode = scoreMode;
+      childTwoPhase = childScorer.twoPhaseIterator();
+      if (childTwoPhase == null) {
+        childApproximation = childScorer.iterator();
+        parentApproximation = new ParentApproximation(childApproximation, parentBits);
+        parentTwoPhase = null;
+      } else {
+        childApproximation = childTwoPhase.approximation();
+        parentApproximation = new ParentApproximation(childTwoPhase.approximation(), parentBits);
+        parentTwoPhase = new ParentTwoPhase(parentApproximation, childTwoPhase);
+      }
     }
 
-    private void checkOrthogonal(int childDoc, int parentDoc) {
-      if (childDoc==parentDoc) {
-        throw new IllegalStateException("Child query must not match same docs with parent filter. "
-             + "Combine them as must clauses (+) to find a problem doc. "
-             + "docId=" + nextChildDoc + ", " + childScorer.getClass());
-        
+    @Override
+    public Collection<ChildScorer> getChildren() {
+      return Collections.singleton(new ChildScorer(childScorer, "BLOCK_JOIN"));
+    }
+
+    @Override
+    public DocIdSetIterator iterator() {
+      if (parentTwoPhase == null) {
+        // the approximation is exact
+        return parentApproximation;
+      } else {
+        return TwoPhaseIterator.asDocIdSetIterator(parentTwoPhase);
       }
     }
 
     @Override
+    public TwoPhaseIterator twoPhaseIterator() {
+      return parentTwoPhase;
+    }
+
+    @Override
     public int docID() {
-      return parentDoc;
+      return parentApproximation.docID();
     }
 
     @Override
     public float score() throws IOException {
-      return parentScore;
+      setScoreAndFreq();
+      return score;
     }
     
     @Override
-    public int freq() {
-      return parentFreq;
+    public int freq() throws IOException {
+      setScoreAndFreq();
+      return freq;
+    }
+
+    private void setScoreAndFreq() throws IOException {
+      if (childApproximation.docID() >= parentApproximation.docID()) {
+        return;
+      }
+      double score = scoreMode == ScoreMode.None ? 0 : childScorer.score();
+      int freq = 1;
+      while (childApproximation.nextDoc() < parentApproximation.docID()) {
+        if (childTwoPhase == null || childTwoPhase.matches()) {
+          final float childScore = childScorer.score();
+          freq += 1;
+          switch (scoreMode) {
+            case Total:
+            case Avg:
+              score += childScore;
+              break;
+            case Min:
+              score = Math.min(score, childScore);
+              break;
+            case Max:
+              score = Math.min(score, childScore);
+              break;
+            case None:
+              break;
+            default:
+              throw new AssertionError();
+          }
+        }
+      }
+      if (childApproximation.docID() == parentApproximation.docID() && (childTwoPhase == null || childTwoPhase.matches())) {
+        throw new IllegalStateException("Child query must not match same docs with parent filter. "
+            + "Combine them as must clauses (+) to find a problem doc. "
+            + "docId=" + parentApproximation.docID() + ", " + childScorer.getClass());
+      }
+      if (scoreMode == ScoreMode.Avg) {
+        score /= freq;
+      }
+      this.score = (float) score;
+      this.freq = freq;
     }
 
     public Explanation explain(LeafReaderContext context, Weight childWeight) throws IOException {
+      int prevParentDoc = parentBits.prevSetBit(parentApproximation.docID() - 1);
       int start = context.docBase + prevParentDoc + 1; // +1 b/c prevParentDoc is previous parent doc
-      int end = context.docBase + parentDoc - 1; // -1 b/c parentDoc is parent doc
+      int end = context.docBase + parentApproximation.docID() - 1; // -1 b/c parentDoc is parent doc
 
       Explanation bestChild = null;
       int matches = 0;
@@ -341,6 +366,7 @@ public class ToParentBlockJoinQuery extends Query {
         }
       }
 
+      assert freq() == matches;
       return Explanation.match(score(), String.format(Locale.ROOT,
           "Score based on %d child docs in range from %d to %d, best match:", matches, start, end), bestChild
       );

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b6b44d44/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java
----------------------------------------------------------------------
diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java
index 02f6b23..9a6ead4 100644
--- a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java
+++ b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java
@@ -671,7 +671,7 @@ public class TestBlockJoin extends LuceneTestCase {
         System.out.println("TEST: iter=" + (1+iter) + " of " + iters);
       }
 
-      final Query childQuery;
+      Query childQuery;
       if (random().nextInt(3) == 2) {
         final int childFieldID = random().nextInt(childFields.length);
         childQuery = new TermQuery(new Term("child" + childFieldID,
@@ -706,6 +706,9 @@ public class TestBlockJoin extends LuceneTestCase {
                random().nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT);
         childQuery = bq.build();
       }
+      if (random().nextBoolean()) {
+        childQuery = new RandomApproximationQuery(childQuery, random());
+      }
 
 
       final ScoreMode agg = ScoreMode.values()[random().nextInt(ScoreMode.values().length)];

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b6b44d44/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinValidation.java
----------------------------------------------------------------------
diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinValidation.java b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinValidation.java
index aa68d09..cb3762c 100644
--- a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinValidation.java
+++ b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinValidation.java
@@ -87,25 +87,6 @@ public class TestBlockJoinValidation extends LuceneTestCase {
     assertTrue(expected.getMessage() != null && expected.getMessage().contains("Child query must not match same docs with parent filter"));
   }
 
-  public void testAdvanceValidationForToParentBjq() throws Exception {
-    int randomChildNumber = getRandomChildNumber(0);
-    // we need to make advance method meet wrong document, so random child number
-    // in BJQ must be greater than child number in Boolean clause
-    int nextRandomChildNumber = getRandomChildNumber(randomChildNumber);
-    Query parentQueryWithRandomChild = createChildrenQueryWithOneParent(nextRandomChildNumber);
-    ToParentBlockJoinQuery blockJoinQuery = new ToParentBlockJoinQuery(parentQueryWithRandomChild, parentsFilter, ScoreMode.None);
-    // advance() method is used by ConjunctionScorer, so we need to create Boolean conjunction query
-    BooleanQuery.Builder conjunctionQuery = new BooleanQuery.Builder();
-    WildcardQuery childQuery = new WildcardQuery(new Term("child", createFieldValue(randomChildNumber)));
-    conjunctionQuery.add(new BooleanClause(childQuery, BooleanClause.Occur.MUST));
-    conjunctionQuery.add(new BooleanClause(blockJoinQuery, BooleanClause.Occur.MUST));
-    
-    IllegalStateException expected = expectThrows(IllegalStateException.class, () -> {
-      indexSearcher.search(conjunctionQuery.build(), 1);
-    });
-    assertTrue(expected.getMessage() != null && expected.getMessage().contains("Child query must not match same docs with parent filter"));
-  }
-
   public void testNextDocValidationForToChildBjq() throws Exception {
     Query parentQueryWithRandomChild = createParentsQueryWithOneChild(getRandomChildNumber(0));