You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2015/02/13 17:58:33 UTC

svn commit: r1659601 - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/search/ lucene/core/src/test/org/apache/lucene/search/

Author: jpountz
Date: Fri Feb 13 16:58:33 2015
New Revision: 1659601

URL: http://svn.apache.org/r1659601
Log:
LUCENE-6198: Two-phase execution for phrase queries and conjunctions.

Added:
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java
      - copied unchanged from r1659599, lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/TwoPhaseDocIdSetIterator.java
      - copied, changed from r1659599, lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/TwoPhaseDocIdSetIterator.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java
      - copied, changed from r1659599, lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java
Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/lucene/core/   (props changed)
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/ConjunctionScorer.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/Scorer.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java

Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1659601&r1=1659600&r2=1659601&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Fri Feb 13 16:58:33 2015
@@ -49,6 +49,11 @@ Optimizations
 * LUCENE-6233 Speed up CheckIndex when the index has term vectors
   (Robert Muir, Mike McCandless)
 
+* LUCENE-6198: Added the TwoPhaseDocIdSetIterator API, exposed on scorers which
+  is for now only used on phrase queries and conjunctions in order to check
+  positions lazily if the phrase query is in a conjunction with other queries.
+  (Robert Muir, Adrien Grand)
+
 API Changes
 
 * LUCENE-6204, LUCENE-6208: Simplify CompoundFormat: remove files()

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/ConjunctionScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/ConjunctionScorer.java?rev=1659601&r1=1659600&r2=1659601&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/ConjunctionScorer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/ConjunctionScorer.java Fri Feb 13 16:58:33 2015
@@ -20,18 +20,14 @@ package org.apache.lucene.search;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
-import java.util.Comparator;
 import java.util.List;
 
-import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 
 /** Scorer for conjunctions, sets of queries, all of which are required. */
 class ConjunctionScorer extends Scorer {
 
-  protected int lastDoc = -1;
-  protected final DocsAndFreqs[] docsAndFreqs;
-  private final DocsAndFreqs lead;
+  private final ConjunctionDISI disi;
   private final Scorer[] scorers;
   private final float coord;
 
@@ -44,68 +40,28 @@ class ConjunctionScorer extends Scorer {
     super(weight);
     assert required.containsAll(scorers);
     this.coord = coord;
-    this.docsAndFreqs = new DocsAndFreqs[required.size()];
-    for (int i = 0; i < required.size(); ++i) {
-      docsAndFreqs[i] = new DocsAndFreqs(required.get(i));
-    }
-    // Sort the array the first time to allow the least frequent DocsEnum to
-    // lead the matching.
-    ArrayUtil.timSort(docsAndFreqs, new Comparator<DocsAndFreqs>() {
-      @Override
-      public int compare(DocsAndFreqs o1, DocsAndFreqs o2) {
-        return Long.compare(o1.cost, o2.cost);
-      }
-    });
-
-    lead = docsAndFreqs[0]; // least frequent DocsEnum leads the intersection
-
+    this.disi = ConjunctionDISI.intersect(required);
     this.scorers = scorers.toArray(new Scorer[scorers.size()]);
   }
 
-  private int doNext(int doc) throws IOException {
-    for(;;) {
-      // doc may already be NO_MORE_DOCS here, but we don't check explicitly
-      // since all scorers should advance to NO_MORE_DOCS, match, then
-      // return that value.
-      advanceHead: for(;;) {
-        for (int i = 1; i < docsAndFreqs.length; i++) {
-          // invariant: docsAndFreqs[i].doc <= doc at this point.
-
-          // docsAndFreqs[i].doc may already be equal to doc if we "broke advanceHead"
-          // on the previous iteration and the advance on the lead scorer exactly matched.
-          if (docsAndFreqs[i].doc < doc) {
-            docsAndFreqs[i].doc = docsAndFreqs[i].iterator.advance(doc);
-
-            if (docsAndFreqs[i].doc > doc) {
-              // DocsEnum beyond the current doc - break and advance lead to the new highest doc.
-              doc = docsAndFreqs[i].doc;
-              break advanceHead;
-            }
-          }
-        }
-        // success - all DocsEnums are on the same doc
-        return doc;
-      }
-      // advance head for next iteration
-      doc = lead.doc = lead.iterator.advance(doc);
-    }
+  @Override
+  public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
+    return disi.asTwoPhaseIterator();
   }
 
   @Override
   public int advance(int target) throws IOException {
-    lead.doc = lead.iterator.advance(target);
-    return lastDoc = doNext(lead.doc);
+    return disi.advance(target);
   }
 
   @Override
   public int docID() {
-    return lastDoc;
+    return disi.docID();
   }
 
   @Override
   public int nextDoc() throws IOException {
-    lead.doc = lead.iterator.nextDoc();
-    return lastDoc = doNext(lead.doc);
+    return disi.nextDoc();
   }
 
   @Override
@@ -120,7 +76,7 @@ class ConjunctionScorer extends Scorer {
 
   @Override
   public int freq() {
-    return docsAndFreqs.length;
+    return scorers.length;
   }
 
   @Override
@@ -145,12 +101,12 @@ class ConjunctionScorer extends Scorer {
 
   @Override
   public long cost() {
-    return lead.iterator.cost();
+    return disi.cost();
   }
 
   @Override
   public Collection<ChildScorer> getChildren() {
-    ArrayList<ChildScorer> children = new ArrayList<>(docsAndFreqs.length);
+    ArrayList<ChildScorer> children = new ArrayList<>();
     for (Scorer scorer : scorers) {
       children.add(new ChildScorer(scorer, "MUST"));
     }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java?rev=1659601&r1=1659600&r2=1659601&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java Fri Feb 13 16:58:33 2015
@@ -18,9 +18,11 @@ package org.apache.lucene.search;
  */
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
 
-import org.apache.lucene.index.*;
+import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.util.BytesRef;
 
@@ -49,10 +51,11 @@ final class ExactPhraseScorer extends Sc
     }
   }
 
+  private final ConjunctionDISI conjunction;
+
   private final ChunkState[] chunkStates;
   private final PostingsEnum lead;
 
-  private int docID = -1;
   private int freq;
 
   private final Similarity.SimScorer docScorer;
@@ -72,49 +75,46 @@ final class ExactPhraseScorer extends Sc
     // min(cost)
     cost = lead.cost();
 
+    List<DocIdSetIterator> iterators = new ArrayList<>();
     for(int i=0;i<postings.length;i++) {
       chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position);
+      iterators.add(postings[i].postings);
     }
+    conjunction = ConjunctionDISI.intersect(iterators);
   }
-  
+
+  @Override
+  public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
+    return new TwoPhaseDocIdSetIterator() {
+
+      @Override
+      public boolean matches() throws IOException {
+        return phraseFreq() > 0;
+      }
+
+      @Override
+      public DocIdSetIterator approximation() {
+        return conjunction;
+      }
+    };
+  }
+
   private int doNext(int doc) throws IOException {
-    for(;;) {
-      // TODO: don't dup this logic from conjunctionscorer :)
-      advanceHead: for(;;) {
-        for (int i = 1; i < chunkStates.length; i++) {
-          final PostingsEnum de = chunkStates[i].posEnum;
-          if (de.docID() < doc) {
-            int d = de.advance(doc);
-
-            if (d > doc) {
-              // DocsEnum beyond the current doc - break and advance lead to the new highest doc.
-              doc = d;
-              break advanceHead;
-            }
-          }
-        }
-        // all DocsEnums are on the same doc
-        if (doc == NO_MORE_DOCS) {
-          return doc;
-        } else if (phraseFreq() > 0) {
-          return doc;            // success: matches phrase
-        } else {
-          doc = lead.nextDoc();  // doesn't match phrase
-        }
+    for (;; doc = conjunction.nextDoc()) {
+      if (doc == NO_MORE_DOCS || phraseFreq() > 0) {
+        return doc;
       }
-      // advance head for next iteration
-      doc = lead.advance(doc);
     }
   }
 
   @Override
   public int nextDoc() throws IOException {
-    return docID = doNext(lead.nextDoc());
+    return doNext(conjunction.nextDoc());
   }
 
   @Override
   public int advance(int target) throws IOException {
-    return docID = doNext(lead.advance(target));
+    return doNext(conjunction.advance(target));
   }
 
   @Override
@@ -149,12 +149,12 @@ final class ExactPhraseScorer extends Sc
 
   @Override
   public int docID() {
-    return docID;
+    return conjunction.docID();
   }
 
   @Override
   public float score() {
-    return docScorer.score(docID, freq);
+    return docScorer.score(docID(), freq);
   }
 
   private int phraseFreq() throws IOException {

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/Scorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/Scorer.java?rev=1659601&r1=1659600&r2=1659601&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/Scorer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/Scorer.java Fri Feb 13 16:58:33 2015
@@ -60,7 +60,7 @@ public abstract class Scorer extends Pos
    * {@link LeafCollector#collect}.
    */
   public abstract float score() throws IOException;
-  
+
   /** returns parent Weight
    * @lucene.experimental
    */
@@ -99,4 +99,23 @@ public abstract class Scorer extends Pos
       this.relationship = relationship;
     }
   }
+
+  /**
+   * Optional method: Return a {@link TwoPhaseDocIdSetIterator} view of this
+   * {@link Scorer}. A return value of {@code null} indicates that
+   * two-phase iteration is not supported.
+   *
+   * Note that the returned {@link TwoPhaseDocIdSetIterator}'s
+   * {@link TwoPhaseDocIdSetIterator#approximation() approximation} must
+   * advance synchronously with this iterator: advancing the approximation must
+   * advance this iterator and vice-versa.
+   *
+   * Implementing this method is typically useful on {@link Scorer}s
+   * that have a high per-document overhead in order to confirm matches.
+   *
+   * The default implementation returns {@code null}.
+   */
+  public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
+    return null;
+  }
 }

Copied: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/TwoPhaseDocIdSetIterator.java (from r1659599, lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/TwoPhaseDocIdSetIterator.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/TwoPhaseDocIdSetIterator.java?p2=lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/TwoPhaseDocIdSetIterator.java&p1=lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/TwoPhaseDocIdSetIterator.java&r1=1659599&r2=1659601&rev=1659601&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/TwoPhaseDocIdSetIterator.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/TwoPhaseDocIdSetIterator.java Fri Feb 13 16:58:33 2015
@@ -30,7 +30,7 @@ public abstract class TwoPhaseDocIdSetIt
 
   /** Return a {@link DocIdSetIterator} view of the provided
    *  {@link TwoPhaseDocIdSetIterator}. */
-  public static DocIdSetIterator asDocIdSetIterator(TwoPhaseDocIdSetIterator twoPhaseIterator) {
+  public static DocIdSetIterator asDocIdSetIterator(final TwoPhaseDocIdSetIterator twoPhaseIterator) {
     final DocIdSetIterator approximation = twoPhaseIterator.approximation();
     return new DocIdSetIterator() {
 

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java?rev=1659601&r1=1659600&r2=1659601&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java Fri Feb 13 16:58:33 2015
@@ -590,4 +590,33 @@ public class TestBooleanQuery extends Lu
     w.close();
     dir.close();
   }
+
+  public void testConjunctionSupportsApproximations() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    Field f = newTextField("field", "a b c", Field.Store.NO);
+    doc.add(f);
+    w.addDocument(doc);
+    w.commit();
+
+    DirectoryReader reader = w.getReader();
+    final IndexSearcher searcher = new IndexSearcher(reader);
+
+    PhraseQuery pq = new PhraseQuery();
+    pq.add(new Term("field", "a"));
+    pq.add(new Term("field", "b"));
+
+    BooleanQuery q = new BooleanQuery();
+    q.add(pq, Occur.MUST);
+    q.add(new TermQuery(new Term("field", "c")), Occur.FILTER);
+
+    final Weight weight = searcher.createNormalizedWeight(q, random().nextBoolean());
+    final Scorer scorer = weight.scorer(reader.leaves().get(0), null);
+    assertNotNull(scorer.asTwoPhaseIterator());
+
+    reader.close();
+    w.close();
+    dir.close();
+  }
 }

Copied: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java (from r1659599, lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java?p2=lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java&p1=lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java&r1=1659599&r2=1659601&rev=1659601&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java Fri Feb 13 16:58:33 2015
@@ -55,7 +55,7 @@ public class TestConjunctionDISI extends
    * an exception in order to make sure that {@link ConjunctionDISI} takes
    * advantage of the {@link TwoPhaseDocIdSetIterator} view.
    */
-  private static Scorer scorer(DocIdSetIterator it, TwoPhaseDocIdSetIterator twoPhaseIterator) {
+  private static Scorer scorer(final DocIdSetIterator it, final TwoPhaseDocIdSetIterator twoPhaseIterator) {
     return new Scorer(null) {
 
       @Override