You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by do...@apache.org on 2012/03/09 23:24:28 UTC
svn commit: r1299077 - in /lucene/dev/branches/branch_3x/lucene: ./ core/src/java/org/apache/lucene/search/ core/src/test/org/apache/lucene/search/

Author: doronc
Date: Fri Mar  9 22:24:27 2012
New Revision: 1299077

URL: http://svn.apache.org/viewvc?rev=1299077&view=rev
Log:
LUCENE-3821: SloppyPhraseScorer missed documents that ExactPhraseScorer finds

Modified:
    lucene/dev/branches/branch_3x/lucene/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
    lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java
    lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
    lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java
    lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
    lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
    lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery2.java

Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1299077&r1=1299076&r2=1299077&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Fri Mar  9 22:24:27 2012
@@ -222,7 +222,13 @@ Bug fixes
   from the delegate DocIdSet.iterator(), which is allowed to return
   null by DocIdSet specification when no documents match.
   (Shay Banon via Uwe Schindler)
-
+  
+* LUCENE-3821: SloppyPhraseScorer missed documents that ExactPhraseScorer finds
+  When phrase queru had repeating terms (e.g. "yes ho yes")  
+  sloppy query missed documents that exact query matched. 
+  Fixed except when for repeating multiterms (e.g. "yes ho yes|no").
+  (Robert Muir, Doron Cohen)
+    
 Optimizations
 
 * LUCENE-3653: Improve concurrency in VirtualMethod and AttributeSource by

Modified: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java?rev=1299077&r1=1299076&r2=1299077&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java Fri Mar  9 22:24:27 2012
@@ -197,7 +197,7 @@ public class MultiPhraseQuery extends Qu
             return null;
         }
 
-        postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(p, docFreq, positions.get(pos).intValue(), terms[0]);
+        postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(p, docFreq, positions.get(pos).intValue(), terms);
       }
 
       // sort by increasing docFreq order
@@ -326,9 +326,19 @@ public class MultiPhraseQuery extends Qu
     }
 
     buffer.append("\"");
-    Iterator<Term[]> i = termArrays.iterator();
-    while (i.hasNext()) {
-      Term[] terms = i.next();
+    int lastPos = -1;
+    boolean first = true;
+    for (int i=0; i<termArrays.size(); i++) {
+      Term[] terms = termArrays.get(i);
+      int position = positions.get(i);
+      if (first) {
+        first = false;
+      } else {
+        buffer.append(" ");
+        for (int j=1; j<(position-lastPos); j++) {
+          buffer.append("? ");
+        }
+      }
       if (terms.length > 1) {
         buffer.append("(");
         for (int j = 0; j < terms.length; j++) {
@@ -340,8 +350,7 @@ public class MultiPhraseQuery extends Qu
       } else {
         buffer.append(terms[0].text());
       }
-      if (i.hasNext())
-        buffer.append(" ");
+      lastPos = position;
     }
     buffer.append("\"");
 

Modified: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java?rev=1299077&r1=1299076&r2=1299077&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java Fri Mar  9 22:24:27 2012
@@ -31,12 +31,15 @@ final class PhrasePositions {
   final int ord;                                  // unique across all PhrasePositions instances
   TermPositions tp;				  // stream of positions
   PhrasePositions next;				  // used to make lists
-  PhrasePositions nextRepeating;	                // link to next repeating pp: standing for same term in different query offsets
-
-  PhrasePositions(TermPositions t, int o, int ord) {
+  int rptGroup = -1; // >=0 indicates that this is a repeating PP
+  int rptInd; // index in the rptGroup
+  final Term[] terms; // for repetitions initialization 
+  
+  PhrasePositions(TermPositions t, int o, int ord, Term[] terms) {
     tp = t;
     offset = o;
     this.ord = ord;
+    this.terms = terms;
   }
 
   final boolean next() throws IOException {	  // increments to next doc
@@ -85,8 +88,8 @@ final class PhrasePositions {
   @Override
   public String toString() {
     String s = "d:"+doc+" o:"+offset+" p:"+position+" c:"+count;
-    if (nextRepeating!=null) {
-      s += " rpt[ "+nextRepeating+" ]";
+    if (rptGroup >=0 ) {
+      s += " rpt:"+rptGroup+",i"+rptInd;
     }
     return s;
   }

Modified: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java?rev=1299077&r1=1299076&r2=1299077&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java Fri Mar  9 22:24:27 2012
@@ -18,6 +18,7 @@ package org.apache.lucene.search;
  */
 
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.Set;
 import java.util.ArrayList;
 
@@ -122,23 +123,46 @@ public class PhraseQuery extends Query {
     final TermPositions postings;
     final int docFreq;
     final int position;
-    final Term term;
+    final Term[] terms;
+    final int nTerms; // for faster comparisons 
 
-    public PostingsAndFreq(TermPositions postings, int docFreq, int position, Term term) {
+    public PostingsAndFreq(TermPositions postings, int docFreq, int position, Term... terms) {
       this.postings = postings;
       this.docFreq = docFreq;
       this.position = position;
-      this.term = term;
+      nTerms = terms==null ? 0 : terms.length;
+      if (nTerms>0) {
+        if (terms.length==1) {
+          this.terms = terms;
+        } else {
+          Term[] terms2 = new Term[terms.length];
+          System.arraycopy(terms, 0, terms2, 0, terms.length);
+          Arrays.sort(terms2);
+          this.terms = terms2;
+        }
+      } else {
+        this.terms = null;
+      }
     }
 
     public int compareTo(PostingsAndFreq other) {
-      if (docFreq == other.docFreq) {
-        if (position == other.position) {
-          return term.compareTo(other.term);
-        }
+      if (docFreq != other.docFreq) {
+        return docFreq - other.docFreq;
+      }
+      if (position != other.position) {
         return position - other.position;
       }
-      return docFreq - other.docFreq;
+      if (nTerms != other.nTerms) {
+        return nTerms - other.nTerms;
+      }
+      if (nTerms == 0) {
+        return 0;
+      }
+      for (int i=0; i<terms.length; i++) {
+        int res = terms[i].compareTo(other.terms[i]);
+        if (res!=0) return res;
+      }
+      return 0;
     }
 
     @Override
@@ -147,7 +171,9 @@ public class PhraseQuery extends Query {
       int result = 1;
       result = prime * result + docFreq;
       result = prime * result + position;
-      result = prime * result + ((term == null) ? 0 : term.hashCode());
+      for (int i=0; i<nTerms; i++) {
+        result = prime * result + terms[i].hashCode(); 
+      }
       return result;
     }
 
@@ -159,10 +185,8 @@ public class PhraseQuery extends Query {
       PostingsAndFreq other = (PostingsAndFreq) obj;
       if (docFreq != other.docFreq) return false;
       if (position != other.position) return false;
-      if (term == null) {
-        if (other.term != null) return false;
-      } else if (!term.equals(other.term)) return false;
-      return true;
+      if (terms == null) return other.terms == null;
+      return Arrays.equals(terms, other.terms);
     }
   }
 

Modified: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java?rev=1299077&r1=1299076&r2=1299077&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java Fri Mar  9 22:24:27 2012
@@ -49,11 +49,11 @@ abstract class PhraseScorer extends Scor
     // this allows to easily identify a matching (exact) phrase 
     // when all PhrasePositions have exactly the same position.
     if (postings.length > 0) {
-      min = new PhrasePositions(postings[0].postings, postings[0].position, 0);
+      min = new PhrasePositions(postings[0].postings, postings[0].position, 0, postings[0].terms);
       max = min;
       max.doc = -1;
       for (int i = 1; i < postings.length; i++) {
-        PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
+        PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms);
         max.next = pp;
         max = pp;
         max.doc = -1;

Modified: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java?rev=1299077&r1=1299076&r2=1299077&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java Fri Mar  9 22:24:27 2012
@@ -19,65 +19,78 @@ package org.apache.lucene.search;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.util.OpenBitSet;
 
 final class SloppyPhraseScorer extends PhraseScorer {
-    private int slop;
-  private boolean checkedRepeats; // flag to only check in first candidate doc in case there are no repeats
-  private boolean hasRepeats; // flag indicating that there are repeats (already checked in first candidate doc)
-  private PhraseQueue pq; // for advancing min position
-  private PhrasePositions[] nrPps; // non repeating pps ordered by their query offset
-
-    SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity,
-                       int slop, byte[] norms) {
-        super(weight, postings, similarity, norms);
-        this.slop = slop;
-    }
-
-    /**
-     * Score a candidate doc for all slop-valid position-combinations (matches) 
-     * encountered while traversing/hopping the PhrasePositions.
-     * <br> The score contribution of a match depends on the distance: 
-     * <br> - highest score for distance=0 (exact match).
-     * <br> - score gets lower as distance gets higher.
-     * <br>Example: for query "a b"~2, a document "x a b a y" can be scored twice: 
-     * once for "a b" (distance=0), and once for "b a" (distance=2).
-     * <br>Possibly not all valid combinations are encountered, because for efficiency  
-     * we always propagate the least PhrasePosition. This allows to base on 
-     * PriorityQueue and move forward faster. 
-     * As result, for example, document "a b c b a"
-     * would score differently for queries "a b c"~4 and "c b a"~4, although 
-     * they really are equivalent. 
-     * Similarly, for doc "a b c b a f g", query "c b"~2 
-     * would get same score as "g f"~2, although "c b"~2 could be matched twice.
-     * We may want to fix this in the future (currently not, for performance reasons).
-     */
-    @Override
+  
+  private final int slop;
+  private final int numPostings;
+  private final PhraseQueue pq; // for advancing min position
+  
+  private int end; // current largest phrase position  
+
+  private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc)
+  private boolean checkedRpts; // flag to only check for repetitions in first candidate doc
+  private boolean hasMultiTermRpts; //  
+  private PhrasePositions[][] rptGroups; // in each group are PPs that repeats each other (i.e. same term), sorted by (query) offset 
+  private PhrasePositions[] rptStack; // temporary stack for switching colliding repeating pps 
+  
+  SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity,
+      int slop, byte[] norms) throws IOException {
+    super(weight, postings, similarity, norms);
+    this.slop = slop;
+    this.numPostings = postings==null ? 0 : postings.length;
+    pq = new PhraseQueue(postings.length);
+  }
+
+  /**
+   * Score a candidate doc for all slop-valid position-combinations (matches) 
+   * encountered while traversing/hopping the PhrasePositions.
+   * <br> The score contribution of a match depends on the distance: 
+   * <br> - highest score for distance=0 (exact match).
+   * <br> - score gets lower as distance gets higher.
+   * <br>Example: for query "a b"~2, a document "x a b a y" can be scored twice: 
+   * once for "a b" (distance=0), and once for "b a" (distance=2).
+   * <br>Possibly not all valid combinations are encountered, because for efficiency  
+   * we always propagate the least PhrasePosition. This allows to base on 
+   * PriorityQueue and move forward faster. 
+   * As result, for example, document "a b c b a"
+   * would score differently for queries "a b c"~4 and "c b a"~4, although 
+   * they really are equivalent. 
+   * Similarly, for doc "a b c b a f g", query "c b"~2 
+   * would get same score as "g f"~2, although "c b"~2 could be matched twice.
+   * We may want to fix this in the future (currently not, for performance reasons).
+   */
+  @Override
   protected float phraseFreq() throws IOException {
-    int end = initPhrasePositions();
-    //printPositions(System.err, "INIT DONE:");
-    if (end==Integer.MIN_VALUE) {
+    if (!initPhrasePositions()) {
       return 0.0f;
     }
-    
     float freq = 0.0f;
     PhrasePositions pp = pq.pop();
     int matchLength = end - pp.position;
-    int next = pq.size()>0 ? pq.top().position : pp.position;
-    //printQueue(System.err, pp, "Bef Loop: next="+next+" mlen="+end+"-"+pp.position+"="+matchLength);
-    while (pp.nextPosition() && (end=advanceRepeats(pp, end)) != Integer.MIN_VALUE)  {
-      if (pp.position > next) {
-        //printQueue(System.err, pp, "A: >next="+next+" matchLength="+matchLength);
+    int next = pq.top().position; 
+    while (advancePP(pp)) {
+      if (hasRpts && !advanceRpts(pp)) {
+        break; // pps exhausted
+      }
+      if (pp.position > next) { // done minimizing current match-length 
         if (matchLength <= slop) {
           freq += getSimilarity().sloppyFreq(matchLength); // score match
         }      
         pq.add(pp);
         pp = pq.pop();
-        next = pq.size()>0 ? pq.top().position : pp.position;
+        next = pq.top().position;
         matchLength = end - pp.position;
-        //printQueue(System.err, pp, "B: >next="+next+" matchLength="+matchLength);
       } else {
         int matchLength2 = end - pp.position;
-        //printQueue(System.err, pp, "C: mlen2<mlen: next="+next+" matchLength="+matchLength+" matchLength2="+matchLength2);
         if (matchLength2 < matchLength) {
           matchLength = matchLength2;
         }
@@ -89,53 +102,82 @@ final class SloppyPhraseScorer extends P
     return freq;
   }
 
-  /**
-   * Advance repeating pps of an input (non-repeating) pp.
-   * Return a modified 'end' in case pp or its repeats exceeds original 'end'.
-   * "Dirty" trick: when there are repeats, modifies pp's position to that of 
-   * least repeater of pp (needed when due to holes repeaters' positions are "back").
-   */
-  private int advanceRepeats(PhrasePositions pp, int end) throws IOException {
-    int repeatsEnd = end;
-    if (pp.position > repeatsEnd) {
-      repeatsEnd = pp.position;
+  /** advance a PhrasePosition and update 'end', return false if exhausted */
+  private boolean advancePP(PhrasePositions pp) throws IOException {
+    if (!pp.nextPosition()) {
+      return false;
     }
-    if (!hasRepeats) {
-      return repeatsEnd;
+    if (pp.position > end) {
+      end = pp.position;
     }
-    int tpPos = tpPos(pp);
-    for (PhrasePositions pp2=pp.nextRepeating; pp2!=null; pp2=pp2.nextRepeating) {
-      while (tpPos(pp2) <= tpPos) {
-        if (!pp2.nextPosition()) {
-          return Integer.MIN_VALUE;
-        }
+    return true;
+  }
+  
+  /** pp was just advanced. If that caused a repeater collision, resolve by advancing the lesser
+   * of the two colliding pps. Note that there can only be one collision, as by the initialization
+   * there were no collisions before pp was advanced.  */
+  private boolean advanceRpts(PhrasePositions pp) throws IOException {
+    if (pp.rptGroup < 0) {
+      return true; // not a repeater
+    }
+    PhrasePositions[] rg = rptGroups[pp.rptGroup];
+    OpenBitSet bits = new OpenBitSet(rg.length); // for re-queuing after collisions are resolved
+    int k0 = pp.rptInd;
+    int k;
+    while((k=collide(pp)) >= 0) {
+      pp = lesser(pp, rg[k]); // always advance the lesser of the (only) two colliding pps
+      if (!advancePP(pp)) {
+        return false; // exhausted
+      }
+      if (k != k0) { // careful: mark only those currently in the queue
+        bits.set(k); // mark that pp2 need to be re-queued
       }
-      tpPos = tpPos(pp2);
-      if (pp2.position > repeatsEnd) {
-        repeatsEnd = pp2.position;
-      }
-      // "dirty" trick: with holes, given a pp, its repeating pp2 might have smaller position.
-      // so in order to have the right "start" in matchLength computation we fake pp.position.
-      // this relies on pp.nextPosition() not using pp.position.
-      if (pp2.position < pp.position) { 
-        pp.position = pp2.position;     
+    }
+    // collisions resolved, now re-queue
+    // empty (partially) the queue until seeing all pps advanced for resolving collisions
+    int n = 0;
+    while (bits.cardinality() > 0) {
+      PhrasePositions pp2 = pq.pop();
+      rptStack[n++] = pp2;
+      if (pp2.rptGroup >= 0 && bits.get(pp2.rptInd)) {
+        bits.clear(pp2.rptInd);
       }
     }
-    return repeatsEnd;
+    // add back to queue
+    for (int i=n-1; i>=0; i--) {
+      pq.add(rptStack[i]);
+    }
+    return true;
+  }
+
+  /** compare two pps, but only by position and offset */
+  private PhrasePositions lesser(PhrasePositions pp, PhrasePositions pp2) {
+    if (pp.position < pp2.position ||
+        (pp.position == pp2.position && pp.offset < pp2.offset)) {
+      return pp;
+    }
+    return pp2;
+  }
+
+  /** index of a pp2 colliding with pp, or -1 if none */
+  private int collide(PhrasePositions pp) {
+    int tpPos = tpPos(pp);
+    PhrasePositions[] rg = rptGroups[pp.rptGroup];
+    for (int i=0; i<rg.length; i++) {
+      PhrasePositions pp2 = rg[i];
+      if (pp2 != pp && tpPos(pp2) == tpPos) {
+        return pp2.rptInd;
+      }
+    }
+    return -1;
   }
 
   /**
    * Initialize PhrasePositions in place.
-   * There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
+   * A one time initialization for this scorer (on first doc matching all terms):
    * <ul>
-   *  <li>Detect groups of repeating pps: those with same tpPos (tpPos==position in the doc) but different offsets in query.
-   *  <li>For each such group:
-   *  <ul>
-   *   <li>form an inner linked list of the repeating ones.
-   *   <li>propagate all group members but first so that they land on different tpPos().
-   *  </ul>
-   *  <li>Mark whether there are repetitions at all, so that scoring queries with no repetitions has no overhead due to this computation.
-   *  <li>Insert to pq only non repeating PPs, or PPs that are the first in a repeating group.
+   *  <li>Check if there are repetitions
+   *  <li>If there are, find groups of repetitions.
    * </ul>
    * Examples:
    * <ol>
@@ -143,118 +185,305 @@ final class SloppyPhraseScorer extends P
    *  <li>repetitions: <b>"ho my my"~2</b>
    *  <li>repetitions: <b>"my ho my"~2</b>
    * </ol>
-   * @return end (max position), or Integer.MIN_VALUE if any term ran out (i.e. done) 
+   * @return false if PPs are exhausted (and so current doc will not be a match) 
    */
-  private int initPhrasePositions() throws IOException {
-    int end = Integer.MIN_VALUE;
-    
-    // no repeats at all (most common case is also the simplest one)
-    if (checkedRepeats && !hasRepeats) {
-      // build queue from list
-      pq.clear();
-      for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) {  // iterate cyclic list: done once handled max
-        pp.firstPosition();
-        if (pp.position > end) {
-          end = pp.position;
-        }
-        pq.add(pp);         // build pq from list
+  private boolean initPhrasePositions() throws IOException {
+    end = Integer.MIN_VALUE;
+    if (!checkedRpts) {
+      return initFirstTime();
+    }
+    if (!hasRpts) {
+      initSimple();
+      return true; // PPs available
+    }
+    return initComplex();
+  }
+  
+  /** no repeats: simplest case, and most common. It is important to keep this piece of the code simple and efficient */
+  private void initSimple() throws IOException {
+    //System.err.println("initSimple: doc: "+min.doc);
+    pq.clear();
+    // position pps and build queue from list
+    for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) {  // iterate cyclic list: done once handled max
+      pp.firstPosition();
+      if (pp.position > end) {
+        end = pp.position;
       }
-      return end;
+      pq.add(pp);
     }
-    
-    //printPositions(System.err, "Init: 1: Bef position");
-    
-    // position the pp's
-    for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) {  // iterate cyclic list: done once handled max  
+  }
+  
+  /** with repeats: not so simple. */
+  private boolean initComplex() throws IOException {
+    //System.err.println("initComplex: doc: "+min.doc);
+    placeFirstPositions();
+    if (!advanceRepeatGroups()) {
+      return false; // PPs exhausted
+    }
+    fillQueue();
+    return true; // PPs available
+  }
+
+  /** move all PPs to their first position */
+  private void placeFirstPositions() throws IOException {
+    for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
       pp.firstPosition();
     }
-    
-    //printPositions(System.err, "Init: 2: Aft position");
-    
-    // one time initialization for this scorer (done only for the first candidate doc)
-    if (!checkedRepeats) {
-      checkedRepeats = true;
-      ArrayList<PhrasePositions> ppsA = new ArrayList<PhrasePositions>();
-      PhrasePositions dummyPP = new PhrasePositions(null, -1, -1);
-      // check for repeats
-      for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) {  // iterate cyclic list: done once handled max
-        if (pp.nextRepeating != null) {
-          continue; // a repetition of an earlier pp
+  }
+
+  /** Fill the queue (all pps are already placed */
+  private void fillQueue() {
+    pq.clear();
+    for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) {  // iterate cyclic list: done once handled max
+      if (pp.position > end) {
+        end = pp.position;
+      }
+      pq.add(pp);
+    }
+  }
+
+  /** At initialization (each doc), each repetition group is sorted by (query) offset.
+   * This provides the start condition: no collisions.
+   * <p>Case 1: no multi-term repeats<br>
+   * It is sufficient to advance each pp in the group by one less than its group index.
+   * So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc.
+   * <p>Case 2: multi-term repeats<br>
+   * 
+   * @return false if PPs are exhausted. 
+   */
+  private boolean advanceRepeatGroups() throws IOException {
+    for (PhrasePositions[] rg: rptGroups) { 
+      if (hasMultiTermRpts) {
+        // more involved, some may not collide
+        int incr;
+        for (int i=0; i<rg.length; i+=incr) {
+          incr = 1;
+          PhrasePositions pp = rg[i];
+          int k;
+          while((k=collide(pp)) >= 0) {
+            PhrasePositions pp2 = lesser(pp, rg[k]);
+            if (!advancePP(pp2)) {  // at initialization always advance pp with higher offset
+              return false; // exhausted
+            }
+            if (pp2.rptInd < i) { // should not happen?
+              incr = 0;
+              break;
+            }
+          }
+        }
+      } else {
+        // simpler, we know exactly how much to advance
+        for (int j=1; j<rg.length; j++) {
+          for (int k=0; k<j; k++) {
+            if (!rg[j].nextPosition()) {
+              return false; // PPs exhausted
+            }
+          }
         }
-        ppsA.add(pp);
+      }
+    }
+    return true; // PPs available
+  }
+  
+  /** initialize with checking for repeats. Heavy work, but done only for the first candidate doc.<p>
+   * If there are repetitions, check if multi-term postings (MTP) are involved.<p>
+   * Without MTP, once PPs are placed in the first candidate doc, repeats (and groups) are visible.<br>
+   * With MTP, a more complex check is needed, up-front, as there may be "hidden collisions".<br>
+   * For example P1 has {A,B}, P1 has {B,C}, and the first doc is: "A C B". At start, P1 would point
+   * to "A", p2 to "C", and it will not be identified that P1 and P2 are repetitions of each other.<p>
+   * The more complex initialization has two parts:<br>
+   * (1) identification of repetition groups.<br>
+   * (2) advancing repeat groups at the start of the doc.<br>
+   * For (1), a possible solution is to just create a single repetition group, 
+   * made of all repeating pps. But this would slow down the check for collisions, 
+   * as all pps would need to be checked. Instead, we compute "connected regions" 
+   * on the bipartite graph of postings and terms.  
+   */
+  private boolean initFirstTime() throws IOException {
+    //System.err.println("initFirstTime: doc: "+min.doc);
+    checkedRpts = true;
+    placeFirstPositions();
+
+    LinkedHashMap<Term,Integer> rptTerms = repeatingTerms(); 
+    hasRpts = !rptTerms.isEmpty();
+
+    if (hasRpts) {
+      rptStack = new PhrasePositions[numPostings]; // needed with repetitions
+      ArrayList<ArrayList<PhrasePositions>> rgs = gatherRptGroups(rptTerms);
+      sortRptGroups(rgs);
+      if (!advanceRepeatGroups()) {
+        return false; // PPs exhausted
+      }
+    }
+    
+    fillQueue();
+    return true; // PPs available
+  }
+
+  /** sort each repetition group by (query) offset. 
+   * Done only once (at first doc) and allows to initialize faster for each doc. */
+  private void sortRptGroups(ArrayList<ArrayList<PhrasePositions>> rgs) {
+    rptGroups = new PhrasePositions[rgs.size()][];
+    Comparator<PhrasePositions> cmprtr = new Comparator<PhrasePositions>() {
+      public int compare(PhrasePositions pp1, PhrasePositions pp2) {
+        return pp1.offset - pp2.offset;
+      }
+    };
+    for (int i=0; i<rptGroups.length; i++) {
+      PhrasePositions[] rg = rgs.get(i).toArray(new PhrasePositions[0]);
+      Arrays.sort(rg, cmprtr);
+      rptGroups[i] = rg;
+      for (int j=0; j<rg.length; j++) {
+        rg[j].rptInd = j; // we use this index for efficient re-queuing
+      }
+    }
+  }
+
+  /** Detect repetition groups. Done once - for first doc */
+  private ArrayList<ArrayList<PhrasePositions>> gatherRptGroups(LinkedHashMap<Term,Integer> rptTerms) throws IOException {
+    PhrasePositions[] rpp = repeatingPPs(rptTerms); 
+    ArrayList<ArrayList<PhrasePositions>> res = new ArrayList<ArrayList<PhrasePositions>>();
+    if (!hasMultiTermRpts) {
+      // simpler - no multi-terms - can base on positions in first doc
+      for (int i=0; i<rpp.length; i++) {
+        PhrasePositions pp = rpp[i];
+        if (pp.rptGroup >=0) continue; // already marked as a repetition
         int tpPos = tpPos(pp);
-        for (PhrasePositions prevB=pp, pp2=pp.next; pp2!= min; pp2=pp2.next) {
+        for (int j=i+1; j<rpp.length; j++) {
+          PhrasePositions pp2 = rpp[j];
           if (
-              pp2.nextRepeating != null  // already detected as a repetition of an earlier pp
-              || pp.offset == pp2.offset // not a repetition: the two PPs are originally in same offset in the query! 
+              pp2.rptGroup >=0        // already marked as a repetition
+              || pp2.offset == pp.offset // not a repetition: two PPs are originally in same offset in the query! 
               || tpPos(pp2) != tpPos) {  // not a repetition
             continue; 
           }
           // a repetition
-          hasRepeats = true;
-          prevB.nextRepeating = pp2;  // add pp2 to the repeats linked list
-          pp2.nextRepeating = dummyPP; // allows not to handle the last pp in a sub-list
-          prevB = pp2;
+          int g = pp.rptGroup;
+          if (g < 0) {
+            g = res.size();
+            pp.rptGroup = g;  
+            ArrayList<PhrasePositions> rl = new ArrayList<PhrasePositions>(2);
+            rl.add(pp);
+            res.add(rl); 
+          }
+          pp2.rptGroup = g;
+          res.get(g).add(pp2);
         }
       }
-      if (hasRepeats) {
-        // clean dummy markers
-        for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) {  // iterate cyclic list: done once handled max
-          if (pp.nextRepeating == dummyPP) {
-            pp.nextRepeating = null;
+    } else {
+      // more involved - has multi-terms
+      ArrayList<HashSet<PhrasePositions>> tmp = new ArrayList<HashSet<PhrasePositions>>();
+      ArrayList<OpenBitSet> bb = ppTermsBitSets(rpp, rptTerms);
+      unionTermGroups(bb);
+      HashMap<Term,Integer> tg = termGroups(rptTerms, bb);
+      HashSet<Integer> distinctGroupIDs = new HashSet<Integer>(tg.values());
+      for (int i=0; i<distinctGroupIDs.size(); i++) {
+        tmp.add(new HashSet<PhrasePositions>());
+      }
+      for (PhrasePositions pp : rpp) {
+        for (Term t: pp.terms) {
+          if (rptTerms.containsKey(t)) {
+            int g = tg.get(t);
+            tmp.get(g).add(pp);
+            assert pp.rptGroup==-1 || pp.rptGroup==g;  
+            pp.rptGroup = g;
           }
         }
       }
-      nrPps = ppsA.toArray(new PhrasePositions[0]);
-      pq = new PhraseQueue(nrPps.length);
+      for (HashSet<PhrasePositions> hs : tmp) {
+        res.add(new ArrayList<PhrasePositions>(hs));
+      }
     }
-    
-    //printPositions(System.err, "Init: 3: Aft check-repeats");
-    
-    // with repeats must advance some repeating pp's so they all start with differing tp's
-    if (hasRepeats) {
-      for (PhrasePositions pp: nrPps) {
-        if ((end=advanceRepeats(pp, end)) == Integer.MIN_VALUE) {
-          return Integer.MIN_VALUE; // ran out of a term -- done (no valid matches in current doc)
+    return res;
+  }
+
+  /** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */
+  private final int tpPos(PhrasePositions pp) {
+    return pp.position + pp.offset;
+  }
+
+  /** find repeating terms and assign them ordinal values */
+  private LinkedHashMap<Term,Integer> repeatingTerms() {
+    LinkedHashMap<Term,Integer> tord = new LinkedHashMap<Term,Integer>();
+    HashMap<Term,Integer> tcnt = new HashMap<Term,Integer>();
+    for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+      for (Term t : pp.terms) {
+        Integer cnt0 = tcnt.get(t);
+        Integer cnt = cnt0==null ? new Integer(1) : new Integer(1+cnt0.intValue());
+        tcnt.put(t, cnt);
+        if (cnt==2) {
+          tord.put(t,tord.size());
         }
       }
     }
-    
-    //printPositions(System.err, "Init: 4: Aft advance-repeats");
-    
-    // build queue from non repeating pps 
-    pq.clear();
-    for (PhrasePositions pp: nrPps) {
-      if (pp.position > end) {
-        end = pp.position;
+    return tord;
+  }
+
+  /** find repeating pps, and for each, if has multi-terms, update this.hasMultiTermRpts */
+  private PhrasePositions[] repeatingPPs(HashMap<Term,Integer> rptTerms) {
+    ArrayList<PhrasePositions> rp = new ArrayList<PhrasePositions>(); 
+    for (PhrasePositions pp=min,prev=null; prev!=max; pp=(prev=pp).next) { // iterate cyclic list: done once handled max
+      for (Term t : pp.terms) {
+        if (rptTerms.containsKey(t)) {
+          rp.add(pp);
+          hasMultiTermRpts |= (pp.terms.length > 1);
+          break;
+        }
       }
-      pq.add(pp);
     }
-    
-    return end;
+    return rp.toArray(new PhrasePositions[0]);
   }
   
-  /** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */
-  private final int tpPos(PhrasePositions pp) {
-    return pp.position + pp.offset;
+  /** bit-sets - for each repeating pp, for each of its repeating terms, the term ordinal values is set */
+  private ArrayList<OpenBitSet> ppTermsBitSets(PhrasePositions[] rpp, HashMap<Term,Integer> tord) {
+    ArrayList<OpenBitSet> bb = new ArrayList<OpenBitSet>(rpp.length);
+    for (PhrasePositions pp : rpp) {
+      OpenBitSet b = new OpenBitSet(tord.size());
+      Integer ord;
+      for (Term t: pp.terms) {
+        if ((ord=tord.get(t))!=null) {
+          b.set(ord);
+        }
+      }
+      bb.add(b);
+    }
+    return bb;
+  }
+  
+  /** union (term group) bit-sets until they are disjoint (O(n^^2)), and each group have different terms */
+  private void unionTermGroups(ArrayList<OpenBitSet> bb) {
+    int incr;
+    for (int i=0; i<bb.size()-1; i+=incr) {
+      incr = 1;
+      int j = i+1;
+      while (j<bb.size()) {
+        if (bb.get(i).intersects(bb.get(j))) {
+          bb.get(i).union(bb.get(j));
+          bb.remove(j);
+          incr = 0;
+        } else {
+          ++j;
+        }
+      }
+    }
+  }
+  
+  /** map each term to the single group that contains it */ 
+  private HashMap<Term,Integer> termGroups(LinkedHashMap<Term,Integer> tord, ArrayList<OpenBitSet> bb) throws IOException {
+    HashMap<Term,Integer> tg = new HashMap<Term,Integer>();
+    Term[] t = tord.keySet().toArray(new Term[0]);
+    for (int i=0; i<bb.size(); i++) { // i is the group no.
+      DocIdSetIterator bits = bb.get(i).iterator();
+      int ord;
+      while ((ord=bits.nextDoc())!=NO_MORE_DOCS) {
+        tg.put(t[ord],i);
+      }
+    }
+    return tg;
   }
   
-//  private void printPositions(PrintStream ps, String title) {
-//    ps.println();
-//    ps.println("---- "+title);
-//    int k = 0;
-//    if (nrPps!=null) {
-//      for (PhrasePositions pp: nrPps) {
-//        ps.println("  " + k++ + "  " + pp);
-//      }
-//    } else {
-//      for (PhrasePositions pp=min; 0==k || pp!=min; pp = pp.next) {  
-//        ps.println("  " + k++ + "  " + pp);
-//      }
-//    }
-//  }
-
 //  private void printQueue(PrintStream ps, PhrasePositions ext, String title) {
+//    //if (min.doc != ?) return;
 //    ps.println();
 //    ps.println("---- "+title);
 //    ps.println("EXT: "+ext);
@@ -264,7 +493,7 @@ final class SloppyPhraseScorer extends P
 //      ps.println("  " + 0 + "  " + t[0]);
 //      for (int i=1; i<t.length; i++) {
 //        t[i] = pq.pop();
-//        assert t[i-1].position <= t[i].position : "PQ is out of order: "+(i-1)+"::"+t[i-1]+" "+i+"::"+t[i];
+//        assert t[i-1].position <= t[i].position;
 //        ps.println("  " + i + "  " + t[i]);
 //      }
 //      // add them back
@@ -273,4 +502,5 @@ final class SloppyPhraseScorer extends P
 //      }
 //    }
 //  }
+  
 }

Modified: lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java?rev=1299077&r1=1299076&r2=1299077&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java Fri Mar  9 22:24:27 2012
@@ -27,10 +27,8 @@ import org.apache.lucene.queryParser.Que
 import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.SimpleAnalyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.document.Document;
@@ -39,6 +37,7 @@ import org.apache.lucene.index.IndexWrit
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Ignore;
 
 import java.io.IOException;
 import java.util.Collection;
@@ -158,6 +157,45 @@ public class TestMultiPhraseQuery extend
     r.close();
     indexStore.close();
   }
+
+  @Ignore //LUCENE-3821 fixes sloppy phrase scoring, except for this known problem 
+  public void testMultiSloppyWithRepeats() throws IOException {
+    Directory indexStore = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
+    add("a b c d e f g h i k", writer);
+    IndexReader r = writer.getReader();
+    writer.close();
+    
+    IndexSearcher searcher = newSearcher(r);
+    
+    MultiPhraseQuery q = new MultiPhraseQuery();
+    // this will fail, when the scorer would propagate [a] rather than [a,b],
+    q.add(new Term[] {new Term("body", "a"), new Term("body", "b")});
+    q.add(new Term[] {new Term("body", "a")});
+    q.setSlop(6);
+    assertEquals(1, searcher.search(q, 1).totalHits); // should match on "a b"
+    
+    searcher.close();
+    r.close();
+    indexStore.close();
+  }
+
+  public void testMultiExactWithRepeats() throws IOException {
+    Directory indexStore = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, indexStore);
+    add("a b c d e f g h i k", writer);
+    IndexReader r = writer.getReader();
+    writer.close();
+    
+    IndexSearcher searcher = newSearcher(r);
+    MultiPhraseQuery q = new MultiPhraseQuery();
+    q.add(new Term[] {new Term("body", "a"), new Term("body", "d")}, 0);
+    q.add(new Term[] {new Term("body", "a"), new Term("body", "f")}, 2);
+    assertEquals(1, searcher.search(q, 1).totalHits); // should match on "a b"
+    searcher.close();
+    r.close();
+    indexStore.close();
+  }
   
   private void add(String s, RandomIndexWriter writer) throws IOException {
     Document doc = new Document();

Modified: lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery2.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery2.java?rev=1299077&r1=1299076&r2=1299077&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery2.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/test/org/apache/lucene/search/TestSloppyPhraseQuery2.java Fri Mar  9 22:24:27 2012
@@ -21,12 +21,10 @@ import java.util.Random;
 
 import org.apache.lucene.index.Term;
 import org.apache.lucene.util._TestUtil;
-import org.junit.Ignore;
 
 /**
  * random sloppy phrase query tests
  */
-@Ignore("Put this back when we fix LUCENE-3821")
 public class TestSloppyPhraseQuery2 extends SearchEquivalenceTestBase {
   /** "A B"~N â "A B"~N+1 */
   public void testIncreasingSloppiness() throws Exception {