You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2011/05/02 22:39:26 UTC
svn commit: r1098782 - in /lucene/dev/trunk: ./ lucene/ lucene/backwards/ lucene/src/java/org/apache/lucene/search/ lucene/src/test/org/apache/lucene/search/ solr/

Author: mikemccand
Date: Mon May  2 20:39:26 2011
New Revision: 1098782

URL: http://svn.apache.org/viewvc?rev=1098782&view=rev
Log:
LUCENE-3029: MultiPhraseQuery scores should not depend on docID

Modified:
    lucene/dev/trunk/   (props changed)
    lucene/dev/trunk/lucene/   (props changed)
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/backwards/   (props changed)
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhrasePositions.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseQueue.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseScorer.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
    lucene/dev/trunk/solr/   (props changed)

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1098782&r1=1098781&r2=1098782&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Mon May  2 20:39:26 2011
@@ -1477,6 +1477,10 @@ Bug fixes
   that warming is free to do whatever it needs to.  (Earwin Burrfoot
   via Mike McCandless)
 
+* LUCENE-3029: Fix corner case when MultiPhraseQuery is used with zero
+  position-increment tokens that would sometimes assign different
+  scores to identical docs.  (Mike McCandless)
+
 * LUCENE-2486: Fixed intermittent FileNotFoundException on doc store
   files when a mergedSegmentWarmer is set on IndexWriter.  (Mike
   McCandless)

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhrasePositions.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhrasePositions.java?rev=1098782&r1=1098781&r2=1098782&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhrasePositions.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhrasePositions.java Mon May  2 20:39:26 2011
@@ -28,13 +28,15 @@ final class PhrasePositions {
   int position;					  // position in doc
   int count;					  // remaining pos in this doc
   int offset;					  // position in phrase
+  final int ord;                                  // unique across all PhrasePositions instances
   final DocsAndPositionsEnum postings;  	  // stream of docs & positions
   PhrasePositions next;	                          // used to make lists
   boolean repeats;       // there's other pp for same term (e.g. query="1st word 2nd word"~1) 
 
-  PhrasePositions(DocsAndPositionsEnum postings, int o) {
+  PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) {
     this.postings = postings;
     offset = o;
+    this.ord = ord;
   }
 
   final boolean next() throws IOException {	  // increments to next doc

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseQueue.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseQueue.java?rev=1098782&r1=1098781&r2=1098782&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseQueue.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseQueue.java Mon May  2 20:39:26 2011
@@ -30,10 +30,16 @@ final class PhraseQueue extends Priority
       if (pp1.position == pp2.position)
         // same doc and pp.position, so decide by actual term positions. 
         // rely on: pp.position == tp.position - offset. 
-        return pp1.offset < pp2.offset;
-      else
+        if (pp1.offset == pp2.offset) {
+          return pp1.ord < pp2.ord;
+        } else {
+          return pp1.offset < pp2.offset;
+        }
+      else {
         return pp1.position < pp2.position;
-    else
+      }
+    else {
       return pp1.doc < pp2.doc;
+    }
   }
 }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseScorer.java?rev=1098782&r1=1098781&r2=1098782&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseScorer.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseScorer.java Mon May  2 20:39:26 2011
@@ -55,7 +55,7 @@ abstract class PhraseScorer extends Scor
     // this allows to easily identify a matching (exact) phrase 
     // when all PhrasePositions have exactly the same position.
     for (int i = 0; i < postings.length; i++) {
-      PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position);
+      PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
       if (last != null) {			  // add next to end of list
         last.next = pp;
       } else {

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java?rev=1098782&r1=1098781&r2=1098782&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java Mon May  2 20:39:26 2011
@@ -25,14 +25,22 @@ import org.apache.lucene.index.MultiFiel
 import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.LuceneTestCase;
 
 import java.io.IOException;
 import java.util.Collection;
 import java.util.LinkedList;
+import java.io.Reader;
 
 /**
  * This class tests the MultiPhraseQuery class.
@@ -333,4 +341,97 @@ public class TestMultiPhraseQuery extend
     reader.close();
     indexStore.close();
   }
+
+  private static class TokenAndPos {
+    public final String token;
+    public final int pos;
+    public TokenAndPos(String token, int pos) {
+      this.token = token;
+      this.pos = pos;
+    }
+  }
+
+  private static class CannedAnalyzer extends Analyzer {
+    private final TokenAndPos[] tokens;
+    
+    public CannedAnalyzer(TokenAndPos[] tokens) {
+      this.tokens = tokens;
+    }
+
+    @Override
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      return new CannedTokenizer(tokens);
+    }
+  }
+
+  private static class CannedTokenizer extends Tokenizer {
+    private final TokenAndPos[] tokens;
+    private int upto = 0;
+    private int lastPos = 0;
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+    public CannedTokenizer(TokenAndPos[] tokens) {
+      this.tokens = tokens;
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+      clearAttributes();      
+      if (upto < tokens.length) {
+        final TokenAndPos token = tokens[upto++];
+        termAtt.setEmpty();
+        termAtt.append(token.token);
+        posIncrAtt.setPositionIncrement(token.pos - lastPos);
+        lastPos = token.pos;
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+
+  public void testZeroPosIncr() throws IOException {
+    Directory dir = new RAMDirectory();
+    final TokenAndPos[] tokens = new TokenAndPos[3];
+    tokens[0] = new TokenAndPos("a", 0);
+    tokens[1] = new TokenAndPos("b", 0);
+    tokens[2] = new TokenAndPos("c", 0);
+
+    RandomIndexWriter writer = new RandomIndexWriter(random, dir, new CannedAnalyzer(tokens));
+    Document doc = new Document();
+    doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
+    writer.addDocument(doc);
+    writer.addDocument(doc);
+    IndexReader r = writer.getReader();
+    writer.close();
+    IndexSearcher s = new IndexSearcher(r);
+    MultiPhraseQuery mpq = new MultiPhraseQuery();
+    //mpq.setSlop(1);
+
+    // NOTE: not great that if we do the else clause here we
+    // get different scores!  MultiPhraseQuery counts that
+    // phrase as occurring twice per doc (it should be 1, I
+    // think?).  This is because MultipleTermPositions is able to
+    // return the same position more than once (0, in this
+    // case):
+    if (true) {
+      mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
+      mpq.add(new Term[] {new Term("field", "a")}, 0);
+    } else {
+      mpq.add(new Term[] {new Term("field", "a")}, 0);
+      mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
+    }
+    TopDocs hits = s.search(mpq, 2);
+    assert hits.totalHits == 2;
+    assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
+    /*
+    for(int hit=0;hit<hits.totalHits;hit++) {
+      ScoreDoc sd = hits.scoreDocs[hit];
+      System.out.println("  hit doc=" + sd.doc + " score=" + sd.score);
+    }
+    */
+    r.close();
+    dir.close();
+  }
 }