You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2011/05/02 22:39:26 UTC
svn commit: r1098782 - in /lucene/dev/trunk: ./ lucene/ lucene/backwards/
lucene/src/java/org/apache/lucene/search/
lucene/src/test/org/apache/lucene/search/ solr/
Author: mikemccand
Date: Mon May 2 20:39:26 2011
New Revision: 1098782
URL: http://svn.apache.org/viewvc?rev=1098782&view=rev
Log:
LUCENE-3029: MultiPhraseQuery scores should not depend on docID
Modified:
lucene/dev/trunk/ (props changed)
lucene/dev/trunk/lucene/ (props changed)
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/backwards/ (props changed)
lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhrasePositions.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseQueue.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseScorer.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
lucene/dev/trunk/solr/ (props changed)
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1098782&r1=1098781&r2=1098782&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Mon May 2 20:39:26 2011
@@ -1477,6 +1477,10 @@ Bug fixes
that warming is free to do whatever it needs to. (Earwin Burrfoot
via Mike McCandless)
+* LUCENE-3029: Fix corner case when MultiPhraseQuery is used with zero
+ position-increment tokens that would sometimes assign different
+ scores to identical docs. (Mike McCandless)
+
* LUCENE-2486: Fixed intermittent FileNotFoundException on doc store
files when a mergedSegmentWarmer is set on IndexWriter. (Mike
McCandless)
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhrasePositions.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhrasePositions.java?rev=1098782&r1=1098781&r2=1098782&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhrasePositions.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhrasePositions.java Mon May 2 20:39:26 2011
@@ -28,13 +28,15 @@ final class PhrasePositions {
int position; // position in doc
int count; // remaining pos in this doc
int offset; // position in phrase
+ final int ord; // unique across all PhrasePositions instances
final DocsAndPositionsEnum postings; // stream of docs & positions
PhrasePositions next; // used to make lists
boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1)
- PhrasePositions(DocsAndPositionsEnum postings, int o) {
+ PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) {
this.postings = postings;
offset = o;
+ this.ord = ord;
}
final boolean next() throws IOException { // increments to next doc
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseQueue.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseQueue.java?rev=1098782&r1=1098781&r2=1098782&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseQueue.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseQueue.java Mon May 2 20:39:26 2011
@@ -30,10 +30,16 @@ final class PhraseQueue extends Priority
if (pp1.position == pp2.position)
// same doc and pp.position, so decide by actual term positions.
// rely on: pp.position == tp.position - offset.
- return pp1.offset < pp2.offset;
- else
+ if (pp1.offset == pp2.offset) {
+ return pp1.ord < pp2.ord;
+ } else {
+ return pp1.offset < pp2.offset;
+ }
+ else {
return pp1.position < pp2.position;
- else
+ }
+ else {
return pp1.doc < pp2.doc;
+ }
}
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseScorer.java?rev=1098782&r1=1098781&r2=1098782&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseScorer.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/PhraseScorer.java Mon May 2 20:39:26 2011
@@ -55,7 +55,7 @@ abstract class PhraseScorer extends Scor
// this allows to easily identify a matching (exact) phrase
// when all PhrasePositions have exactly the same position.
for (int i = 0; i < postings.length; i++) {
- PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position);
+ PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
if (last != null) { // add next to end of list
last.next = pp;
} else {
Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java?rev=1098782&r1=1098781&r2=1098782&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java Mon May 2 20:39:26 2011
@@ -25,14 +25,22 @@ import org.apache.lucene.index.MultiFiel
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
+import java.io.Reader;
/**
* This class tests the MultiPhraseQuery class.
@@ -333,4 +341,97 @@ public class TestMultiPhraseQuery extend
reader.close();
indexStore.close();
}
+
+ private static class TokenAndPos {
+ public final String token;
+ public final int pos;
+ public TokenAndPos(String token, int pos) {
+ this.token = token;
+ this.pos = pos;
+ }
+ }
+
+ private static class CannedAnalyzer extends Analyzer {
+ private final TokenAndPos[] tokens;
+
+ public CannedAnalyzer(TokenAndPos[] tokens) {
+ this.tokens = tokens;
+ }
+
+ @Override
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new CannedTokenizer(tokens);
+ }
+ }
+
+ private static class CannedTokenizer extends Tokenizer {
+ private final TokenAndPos[] tokens;
+ private int upto = 0;
+ private int lastPos = 0;
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+ public CannedTokenizer(TokenAndPos[] tokens) {
+ this.tokens = tokens;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ clearAttributes();
+ if (upto < tokens.length) {
+ final TokenAndPos token = tokens[upto++];
+ termAtt.setEmpty();
+ termAtt.append(token.token);
+ posIncrAtt.setPositionIncrement(token.pos - lastPos);
+ lastPos = token.pos;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ public void testZeroPosIncr() throws IOException {
+ Directory dir = new RAMDirectory();
+ final TokenAndPos[] tokens = new TokenAndPos[3];
+ tokens[0] = new TokenAndPos("a", 0);
+ tokens[1] = new TokenAndPos("b", 0);
+ tokens[2] = new TokenAndPos("c", 0);
+
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir, new CannedAnalyzer(tokens));
+ Document doc = new Document();
+ doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ writer.addDocument(doc);
+ IndexReader r = writer.getReader();
+ writer.close();
+ IndexSearcher s = new IndexSearcher(r);
+ MultiPhraseQuery mpq = new MultiPhraseQuery();
+ //mpq.setSlop(1);
+
+ // NOTE: not great that if we do the else clause here we
+ // get different scores! MultiPhraseQuery counts that
+ // phrase as occurring twice per doc (it should be 1, I
+ // think?). This is because MultipleTermPositions is able to
+ // return the same position more than once (0, in this
+ // case):
+ if (true) {
+ mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
+ mpq.add(new Term[] {new Term("field", "a")}, 0);
+ } else {
+ mpq.add(new Term[] {new Term("field", "a")}, 0);
+ mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
+ }
+ TopDocs hits = s.search(mpq, 2);
+ assert hits.totalHits == 2;
+ assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
+ /*
+ for(int hit=0;hit<hits.totalHits;hit++) {
+ ScoreDoc sd = hits.scoreDocs[hit];
+ System.out.println(" hit doc=" + sd.doc + " score=" + sd.score);
+ }
+ */
+ r.close();
+ dir.close();
+ }
}