You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2011/05/02 21:48:26 UTC
svn commit: r1098765 - in /lucene/java/branches/lucene_3_0: ./
src/java/org/apache/lucene/index/ src/java/org/apache/lucene/search/
src/test/org/apache/lucene/search/
Author: mikemccand
Date: Mon May 2 19:48:25 2011
New Revision: 1098765
URL: http://svn.apache.org/viewvc?rev=1098765&view=rev
Log:
LUCENE-3029: MultiPhraseQuery scores should not depend on docID
Modified:
lucene/java/branches/lucene_3_0/CHANGES.txt
lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/index/MultipleTermPositions.java
lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/ExactPhraseScorer.java
lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhrasePositions.java
lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhraseQueue.java
lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhraseScorer.java
lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
Modified: lucene/java/branches/lucene_3_0/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_3_0/CHANGES.txt?rev=1098765&r1=1098764&r2=1098765&view=diff
==============================================================================
--- lucene/java/branches/lucene_3_0/CHANGES.txt (original)
+++ lucene/java/branches/lucene_3_0/CHANGES.txt Mon May 2 19:48:25 2011
@@ -251,6 +251,10 @@ Bug fixes
that warming is free to do whatever it needs to. (Earwin Burrfoot
via Mike McCandless)
+* LUCENE-3029: Fix corner case when MultiPhraseQuery is used with zero
+ position-increment tokens that would sometimes assign different
+ scores to identical docs. (Mike McCandless)
+
* LUCENE-2486: Fixed intermittent FileNotFoundException on doc store
files when a mergedSegmentWarmer is set on IndexWriter. (Mike
McCandless)
Modified: lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/index/MultipleTermPositions.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/index/MultipleTermPositions.java?rev=1098765&r1=1098764&r2=1098765&view=diff
==============================================================================
--- lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/index/MultipleTermPositions.java (original)
+++ lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/index/MultipleTermPositions.java Mon May 2 19:48:25 2011
@@ -121,8 +121,10 @@ public class MultipleTermPositions imple
do {
tp = _termPositionsQueue.peek();
- for (int i = 0; i < tp.freq(); i++)
+ for (int i = 0; i < tp.freq(); i++) {
+ // NOTE: this can result in dup positions being added!
_posList.add(tp.nextPosition());
+ }
if (tp.next())
_termPositionsQueue.updateTop();
@@ -139,6 +141,8 @@ public class MultipleTermPositions imple
}
public final int nextPosition() {
+ // NOTE: this may return the same position more than
+ // once (see TestMultiPhraseQuery.testZeroPosIncr)
return _posList.next();
}
Modified: lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/ExactPhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/ExactPhraseScorer.java?rev=1098765&r1=1098764&r2=1098765&view=diff
==============================================================================
--- lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/ExactPhraseScorer.java (original)
+++ lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/ExactPhraseScorer.java Mon May 2 19:48:25 2011
@@ -42,11 +42,12 @@ final class ExactPhraseScorer extends Ph
int freq = 0;
do { // find position w/ all terms
while (first.position < last.position) { // scan forward in first
- do {
- if (!first.nextPosition())
- return freq;
- } while (first.position < last.position);
- firstToLast();
+ do {
+ if (!first.nextPosition()) {
+ return freq;
+ }
+ } while (first.position < last.position);
+ firstToLast();
}
freq++; // all equal: a match
} while (last.nextPosition());
Modified: lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhrasePositions.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhrasePositions.java?rev=1098765&r1=1098764&r2=1098765&view=diff
==============================================================================
--- lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhrasePositions.java (original)
+++ lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhrasePositions.java Mon May 2 19:48:25 2011
@@ -28,13 +28,15 @@ final class PhrasePositions {
int position; // position in doc
int count; // remaining pos in this doc
int offset; // position in phrase
+ final int ord; // unique across all PhrasePositions instances
TermPositions tp; // stream of positions
PhrasePositions next; // used to make lists
boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1)
- PhrasePositions(TermPositions t, int o) {
+ PhrasePositions(TermPositions t, int o, int ord) {
tp = t;
offset = o;
+ this.ord = ord;
}
final boolean next() throws IOException { // increments to next doc
Modified: lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhraseQueue.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhraseQueue.java?rev=1098765&r1=1098764&r2=1098765&view=diff
==============================================================================
--- lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhraseQueue.java (original)
+++ lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhraseQueue.java Mon May 2 19:48:25 2011
@@ -30,10 +30,16 @@ final class PhraseQueue extends Priority
if (pp1.position == pp2.position)
// same doc and pp.position, so decide by actual term positions.
// rely on: pp.position == tp.position - offset.
- return pp1.offset < pp2.offset;
- else
+ if (pp1.offset == pp2.offset) {
+ return pp1.ord < pp2.ord;
+ } else {
+ return pp1.offset < pp2.offset;
+ }
+ else {
return pp1.position < pp2.position;
- else
+ }
+ else {
return pp1.doc < pp2.doc;
+ }
}
}
Modified: lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhraseScorer.java?rev=1098765&r1=1098764&r2=1098765&view=diff
==============================================================================
--- lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhraseScorer.java (original)
+++ lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/search/PhraseScorer.java Mon May 2 19:48:25 2011
@@ -56,7 +56,7 @@ abstract class PhraseScorer extends Scor
// this allows to easily identify a matching (exact) phrase
// when all PhrasePositions have exactly the same position.
for (int i = 0; i < tps.length; i++) {
- PhrasePositions pp = new PhrasePositions(tps[i], offsets[i]);
+ PhrasePositions pp = new PhrasePositions(tps[i], offsets[i], i);
if (last != null) { // add next to end of list
last.next = pp;
} else {
Modified: lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java?rev=1098765&r1=1098764&r2=1098765&view=diff
==============================================================================
--- lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (original)
+++ lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java Mon May 2 19:48:25 2011
@@ -17,24 +17,29 @@ package org.apache.lucene.search;
* limitations under the License.
*/
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermEnum;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.store.RAMDirectory;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Collections;
+import java.util.LinkedList;
+
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.LinkedList;
-import java.util.Collections;
-
/**
* This class tests the MultiPhraseQuery class.
*
@@ -228,4 +233,95 @@ public class TestMultiPhraseQuery extend
writer.addDocument(doc);
}
+ private static class TokenAndPos {
+ public final String token;
+ public final int pos;
+ public TokenAndPos(String token, int pos) {
+ this.token = token;
+ this.pos = pos;
+ }
+ }
+
+ private static class CannedAnalyzer extends Analyzer {
+ private final TokenAndPos[] tokens;
+
+ public CannedAnalyzer(TokenAndPos[] tokens) {
+ this.tokens = tokens;
+ }
+
+ @Override
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new CannedTokenizer(tokens);
+ }
+ }
+
+ private static class CannedTokenizer extends Tokenizer {
+ private final TokenAndPos[] tokens;
+ private int upto = 0;
+ private int lastPos = 0;
+ private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+ public CannedTokenizer(TokenAndPos[] tokens) {
+ this.tokens = tokens;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ clearAttributes();
+ if (upto < tokens.length) {
+ final TokenAndPos token = tokens[upto++];
+ termAtt.setTermBuffer(token.token);
+ posIncrAtt.setPositionIncrement(token.pos - lastPos);
+ lastPos = token.pos;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ public void testZeroPosIncr() throws IOException {
+ Directory dir = new RAMDirectory();
+ final TokenAndPos[] tokens = new TokenAndPos[3];
+ tokens[0] = new TokenAndPos("a", 0);
+ tokens[1] = new TokenAndPos("b", 0);
+ tokens[2] = new TokenAndPos("c", 0);
+
+ IndexWriter writer = new IndexWriter(dir, new CannedAnalyzer(tokens), true, IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ writer.addDocument(doc);
+ IndexReader r = writer.getReader();
+ writer.close();
+ IndexSearcher s = new IndexSearcher(r);
+ MultiPhraseQuery mpq = new MultiPhraseQuery();
+ //mpq.setSlop(1);
+
+ // NOTE: not great that if we do the else clause here we
+ // get different scores! MultiPhraseQuery counts that
+ // phrase as occurring twice per doc (it should be 1, I
+ // think?). This is because MultipleTermPositions is able to
+ // return the same position more than once (0, in this
+ // case):
+ if (true) {
+ mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
+ mpq.add(new Term[] {new Term("field", "a")}, 0);
+ } else {
+ mpq.add(new Term[] {new Term("field", "a")}, 0);
+ mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
+ }
+ TopDocs hits = s.search(mpq, 2);
+ assert hits.totalHits == 2;
+ assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
+ /*
+ for(int hit=0;hit<hits.totalHits;hit++) {
+ ScoreDoc sd = hits.scoreDocs[hit];
+ System.out.println(" hit doc=" + sd.doc + " score=" + sd.score);
+ }
+ */
+ r.close();
+ dir.close();
+ }
}