You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by do...@apache.org on 2011/05/18 17:32:07 UTC
svn commit: r1124302 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/CHANGES.txt
lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java solr/
Author: doronc
Date: Wed May 18 15:32:06 2011
New Revision: 1124302
URL: http://svn.apache.org/viewvc?rev=1124302&view=rev
Log:
UCENE-3068: sloppy phrase query failed to match valid documents when multiple
query terms had same position in the query.
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
lucene/dev/branches/branch_3x/solr/ (props changed)
Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1124302&r1=1124301&r2=1124302&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Wed May 18 15:32:06 2011
@@ -94,6 +94,9 @@ Bug fixes
PhraseQuery as term with lower doc freq will also have less positions.
(Uwe Schindler, Robert Muir, Otis Gospodnetic)
+* LUCENE-3068: sloppy phrase query failed to match valid documents when multiple
+ query terms had same position in the query. (Doron Cohen)
+
Test Cases
* LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java?rev=1124302&r1=1124301&r2=1124302&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java Wed May 18 15:32:06 2011
@@ -18,7 +18,7 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
-import java.util.HashMap;
+import java.util.HashSet;
final class SloppyPhraseScorer extends PhraseScorer {
private int slop;
@@ -109,8 +109,14 @@ final class SloppyPhraseScorer extends P
/**
* Init PhrasePositions in place.
- * There is a one time initialization for this scorer:
+ * There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
* <br>- Put in repeats[] each pp that has another pp with same position in the doc.
+ * This relies on that the position in PP is computed as (TP.position - offset) and
+ * so by adding offset we actually compare positions and identify that the two are
+ * the same term.
+ * An exclusion to this is two distinct terms in the same offset in query and same
+ * position in doc. This case is detected by comparing just the (query) offsets,
+ * and two such PPs are not considered "repeating".
* <br>- Also mark each such pp by pp.repeats = true.
* <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
* In particular, this allows to score queries with no repetitions with no overhead due to this computation.
@@ -145,23 +151,26 @@ final class SloppyPhraseScorer extends P
if (!checkedRepeats) {
checkedRepeats = true;
// check for repeats
- HashMap<PhrasePositions, Object> m = null;
+ HashSet<PhrasePositions> m = null;
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
int tpPos = pp.position + pp.offset;
for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
+ if (pp.offset == pp2.offset) {
+ continue; // not a repetition: the two PPs are originally in same offset in the query!
+ }
int tpPos2 = pp2.position + pp2.offset;
if (tpPos2 == tpPos) {
if (m == null)
- m = new HashMap<PhrasePositions, Object>();
+ m = new HashSet<PhrasePositions>();
pp.repeats = true;
pp2.repeats = true;
- m.put(pp,null);
- m.put(pp2,null);
+ m.add(pp);
+ m.add(pp2);
}
}
}
if (m!=null)
- repeats = m.keySet().toArray(new PhrasePositions[0]);
+ repeats = m.toArray(new PhrasePositions[0]);
}
// with repeats must advance some repeating pp's so they all start with differing tp's
@@ -204,11 +213,16 @@ final class SloppyPhraseScorer extends P
int tpPos = pp.position + pp.offset;
for (int i = 0; i < repeats.length; i++) {
PhrasePositions pp2 = repeats[i];
- if (pp2 == pp)
+ if (pp2 == pp) {
continue;
+ }
+ if (pp.offset == pp2.offset) {
+ continue; // not a repetition: the two PPs are originally in same offset in the query!
+ }
int tpPos2 = pp2.position + pp2.offset;
- if (tpPos2 == tpPos)
+ if (tpPos2 == tpPos) {
return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset.
+ }
}
return null;
}
Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java?rev=1124302&r1=1124301&r2=1124302&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java Wed May 18 15:32:06 2011
@@ -17,10 +17,13 @@ package org.apache.lucene.search;
* limitations under the License.
*/
+import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.Analyzer;
@@ -412,7 +415,7 @@ public class TestMultiPhraseQuery extend
mpq.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
}
TopDocs hits = s.search(mpq, 2);
- assert hits.totalHits == 2;
+ assertEquals(2, hits.totalHits);
assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
/*
for(int hit=0;hit<hits.totalHits;hit++) {
@@ -423,4 +426,156 @@ public class TestMultiPhraseQuery extend
r.close();
dir.close();
}
+
+ private final static TokenAndPos[] INCR_0_DOC_TOKENS = new TokenAndPos[] {
+ new TokenAndPos("x", 0),
+ new TokenAndPos("a", 1),
+ new TokenAndPos("1", 1),
+ new TokenAndPos("m", 2), // not existing, relying on slop=2
+ new TokenAndPos("b", 3),
+ new TokenAndPos("1", 3),
+ new TokenAndPos("n", 4), // not existing, relying on slop=2
+ new TokenAndPos("c", 5),
+ new TokenAndPos("y", 6)
+ };
+
+ private final static TokenAndPos[] INCR_0_QUERY_TOKENS_AND = new TokenAndPos[] {
+ new TokenAndPos("a", 0),
+ new TokenAndPos("1", 0),
+ new TokenAndPos("b", 1),
+ new TokenAndPos("1", 1),
+ new TokenAndPos("c", 2)
+ };
+
+ private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new TokenAndPos[][] {
+ { new TokenAndPos("a", 0) },
+ { new TokenAndPos("x", 0), new TokenAndPos("1", 0) },
+ { new TokenAndPos("b", 1) },
+ { new TokenAndPos("x", 1), new TokenAndPos("1", 1) },
+ { new TokenAndPos("c", 2) }
+ };
+
+ private final static TokenAndPos[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new TokenAndPos[][] {
+ { new TokenAndPos("x", 0) },
+ { new TokenAndPos("a", 0), new TokenAndPos("1", 0) },
+ { new TokenAndPos("x", 1) },
+ { new TokenAndPos("b", 1), new TokenAndPos("1", 1) },
+ { new TokenAndPos("c", 2) }
+ };
+
+ /**
+ * using query parser, MPQ will be created, and will not be strict about having all query terms
+ * in each position - one of each position is sufficient (OR logic)
+ */
+ public void testZeroPosIncrSloppyParsedAnd() throws IOException, ParseException {
+ QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new CannedAnalyzer(INCR_0_QUERY_TOKENS_AND));
+ final Query q = qp.parse("\"this text is acually ignored\"");
+ assertTrue("wrong query type!", q instanceof MultiPhraseQuery);
+ doTestZeroPosIncrSloppy(q, 0);
+ ((MultiPhraseQuery) q).setSlop(1);
+ doTestZeroPosIncrSloppy(q, 0);
+ ((MultiPhraseQuery) q).setSlop(2);
+ doTestZeroPosIncrSloppy(q, 1);
+ }
+
+ private void doTestZeroPosIncrSloppy(Query q, int nExpected) throws IOException {
+ Directory dir = newDirectory(); // random dir
+ IndexWriterConfig cfg = newIndexWriterConfig(TEST_VERSION_CURRENT, new CannedAnalyzer(INCR_0_DOC_TOKENS));
+ IndexWriter writer = new IndexWriter(dir, cfg);
+ Document doc = new Document();
+ doc.add(new Field("field", "", Field.Store.NO, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ IndexReader r = IndexReader.open(writer,false);
+ writer.close();
+ IndexSearcher s = new IndexSearcher(r);
+
+ if (VERBOSE) {
+ System.out.println("QUERY=" + q);
+ }
+
+ TopDocs hits = s.search(q, 1);
+ assertEquals("wrong number of results", nExpected, hits.totalHits);
+
+ if (VERBOSE) {
+ for(int hit=0;hit<hits.totalHits;hit++) {
+ ScoreDoc sd = hits.scoreDocs[hit];
+ System.out.println(" hit doc=" + sd.doc + " score=" + sd.score);
+ }
+ }
+
+ r.close();
+ dir.close();
+ }
+
+ /**
+ * PQ AND Mode - Manually creating a phrase query
+ */
+ public void testZeroPosIncrSloppyPqAnd() throws IOException, ParseException {
+ final PhraseQuery pq = new PhraseQuery();
+ for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
+ pq.add(new Term("field",tap.token), tap.pos);
+ }
+ doTestZeroPosIncrSloppy(pq, 0);
+ pq.setSlop(1);
+ doTestZeroPosIncrSloppy(pq, 0);
+ pq.setSlop(2);
+ doTestZeroPosIncrSloppy(pq, 1);
+ }
+
+ /**
+ * MPQ AND Mode - Manually creating a multiple phrase query
+ */
+ public void testZeroPosIncrSloppyMpqAnd() throws IOException, ParseException {
+ final MultiPhraseQuery mpq = new MultiPhraseQuery();
+ for (TokenAndPos tap : INCR_0_QUERY_TOKENS_AND) {
+ mpq.add(new Term[]{new Term("field",tap.token)}, tap.pos); //AND logic
+ }
+ doTestZeroPosIncrSloppy(mpq, 0);
+ mpq.setSlop(1);
+ doTestZeroPosIncrSloppy(mpq, 0);
+ mpq.setSlop(2);
+ doTestZeroPosIncrSloppy(mpq, 1);
+ }
+
+ /**
+ * MPQ Combined AND OR Mode - Manually creating a multiple phrase query
+ */
+ public void testZeroPosIncrSloppyMpqAndOrMatch() throws IOException, ParseException {
+ final MultiPhraseQuery mpq = new MultiPhraseQuery();
+ for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) {
+ Term[] terms = tapTerms(tap);
+ final int pos = tap[0].pos;
+ mpq.add(terms, pos); //AND logic in pos, OR across lines
+ }
+ doTestZeroPosIncrSloppy(mpq, 0);
+ mpq.setSlop(1);
+ doTestZeroPosIncrSloppy(mpq, 0);
+ mpq.setSlop(2);
+ doTestZeroPosIncrSloppy(mpq, 1);
+ }
+
+ /**
+ * MPQ Combined AND OR Mode - Manually creating a multiple phrase query - with no match
+ */
+ public void testZeroPosIncrSloppyMpqAndOrNoMatch() throws IOException, ParseException {
+ final MultiPhraseQuery mpq = new MultiPhraseQuery();
+ for (TokenAndPos tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) {
+ Term[] terms = tapTerms(tap);
+ final int pos = tap[0].pos;
+ mpq.add(terms, pos); //AND logic in pos, OR across lines
+ }
+ doTestZeroPosIncrSloppy(mpq, 0);
+ mpq.setSlop(2);
+ doTestZeroPosIncrSloppy(mpq, 0);
+ }
+
+ private Term[] tapTerms(TokenAndPos[] tap) {
+ Term[] terms = new Term[tap.length];
+ for (int i=0; i<terms.length; i++) {
+ terms[i] = new Term("field",tap[i].token);
+ }
+ return terms;
+ }
+
}
+