You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by do...@apache.org on 2007/04/24 07:32:48 UTC
svn commit: r531733 - in /lucene/java/trunk: ./
src/java/org/apache/lucene/search/ src/test/org/apache/lucene/search/
Author: doronc
Date: Mon Apr 23 22:32:47 2007
New Revision: 531733
URL: http://svn.apache.org/viewvc?view=rev&rev=531733
Log:
LUCENE-736: Sloppy phrase query with repeating terms matches wrong docs.
For example query "B C B"~2 matches the doc "A B C D E".
Search time cost of this fix is 4%, for sloppy phrase search.
This fix also partially brings back the the tests checkSkipTo() in QueryUtils,
(which was disabled by LUCENE-730), but this is now invoked
only for non Boolean scorers.
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/src/java/org/apache/lucene/search/ExactPhraseScorer.java
lucene/java/trunk/src/java/org/apache/lucene/search/PhrasePositions.java
lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQueue.java
lucene/java/trunk/src/java/org/apache/lucene/search/PhraseScorer.java
lucene/java/trunk/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
lucene/java/trunk/src/test/org/apache/lucene/search/QueryUtils.java
lucene/java/trunk/src/test/org/apache/lucene/search/TestPhraseQuery.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Mon Apr 23 22:32:47 2007
@@ -98,6 +98,10 @@
the instance of IndexWriter (but, not the index itself) by
referencing already deleted segments. This bug was only present
in 2.2 (trunk), ie was never released. (Mike McCandless)
+
+13. LUCENE-736: Sloppy phrase query with repeating terms matches wrong docs.
+ For example query "B C B"~2 matches the doc "A B C D E". (Doron Cohen)
+
New features
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/ExactPhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/ExactPhraseScorer.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/ExactPhraseScorer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/ExactPhraseScorer.java Mon Apr 23 22:32:47 2007
@@ -22,27 +22,30 @@
final class ExactPhraseScorer extends PhraseScorer {
- ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
+ ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity,
byte[] norms) {
- super(weight, tps, positions, similarity, norms);
+ super(weight, tps, offsets, similarity, norms);
}
protected final float phraseFreq() throws IOException {
// sort list with pq
+ pq.clear();
for (PhrasePositions pp = first; pp != null; pp = pp.next) {
pp.firstPosition();
pq.put(pp); // build pq from list
}
pqToList(); // rebuild list from pq
+ // for counting how many times the exact phrase is found in current document,
+ // just count how many times all PhrasePosition's have exactly the same position.
int freq = 0;
do { // find position w/ all terms
while (first.position < last.position) { // scan forward in first
- do {
- if (!first.nextPosition())
- return (float)freq;
- } while (first.position < last.position);
- firstToLast();
+ do {
+ if (!first.nextPosition())
+ return (float)freq;
+ } while (first.position < last.position);
+ firstToLast();
}
freq++; // all equal: a match
} while (last.nextPosition());
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/PhrasePositions.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/PhrasePositions.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/PhrasePositions.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/PhrasePositions.java Mon Apr 23 22:32:47 2007
@@ -20,6 +20,9 @@
import java.io.IOException;
import org.apache.lucene.index.*;
+/**
+ * Position of a term in a document that takes into account the term offset within the phrase.
+ */
final class PhrasePositions {
int doc; // current doc
int position; // position in doc
@@ -27,6 +30,7 @@
int offset; // position in phrase
TermPositions tp; // stream of positions
PhrasePositions next; // used to make lists
+ boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1)
PhrasePositions(TermPositions t, int o) {
tp = t;
@@ -61,6 +65,12 @@
nextPosition();
}
+ /**
+ * Go to next location of this term current document, and set
+ * <code>position</code> as <code>location - offset</code>, so that a
+ * matching exact phrase is easily identified when all PhrasePositions
+ * have exactly the same <code>position</code>.
+ */
final boolean nextPosition() throws IOException {
if (count-- > 0) { // read subsequent pos's
position = tp.nextPosition() - offset;
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQueue.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQueue.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQueue.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQueue.java Mon Apr 23 22:32:47 2007
@@ -28,7 +28,12 @@
PhrasePositions pp1 = (PhrasePositions)o1;
PhrasePositions pp2 = (PhrasePositions)o2;
if (pp1.doc == pp2.doc)
- return pp1.position < pp2.position;
+ if (pp1.position == pp2.position)
+ // same doc and pp.position, so decide by actual term positions.
+ // rely on: pp.position == tp.position - offset.
+ return pp1.offset < pp2.offset;
+ else
+ return pp1.position < pp2.position;
else
return pp1.doc < pp2.doc;
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/PhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/PhraseScorer.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/PhraseScorer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/PhraseScorer.java Mon Apr 23 22:32:47 2007
@@ -21,6 +21,16 @@
import org.apache.lucene.index.*;
+/** Expert: Scoring functionality for phrase queries.
+ * <br>A document is considered matching if it contains the phrase-query terms
+ * at "valid" positons. What "valid positions" are
+ * depends on the type of the phrase query: for an exact phrase query terms are required
+ * to appear in adjacent locations, while for a sloppy phrase query some distance between
+ * the terms is allowed. The abstract method {@link #phraseFreq()} of extending classes
+ * is invoked for each document containing all the phrase query terms, in order to
+ * compute the frequency of the phrase query in that document. A non zero frequency
+ * means a match.
+ */
abstract class PhraseScorer extends Scorer {
private Weight weight;
protected byte[] norms;
@@ -31,19 +41,23 @@
protected PhraseQueue pq;
protected PhrasePositions first, last;
- private float freq;
+ private float freq; //prhase frequency in current doc as computed by phraseFreq().
- PhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
+ PhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity,
byte[] norms) {
super(similarity);
this.norms = norms;
this.weight = weight;
this.value = weight.getValue();
- // convert tps to a list
+ // convert tps to a list of phrase positions.
+ // note: phrase-position differs from term-position in that its position
+ // reflects the phrase offset: pp.pos = tp.pos - offset.
+ // this allows to easily identify a matching (exact) phrase
+ // when all PhrasePositions have exactly the same position.
for (int i = 0; i < tps.length; i++) {
- PhrasePositions pp = new PhrasePositions(tps[i], positions[i]);
+ PhrasePositions pp = new PhrasePositions(tps[i], offsets[i]);
if (last != null) { // add next to end of list
last.next = pp;
} else
@@ -94,6 +108,7 @@
}
public boolean skipTo(int target) throws IOException {
+ firstTime = false;
for (PhrasePositions pp = first; more && pp != null; pp = pp.next) {
more = pp.skipTo(target);
}
@@ -102,6 +117,13 @@
return doNext();
}
+ /**
+ * For a document containing all the phrase query terms, compute the
+ * frequency of the phrase in that document.
+ * A non zero frequency means a match.
+ * <br>Note, that containing all phrase terms does not guarantee a match - they have to be found in matching locations.
+ * @return frequency of the phrase in current doc, 0 if not found.
+ */
protected abstract float phraseFreq() throws IOException;
private void init() throws IOException {
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/SloppyPhraseScorer.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/SloppyPhraseScorer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/SloppyPhraseScorer.java Mon Apr 23 22:32:47 2007
@@ -20,38 +20,58 @@
import org.apache.lucene.index.TermPositions;
import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
final class SloppyPhraseScorer extends PhraseScorer {
private int slop;
+ private PhrasePositions repeats[];
+ private boolean checkedRepeats;
- SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
+ SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity,
int slop, byte[] norms) {
- super(weight, tps, positions, similarity, norms);
+ super(weight, tps, offsets, similarity, norms);
this.slop = slop;
}
+ /**
+ * Score a candidate doc for all slop-valid position-combinations (matches)
+ * encountered while traversing/hopping the PhrasePositions.
+ * <br> The score contribution of a match depends on the distance:
+ * <br> - highest score for distance=0 (exact match).
+ * <br> - score gets lower as distance gets higher.
+ * <br>Example: for query "a b"~2, a document "x a b a y" can be scored twice:
+ * once for "a b" (distance=0), and once for "b a" (distance=2).
+ * <br>Pssibly not all valid combinations are encountered, because for efficiency
+ * we always propagate the least PhrasePosition. This allows to base on
+ * PriorityQueue and move forward faster.
+ * As result, for example, document "a b c b a"
+ * would score differently for queries "a b c"~4 and "c b a"~4, although
+ * they really are equivalent.
+ * Similarly, for doc "a b c b a f g", query "c b"~2
+ * would get same score as "g f"~2, although "c b"~2 could be matched twice.
+ * We may want to fix this in the future (currently not, for performance reasons).
+ */
protected final float phraseFreq() throws IOException {
- pq.clear();
- int end = 0;
- for (PhrasePositions pp = first; pp != null; pp = pp.next) {
- pp.firstPosition();
- if (pp.position > end)
- end = pp.position;
- pq.put(pp); // build pq from list
- }
-
+ int end = initPhrasePositions();
+
float freq = 0.0f;
- boolean done = false;
- do {
+ boolean done = (end<0);
+ while (!done) {
PhrasePositions pp = (PhrasePositions) pq.pop();
int start = pp.position;
int next = ((PhrasePositions) pq.top()).position;
- for (int pos = start; pos <= next; pos = pp.position) {
- start = pos; // advance pp to min window
+
+ boolean tpsDiffer = true;
+ for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position) {
+ if (pos<=next && tpsDiffer)
+ start = pos; // advance pp to min window
if (!pp.nextPosition()) {
- done = true; // ran out of a term -- done
+ done = true; // ran out of a term -- done
break;
}
+ tpsDiffer = !pp.repeats || termPositionsDiffer(pp);
}
int matchLength = end - start;
@@ -61,8 +81,113 @@
if (pp.position > end)
end = pp.position;
pq.put(pp); // restore pq
- } while (!done);
+ }
return freq;
+ }
+
+
+ /**
+ * Init PhrasePositions in place.
+ * There is a one time initializatin for this scorer:
+ * <br>- Put in repeats[] each pp that has another pp with same position in the doc.
+ * <br>- Also mark each such pp by pp.repeats = true.
+ * <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
+ * In particular, this allows to score queries with no repetiotions with no overhead due to this computation.
+ * <br>- Example 1 - query with no repetitions: "ho my"~2
+ * <br>- Example 2 - query with repetitions: "ho my my"~2
+ * <br>- Example 3 - query with repetitions: "my ho my"~2
+ * <br>Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection.
+ * @return end (max position), or -1 if any term ran out (i.e. done)
+ * @throws IOException
+ */
+ private int initPhrasePositions() throws IOException {
+ int end = 0;
+
+ // no repeats at all (most common case is also the simplest one)
+ if (checkedRepeats && repeats==null) {
+ // build queue from list
+ pq.clear();
+ for (PhrasePositions pp = first; pp != null; pp = pp.next) {
+ pp.firstPosition();
+ if (pp.position > end)
+ end = pp.position;
+ pq.put(pp); // build pq from list
+ }
+ return end;
+ }
+
+ // position the pp's
+ for (PhrasePositions pp = first; pp != null; pp = pp.next)
+ pp.firstPosition();
+
+ // one time initializatin for this scorer
+ if (!checkedRepeats) {
+ checkedRepeats = true;
+ // check for repeats
+ HashMap m = null;
+ for (PhrasePositions pp = first; pp != null; pp = pp.next) {
+ int tpPos = pp.position + pp.offset;
+ for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
+ int tpPos2 = pp2.position + pp2.offset;
+ if (tpPos2 == tpPos) {
+ if (m == null)
+ m = new HashMap();
+ pp.repeats = true;
+ pp2.repeats = true;
+ m.put(pp,null);
+ m.put(pp2,null);
+ }
+ }
+ }
+ if (m!=null)
+ repeats = (PhrasePositions[]) m.keySet().toArray(new PhrasePositions[0]);
+ }
+
+ // with repeats must advance some repeating pp's so they all start with differing tp's
+ if (repeats!=null) {
+ // must propagate higher offsets first (otherwise might miss matches).
+ Arrays.sort(repeats, new Comparator() {
+ public int compare(Object x, Object y) {
+ return ((PhrasePositions) y).offset - ((PhrasePositions) x).offset;
+ }});
+ // now advance them
+ for (int i = 0; i < repeats.length; i++) {
+ PhrasePositions pp = repeats[i];
+ while (!termPositionsDiffer(pp)) {
+ if (!pp.nextPosition())
+ return -1; // ran out of a term -- done
+ }
+ }
+ }
+
+ // build queue from list
+ pq.clear();
+ for (PhrasePositions pp = first; pp != null; pp = pp.next) {
+ if (pp.position > end)
+ end = pp.position;
+ pq.put(pp); // build pq from list
+ }
+
+ return end;
+ }
+
+ // disalow two pp's to have the same tp position, so that same word twice
+ // in query would go elswhere in the matched doc
+ private boolean termPositionsDiffer(PhrasePositions pp) {
+ // efficiency note: a more efficient implemention could keep a map between repeating
+ // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats
+ // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c.
+ // However this would complicate code, for a rather rare case, so choice is to compromise here.
+ int tpPos = pp.position + pp.offset;
+ for (int i = 0; i < repeats.length; i++) {
+ PhrasePositions pp2 = repeats[i];
+ if (pp2 == pp)
+ continue;
+ int tpPos2 = pp2.position + pp2.offset;
+ if (tpPos2 == tpPos)
+ return false;
+ }
+ return true;
}
}
Modified: lucene/java/trunk/src/test/org/apache/lucene/search/QueryUtils.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/QueryUtils.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/QueryUtils.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/QueryUtils.java Mon Apr 23 22:32:47 2007
@@ -68,18 +68,15 @@
/** various query sanity checks on a searcher */
public static void check(Query q1, Searcher s) {
-// Disabled because this started failing after LUCENE-730 patch was applied
-// try {
+ try {
check(q1);
-/* disabled for use of BooleanScorer in BooleanScorer2.
if (s!=null && s instanceof IndexSearcher) {
IndexSearcher is = (IndexSearcher)s;
-// checkSkipTo(q1,is);
+ checkSkipTo(q1,is);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
- */
}
/** alternate scorer skipTo(),skipTo(),next(),next(),skipTo(),skipTo(), etc
@@ -87,42 +84,71 @@
*/
public static void checkSkipTo(final Query q, final IndexSearcher s) throws IOException {
//System.out.println("Checking "+q);
- final Weight w = q.weight(s);
- final Scorer scorer = w.scorer(s.getIndexReader());
-
- // FUTURE: ensure scorer.doc()==-1
-
+
if (BooleanQuery.getUseScorer14()) return; // 1.4 doesn't support skipTo
- final int[] which = new int[1];
- final int[] sdoc = new int[] {-1};
- final float maxDiff = 1e-5f;
- s.search(q,new HitCollector() {
- public void collect(int doc, float score) {
- try {
- boolean more = (which[0]++&0x02)==0 ? scorer.skipTo(sdoc[0]+1) : scorer.next();
- sdoc[0] = scorer.doc();
- float scorerScore = scorer.score();
- float scoreDiff = Math.abs(score-scorerScore);
- scoreDiff=0; // TODO: remove this go get LUCENE-697 failures
- if (more==false || doc != sdoc[0] || scoreDiff>maxDiff) {
- throw new RuntimeException("ERROR matching docs:"
- +"\n\tscorer.more=" + more + " doc="+sdoc[0] + " scorerScore="+scorerScore
- +" scoreDiff="+scoreDiff + " maxDiff="+maxDiff
- +"\n\thitCollector.doc=" + doc + " score="+score
- +"\n\t Scorer=" + scorer
- +"\n\t Query=" + q
- +"\n\t Searcher=" + s
- );
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
+ final int skip_op = 0;
+ final int next_op = 1;
+ final int orders [][] = {
+ {skip_op},
+ {next_op},
+ {skip_op, next_op},
+ {next_op, skip_op},
+ {skip_op, skip_op, next_op, next_op},
+ {next_op, next_op, skip_op, skip_op},
+ {skip_op, skip_op, skip_op, next_op, next_op},
+ };
+ for (int k = 0; k < orders.length; k++) {
+ final int order[] = orders[k];
+ //System.out.print("Order:");for (int i = 0; i < order.length; i++) System.out.print(order[i]==skip_op ? " skip()":" next()"); System.out.println();
+ final int opidx[] = {0};
+
+ final Weight w = q.weight(s);
+ final Scorer scorer = w.scorer(s.getIndexReader());
+
+ if (scorer instanceof BooleanScorer || scorer instanceof BooleanScorer2) {
+ return; // TODO change this if BooleanScorers would once again guarantee docs in order.
}
- });
- // make sure next call to scorer is false.
- TestCase.assertFalse((which[0]++&0x02)==0 ? scorer.skipTo(sdoc[0]+1) : scorer.next());
+ // FUTURE: ensure scorer.doc()==-1
+
+ final int[] sdoc = new int[] {-1};
+ final float maxDiff = 1e-5f;
+ s.search(q,new HitCollector() {
+ public void collect(int doc, float score) {
+ try {
+ int op = order[(opidx[0]++)%order.length];
+ //System.out.println(op==skip_op ? "skip("+(sdoc[0]+1)+")":"next()");
+ boolean more = op==skip_op ? scorer.skipTo(sdoc[0]+1) : scorer.next();
+ sdoc[0] = scorer.doc();
+ float scorerScore = scorer.score();
+ float scoreDiff = Math.abs(score-scorerScore);
+ if (more==false || doc != sdoc[0] || scoreDiff>maxDiff) {
+ StringBuffer sbord = new StringBuffer();
+ for (int i = 0; i < order.length; i++)
+ sbord.append(order[i]==skip_op ? " skip()":" next()");
+ throw new RuntimeException("ERROR matching docs:"
+ +"\n\tscorer.more=" + more + " doc="+sdoc[0] + " scorerScore="+scorerScore
+ +" scoreDiff="+scoreDiff + " maxDiff="+maxDiff
+ +"\n\thitCollector.doc=" + doc + " score="+score
+ +"\n\t Scorer=" + scorer
+ +"\n\t Query=" + q
+ +"\n\t Searcher=" + s
+ +"\n\t Order=" + sbord
+ );
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ });
+
+ // make sure next call to scorer is false.
+ int op = order[(opidx[0]++)%order.length];
+ //System.out.println(op==skip_op ? "last: skip()":"last: next()");
+ boolean more = op==skip_op ? scorer.skipTo(sdoc[0]+1) : scorer.next();
+ TestCase.assertFalse(more);
+ }
}
}
Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestPhraseQuery.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestPhraseQuery.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestPhraseQuery.java Mon Apr 23 22:32:47 2007
@@ -35,6 +35,10 @@
* @author Erik Hatcher
*/
public class TestPhraseQuery extends TestCase {
+
+ /** threshold for comparing floats */
+ public static final float SCORE_COMP_THRESH = 1e-6f;
+
private IndexSearcher searcher;
private PhraseQuery query;
private RAMDirectory directory;
@@ -57,8 +61,17 @@
doc.add(new Field("repeated", "this is a repeated field - first part", Field.Store.YES, Field.Index.TOKENIZED));
Fieldable repeatedField = new Field("repeated", "second part of a repeated field", Field.Store.YES, Field.Index.TOKENIZED);
doc.add(repeatedField);
+ doc.add(new Field("palindrome", "one two three two one", Field.Store.YES, Field.Index.TOKENIZED));
+ writer.addDocument(doc);
+
+ doc = new Document();
+ doc.add(new Field("nonexist", "phrase exist notexist exist found", Field.Store.YES, Field.Index.TOKENIZED));
writer.addDocument(doc);
+ doc = new Document();
+ doc.add(new Field("nonexist", "phrase exist notexist exist found", Field.Store.YES, Field.Index.TOKENIZED));
+ writer.addDocument(doc);
+
writer.optimize();
writer.close();
@@ -341,12 +354,188 @@
query.add(new Term("repeated", "part"));
query.add(new Term("repeated", "second"));
query.add(new Term("repeated", "part"));
+ query.setSlop(100);
+
+ Hits hits = searcher.search(query);
+ assertEquals("slop of 100 just right", 1, hits.length());
+ QueryUtils.check(query,searcher);
+
query.setSlop(99);
+ hits = searcher.search(query);
+ assertEquals("slop of 99 not enough", 0, hits.length());
+ QueryUtils.check(query,searcher);
+ }
+
+ // work on two docs like this: "phrase exist notexist exist found"
+ public void testNonExistingPhrase() throws IOException {
+ // phrase without repetitions that exists in 2 docs
+ query.add(new Term("nonexist", "phrase"));
+ query.add(new Term("nonexist", "notexist"));
+ query.add(new Term("nonexist", "found"));
+ query.setSlop(2); // would be found this way
+
Hits hits = searcher.search(query);
- assertEquals(0, hits.length());
+ assertEquals("phrase without repetitions exists in 2 docs", 2, hits.length());
+ QueryUtils.check(query,searcher);
+
+ // phrase with repetitions that exists in 2 docs
+ query = new PhraseQuery();
+ query.add(new Term("nonexist", "phrase"));
+ query.add(new Term("nonexist", "exist"));
+ query.add(new Term("nonexist", "exist"));
+ query.setSlop(1); // would be found
+
+ hits = searcher.search(query);
+ assertEquals("phrase with repetitions exists in two docs", 2, hits.length());
+ QueryUtils.check(query,searcher);
+
+ // phrase I with repetitions that does not exist in any doc
+ query = new PhraseQuery();
+ query.add(new Term("nonexist", "phrase"));
+ query.add(new Term("nonexist", "notexist"));
+ query.add(new Term("nonexist", "phrase"));
+ query.setSlop(1000); // would not be found no matter how high the slop is
+
+ hits = searcher.search(query);
+ assertEquals("nonexisting phrase with repetitions does not exist in any doc", 0, hits.length());
+ QueryUtils.check(query,searcher);
+
+ // phrase II with repetitions that does not exist in any doc
+ query = new PhraseQuery();
+ query.add(new Term("nonexist", "phrase"));
+ query.add(new Term("nonexist", "exist"));
+ query.add(new Term("nonexist", "exist"));
+ query.add(new Term("nonexist", "exist"));
+ query.setSlop(1000); // would not be found no matter how high the slop is
+
+ hits = searcher.search(query);
+ assertEquals("nonexisting phrase with repetitions does not exist in any doc", 0, hits.length());
+ QueryUtils.check(query,searcher);
+
+ }
+
+ /**
+ * Working on a 2 fields like this:
+ * Field("field", "one two three four five")
+ * Field("palindrome", "one two three two one")
+ * Phrase of size 2 occuriong twice, once in order and once in reverse,
+ * because doc is a palyndrome, is counted twice.
+ * Also, in this case order in query does not matter.
+ * Also, when an exact match is found, both sloppy scorer and exact scorer scores the same.
+ */
+ public void testPalyndrome2() throws Exception {
+
+ // search on non palyndrome, find phrase with no slop, using exact phrase scorer
+ query.setSlop(0); // to use exact phrase scorer
+ query.add(new Term("field", "two"));
+ query.add(new Term("field", "three"));
+ Hits hits = searcher.search(query);
+ assertEquals("phrase found with exact phrase scorer", 1, hits.length());
+ float score0 = hits.score(0);
+ //System.out.println("(exact) field: two three: "+score0);
+ QueryUtils.check(query,searcher);
+
+ // search on non palyndrome, find phrase with slop 2, though no slop required here.
+ query.setSlop(2); // to use sloppy scorer
+ hits = searcher.search(query);
+ assertEquals("just sloppy enough", 1, hits.length());
+ float score1 = hits.score(0);
+ //System.out.println("(sloppy) field: two three: "+score1);
+ assertEquals("exact scorer and sloppy scorer score the same when slop does not matter",score0, score1, SCORE_COMP_THRESH);
QueryUtils.check(query,searcher);
+ // search ordered in palyndrome, find it twice
+ query = new PhraseQuery();
+ query.setSlop(2); // must be at least two for both ordered and reversed to match
+ query.add(new Term("palindrome", "two"));
+ query.add(new Term("palindrome", "three"));
+ hits = searcher.search(query);
+ assertEquals("just sloppy enough", 1, hits.length());
+ float score2 = hits.score(0);
+ //System.out.println("palindrome: two three: "+score2);
+ QueryUtils.check(query,searcher);
+
+ //commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
+ //assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESH<score2);
+
+ // search reveresed in palyndrome, find it twice
+ query = new PhraseQuery();
+ query.setSlop(2); // must be at least two for both ordered and reversed to match
+ query.add(new Term("palindrome", "three"));
+ query.add(new Term("palindrome", "two"));
+ hits = searcher.search(query);
+ assertEquals("just sloppy enough", 1, hits.length());
+ float score3 = hits.score(0);
+ //System.out.println("palindrome: three two: "+score3);
+ QueryUtils.check(query,searcher);
+
+ //commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
+ //assertTrue("reversed scores higher in palindrome",score1+SCORE_COMP_THRESH<score3);
+ //assertEquals("ordered or reversed does not matter",score2, score3, SCORE_COMP_THRESH);
}
+ /**
+ * Working on a 2 fields like this:
+ * Field("field", "one two three four five")
+ * Field("palindrome", "one two three two one")
+ * Phrase of size 3 occuriong twice, once in order and once in reverse,
+ * because doc is a palyndrome, is counted twice.
+ * Also, in this case order in query does not matter.
+ * Also, when an exact match is found, both sloppy scorer and exact scorer scores the same.
+ */
+ public void testPalyndrome3() throws Exception {
+
+ // search on non palyndrome, find phrase with no slop, using exact phrase scorer
+ query.setSlop(0); // to use exact phrase scorer
+ query.add(new Term("field", "one"));
+ query.add(new Term("field", "two"));
+ query.add(new Term("field", "three"));
+ Hits hits = searcher.search(query);
+ assertEquals("phrase found with exact phrase scorer", 1, hits.length());
+ float score0 = hits.score(0);
+ //System.out.println("(exact) field: one two three: "+score0);
+ QueryUtils.check(query,searcher);
+
+ // search on non palyndrome, find phrase with slop 3, though no slop required here.
+ query.setSlop(4); // to use sloppy scorer
+ hits = searcher.search(query);
+ assertEquals("just sloppy enough", 1, hits.length());
+ float score1 = hits.score(0);
+ //System.out.println("(sloppy) field: one two three: "+score1);
+ assertEquals("exact scorer and sloppy scorer score the same when slop does not matter",score0, score1, SCORE_COMP_THRESH);
+ QueryUtils.check(query,searcher);
+
+ // search ordered in palyndrome, find it twice
+ query = new PhraseQuery();
+ query.setSlop(4); // must be at least four for both ordered and reversed to match
+ query.add(new Term("palindrome", "one"));
+ query.add(new Term("palindrome", "two"));
+ query.add(new Term("palindrome", "three"));
+ hits = searcher.search(query);
+ assertEquals("just sloppy enough", 1, hits.length());
+ float score2 = hits.score(0);
+ //System.out.println("palindrome: one two three: "+score2);
+ QueryUtils.check(query,searcher);
+
+ //commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
+ //assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESH<score2);
+
+ // search reveresed in palyndrome, find it twice
+ query = new PhraseQuery();
+ query.setSlop(4); // must be at least four for both ordered and reversed to match
+ query.add(new Term("palindrome", "three"));
+ query.add(new Term("palindrome", "two"));
+ query.add(new Term("palindrome", "one"));
+ hits = searcher.search(query);
+ assertEquals("just sloppy enough", 1, hits.length());
+ float score3 = hits.score(0);
+ //System.out.println("palindrome: three two one: "+score3);
+ QueryUtils.check(query,searcher);
+
+ //commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
+ //assertTrue("reversed scores higher in palindrome",score1+SCORE_COMP_THRESH<score3);
+ //assertEquals("ordered or reversed does not matter",score2, score3, SCORE_COMP_THRESH);
+ }
+
}