You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mh...@apache.org on 2015/05/20 13:08:32 UTC
svn commit: r1680522 - in /lucene/dev/trunk/lucene: ./
core/src/java/org/apache/lucene/index/
core/src/java/org/apache/lucene/search/
core/src/test/org/apache/lucene/search/
Author: mharwood
Date: Wed May 20 11:08:32 2015
New Revision: 1680522
URL: http://svn.apache.org/r1680522
Log:
LUCENE-329: Fix FuzzyQuery defaults to rank exact matches highest
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TermContext.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1680522&r1=1680521&r2=1680522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed May 20 11:08:32 2015
@@ -135,6 +135,8 @@ Optimizations
like a disjunction. (Adrien Grand)
Bug Fixes
+* LUCENE-329: Fix FuzzyQuery defaults to rank exact matches highest.
+ (Mark Harwood, Adrien Grand)
* LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.
(Varun Thacker, Adrien Grand, Mike McCandless)
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TermContext.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TermContext.java?rev=1680522&r1=1680521&r2=1680522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TermContext.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/TermContext.java Wed May 20 11:08:32 2015
@@ -117,16 +117,31 @@ public final class TermContext {
* should be derived from a {@link IndexReaderContext}'s leaf ord.
*/
public void register(TermState state, final int ord, final int docFreq, final long totalTermFreq) {
+ register(state, ord);
+ accumulateStatistics(docFreq, totalTermFreq);
+ }
+
+ /**
+ * Expert: Registers and associates a {@link TermState} with an leaf ordinal. The
+ * leaf ordinal should be derived from a {@link IndexReaderContext}'s leaf ord.
+ * On the contrary to {@link #register(TermState, int, int, long)} this method
+ * does NOT update term statistics.
+ */
+ public void register(TermState state, final int ord) {
assert state != null : "state must not be null";
assert ord >= 0 && ord < states.length;
assert states[ord] == null : "state for ord: " + ord
+ " already registered";
+ states[ord] = state;
+ }
+
+ /** Expert: Accumulate term statistics. */
+ public void accumulateStatistics(final int docFreq, final long totalTermFreq) {
this.docFreq += docFreq;
if (this.totalTermFreq >= 0 && totalTermFreq >= 0)
this.totalTermFreq += totalTermFreq;
else
this.totalTermFreq = -1;
- states[ord] = state;
}
/**
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java?rev=1680522&r1=1680521&r2=1680522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java Wed May 20 11:08:32 2015
@@ -98,7 +98,7 @@ public class FuzzyQuery extends MultiTer
this.prefixLength = prefixLength;
this.transpositions = transpositions;
this.maxExpansions = maxExpansions;
- setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
+ setRewriteMethod(new MultiTermQuery.TopTermsBlendedFreqScoringRewrite(maxExpansions));
}
/**
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java?rev=1680522&r1=1680521&r2=1680522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java Wed May 20 11:08:32 2015
@@ -18,13 +18,16 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
+import java.util.List;
import java.util.Objects;
import org.apache.lucene.index.FilteredTermsEnum; // javadocs
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SingleTermsEnum; // javadocs
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
+import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
@@ -113,7 +116,7 @@ public abstract class MultiTermQuery ext
*
* @see #setRewriteMethod */
public final static RewriteMethod SCORING_BOOLEAN_REWRITE = ScoringRewrite.SCORING_BOOLEAN_REWRITE;
-
+
/** Like {@link #SCORING_BOOLEAN_REWRITE} except
* scores are not computed. Instead, each matching
* document receives a constant score equal to the
@@ -171,6 +174,106 @@ public abstract class MultiTermQuery ext
/**
* A rewrite method that first translates each term into
+ * {@link BooleanClause.Occur#SHOULD} clause in a BooleanQuery, but adjusts
+ * the frequencies used for scoring to be blended across the terms, otherwise
+ * the rarest term typically ranks highest (often not useful eg in the set of
+ * expanded terms in a FuzzyQuery).
+ *
+ * <p>
+ * This rewrite method only uses the top scoring terms so it will not overflow
+ * the boolean max clause count.
+ *
+ * @see #setRewriteMethod
+ */
+ public static final class TopTermsBlendedFreqScoringRewrite extends
+ TopTermsRewrite<BooleanQuery> {
+
+ /**
+ * Create a TopTermsBlendedScoringBooleanQueryRewrite for at most
+ * <code>size</code> terms.
+ * <p>
+ * NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
+ * <code>size</code>, then it will be used instead.
+ */
+ public TopTermsBlendedFreqScoringRewrite(int size) {
+ super(size);
+ }
+
+ @Override
+ protected int getMaxSize() {
+ return BooleanQuery.getMaxClauseCount();
+ }
+
+ @Override
+ protected BooleanQuery getTopLevelQuery() {
+ return new BooleanQuery(true);
+ }
+
+ @Override
+ protected void addClause(BooleanQuery topLevel, Term term, int docCount,
+ float boost, TermContext states) {
+ final TermQuery tq = new TermQuery(term, states);
+ tq.setBoost(boost);
+ topLevel.add(tq, BooleanClause.Occur.SHOULD);
+ }
+
+ @Override
+ void adjustScoreTerms(IndexReader reader, ScoreTerm[] scoreTerms) {
+ if (scoreTerms.length <= 1) {
+ return;
+ }
+ int maxDoc = reader.maxDoc();
+ int maxDf = 0;
+ long maxTtf = 0;
+ for (ScoreTerm scoreTerm : scoreTerms) {
+ TermContext ctx = scoreTerm.termState;
+ int df = ctx.docFreq();
+ maxDf = Math.max(df, maxDf);
+ long ttf = ctx.totalTermFreq();
+ maxTtf = ttf == -1 || maxTtf == -1 ? -1 : Math.max(ttf, maxTtf);
+ }
+
+ assert maxDf >= 0 : "DF must be >= 0";
+ if (maxDf == 0) {
+ return; // we are done that term doesn't exist at all
+ }
+ assert (maxTtf == -1) || (maxTtf >= maxDf);
+
+ for (int i = 0; i < scoreTerms.length; i++) {
+ TermContext ctx = scoreTerms[i].termState;
+ ctx = adjustFrequencies(ctx, maxDf, maxTtf);
+
+ ScoreTerm adjustedScoreTerm = new ScoreTerm(ctx);
+ adjustedScoreTerm.boost = scoreTerms[i].boost;
+ adjustedScoreTerm.bytes.copyBytes(scoreTerms[i].bytes);
+ scoreTerms[i] = adjustedScoreTerm;
+ }
+ }
+ }
+
+ private static TermContext adjustFrequencies(TermContext ctx, int artificialDf,
+ long artificialTtf) {
+ List<LeafReaderContext> leaves = ctx.topReaderContext.leaves();
+ final int len;
+ if (leaves == null) {
+ len = 1;
+ } else {
+ len = leaves.size();
+ }
+ TermContext newCtx = new TermContext(ctx.topReaderContext);
+ for (int i = 0; i < len; ++i) {
+ TermState termState = ctx.get(i);
+ if (termState == null) {
+ continue;
+ }
+ newCtx.register(termState, i);
+ }
+ newCtx.accumulateStatistics(artificialDf, artificialTtf);
+ return newCtx;
+ }
+
+ /**
+ * A rewrite method that first translates each term into
* {@link BooleanClause.Occur#SHOULD} clause in a BooleanQuery, but the scores
* are only computed as the boost.
* <p>
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java?rev=1680522&r1=1680521&r2=1680522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java Wed May 20 11:08:32 2015
@@ -18,10 +18,10 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
+import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.PriorityQueue;
-import java.util.Comparator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
@@ -158,14 +158,19 @@ public abstract class TopTermsRewrite<Q
final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp);
+ adjustScoreTerms(reader, scoreTerms);
+
for (final ScoreTerm st : scoreTerms) {
final Term term = new Term(query.field, st.bytes.toBytesRef());
- assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq() + " term=" + term;
addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query
}
return q;
}
+ void adjustScoreTerms(IndexReader reader, ScoreTerm[] scoreTerms) {
+ //no-op but allows subclasses the ability to tweak the score terms used in ranking e.g. balancing IDF.
+ }
+
@Override
public int hashCode() {
return 31 * size;
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java?rev=1680522&r1=1680521&r2=1680522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java Wed May 20 11:08:32 2015
@@ -17,9 +17,9 @@ package org.apache.lucene.search;
* limitations under the License.
*/
-import java.util.List;
-import java.util.Arrays;
import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
@@ -28,7 +28,10 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
@@ -241,6 +244,90 @@ public class TestFuzzyQuery extends Luce
directory.close();
}
+ public void testSingleQueryExactMatchScoresHighest() throws Exception {
+ //See issue LUCENE-329 - IDF shouldn't wreck similarity ranking
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
+ addDoc("smith", writer);
+ addDoc("smith", writer);
+ addDoc("smith", writer);
+ addDoc("smith", writer);
+ addDoc("smith", writer);
+ addDoc("smith", writer);
+ addDoc("smythe", writer);
+ addDoc("smdssasd", writer);
+
+ IndexReader reader = writer.getReader();
+ IndexSearcher searcher = newSearcher(reader);
+ searcher.setSimilarity(new DefaultSimilarity()); //avoid randomisation of similarity algo by test framework
+ writer.close();
+ String searchTerms[] = { "smith", "smythe", "smdssasd" };
+ for (String searchTerm : searchTerms) {
+ FuzzyQuery query = new FuzzyQuery(new Term("field", searchTerm), 2, 1);
+ ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
+ StoredDocument bestDoc = searcher.doc(hits[0].doc);
+ assertTrue(hits.length > 0);
+ String topMatch = bestDoc.get("field");
+ assertEquals(searchTerm, topMatch);
+ if (hits.length > 1) {
+ StoredDocument worstDoc = searcher.doc(hits[hits.length - 1].doc);
+ String worstMatch = worstDoc.get("field");
+ assertNotSame(searchTerm, worstMatch);
+ }
+ }
+ reader.close();
+ directory.close();
+ }
+
+ public void testMultipleQueriesIdfWorks() throws Exception {
+ // With issue LUCENE-329 - it could be argued a MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite
+ // is the solution as it disables IDF.
+ // However - IDF is still useful as in this case where there are multiple FuzzyQueries.
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
+
+ addDoc("michael smith", writer);
+ addDoc("michael lucero", writer);
+ addDoc("doug cutting", writer);
+ addDoc("doug cuttin", writer);
+ addDoc("michael wardle", writer);
+ addDoc("micheal vegas", writer);
+ addDoc("michael lydon", writer);
+
+ IndexReader reader = writer.getReader();
+ IndexSearcher searcher = newSearcher(reader);
+ searcher.setSimilarity(new DefaultSimilarity()); //avoid randomisation of similarity algo by test framework
+
+ writer.close();
+
+ BooleanQuery query = new BooleanQuery();
+ String commonSearchTerm = "michael";
+ FuzzyQuery commonQuery = new FuzzyQuery(new Term("field", commonSearchTerm), 2, 1);
+ query.add(commonQuery, Occur.SHOULD);
+
+ String rareSearchTerm = "cutting";
+ FuzzyQuery rareQuery = new FuzzyQuery(new Term("field", rareSearchTerm), 2, 1);
+ query.add(rareQuery, Occur.SHOULD);
+ ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
+
+ // Matches on the rare surname should be worth more than matches on the common forename
+ assertEquals(7, hits.length);
+ StoredDocument bestDoc = searcher.doc(hits[0].doc);
+ String topMatch = bestDoc.get("field");
+ assertTrue(topMatch.contains(rareSearchTerm));
+
+ StoredDocument runnerUpDoc = searcher.doc(hits[1].doc);
+ String runnerUpMatch = runnerUpDoc.get("field");
+ assertTrue(runnerUpMatch.contains("cuttin"));
+
+ StoredDocument worstDoc = searcher.doc(hits[hits.length - 1].doc);
+ String worstMatch = worstDoc.get("field");
+ assertTrue(worstMatch.contains("micheal")); //misspelling of common name
+
+ reader.close();
+ directory.close();
+ }
+
/**
* MultiTermQuery provides (via attribute) information about which values
* must be competitive to enter the priority queue.