You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/02/21 08:27:46 UTC
svn commit: r912311 - in /lucene/java/trunk: ./
src/java/org/apache/lucene/search/ src/test/org/apache/lucene/search/
Author: rmuir
Date: Sun Feb 21 07:27:46 2010
New Revision: 912311
URL: http://svn.apache.org/viewvc?rev=912311&view=rev
Log:
LUCENE-2261: make TopTermsScoringBooleanRewrite's PQ size configurable
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java
lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQuery.java
lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java
lucene/java/trunk/src/test/org/apache/lucene/search/TestTermRangeQuery.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=912311&r1=912310&r2=912311&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sun Feb 21 07:27:46 2010
@@ -135,8 +135,11 @@
* LUCENE-2137: Switch to AtomicInteger for some ref counting (Earwin
Burrfoot via Mike McCandless)
-* LUCENE-2123: Move FuzzyQuery rewrite as separate RewriteMode into
- MTQ. (Uwe Schindler, Robert Muir, Mike McCandless)
+* LUCENE-2123, LUCENE-2261: Move FuzzyQuery rewrite to separate RewriteMode
+ into MultiTermQuery. The number of fuzzy expansions can be specified with
+ the maxExpansions parameter to FuzzyQuery, but the default is limited to
+ BooleanQuery.maxClauseCount() as before.
+ (Uwe Schindler, Robert Muir, Mike McCandless)
* LUCENE-2135: On IndexReader.close, forcefully evict any entries from
the FieldCache rather than waiting for the WeakHashMap to release
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java?rev=912311&r1=912310&r2=912311&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java Sun Feb 21 07:27:46 2010
@@ -30,7 +30,7 @@
* length of 0 - in this case, *every* term will be enumerated and
* cause an edit score calculation.
*
- * <p>This query uses {@link MultiTermQuery#TOP_TERMS_SCORING_BOOLEAN_REWRITE}
+ * <p>This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite}
* as default. So terms will be collected and scored according to their
* edit distance. Only the top terms are used for building the {@link BooleanQuery}.
* It is not recommended to change the rewrite mode for fuzzy queries.
@@ -39,6 +39,7 @@
public final static float defaultMinSimilarity = 0.5f;
public final static int defaultPrefixLength = 0;
+ public final static int defaultMaxExpansions = Integer.MAX_VALUE;
private float minimumSimilarity;
private int prefixLength;
@@ -59,12 +60,15 @@
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than <code>length(term)*0.5</code>
* @param prefixLength length of common (non-fuzzy) prefix
+ * @param maxExpansions the maximum number of terms to match. If this number is
+ * greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten,
+ * then the maxClauseCount will be used instead.
* @throws IllegalArgumentException if minimumSimilarity is >= 1 or < 0
* or if prefixLength < 0
*/
- public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException {
+ public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength,
+ int maxExpansions) {
this.term = term;
- setRewriteMethod(TOP_TERMS_SCORING_BOOLEAN_REWRITE);
if (minimumSimilarity >= 1.0f)
throw new IllegalArgumentException("minimumSimilarity >= 1");
@@ -72,6 +76,10 @@
throw new IllegalArgumentException("minimumSimilarity < 0");
if (prefixLength < 0)
throw new IllegalArgumentException("prefixLength < 0");
+ if (maxExpansions < 0)
+ throw new IllegalArgumentException("maxExpansions < 0");
+
+ setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
if (term.text().length() > 1.0f / (1.0f - minimumSimilarity)) {
this.termLongEnough = true;
@@ -82,17 +90,24 @@
}
/**
- * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}.
+ * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, prefixLength, Integer.MAX_VALUE)}.
+ */
+ public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
+ this(term, minimumSimilarity, prefixLength, defaultMaxExpansions);
+ }
+
+ /**
+ * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0, Integer.MAX_VALUE)}.
*/
- public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException {
- this(term, minimumSimilarity, defaultPrefixLength);
+ public FuzzyQuery(Term term, float minimumSimilarity) {
+ this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions);
}
/**
- * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}.
+ * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0, Integer.MAX_VALUE)}.
*/
public FuzzyQuery(Term term) {
- this(term, defaultMinSimilarity, defaultPrefixLength);
+ this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions);
}
/**
Modified: lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQuery.java?rev=912311&r1=912310&r2=912311&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQuery.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQuery.java Sun Feb 21 07:27:46 2010
@@ -20,7 +20,6 @@
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
-import java.util.Collection;
import java.util.PriorityQueue;
import org.apache.lucene.index.IndexReader;
@@ -53,7 +52,7 @@
* computing unhelpful scores, and it tries to pick the most
* performant rewrite method given the query. If you
* need scoring (like {@link FuzzyQuery}, use
- * {@link #TOP_TERMS_SCORING_BOOLEAN_REWRITE} which uses
+ * {@link TopTermsScoringBooleanQueryRewrite} which uses
* a priority queue to only collect competitive terms
* and not hit this limitation.
*
@@ -163,10 +162,41 @@
* @see #setRewriteMethod */
public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite();
- private static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite {
+ /** A rewrite method that first translates each term into
+ * {@link BooleanClause.Occur#SHOULD} clause in a
+ * BooleanQuery, and keeps the scores as computed by the
+ * query.
+ *
+ * <p>This rewrite mode only uses the top scoring terms
+ * so it will not overflow the boolean max clause count.
+ * It is the default rewrite mode for {@link FuzzyQuery}.
+ *
+ * @see #setRewriteMethod */
+ public static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite {
+ private final int size;
+
+ /**
+ * Create a TopTermsScoringBooleanQueryRewrite for
+ * at most <code>size</code> terms.
+ * <p>
+ * NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
+ * <code>size</code>, then it will be used instead.
+ */
+ public TopTermsScoringBooleanQueryRewrite(int size) {
+ this.size = size;
+ }
+
+ /**
+ * Create a TopTermsScoringBooleanQueryRewrite that is limited
+ * to at most {@link BooleanQuery#getMaxClauseCount} terms.
+ */
+ public TopTermsScoringBooleanQueryRewrite() {
+ this(Integer.MAX_VALUE);
+ }
+
@Override
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
- final int maxSize = BooleanQuery.getMaxClauseCount();
+ final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
collectTerms(reader, query, new TermCollector() {
public boolean collect(Term t, float boost) {
@@ -195,10 +225,23 @@
query.incTotalNumberOfTerms(bq.clauses().size());
return bq;
}
+
+ @Override
+ public int hashCode() {
+ final int prime = 17;
+ int result = 1;
+ result = prime * result + size;
+ return result;
+ }
- // Make sure we are still a singleton even after deserializing
- protected Object readResolve() {
- return TOP_TERMS_SCORING_BOOLEAN_REWRITE;
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) return true;
+ if (obj == null) return false;
+ if (getClass() != obj.getClass()) return false;
+ TopTermsScoringBooleanQueryRewrite other = (TopTermsScoringBooleanQueryRewrite) obj;
+ if (size != other.size) return false;
+ return true;
}
private static class ScoreTerm implements Comparable<ScoreTerm> {
@@ -213,18 +256,6 @@
}
}
}
-
- /** A rewrite method that first translates each term into
- * {@link BooleanClause.Occur#SHOULD} clause in a
- * BooleanQuery, and keeps the scores as computed by the
- * query.
- *
- * <p>This rewrite mode only uses the top scoring terms
- * so it will not overflow the boolean max clause count.
- * It is the default rewrite mode for {@link FuzzyQuery}.
- *
- * @see #setRewriteMethod */
- public final static RewriteMethod TOP_TERMS_SCORING_BOOLEAN_REWRITE = new TopTermsScoringBooleanQueryRewrite();
private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable {
@Override
Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java?rev=912311&r1=912310&r2=912311&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java Sun Feb 21 07:27:46 2010
@@ -89,22 +89,16 @@
assertEquals(order.get(i), term);
}
- // test BooleanQuery.maxClauseCount
- int savedClauseCount = BooleanQuery.getMaxClauseCount();
- try {
- BooleanQuery.setMaxClauseCount(2);
- // This query would normally return 3 documents, because 3 terms match (see above):
- query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals("only 2 documents should match", 2, hits.length);
- order = Arrays.asList("bbbbb","abbbb");
- for (int i = 0; i < hits.length; i++) {
- final String term = searcher.doc(hits[i].doc).get("field");
- //System.out.println(hits[i].score);
- assertEquals(order.get(i), term);
- }
- } finally {
- BooleanQuery.setMaxClauseCount(savedClauseCount);
+ // test pq size by supplying maxExpansions=2
+ // This query would normally return 3 documents, because 3 terms match (see above):
+ query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0, 2);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals("only 2 documents should match", 2, hits.length);
+ order = Arrays.asList("bbbbb","abbbb");
+ for (int i = 0; i < hits.length; i++) {
+ final String term = searcher.doc(hits[i].doc).get("field");
+ //System.out.println(hits[i].score);
+ assertEquals(order.get(i), term);
}
// not similar enough:
Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestTermRangeQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestTermRangeQuery.java?rev=912311&r1=912310&r2=912311&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestTermRangeQuery.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestTermRangeQuery.java Sun Feb 21 07:27:46 2010
@@ -111,7 +111,7 @@
}
private void checkBooleanTerms(Searcher searcher, TermRangeQuery query, String... terms) throws IOException {
- query.setRewriteMethod(MultiTermQuery.TOP_TERMS_SCORING_BOOLEAN_REWRITE);
+ query.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite());
final BooleanQuery bq = (BooleanQuery) searcher.rewrite(query);
final Set<String> allowedTerms = new HashSet<String>(Arrays.asList(terms));
assertEquals(allowedTerms.size(), bq.clauses().size());