You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/02/21 08:27:46 UTC

svn commit: r912311 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/search/ src/test/org/apache/lucene/search/

Author: rmuir
Date: Sun Feb 21 07:27:46 2010
New Revision: 912311

URL: http://svn.apache.org/viewvc?rev=912311&view=rev
Log:
LUCENE-2261: make TopTermsScoringBooleanRewrite's PQ size configurable

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java
    lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQuery.java
    lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java
    lucene/java/trunk/src/test/org/apache/lucene/search/TestTermRangeQuery.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=912311&r1=912310&r2=912311&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sun Feb 21 07:27:46 2010
@@ -135,8 +135,11 @@
 * LUCENE-2137: Switch to AtomicInteger for some ref counting (Earwin
   Burrfoot via Mike McCandless)
 
-* LUCENE-2123: Move FuzzyQuery rewrite as separate RewriteMode into
-  MTQ. (Uwe Schindler, Robert Muir, Mike McCandless)
+* LUCENE-2123, LUCENE-2261: Move FuzzyQuery rewrite to separate RewriteMode 
+  into MultiTermQuery. The number of fuzzy expansions can be specified with
+  the maxExpansions parameter to FuzzyQuery, but the default is limited to
+  BooleanQuery.maxClauseCount() as before. 
+  (Uwe Schindler, Robert Muir, Mike McCandless)
 
 * LUCENE-2135: On IndexReader.close, forcefully evict any entries from
   the FieldCache rather than waiting for the WeakHashMap to release

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java?rev=912311&r1=912310&r2=912311&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java Sun Feb 21 07:27:46 2010
@@ -30,7 +30,7 @@
  * length of 0 - in this case, *every* term will be enumerated and
  * cause an edit score calculation.
  * 
- * <p>This query uses {@link MultiTermQuery#TOP_TERMS_SCORING_BOOLEAN_REWRITE}
+ * <p>This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite}
  * as default. So terms will be collected and scored according to their
  * edit distance. Only the top terms are used for building the {@link BooleanQuery}.
  * It is not recommended to change the rewrite mode for fuzzy queries.
@@ -39,6 +39,7 @@
   
   public final static float defaultMinSimilarity = 0.5f;
   public final static int defaultPrefixLength = 0;
+  public final static int defaultMaxExpansions = Integer.MAX_VALUE;
   
   private float minimumSimilarity;
   private int prefixLength;
@@ -59,12 +60,15 @@
    *  as the query term is considered similar to the query term if the edit distance
    *  between both terms is less than <code>length(term)*0.5</code>
    * @param prefixLength length of common (non-fuzzy) prefix
+   * @param maxExpansions the maximum number of terms to match. If this number is
+   *  greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten, 
+   *  then the maxClauseCount will be used instead.
    * @throws IllegalArgumentException if minimumSimilarity is &gt;= 1 or &lt; 0
    * or if prefixLength &lt; 0
    */
-  public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException {
+  public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength,
+      int maxExpansions) {
     this.term = term;
-    setRewriteMethod(TOP_TERMS_SCORING_BOOLEAN_REWRITE);
     
     if (minimumSimilarity >= 1.0f)
       throw new IllegalArgumentException("minimumSimilarity >= 1");
@@ -72,6 +76,10 @@
       throw new IllegalArgumentException("minimumSimilarity < 0");
     if (prefixLength < 0)
       throw new IllegalArgumentException("prefixLength < 0");
+    if (maxExpansions < 0)
+      throw new IllegalArgumentException("maxExpansions < 0");
+    
+    setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
     
     if (term.text().length() > 1.0f / (1.0f - minimumSimilarity)) {
       this.termLongEnough = true;
@@ -82,17 +90,24 @@
   }
   
   /**
-   * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}.
+   * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, prefixLength, Integer.MAX_VALUE)}.
+   */
+  public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
+    this(term, minimumSimilarity, prefixLength, defaultMaxExpansions);
+  }
+  
+  /**
+   * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0, Integer.MAX_VALUE)}.
    */
-  public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException {
-    this(term, minimumSimilarity, defaultPrefixLength);
+  public FuzzyQuery(Term term, float minimumSimilarity) {
+    this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions);
   }
 
   /**
-   * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}.
+   * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0, Integer.MAX_VALUE)}.
    */
   public FuzzyQuery(Term term) {
-    this(term, defaultMinSimilarity, defaultPrefixLength);
+    this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions);
   }
   
   /**

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQuery.java?rev=912311&r1=912310&r2=912311&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQuery.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQuery.java Sun Feb 21 07:27:46 2010
@@ -20,7 +20,6 @@
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.PriorityQueue;
 
 import org.apache.lucene.index.IndexReader;
@@ -53,7 +52,7 @@
  * computing unhelpful scores, and it tries to pick the most
  * performant rewrite method given the query. If you
  * need scoring (like {@link FuzzyQuery}, use
- * {@link #TOP_TERMS_SCORING_BOOLEAN_REWRITE} which uses
+ * {@link TopTermsScoringBooleanQueryRewrite} which uses
  * a priority queue to only collect competitive terms
  * and not hit this limitation.
  *
@@ -163,10 +162,41 @@
    *  @see #setRewriteMethod */
   public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite();
 
-  private static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite {
+  /** A rewrite method that first translates each term into
+   *  {@link BooleanClause.Occur#SHOULD} clause in a
+   *  BooleanQuery, and keeps the scores as computed by the
+   *  query.
+   *
+   * <p>This rewrite mode only uses the top scoring terms
+   * so it will not overflow the boolean max clause count.
+   * It is the default rewrite mode for {@link FuzzyQuery}.
+   *
+   *  @see #setRewriteMethod */
+  public static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite {
+    private final int size;
+    
+    /** 
+     * Create a TopTermsScoringBooleanQueryRewrite for 
+     * at most <code>size</code> terms.
+     * <p>
+     * NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than 
+     * <code>size</code>, then it will be used instead. 
+     */
+    public TopTermsScoringBooleanQueryRewrite(int size) {
+      this.size = size;
+    }
+    
+    /** 
+     * Create a TopTermsScoringBooleanQueryRewrite that is limited
+     * to at most {@link BooleanQuery#getMaxClauseCount} terms. 
+     */
+    public TopTermsScoringBooleanQueryRewrite() {
+      this(Integer.MAX_VALUE);
+    }
+    
     @Override
     public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
-      final int maxSize = BooleanQuery.getMaxClauseCount();
+      final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
       final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
       collectTerms(reader, query, new TermCollector() {
         public boolean collect(Term t, float boost) {
@@ -195,10 +225,23 @@
       query.incTotalNumberOfTerms(bq.clauses().size());
       return bq;
     }
+  
+    @Override
+    public int hashCode() {
+      final int prime = 17;
+      int result = 1;
+      result = prime * result + size;
+      return result;
+    }
 
-    // Make sure we are still a singleton even after deserializing
-    protected Object readResolve() {
-      return TOP_TERMS_SCORING_BOOLEAN_REWRITE;
+    @Override
+    public boolean equals(Object obj) {
+      if (this == obj) return true;
+      if (obj == null) return false;
+      if (getClass() != obj.getClass()) return false;
+      TopTermsScoringBooleanQueryRewrite other = (TopTermsScoringBooleanQueryRewrite) obj;
+      if (size != other.size) return false;
+      return true;
     }
   
     private static class ScoreTerm implements Comparable<ScoreTerm> {
@@ -213,18 +256,6 @@
       }
     }
   }
-  
-  /** A rewrite method that first translates each term into
-   *  {@link BooleanClause.Occur#SHOULD} clause in a
-   *  BooleanQuery, and keeps the scores as computed by the
-   *  query.
-   *
-   * <p>This rewrite mode only uses the top scoring terms
-   * so it will not overflow the boolean max clause count.
-   * It is the default rewrite mode for {@link FuzzyQuery}.
-   *
-   *  @see #setRewriteMethod */
-  public final static RewriteMethod TOP_TERMS_SCORING_BOOLEAN_REWRITE = new TopTermsScoringBooleanQueryRewrite();
 
   private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable {
     @Override

Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java?rev=912311&r1=912310&r2=912311&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java Sun Feb 21 07:27:46 2010
@@ -89,22 +89,16 @@
       assertEquals(order.get(i), term);
     }
 
-    // test BooleanQuery.maxClauseCount
-    int savedClauseCount = BooleanQuery.getMaxClauseCount();
-    try {
-      BooleanQuery.setMaxClauseCount(2);
-      // This query would normally return 3 documents, because 3 terms match (see above):
-      query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);   
-      hits = searcher.search(query, null, 1000).scoreDocs;
-      assertEquals("only 2 documents should match", 2, hits.length);
-      order = Arrays.asList("bbbbb","abbbb");
-      for (int i = 0; i < hits.length; i++) {
-        final String term = searcher.doc(hits[i].doc).get("field");
-        //System.out.println(hits[i].score);
-        assertEquals(order.get(i), term);
-      }
-    } finally {
-      BooleanQuery.setMaxClauseCount(savedClauseCount);
+    // test pq size by supplying maxExpansions=2
+    // This query would normally return 3 documents, because 3 terms match (see above):
+    query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0, 2); 
+    hits = searcher.search(query, null, 1000).scoreDocs;
+    assertEquals("only 2 documents should match", 2, hits.length);
+    order = Arrays.asList("bbbbb","abbbb");
+    for (int i = 0; i < hits.length; i++) {
+      final String term = searcher.doc(hits[i].doc).get("field");
+      //System.out.println(hits[i].score);
+      assertEquals(order.get(i), term);
     }
 
     // not similar enough:

Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestTermRangeQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestTermRangeQuery.java?rev=912311&r1=912310&r2=912311&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestTermRangeQuery.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestTermRangeQuery.java Sun Feb 21 07:27:46 2010
@@ -111,7 +111,7 @@
   }
   
   private void checkBooleanTerms(Searcher searcher, TermRangeQuery query, String... terms) throws IOException {
-    query.setRewriteMethod(MultiTermQuery.TOP_TERMS_SCORING_BOOLEAN_REWRITE);
+    query.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite());
     final BooleanQuery bq = (BooleanQuery) searcher.rewrite(query);
     final Set<String> allowedTerms = new HashSet<String>(Arrays.asList(terms));
     assertEquals(allowedTerms.size(), bq.clauses().size());