You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2013/07/18 19:10:00 UTC

svn commit: r1504531 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/

Author: jpountz
Date: Thu Jul 18 17:09:59 2013
New Revision: 1504531

URL: http://svn.apache.org/r1504531
Log:
LUCENE-4542: Make HunspellStemFilter's maximum recursion level configurable.

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1504531&r1=1504530&r2=1504531&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Thu Jul 18 17:09:59 2013
@@ -28,6 +28,9 @@ API Changes
 * LUCENE-5114: Remove unused boolean useCache parameter from
   TermsEnum.seekCeil and .seekExact (Mike McCandless)
 
+* LUCENE-4542: HunspellStemFilter's maximum recursion level is now configurable.
+  (Piotr, Rafał Kuć via Adrien Grand)
+
 Optimizations
 
 * LUCENE-5088: Added TermFilter to filter docs by a specific term.

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java?rev=1504531&r1=1504530&r2=1504531&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java Thu Jul 18 17:09:59 2013
@@ -55,17 +55,31 @@ public final class HunspellStemFilter ex
   
   private final boolean dedup;
 
+  /** Create a {@link HunspellStemFilter} which deduplicates stems and has a maximum
+   *  recursion level of 2. 
+   *  @see #HunspellStemFilter(TokenStream, HunspellDictionary, int) */
+  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
+    this(input, dictionary, 2);
+  }
+
   /**
    * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
    * HunspellDictionary
    *
    * @param input TokenStream whose tokens will be stemmed
    * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
+   * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
    */
-  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
-    this(input, dictionary, true);
+  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, int recursionCap) {
+    this(input, dictionary, true, recursionCap);
   }
-  
+
+  /** Create a {@link HunspellStemFilter} which has a maximum recursion level of 2. 
+   *  @see #HunspellStemFilter(TokenStream, HunspellDictionary, boolean, int) */
+  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
+    this(input, dictionary, dedup, 2);
+  }
+
   /**
    * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
    * HunspellDictionary
@@ -73,11 +87,12 @@ public final class HunspellStemFilter ex
    * @param input TokenStream whose tokens will be stemmed
    * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
    * @param dedup true if only unique terms should be output.
+   * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
    */
-  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
+  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup, int recursionCap) {
     super(input);
     this.dedup = dedup;
-    this.stemmer = new HunspellStemmer(dictionary);
+    this.stemmer = new HunspellStemmer(dictionary, recursionCap);
   }
 
   /**

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java?rev=1504531&r1=1504530&r2=1504531&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java Thu Jul 18 17:09:59 2013
@@ -54,12 +54,14 @@ public class HunspellStemFilterFactory e
   private static final String PARAM_AFFIX = "affix";
   private static final String PARAM_IGNORE_CASE = "ignoreCase";
   private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
+  private static final String PARAM_RECURSION_CAP = "recursionCap";
 
   private final String dictionaryArg;
   private final String affixFile;
   private final boolean ignoreCase;
   private final boolean strictAffixParsing;
   private HunspellDictionary dictionary;
+  private int recursionCap;
   
   /** Creates a new HunspellStemFilterFactory */
   public HunspellStemFilterFactory(Map<String,String> args) {
@@ -69,6 +71,7 @@ public class HunspellStemFilterFactory e
     affixFile = get(args, PARAM_AFFIX);
     ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false);
     strictAffixParsing = getBoolean(args, PARAM_STRICT_AFFIX_PARSING, true);
+    recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -111,6 +114,6 @@ public class HunspellStemFilterFactory e
    */
   @Override
   public TokenStream create(TokenStream tokenStream) {
-    return new HunspellStemFilter(tokenStream, dictionary);
+    return new HunspellStemFilter(tokenStream, dictionary, recursionCap);
   }
 }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java?rev=1504531&r1=1504530&r2=1504531&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java Thu Jul 18 17:09:59 2013
@@ -17,16 +17,10 @@ package org.apache.lucene.analysis.hunsp
  * limitations under the License.
  */
 
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
-import java.util.Scanner;
 
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.CharacterUtils;
@@ -37,23 +31,33 @@ import org.apache.lucene.util.Version;
  * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
  */
 public class HunspellStemmer {
-
-  private static final int RECURSION_CAP = 2;
-  
+  private final int recursionCap;
   private final HunspellDictionary dictionary;
   private final StringBuilder segment = new StringBuilder();
   private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_40);
 
   /**
-   * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
+   * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems. Uses the 
+   * default recursion cap of <code>2</code> (based on Hunspell documentation). 
    *
    * @param dictionary HunspellDictionary that will be used to create the stems
    */
   public HunspellStemmer(HunspellDictionary dictionary) {
-    this.dictionary = dictionary;
+    this(dictionary, 2);
   }
 
   /**
+   * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
+   *
+   * @param dictionary HunspellDictionary that will be used to create the stems
+   * @param recursionCap maximum level of recursion stemmer can go into
+   */
+  public HunspellStemmer(HunspellDictionary dictionary, int recursionCap) {
+    this.dictionary = dictionary;
+    this.recursionCap = recursionCap;
+  } 
+  
+  /**
    * Find the stem(s) of the provided word
    * 
    * @param word Word to find the stems for
@@ -194,7 +198,7 @@ public class HunspellStemmer {
       }
     }
 
-    if (affix.isCrossProduct() && recursionDepth < RECURSION_CAP) {
+    if (affix.isCrossProduct() && recursionDepth < recursionCap) {
       stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
     }
 

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java?rev=1504531&r1=1504530&r2=1504531&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java Thu Jul 18 17:09:59 2013
@@ -30,6 +30,7 @@ import org.apache.lucene.analysis.Tokeni
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util._TestUtil;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 
@@ -57,13 +58,13 @@ public class HunspellStemFilterTest  ext
   public void testKeywordAttribute() throws IOException {
     MockTokenizer tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true);
     tokenizer.setEnableChecks(true);
-    HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY);
+    HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY, _TestUtil.nextInt(random(), 1, 3));
     assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
     
     // assert with keywork marker
     tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true);
     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
-    filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), DICTIONARY);
+    filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), DICTIONARY, _TestUtil.nextInt(random(), 1, 3));
     assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
   }
   
@@ -74,7 +75,7 @@ public class HunspellStemFilterTest  ext
       @Override
       protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
         Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY));
+        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, _TestUtil.nextInt(random(), 1, 3)));
       }  
     };
     checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
@@ -85,7 +86,7 @@ public class HunspellStemFilterTest  ext
       @Override
       protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
         Tokenizer tokenizer = new KeywordTokenizer(reader);
-        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY));
+        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, _TestUtil.nextInt(random(), 1, 3)));
       }
     };
     checkOneTermReuse(a, "", "");