You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/02 03:08:17 UTC
svn commit: r1573251 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/

Author: rmuir
Date: Sun Mar  2 02:08:16 2014
New Revision: 1573251

URL: http://svn.apache.org/r1573251
Log:
LUCENE-5483: fix hunspell inaccuracies

Added:
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestComplexPrefix.java
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestComplexPrefix.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCondition.java
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCondition.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFlagLong.java
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFlagLong.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFlagNum.java
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFlagNum.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTwoSuffixes.java
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTwoSuffixes.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/complexprefix.aff
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/complexprefix.aff
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/complexprefix.dic
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/complexprefix.dic
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/condition.aff
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/condition.aff
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/condition.dic
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/condition.dic
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flaglong.aff
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flaglong.aff
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flaglong.dic
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flaglong.dic
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagnum.aff
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagnum.aff
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagnum.dic
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagnum.dic
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/twosuffixes.aff
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/twosuffixes.aff
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/twosuffixes.dic
      - copied unchanged from r1573248, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/twosuffixes.dic
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDependencies.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestMorph.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTwoFold.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1573251&r1=1573250&r2=1573251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Sun Mar  2 02:08:16 2014
@@ -56,6 +56,12 @@ Bug fixes
 * LUCENE-5481: IndexWriter.forceMerge used to run a merge even if there was a
   single segment in the index. (Adrien Grand, Mike McCandless)
 
+* LUCENE-5483: Fix inaccuracies in HunspellStemFilter. Multi-stage affix-stripping,
+  prefix-suffix dependencies, and COMPLEXPREFIXES now work correctly according
+  to the hunspell algorithm. Removed recursionCap parameter, as its no longer needed, rules for
+  recursive affix application are driven correctly by continuation classes in the affix file.
+  (Robert Muir)
+
 Test Framework
 
 * LUCENE-5449: Rename _TestUtil and _TestHelper to remove the leading _.

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1573251&r1=1573250&r2=1573251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Sun Mar  2 02:08:16 2014
@@ -65,6 +65,7 @@ public class Dictionary {
   private static final String PREFIX_KEY = "PFX";
   private static final String SUFFIX_KEY = "SFX";
   private static final String FLAG_KEY = "FLAG";
+  private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
 
   private static final String NUM_FLAG_TYPE = "num";
   private static final String UTF8_FLAG_TYPE = "UTF-8";
@@ -104,6 +105,7 @@ public class Dictionary {
   private final File tempDir = OfflineSorter.defaultTempDir(); // TODO: make this configurable?
   
   boolean ignoreCase;
+  boolean complexPrefixes;
   
   /**
    * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
@@ -131,9 +133,10 @@ public class Dictionary {
    */
   public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException {
     this.ignoreCase = ignoreCase;
-    BufferedInputStream buffered = new BufferedInputStream(affix, 8192);
-    buffered.mark(8192);
-    String encoding = getDictionaryEncoding(affix);
+    // hungarian has thousands of AF before the SET, so a 32k buffer is needed 
+    BufferedInputStream buffered = new BufferedInputStream(affix, 32768);
+    buffered.mark(32768);
+    String encoding = getDictionaryEncoding(buffered);
     buffered.reset();
     CharsetDecoder decoder = getJavaEncoding(encoding);
     readAffixFile(buffered, decoder);
@@ -235,6 +238,8 @@ public class Dictionary {
         // Assume that the FLAG line comes before any prefix or suffixes
         // Store the strategy so it can be used when parsing the dic file
         flagParsingStrategy = getFlagParsingStrategy(line);
+      } else if (line.equals(COMPLEXPREFIXES_KEY)) {
+        complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
       }
     }
     

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java?rev=1573251&r1=1573250&r2=1573251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java Sun Mar  2 02:08:16 2014
@@ -58,29 +58,16 @@ public final class HunspellStemFilter ex
   private final boolean dedup;
   private final boolean longestOnly;
 
-  /** Create a {@link HunspellStemFilter} which deduplicates stems and has a maximum
-   *  recursion level of 2. 
-   *  @see #HunspellStemFilter(TokenStream, Dictionary, int) */
+  /** Create a {@link HunspellStemFilter} outputting all possible stems.
+   *  @see #HunspellStemFilter(TokenStream, Dictionary, boolean) */
   public HunspellStemFilter(TokenStream input, Dictionary dictionary) {
-    this(input, dictionary, 2);
+    this(input, dictionary, true);
   }
 
-  /**
-   * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
-   * Dictionary
-   *
-   * @param input TokenStream whose tokens will be stemmed
-   * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
-   * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
-   */
-  public HunspellStemFilter(TokenStream input, Dictionary dictionary, int recursionCap) {
-    this(input, dictionary, true, recursionCap);
-  }
-
-  /** Create a {@link HunspellStemFilter} which has a maximum recursion level of 2. 
-   *  @see #HunspellStemFilter(TokenStream, Dictionary, boolean, int) */
+  /** Create a {@link HunspellStemFilter} outputting all possible stems. 
+   *  @see #HunspellStemFilter(TokenStream, Dictionary, boolean, boolean) */
   public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup) {
-    this(input, dictionary, dedup, 2);
+    this(input, dictionary, dedup, false);
   }
   
   /**
@@ -89,27 +76,12 @@ public final class HunspellStemFilter ex
    *
    * @param input TokenStream whose tokens will be stemmed
    * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
-   * @param dedup true if only unique terms should be output.
-   * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
-   */
-  public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) {
-    this(input, dictionary, dedup, recursionCap, false);
-  }
-
-  /**
-   * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
-   * Dictionary
-   *
-   * @param input TokenStream whose tokens will be stemmed
-   * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
-   * @param dedup true if only unique terms should be output.
-   * @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
    * @param longestOnly true if only the longest term should be output.
    */
-  public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap, boolean longestOnly) {
+  public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup,  boolean longestOnly) {
     super(input);
     this.dedup = dedup && longestOnly == false; // don't waste time deduping if longestOnly is set
-    this.stemmer = new Stemmer(dictionary, recursionCap);
+    this.stemmer = new Stemmer(dictionary);
     this.longestOnly = longestOnly;
   }
 

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java?rev=1573251&r1=1573250&r2=1573251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java Sun Mar  2 02:08:16 2014
@@ -57,7 +57,6 @@ public class HunspellStemFilterFactory e
   private final boolean ignoreCase;
   private final boolean longestOnly;
   private Dictionary dictionary;
-  private int recursionCap;
   
   /** Creates a new HunspellStemFilterFactory */
   public HunspellStemFilterFactory(Map<String,String> args) {
@@ -65,11 +64,14 @@ public class HunspellStemFilterFactory e
     dictionaryFiles = require(args, PARAM_DICTIONARY);
     affixFile = get(args, PARAM_AFFIX);
     ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false);
-    recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
     longestOnly = getBoolean(args, PARAM_LONGEST_ONLY, false);
     // this isnt necessary: we properly load all dictionaries.
     // but recognize and ignore for back compat
     getBoolean(args, "strictAffixParsing", true);
+    // this isn't necessary: multi-stage stripping is fixed and 
+    // flags like COMPLEXPREFIXES in the data itself control this.
+    // but recognize and ignore for back compat
+    getInt(args, "recursionCap", 0);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -100,6 +102,6 @@ public class HunspellStemFilterFactory e
 
   @Override
   public TokenStream create(TokenStream tokenStream) {
-    return new HunspellStemFilter(tokenStream, dictionary, true, recursionCap, longestOnly);
+    return new HunspellStemFilter(tokenStream, dictionary, true, longestOnly);
   }
 }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java?rev=1573251&r1=1573250&r2=1573251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java Sun Mar  2 02:08:16 2014
@@ -36,7 +36,6 @@ import org.apache.lucene.util.Version;
  * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
  */
 final class Stemmer {
-  private final int recursionCap;
   private final Dictionary dictionary;
   private final BytesRef scratch = new BytesRef();
   private final StringBuilder segment = new StringBuilder();
@@ -44,25 +43,13 @@ final class Stemmer {
   private final CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT);
 
   /**
-   * Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the 
-   * default recursion cap of <code>2</code> (based on Hunspell documentation). 
-   *
-   * @param dictionary Dictionary that will be used to create the stems
-   */
-  public Stemmer(Dictionary dictionary) {
-    this(dictionary, 2);
-  }
-
-  /**
    * Constructs a new Stemmer which will use the provided Dictionary to create its stems.
    *
    * @param dictionary Dictionary that will be used to create the stems
-   * @param recursionCap maximum level of recursion stemmer can go into
    */
-  public Stemmer(Dictionary dictionary, int recursionCap) {
+  public Stemmer(Dictionary dictionary) {
     this.dictionary = dictionary;
     this.affixReader = new ByteArrayDataInput(dictionary.affixData);
-    this.recursionCap = recursionCap;
   } 
   
   /**
@@ -94,7 +81,7 @@ final class Stemmer {
         stems.add(new CharsRef(word, 0, length));
       }
     }
-    stems.addAll(stem(word, length, Dictionary.NOFLAGS, 0));
+    stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false));
     return stems;
   }
   
@@ -126,62 +113,116 @@ final class Stemmer {
    * Generates a list of stems for the provided word
    *
    * @param word Word to generate the stems for
-   * @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step
-   * @param recursionDepth Level of recursion this stemming step is at
+   * @param previous previous affix that was removed (so we dont remove same one twice)
+   * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step
+   * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word
+   * @param recursionDepth current recursiondepth
+   * @param doPrefix true if we should remove prefixes
+   * @param doSuffix true if we should remove suffixes
+   * @param previousWasPrefix true if the previous removal was a prefix:
+   *        if we are removing a suffix, and it has no continuation requirements, its ok.
+   *        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. 
    * @return List of stems, or empty list if no stems are found
    */
-  private List<CharsRef> stem(char word[], int length, char[] flags, int recursionDepth) {
+  private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix) {
+    
     // TODO: allow this stuff to be reused by tokenfilter
     List<CharsRef> stems = new ArrayList<CharsRef>();
-
-    for (int i = 0; i < length; i++) {
-      IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
-      if (suffixes == null) {
-        continue;
-      }
-
-      for (int j = 0; j < suffixes.length; j++) {
-        int suffix = suffixes.ints[suffixes.offset + j];
-        affixReader.setPosition(8 * suffix);
-        char flag = (char) (affixReader.readShort() & 0xffff);
-        if (hasCrossCheckedFlag(flag, flags)) {
-          int appendLength = length - i;
-          int deAffixedLength = length - appendLength;
-          // TODO: can we do this in-place?
+    
+    if (doPrefix) {
+      for (int i = length - 1; i >= 0; i--) {
+        IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
+        if (prefixes == null) {
+          continue;
+        }
+        
+        for (int j = 0; j < prefixes.length; j++) {
+          int prefix = prefixes.ints[prefixes.offset + j];
+          if (prefix == previous) {
+            continue;
+          }
+          affixReader.setPosition(8 * prefix);
+          char flag = (char) (affixReader.readShort() & 0xffff);
           char stripOrd = (char) (affixReader.readShort() & 0xffff);
-          dictionary.stripLookup.get(stripOrd, scratch);
-          String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
-
-          List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
-
-          stems.addAll(stemList);
+          int condition = (char) (affixReader.readShort() & 0xffff);
+          boolean crossProduct = (condition & 1) == 1;
+          condition >>>= 1;
+          char append = (char) (affixReader.readShort() & 0xffff);
+          
+          final boolean compatible;
+          if (recursionDepth == 0) {
+            compatible = true;
+          } else if (crossProduct) {
+            // cross check incoming continuation class (flag of previous affix) against list.
+            dictionary.flagLookup.get(append, scratch);
+            char appendFlags[] = Dictionary.decodeFlags(scratch);
+            assert prevFlag >= 0;
+            compatible = hasCrossCheckedFlag((char)prevFlag, appendFlags, false);
+          } else {
+            compatible = false;
+          }
+          
+          if (compatible) {
+            int deAffixedStart = i;
+            int deAffixedLength = length - deAffixedStart;
+            
+            dictionary.stripLookup.get(stripOrd, scratch);
+            String strippedWord = new StringBuilder().append(scratch.utf8ToString())
+                .append(word, deAffixedStart, deAffixedLength)
+                .toString();
+            
+            List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, -1, recursionDepth, true);
+            
+            stems.addAll(stemList);
+          }
         }
       }
-    }
-
-    for (int i = length - 1; i >= 0; i--) {
-      IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
-      if (prefixes == null) {
-        continue;
-      }
-
-      for (int j = 0; j < prefixes.length; j++) {
-        int prefix = prefixes.ints[prefixes.offset + j];
-        affixReader.setPosition(8 * prefix);
-        char flag = (char) (affixReader.readShort() & 0xffff);
-        if (hasCrossCheckedFlag(flag, flags)) {
-          int deAffixedStart = i;
-          int deAffixedLength = length - deAffixedStart;
+    } 
+    
+    if (doSuffix) {
+      for (int i = 0; i < length; i++) {
+        IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
+        if (suffixes == null) {
+          continue;
+        }
+        
+        for (int j = 0; j < suffixes.length; j++) {
+          int suffix = suffixes.ints[suffixes.offset + j];
+          if (suffix == previous) {
+            continue;
+          }
+          affixReader.setPosition(8 * suffix);
+          char flag = (char) (affixReader.readShort() & 0xffff);
           char stripOrd = (char) (affixReader.readShort() & 0xffff);
-
-          dictionary.stripLookup.get(stripOrd, scratch);
-          String strippedWord = new StringBuilder().append(scratch.utf8ToString())
-              .append(word, deAffixedStart, deAffixedLength)
-              .toString();
-
-          List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
-
-          stems.addAll(stemList);
+          int condition = (char) (affixReader.readShort() & 0xffff);
+          boolean crossProduct = (condition & 1) == 1;
+          condition >>>= 1;
+          char append = (char) (affixReader.readShort() & 0xffff);
+          
+          final boolean compatible;
+          if (recursionDepth == 0) {
+            compatible = true;
+          } else if (crossProduct) {
+            // cross check incoming continuation class (flag of previous affix) against list.
+            dictionary.flagLookup.get(append, scratch);
+            char appendFlags[] = Dictionary.decodeFlags(scratch);
+            assert prevFlag >= 0;
+            compatible = hasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix);
+          } else {
+            compatible = false;
+          }
+          
+          if (compatible) {
+            int appendLength = length - i;
+            int deAffixedLength = length - appendLength;
+            // TODO: can we do this in-place?
+            dictionary.stripLookup.get(stripOrd, scratch);
+            String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
+            
+            List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, prefixFlag, recursionDepth, false);
+            
+            stems.addAll(stemList);
+          }
         }
       }
     }
@@ -193,14 +234,19 @@ final class Stemmer {
    * Applies the affix rule to the given word, producing a list of stems if any are found
    *
    * @param strippedWord Word the affix has been removed and the strip added
+   * @param length valid length of stripped word
    * @param affix HunspellAffix representing the affix rule itself
-   * @param recursionDepth Level of recursion this stemming step is at
+   * @param prefixFlag when we already stripped a prefix, we cant simply recurse and check the suffix, unless both are compatible
+   *                   so we must check dictionary form against both to add it as a stem!
+   * @param recursionDepth current recursion depth
+   * @param prefix true if we are removing a prefix (false if its a suffix)
    * @return List of stems for the word, or an empty list if none are found
    */
-  List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
+  List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix) {
     segment.setLength(0);
     segment.append(strippedWord, 0, length);
     
+    // TODO: just pass this in from before, no need to decode it twice
     affixReader.setPosition(8 * affix);
     char flag = (char) (affixReader.readShort() & 0xffff);
     affixReader.skipBytes(2); // strip
@@ -221,16 +267,45 @@ final class Stemmer {
       for (int i = 0; i < forms.length; i++) {
         dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch);
         char wordFlags[] = Dictionary.decodeFlags(scratch);
-        if (wordFlags != null && Dictionary.hasFlag(wordFlags, flag)) {
+        if (Dictionary.hasFlag(wordFlags, flag)) {
+          // confusing: in this one exception, we already chained the first prefix against the second,
+          // so it doesnt need to be checked against the word
+          boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
+          if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.hasFlag(wordFlags, (char)prefixFlag)) {
+            // see if we can chain prefix thru the suffix continuation class (only if it has any!)
+            dictionary.flagLookup.get(append, scratch);
+            char appendFlags[] = Dictionary.decodeFlags(scratch);
+            if (!hasCrossCheckedFlag((char)prefixFlag, appendFlags, false)) {
+              continue;
+            }
+          }
           stems.add(new CharsRef(strippedWord, 0, length));
         }
       }
     }
 
-    if (crossProduct && recursionDepth < recursionCap) {
-      dictionary.flagLookup.get(append, scratch);
-      char appendFlags[] = Dictionary.decodeFlags(scratch);
-      stems.addAll(stem(strippedWord, length, appendFlags, ++recursionDepth));
+    if (crossProduct) {
+      if (recursionDepth == 0) {
+        if (prefix) {
+          // we took away the first prefix.
+          // COMPLEXPREFIXES = true:  combine with a second prefix and another suffix 
+          // COMPLEXPREFIXES = false: combine with another suffix
+          stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes, true, true));
+        } else if (!dictionary.complexPrefixes) {
+          // we took away a suffix.
+          // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
+          // COMPLEXPREFIXES = false: combine with another suffix
+          stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false));
+        }
+      } else if (recursionDepth == 1) {
+        if (prefix && dictionary.complexPrefixes) {
+          // we took away the second prefix: go look for another suffix
+          stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true));
+        } else if (prefix == false && dictionary.complexPrefixes == false) {
+          // we took away a prefix, then a suffix: go look for another suffix
+          stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false));
+        }
+      }
     }
 
     return stems;
@@ -243,7 +318,7 @@ final class Stemmer {
    * @param flags Array of flags to cross check against.  Can be {@code null}
    * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
    */
-  private boolean hasCrossCheckedFlag(char flag, char[] flags) {
-    return flags.length == 0 || Arrays.binarySearch(flags, flag) >= 0;
+  private boolean hasCrossCheckedFlag(char flag, char[] flags, boolean matchEmpty) {
+    return (flags.length == 0 && matchEmpty) || Arrays.binarySearch(flags, flag) >= 0;
   }
 }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java?rev=1573251&r1=1573250&r2=1573251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java Sun Mar  2 02:08:16 2014
@@ -38,7 +38,8 @@ public class TestCaseInsensitive extends
   }
 
   public void testRecursiveSuffix() {
-    assertStemsTo("abcd", "ab");
+    // we should not recurse here! as the suffix has no continuation!
+    assertStemsTo("abcd");
   }
 
   // all forms unmunched from dictionary

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDependencies.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDependencies.java?rev=1573251&r1=1573250&r2=1573251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDependencies.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDependencies.java Sun Mar  2 02:08:16 2014
@@ -30,10 +30,10 @@ public class TestDependencies extends St
     assertStemsTo("drink", "drink", "drink");
     assertStemsTo("drinks", "drink", "drink");
     assertStemsTo("drinkable", "drink");
-    // TODO: BUG! assertStemsTo("drinkables", "drink");
+    assertStemsTo("drinkables", "drink");
     assertStemsTo("undrinkable", "drink");
-    // TODO: BUG! assertStemsTo("undrinkables", "drink");
+    assertStemsTo("undrinkables", "drink");
     assertStemsTo("undrink");
-    // TODO: BUG! assertStemsTo("undrinks");
+    assertStemsTo("undrinks");
   }
 }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java?rev=1573251&r1=1573250&r2=1573251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java Sun Mar  2 02:08:16 2014
@@ -83,7 +83,7 @@ public class TestDictionary extends Luce
       fail("didn't get expected exception");
     } catch (ParseException expected) {
       assertEquals("The affix file contains a rule with less than five elements", expected.getMessage());
-      assertEquals(23, expected.getErrorOffset());
+      assertEquals(24, expected.getErrorOffset());
     }
     
     affixStream.close();

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java?rev=1573251&r1=1573250&r2=1573251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java Sun Mar  2 02:08:16 2014
@@ -60,13 +60,13 @@ public class TestHunspellStemFilter exte
   public void testKeywordAttribute() throws IOException {
     MockTokenizer tokenizer = new MockTokenizer(new StringReader("lucene is awesome"));
     tokenizer.setEnableChecks(true);
-    HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3));
+    HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary);
     assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
     
     // assert with keyword marker
     tokenizer = new MockTokenizer(new StringReader("lucene is awesome"));
     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
-    filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3));
+    filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary);
     assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
   }
   
@@ -74,7 +74,7 @@ public class TestHunspellStemFilter exte
   public void testLongestOnly() throws IOException {
     MockTokenizer tokenizer = new MockTokenizer(new StringReader("lucene is awesome"));
     tokenizer.setEnableChecks(true);
-    HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, true, TestUtil.nextInt(random(), 1, 3), true);
+    HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, true, true);
     assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
   }
   
@@ -84,7 +84,7 @@ public class TestHunspellStemFilter exte
       @Override
       protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
         Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
+        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary));
       }
     };
     checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
@@ -95,7 +95,7 @@ public class TestHunspellStemFilter exte
       @Override
       protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
         Tokenizer tokenizer = new KeywordTokenizer(reader);
-        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
+        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary));
       }
     };
     checkOneTerm(a, "", "");

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestMorph.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestMorph.java?rev=1573251&r1=1573250&r2=1573251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestMorph.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestMorph.java Sun Mar  2 02:08:16 2014
@@ -29,5 +29,6 @@ public class TestMorph extends StemmerTe
   public void testExamples() {
     assertStemsTo("drink", "drink");
     assertStemsTo("drinkable", "drink");
+    assertStemsTo("drinkableable");
   }
 }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java?rev=1573251&r1=1573250&r2=1573251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java Sun Mar  2 02:08:16 2014
@@ -36,7 +36,8 @@ public class TestStemmer extends Stemmer
   }
 
   public void testRecursiveSuffix() {
-    assertStemsTo("abcd", "ab");
+    // we should not recurse here, as the suffix has no continuation!
+    assertStemsTo("abcd");
   }
 
   // all forms unmunched from dictionary

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTwoFold.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTwoFold.java?rev=1573251&r1=1573250&r2=1573251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTwoFold.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTwoFold.java Sun Mar  2 02:08:16 2014
@@ -31,5 +31,7 @@ public class TestTwoFold extends Stemmer
     assertStemsTo("drinkable", "drink");
     assertStemsTo("drinkables", "drink");
     assertStemsTo("drinksable");
+    assertStemsTo("drinkableable");
+    assertStemsTo("drinks");
   }
 }