You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/04 18:04:49 UTC
svn commit: r1574135 - in /lucene/dev/trunk/lucene: ./ analysis/common/src/java/org/apache/lucene/analysis/hunspell/ analysis/common/src/test/org/apache/lucene/analysis/hunspell/

Author: rmuir
Date: Tue Mar  4 17:04:48 2014
New Revision: 1574135

URL: http://svn.apache.org/r1574135
Log:
LUCENE-5224: Add iconv, oconv, and ignore support to HunspellStemFilter

Added:
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestConv.java   (with props)
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestIgnore.java   (with props)
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.aff   (with props)
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.dic   (with props)
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.aff   (with props)
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.dic   (with props)
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1574135&r1=1574134&r2=1574135&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Mar  4 17:04:48 2014
@@ -89,6 +89,9 @@ New Features
 
 * LUCENE-5485: Add circumfix support to HunspellStemFilter. (Robert Muir)
 
+* LUCENE-5224: Add iconv, oconv, and ignore support to HunspellStemFilter.
+  (Robert Muir)
+
 API Changes
 
 * LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1574135&r1=1574134&r2=1574135&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Tue Mar  4 17:04:48 2014
@@ -21,14 +21,17 @@ import org.apache.lucene.store.ByteArray
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.OfflineSorter;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.CharSequenceOutputs;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.IntSequenceOutputs;
+import org.apache.lucene.util.fst.Outputs;
 import org.apache.lucene.util.fst.Util;
 
 import java.io.BufferedInputStream;
@@ -67,6 +70,9 @@ public class Dictionary {
   private static final String FLAG_KEY = "FLAG";
   private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
   private static final String CIRCUMFIX_KEY = "CIRCUMFIX";
+  private static final String IGNORE_KEY = "IGNORE";
+  private static final String ICONV_KEY = "ICONV";
+  private static final String OCONV_KEY = "OCONV";
 
   private static final String NUM_FLAG_TYPE = "num";
   private static final String UTF8_FLAG_TYPE = "UTF-8";
@@ -110,6 +116,16 @@ public class Dictionary {
   
   int circumfix = -1; // circumfix flag, or -1 if one is not defined
   
+  // ignored characters (dictionary, affix, inputs)
+  private char[] ignore;
+  
+  // FSTs used for ICONV/OCONV, output ord pointing to replacement text
+  FST<CharsRef> iconv;
+  FST<CharsRef> oconv;
+  
+  boolean needsInputCleaning;
+  boolean needsOutputCleaning;
+  
   /**
    * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
    * and dictionary files.
@@ -136,6 +152,8 @@ public class Dictionary {
    */
   public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException {
     this.ignoreCase = ignoreCase;
+    this.needsInputCleaning = ignoreCase;
+    this.needsOutputCleaning = false; // set if we have an OCONV
     // hungarian has thousands of AF before the SET, so a 32k buffer is needed 
     BufferedInputStream buffered = new BufferedInputStream(affix, 32768);
     buffered.mark(32768);
@@ -249,6 +267,29 @@ public class Dictionary {
           throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
         }
         circumfix = flagParsingStrategy.parseFlag(parts[1]);
+      } else if (line.startsWith(IGNORE_KEY)) {
+        String parts[] = line.split("\\s+");
+        if (parts.length != 2) {
+          throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
+        }
+        ignore = parts[1].toCharArray();
+        Arrays.sort(ignore);
+        needsInputCleaning = true;
+      } else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
+        String parts[] = line.split("\\s+");
+        String type = parts[0];
+        if (parts.length != 2) {
+          throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
+        }
+        int num = Integer.parseInt(parts[1]);
+        FST<CharsRef> res = parseConversions(reader, num);
+        if (type.equals("ICONV")) {
+          iconv = res;
+          needsInputCleaning |= iconv != null;
+        } else {
+          oconv = res;
+          needsOutputCleaning |= oconv != null;
+        }
       }
     }
     
@@ -291,6 +332,7 @@ public class Dictionary {
                           Map<String,Integer> seenPatterns) throws IOException, ParseException {
     
     BytesRef scratch = new BytesRef();
+    StringBuilder sb = new StringBuilder();
     String args[] = header.split("\\s+");
 
     boolean crossProduct = args[2].equals("Y");
@@ -300,9 +342,6 @@ public class Dictionary {
     ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
     
     for (int i = 0; i < numLines; i++) {
-      if (currentAffix > Short.MAX_VALUE) {
-        throw new UnsupportedOperationException("Too many affixes, please report this to dev@lucene.apache.org");
-      }
       assert affixWriter.getPosition() == currentAffix << 3;
       String line = reader.readLine();
       String ruleArgs[] = line.split("\\s+");
@@ -345,6 +384,9 @@ public class Dictionary {
       Integer patternIndex = seenPatterns.get(regex);
       if (patternIndex == null) {
         patternIndex = patterns.size();
+        if (patternIndex > Short.MAX_VALUE) {
+          throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org");          
+        }
         seenPatterns.put(regex, patternIndex);
         Pattern pattern = Pattern.compile(regex);
         patterns.add(pattern);
@@ -355,6 +397,8 @@ public class Dictionary {
       if (stripOrd < 0) {
         // already exists in our hash
         stripOrd = (-stripOrd)-1;
+      } else if (stripOrd > Character.MAX_VALUE) {
+        throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
       }
 
       if (appendFlags == null) {
@@ -368,7 +412,7 @@ public class Dictionary {
         appendFlagsOrd = (-appendFlagsOrd)-1;
       } else if (appendFlagsOrd > Short.MAX_VALUE) {
         // this limit is probably flexible, but its a good sanity check too
-        throw new UnsupportedOperationException("Too many unique flags, please report this to dev@lucene.apache.org");
+        throw new UnsupportedOperationException("Too many unique append flags, please report this to dev@lucene.apache.org");
       }
       
       affixWriter.writeShort((short)flag);
@@ -378,6 +422,11 @@ public class Dictionary {
       affixWriter.writeShort((short)patternOrd);
       affixWriter.writeShort((short)appendFlagsOrd);
       
+      if (needsInputCleaning) {
+        CharSequence cleaned = cleanInput(affixArg, sb);
+        affixArg = cleaned.toString();
+      }
+      
       List<Character> list = affixes.get(affixArg);
       if (list == null) {
         list = new ArrayList<Character>();
@@ -388,6 +437,31 @@ public class Dictionary {
       currentAffix++;
     }
   }
+  
+  private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
+    Map<String,String> mappings = new TreeMap<>();
+    
+    for (int i = 0; i < num; i++) {
+      String line = reader.readLine();
+      String parts[] = line.split("\\s+");
+      if (parts.length != 3) {
+        throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
+      }
+      if (mappings.put(parts[1], parts[2]) != null) {
+        throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
+      }
+    }
+    
+    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
+    Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
+    IntsRef scratchInts = new IntsRef();
+    for (Map.Entry<String,String> entry : mappings.entrySet()) {
+      Util.toUTF16(entry.getKey(), scratchInts);
+      builder.add(scratchInts, new CharsRef(entry.getValue()));
+    }
+    
+    return builder.finish();
+  }
 
   /**
    * Parses the encoding specified in the affix file readable through the provided InputStream
@@ -485,6 +559,8 @@ public class Dictionary {
     BytesRef flagsScratch = new BytesRef();
     IntsRef scratchInts = new IntsRef();
     
+    StringBuilder sb = new StringBuilder();
+    
     File unsorted = File.createTempFile("unsorted", "dat", tempDir);
     try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
       for (InputStream dictionary : dictionaries) {
@@ -492,16 +568,19 @@ public class Dictionary {
         String line = lines.readLine(); // first line is number of entries (approximately, sometimes)
         
         while ((line = lines.readLine()) != null) {
-          if (ignoreCase) {
+          if (needsInputCleaning) {
             int flagSep = line.lastIndexOf('/');
             if (flagSep == -1) {
-              writer.write(line.toLowerCase(Locale.ROOT).getBytes(IOUtils.CHARSET_UTF_8));
+              CharSequence cleansed = cleanInput(line, sb);
+              writer.write(cleansed.toString().getBytes(IOUtils.CHARSET_UTF_8));
             } else {
-              StringBuilder sb = new StringBuilder();
-              sb.append(line.substring(0, flagSep).toLowerCase(Locale.ROOT));
-              if (flagSep < line.length()) {
-                sb.append(line.substring(flagSep, line.length()));
+              String text = line.substring(0, flagSep);
+              CharSequence cleansed = cleanInput(text, sb);
+              if (cleansed != sb) {
+                sb.setLength(0);
+                sb.append(cleansed);
               }
+              sb.append(line.substring(flagSep));
               writer.write(sb.toString().getBytes(IOUtils.CHARSET_UTF_8));
             }
           } else {
@@ -761,4 +840,76 @@ public class Dictionary {
   static boolean hasFlag(char flags[], char flag) {
     return Arrays.binarySearch(flags, flag) >= 0;
   }
+  
+  CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
+    reuse.setLength(0);
+    
+    for (int i = 0; i < input.length(); i++) {
+      char ch = input.charAt(i);
+      
+      if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
+        continue;
+      }
+      
+      if (ignoreCase && iconv == null) {
+        // if we have no input conversion mappings, do this on-the-fly
+        ch = Character.toLowerCase(ch);
+      }
+      
+      reuse.append(ch);
+    }
+    
+    if (iconv != null) {
+      try {
+        applyMappings(iconv, reuse);
+      } catch (IOException bogus) {
+        throw new RuntimeException(bogus);
+      }
+      if (ignoreCase) {
+        for (int i = 0; i < reuse.length(); i++) {
+          reuse.setCharAt(i, Character.toLowerCase(reuse.charAt(i)));
+        }
+      }
+    }
+    
+    return reuse;
+  }
+  
+  // TODO: this could be more efficient!
+  static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
+    final FST.BytesReader bytesReader = fst.getBytesReader();
+    final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
+    final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
+    
+    // temporary stuff
+    final FST.Arc<CharsRef> arc = new FST.Arc<>();
+    int longestMatch;
+    CharsRef longestOutput;
+    
+    for (int i = 0; i < sb.length(); i++) {
+      arc.copyFrom(firstArc);
+      CharsRef output = NO_OUTPUT;
+      longestMatch = -1;
+      longestOutput = null;
+      
+      for (int j = i; j < sb.length(); j++) {
+        char ch = sb.charAt(j);
+        if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
+          break;
+        } else {
+          output = fst.outputs.add(output, arc.output);
+        }
+        if (arc.isFinal()) {
+          longestOutput = fst.outputs.add(output, arc.nextFinalOutput);
+          longestMatch = j;
+        }
+      }
+      
+      if (longestMatch >= 0) {
+        sb.delete(i, longestMatch+1);
+        sb.insert(i, longestOutput);
+        i += (longestOutput.length - 1);
+      }
+    }
+  }
 }

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java?rev=1574135&r1=1574134&r2=1574135&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java Tue Mar  4 17:04:48 2014
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis.hunsp
  * limitations under the License.
  */
 
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -24,8 +25,8 @@ import java.util.List;
 import java.util.regex.Pattern;
 
 import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
@@ -40,8 +41,11 @@ final class Stemmer {
   private final BytesRef scratch = new BytesRef();
   private final StringBuilder segment = new StringBuilder();
   private final ByteArrayDataInput affixReader;
-  private final CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT);
-
+  
+  // used for normalization
+  private final StringBuilder scratchSegment = new StringBuilder();
+  private char scratchBuffer[] = new char[32];
+  
   /**
    * Constructs a new Stemmer which will use the provided Dictionary to create its stems.
    *
@@ -68,17 +72,25 @@ final class Stemmer {
    * @param word Word to find the stems for
    * @return List of stems for the word
    */
-  public List<CharsRef> stem(char word[], int length) {
-    if (dictionary.ignoreCase) {
-      charUtils.toLowerCase(word, 0, length);
+  public List<CharsRef> stem(char word[], int length) {    
+
+    if (dictionary.needsInputCleaning) {
+      scratchSegment.setLength(0);
+      scratchSegment.append(word, 0, length);
+      CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
+      scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
+      length = segment.length();
+      segment.getChars(0, length, scratchBuffer, 0);
+      word = scratchBuffer;
     }
+    
     List<CharsRef> stems = new ArrayList<CharsRef>();
     IntsRef forms = dictionary.lookupWord(word, 0, length);
     if (forms != null) {
       // TODO: some forms should not be added, e.g. ONLYINCOMPOUND
       // just because it exists, does not make it valid...
       for (int i = 0; i < forms.length; i++) {
-        stems.add(new CharsRef(word, 0, length));
+        stems.add(newStem(word, length));
       }
     }
     stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false));
@@ -106,6 +118,23 @@ final class Stemmer {
     }
     return deduped;
   }
+  
+  private CharsRef newStem(char buffer[], int length) {
+    if (dictionary.needsOutputCleaning) {
+      scratchSegment.setLength(0);
+      scratchSegment.append(buffer, 0, length);
+      try {
+        Dictionary.applyMappings(dictionary.oconv, scratchSegment);
+      } catch (IOException bogus) {
+        throw new RuntimeException(bogus);
+      }
+      char cleaned[] = new char[scratchSegment.length()];
+      scratchSegment.getChars(0, cleaned.length, cleaned, 0);
+      return new CharsRef(cleaned, 0, cleaned.length);
+    } else {
+      return new CharsRef(buffer, 0, length);
+    }
+  }
 
   // ================================================= Helper Methods ================================================
 
@@ -292,7 +321,7 @@ final class Stemmer {
               continue;
             }
           }
-          stems.add(new CharsRef(strippedWord, 0, length));
+          stems.add(newStem(strippedWord, length));
         }
       }
     }

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestConv.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestConv.java?rev=1574135&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestConv.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestConv.java Tue Mar  4 17:04:48 2014
@@ -0,0 +1,36 @@
+package org.apache.lucene.analysis.hunspell;
+
+import org.junit.BeforeClass;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class TestConv extends StemmerTestBase {
+  
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    init("conv.aff", "conv.dic");
+  }
+  
+  public void testConversion() {
+    assertStemsTo("drink", "drInk");
+    assertStemsTo("drInk", "drInk");
+    assertStemsTo("drInkAble", "drInk");
+    assertStemsTo("drInkABle", "drInk");
+    assertStemsTo("drinkABle", "drInk");
+  }
+}

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java?rev=1574135&r1=1574134&r2=1574135&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java Tue Mar  4 17:04:48 2014
@@ -22,10 +22,15 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.text.ParseException;
 
-import org.apache.lucene.analysis.hunspell.Dictionary;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.CharSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Outputs;
+import org.apache.lucene.util.fst.Util;
 
 public class TestDictionary extends LuceneTestCase {
 
@@ -123,4 +128,54 @@ public class TestDictionary extends Luce
     assertTrue(affixStream.isClosed());
     assertTrue(dictStream.isClosed());
   }
+  
+  
+  
+  public void testReplacements() throws Exception {
+    Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
+    Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
+    IntsRef scratchInts = new IntsRef();
+    
+    // a -> b
+    Util.toUTF16("a", scratchInts);
+    builder.add(scratchInts, new CharsRef("b"));
+    
+    // ab -> c
+    Util.toUTF16("ab", scratchInts);
+    builder.add(scratchInts, new CharsRef("c"));
+    
+    // c -> de
+    Util.toUTF16("c", scratchInts);
+    builder.add(scratchInts, new CharsRef("de"));
+    
+    // def -> gh
+    Util.toUTF16("def", scratchInts);
+    builder.add(scratchInts, new CharsRef("gh"));
+    
+    FST<CharsRef> fst = builder.finish();
+    
+    StringBuilder sb = new StringBuilder("atestanother");
+    Dictionary.applyMappings(fst, sb);
+    assertEquals("btestbnother", sb.toString());
+    
+    sb = new StringBuilder("abtestanother");
+    Dictionary.applyMappings(fst, sb);
+    assertEquals("ctestbnother", sb.toString());
+    
+    sb = new StringBuilder("atestabnother");
+    Dictionary.applyMappings(fst, sb);
+    assertEquals("btestcnother", sb.toString());
+    
+    sb = new StringBuilder("abtestabnother");
+    Dictionary.applyMappings(fst, sb);
+    assertEquals("ctestcnother", sb.toString());
+    
+    sb = new StringBuilder("abtestabcnother");
+    Dictionary.applyMappings(fst, sb);
+    assertEquals("ctestcdenother", sb.toString());
+    
+    sb = new StringBuilder("defdefdefc");
+    Dictionary.applyMappings(fst, sb);
+    assertEquals("ghghghde", sb.toString());
+  }
 }

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java?rev=1574135&r1=1574134&r2=1574135&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java Tue Mar  4 17:04:48 2014
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.hunsp
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
+import java.util.Collections;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -30,7 +31,6 @@ import org.apache.lucene.analysis.hunspe
 import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.util.TestUtil;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 
@@ -94,4 +94,20 @@ public class TestHunspellStemFilter exte
     };
     checkOneTerm(a, "", "");
   }
+  
+  public void testIgnoreCaseNoSideEffects() throws Exception {
+    final Dictionary d;
+    try (InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff");
+        InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic")) {
+      d = new Dictionary(affixStream, Collections.singletonList(dictStream), true);
+    }
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new KeywordTokenizer();
+        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, d));
+      }
+    };
+    checkOneTerm(a, "NoChAnGy", "NoChAnGy");
+  }
 }

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestIgnore.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestIgnore.java?rev=1574135&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestIgnore.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestIgnore.java Tue Mar  4 17:04:48 2014
@@ -0,0 +1,36 @@
+package org.apache.lucene.analysis.hunspell;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.junit.BeforeClass;
+
+public class TestIgnore extends StemmerTestBase {
+  
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    init("ignore.aff", "ignore.dic");
+  }
+  
+  public void testExamples() {
+    assertStemsTo("drink", "drink");
+    assertStemsTo("drinkable", "drink");
+    assertStemsTo("dr'ink-able", "drink");
+    assertStemsTo("drank-able", "drank");
+    assertStemsTo("'-'-'-");
+  }
+}

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.aff
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.aff?rev=1574135&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.aff (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.aff Tue Mar  4 17:04:48 2014
@@ -0,0 +1,16 @@
+SET UTF-8
+
+ICONV 4
+ICONV A a
+ICONV B b
+ICONV C c
+ICONV I i
+
+OCONV 4
+OCONV a A
+OCONV b B
+OCONV c C
+OCONV i I
+
+SFX X Y 1
+SFX X 0 able . +ABLE
\ No newline at end of file

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.dic
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.dic?rev=1574135&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.dic (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.dic Tue Mar  4 17:04:48 2014
@@ -0,0 +1,2 @@
+1
+drink/X   [VERB]

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.aff
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.aff?rev=1574135&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.aff (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.aff Tue Mar  4 17:04:48 2014
@@ -0,0 +1,6 @@
+SET UTF-8
+
+IGNORE '-
+
+SFX X Y 1
+SFX X 0 able . +ABLE
\ No newline at end of file

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.dic
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.dic?rev=1574135&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.dic (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ignore.dic Tue Mar  4 17:04:48 2014
@@ -0,0 +1,3 @@
+1
+drink/X   [VERB]
+dr-ank/X  [VERB]
\ No newline at end of file