You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/06 16:04:33 UTC
[lucene-solr] branch master updated: LUCENE-9734: Hunspell: support suggestions based on "ph" morphological data (#2308)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 1852d7a  LUCENE-9734: Hunspell: support suggestions based on "ph" morphological data (#2308)
1852d7a is described below

commit 1852d7ad5aea0f4900550f6f0853b3adc4c49124
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Sat Feb 6 17:04:12 2021 +0100

    LUCENE-9734: Hunspell: support suggestions based on "ph" morphological data (#2308)
---
 .../lucene/analysis/hunspell/Dictionary.java       | 129 +++++++++++++++++----
 .../analysis/hunspell/ModifyingSuggester.java      |  79 ++++++++-----
 .../lucene/analysis/hunspell/SpellChecker.java     |  11 +-
 .../lucene/analysis/hunspell/SpellCheckerTest.java |  10 +-
 .../org/apache/lucene/analysis/hunspell/ph.aff     |  30 +++++
 .../org/apache/lucene/analysis/hunspell/ph.dic     |  11 ++
 .../org/apache/lucene/analysis/hunspell/ph.sug     |  11 ++
 .../org/apache/lucene/analysis/hunspell/ph.wrong   |  11 ++
 .../org/apache/lucene/analysis/hunspell/ph2.aff    |  32 +++++
 .../org/apache/lucene/analysis/hunspell/ph2.dic    |  11 ++
 .../org/apache/lucene/analysis/hunspell/ph2.good   |   9 ++
 .../org/apache/lucene/analysis/hunspell/ph2.sug    |  14 +++
 .../org/apache/lucene/analysis/hunspell/ph2.wrong  |  15 +++
 .../org/apache/lucene/analysis/hunspell/rep.aff    |   2 +-
 14 files changed, 321 insertions(+), 54 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 048f9c6..d71e714 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -44,6 +44,8 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
@@ -80,6 +82,7 @@ public class Dictionary {
   // TODO: really for suffixes we should reverse the automaton and run them backwards
   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
+  private static final Pattern MORPH_KEY_PATTERN = Pattern.compile("\\s+(?=\\p{Alpha}{2}:)");
   static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
   CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);
 
@@ -386,8 +389,7 @@ public class Dictionary {
         fullStrip = true;
       } else if ("LANG".equals(firstWord)) {
         language = singleArgument(reader, line);
-        String langCode = extractLanguageCode(language);
-        alternateCasing = langCode.equals("tr") || langCode.equals("az");
+        this.alternateCasing = hasLanguage("tr", "az");
       } else if ("BREAK".equals(firstWord)) {
         breaks = parseBreaks(reader, line);
       } else if ("WORDCHARS".equals(firstWord)) {
@@ -463,6 +465,17 @@ public class Dictionary {
     stripOffsets[currentIndex] = currentOffset;
   }
 
+  private boolean hasLanguage(String... langCodes) {
+    if (language == null) return false;
+    String langCode = extractLanguageCode(language);
+    for (String code : langCodes) {
+      if (langCode.equals(code)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   static String extractLanguageCode(String isoCode) {
     int underscore = isoCode.indexOf("_");
     return underscore < 0 ? isoCode : isoCode.substring(0, underscore);
@@ -910,7 +923,7 @@ public class Dictionary {
           if (!hasStemExceptions) {
             int morphStart = line.indexOf(MORPH_SEPARATOR);
             if (morphStart >= 0 && morphStart < line.length()) {
-              hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
+              hasStemExceptions = hasStemException(line.substring(morphStart + 1));
             }
           }
 
@@ -963,6 +976,23 @@ public class Dictionary {
     writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
   }
 
+  String toLowerCase(String word) {
+    char[] chars = new char[word.length()];
+    for (int i = 0; i < word.length(); i++) {
+      chars[i] = caseFold(word.charAt(i));
+    }
+    return new String(chars);
+  }
+
+  String toTitleCase(String word) {
+    char[] chars = new char[word.length()];
+    chars[0] = Character.toUpperCase(word.charAt(0));
+    for (int i = 1; i < word.length(); i++) {
+      chars[i] = caseFold(word.charAt(i));
+    }
+    return new String(chars);
+  }
+
   private String sortWordsOffline(
       Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
     OfflineSorter sorter =
@@ -1062,13 +1092,14 @@ public class Dictionary {
         }
         // we possibly have morphological data
         int stemExceptionID = 0;
-        if (hasStemExceptions && end + 1 < line.length()) {
-          String stemException = parseStemException(line.substring(end + 1));
-          if (stemException != null) {
-            stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1);
-            stemExceptionID =
-                stemExceptionCount + 1; // we use '0' to indicate no exception for the form
-            stemExceptions[stemExceptionCount++] = stemException;
+        if (end + 1 < line.length()) {
+          String morphData = line.substring(end + 1);
+          for (String datum : splitMorphData(morphData)) {
+            if (datum.startsWith("st:")) {
+              stemExceptionID = addStemException(datum.substring(3));
+            } else if (datum.startsWith("ph:") && datum.length() > 3) {
+              addPhoneticRepEntries(entry, datum.substring(3));
+            }
           }
         }
 
@@ -1088,6 +1119,52 @@ public class Dictionary {
     }
   }
 
+  private int addStemException(String stemException) {
+    stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1);
+    stemExceptions[stemExceptionCount++] = stemException;
+    return stemExceptionCount; // we use '0' to indicate no exception for the form
+  }
+
+  private void addPhoneticRepEntries(String word, String ph) {
+    // e.g. "pretty ph:prity ph:priti->pretti" to suggest both prity->pretty and pritier->prettiest
+    int arrow = ph.indexOf("->");
+    String pattern;
+    String replacement;
+    if (arrow > 0) {
+      pattern = ph.substring(0, arrow);
+      replacement = ph.substring(arrow + 2);
+    } else {
+      pattern = ph;
+      replacement = word;
+    }
+
+    // when the ph: field ends with *, strip last character of pattern and replacement
+    // e.g., "pretty ph:prity*" results in "prit->prett" replacement instead of "prity->pretty",
+    // to get both prity->pretty and pritiest->prettiest suggestions.
+    if (pattern.endsWith("*") && pattern.length() > 2 && replacement.length() > 1) {
+      pattern = pattern.substring(0, pattern.length() - 2);
+      replacement = replacement.substring(0, replacement.length() - 1);
+    }
+
+    // capitalize lowercase pattern for capitalized words to support
+    // good suggestions also for capitalized misspellings,
+    // e.g. Wednesday ph:wendsay results in wendsay -> Wednesday and Wendsay -> Wednesday.
+    if (WordCase.caseOf(word) == WordCase.TITLE && WordCase.caseOf(pattern) == WordCase.LOWER) {
+      // add also lowercase word in the case of German or
+      // Hungarian to support lowercase suggestions lowercased by
+      // compound word generation or derivational suffixes
+      // for example by adjectival suffix "-i" of geographical names in Hungarian:
+      // Massachusetts ph:messzecsuzec
+      // messzecsuzeci -> massachusettsi (adjective)
+      // For lowercasing by conditional PFX rules, see e.g. germancompounding test
+      if (hasLanguage("de", "hu")) {
+        repTable.add(new RepEntry(pattern, toLowerCase(replacement)));
+      }
+      repTable.add(new RepEntry(toTitleCase(pattern), replacement));
+    }
+    repTable.add(new RepEntry(pattern, replacement));
+  }
+
   boolean isDotICaseChangeDisallowed(char[] word) {
     return word[0] == 'İ' && !alternateCasing;
   }
@@ -1220,29 +1297,31 @@ public class Dictionary {
     }
   }
 
-  private String parseStemException(String morphData) {
+  private boolean hasStemException(String morphData) {
+    for (String datum : splitMorphData(morphData)) {
+      if (datum.startsWith("st:")) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private List<String> splitMorphData(String morphData) {
     // first see if it's an alias
     if (morphAliasCount > 0) {
       try {
         int alias = Integer.parseInt(morphData.trim());
         morphData = morphAliases[alias - 1];
-      } catch (NumberFormatException e) {
-        // fine
+      } catch (NumberFormatException ignored) {
       }
     }
-    // try to parse morph entry
-    int index = morphData.indexOf(" st:");
-    if (index < 0) {
-      index = morphData.indexOf("\tst:");
-    }
-    if (index >= 0) {
-      int endIndex = indexOfSpaceOrTab(morphData, index + 1);
-      if (endIndex < 0) {
-        endIndex = morphData.length();
-      }
-      return morphData.substring(index + 4, endIndex);
+    if (morphData.isBlank()) {
+      return Collections.emptyList();
     }
-    return null;
+    return Arrays.stream(MORPH_KEY_PATTERN.split(morphData))
+        .map(String::trim)
+        .filter(s -> !s.isBlank())
+        .collect(Collectors.toList());
   }
 
   boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
index 4dd91c0..0c60e1b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@@ -16,9 +16,12 @@
  */
 package org.apache.lucene.analysis.hunspell;
 
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.LinkedHashSet;
+import java.util.List;
 import java.util.Locale;
+import java.util.stream.Collectors;
 
 class ModifyingSuggester {
   private static final int MAX_CHAR_DISTANCE = 4;
@@ -36,6 +39,14 @@ class ModifyingSuggester {
 
     WordCase wc = WordCase.caseOf(word);
 
+    if (wc == WordCase.UPPER) {
+      tryVariationsOf(speller.dictionary.toLowerCase(word));
+      tryVariationsOf(speller.dictionary.toTitleCase(word));
+      return result.stream()
+          .map(this::tryUpperCase)
+          .collect(Collectors.toCollection(LinkedHashSet::new));
+    }
+
     if (wc == WordCase.MIXED) {
       int dot = word.indexOf('.');
       if (dot > 0
@@ -44,27 +55,24 @@ class ModifyingSuggester {
         result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1));
       }
 
-      tryVariationsOf(toLowerCase(word));
+      tryVariationsOf(speller.dictionary.toLowerCase(word));
     }
 
     return result;
   }
 
-  private String toLowerCase(String word) {
-    char[] chars = new char[word.length()];
-    for (int i = 0; i < word.length(); i++) {
-      chars[i] = speller.dictionary.caseFold(word.charAt(i));
+  private String tryUpperCase(String candidate) {
+    String upper = candidate.toUpperCase(Locale.ROOT);
+    if (upper.contains(" ") || speller.spell(upper)) {
+      return upper;
     }
-    return new String(chars);
+    String title = speller.dictionary.toTitleCase(candidate);
+    return speller.spell(title) ? title : candidate;
   }
 
   private void tryVariationsOf(String word) {
-    trySuggestion(word.toUpperCase(Locale.ROOT));
-    if (checkDictionaryForSplitSuggestions(word)) {
-      return;
-    }
-
-    tryRep(word);
+    boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
+    hasGoodSuggestions |= tryRep(word);
 
     trySwappingChars(word);
     tryLongSwap(word);
@@ -75,12 +83,24 @@ class ModifyingSuggester {
     tryReplacingChar(word);
     tryTwoDuplicateChars(word);
 
-    if (speller.dictionary.enableSplitSuggestions) {
+    List<String> goodSplit = checkDictionaryForSplitSuggestions(word);
+    if (!goodSplit.isEmpty()) {
+      List<String> copy = new ArrayList<>(result);
+      result.clear();
+      result.addAll(goodSplit);
+      if (hasGoodSuggestions) {
+        result.addAll(copy);
+      }
+      hasGoodSuggestions = true;
+    }
+
+    if (!hasGoodSuggestions && speller.dictionary.enableSplitSuggestions) {
       trySplitting(word);
     }
   }
 
-  private void tryRep(String word) {
+  private boolean tryRep(String word) {
+    int before = result.size();
     for (RepEntry entry : speller.dictionary.repTable) {
       for (String candidate : entry.substitute(word)) {
         if (trySuggestion(candidate)) {
@@ -88,11 +108,16 @@ class ModifyingSuggester {
         }
 
         if (candidate.contains(" ")
-            && Arrays.stream(candidate.split(" ")).allMatch(speller::checkWord)) {
+            && Arrays.stream(candidate.split(" ")).allMatch(this::checkSimpleWord)) {
           result.add(candidate);
         }
       }
     }
+    return result.size() > before;
+  }
+
+  private boolean checkSimpleWord(String part) {
+    return Boolean.TRUE.equals(speller.checkSimpleWord(part.toCharArray(), part.length(), null));
   }
 
   private void trySwappingChars(String word) {
@@ -213,24 +238,30 @@ class ModifyingSuggester {
     }
   }
 
-  private boolean checkDictionaryForSplitSuggestions(String word) {
-    boolean found = false;
+  private List<String> checkDictionaryForSplitSuggestions(String word) {
+    List<String> result = new ArrayList<>();
     for (int i = 1; i < word.length() - 1; i++) {
       String w1 = word.substring(0, i);
       String w2 = word.substring(i);
-      found |= trySuggestion(w1 + " " + w2);
+      String spaced = w1 + " " + w2;
+      if (speller.checkWord(spaced)) {
+        result.add(spaced);
+      }
       if (shouldSplitByDash()) {
-        found |= trySuggestion(w1 + "-" + w2);
+        String dashed = w1 + "-" + w2;
+        if (speller.checkWord(dashed)) {
+          result.add(dashed);
+        }
       }
     }
-    return found;
+    return result;
   }
 
   private void trySplitting(String word) {
     for (int i = 1; i < word.length() - 1; i++) {
       String w1 = word.substring(0, i);
       String w2 = word.substring(i);
-      if (speller.checkWord(w1) && speller.checkWord(w2)) {
+      if (checkSimpleWord(w1) && checkSimpleWord(w2)) {
         result.add(w1 + " " + w2);
         if (shouldSplitByDash()) {
           result.add(w1 + "-" + w2);
@@ -244,10 +275,6 @@ class ModifyingSuggester {
   }
 
   private boolean trySuggestion(String candidate) {
-    if (speller.checkWord(candidate)) {
-      result.add(candidate);
-      return true;
-    }
-    return false;
+    return speller.checkWord(candidate) && result.add(candidate);
   }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index 1b0c2d3..53bf53e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -134,7 +134,7 @@ public class SpellChecker {
     return checkWord(word.toCharArray(), word.length(), null);
   }
 
-  private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
+  Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
     if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
       return false;
     }
@@ -143,6 +143,15 @@ public class SpellChecker {
       return true;
     }
 
+    return null;
+  }
+
+  private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
+    Boolean simpleResult = checkSimpleWord(wordChars, length, originalCase);
+    if (simpleResult != null) {
+      return simpleResult;
+    }
+
     if (dictionary.compoundRules != null
         && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
       return true;
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index a2399af..f4ca6b5 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -44,10 +44,18 @@ public class SpellCheckerTest extends StemmerTestBase {
     doTest("allcaps");
   }
 
-  public void rep() throws Exception {
+  public void testRepSuggestions() throws Exception {
     doTest("rep");
   }
 
+  public void testPhSuggestions() throws Exception {
+    doTest("ph");
+  }
+
+  public void testPhSuggestions2() throws Exception {
+    doTest("ph2");
+  }
+
   public void testForceUCase() throws Exception {
     doTest("forceucase");
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.aff
new file mode 100644
index 0000000..c7d26bb
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.aff
@@ -0,0 +1,30 @@
+# new suggestion methods of Hunspell 1.7:
+# ph: for dictionary-based suggestions.
+#
+# For example, suggestions for "wich"
+# with this test dictonary:
+#
+# Hunspell 1.3.3
+# wich
+# & wich 4 0: winch, witch, which, wish
+#
+# Hunspell 1.6.2
+# wich
+# & wich 4 0: which, witch, winch, wish
+#
+# Suggestions will be limited for
+# the dictionary words with the same ph: field,
+# and for non-ngram suggestions.
+#
+# Order of the ph: suggestions for the
+# same mispelling, eg. wich -> which, witch
+# follows the order of the words in the dictionary:
+#
+# which ph:wich
+# witch ph:witch
+#
+# switch off ngram suggestions to check only
+# ph: based suggestions
+MAXNGRAMSUGS 0
+
+TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ'-
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.dic
new file mode 100644
index 0000000..e9462d5
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.dic
@@ -0,0 +1,11 @@
+8
+a lot ph:alot
+in spite ph:inspite
+inspire
+what ph:whta ph:waht
+Wednesday ph:wendsay ph:wensday
+which ph:wich
+witch ph:wich
+winch
+wish
+Oh, my gosh! ph:omg
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.sug
new file mode 100644
index 0000000..8daee56
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.sug
@@ -0,0 +1,11 @@
+a lot
+in spite, inspire
+what
+what
+Wednesday
+Wednesday
+Wednesday
+Wednesday
+which, witch, winch, wish
+Oh, my gosh!
+OH, MY GOSH!
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.wrong
new file mode 100644
index 0000000..f51b31a
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.wrong
@@ -0,0 +1,11 @@
+alot
+inspite
+whta
+waht
+wensday
+wendsay
+Wensday
+Wendsay
+wich
+omg
+OMG
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.aff
new file mode 100644
index 0000000..d9d4288
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.aff
@@ -0,0 +1,32 @@
+# switch off ngram suggestion to test
+# usage of ph: dictionary fields in REP
+# suggestions
+SET UTF-8
+
+MAXNGRAMSUGS 0
+
+# test in compounds, too
+COMPOUNDFLAG Y
+
+# test also dictionary items with space,
+# and forbidden compounding, if there is
+# a ph: field with that compound as
+# mispelling in the dictionary
+CHECKCOMPOUNDREP
+
+# test in compound word with affixes
+SFX A Y 1
+SFX A 0 's .
+
+# when the ph: field ends with the character *,
+# strip last character of the pattern and the replacement
+# to match in REP suggestions also at character changes,
+# for example, "pretty ph:prity*" results "prit->prett"
+# REP replacement instead of "prity->pretty", to get
+# prity->pretty and pritiest->prettiest suggestions.
+
+SFX B Y 2
+SFX B y iest [^aeiou]y
+SFX B ö őt ö
+
+WORDCHARS '
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.dic
new file mode 100644
index 0000000..34482bb
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.dic
@@ -0,0 +1,11 @@
+9
+foo ph:bar ph:baz
+foo bar ph:foobar
+word/Y ph:baz
+stem/Y ph: ph:
+forbidden/Y
+root/YA
+forbidden root/A ph:forbiddenroot
+pretty/B ph:prity*
+foobarö/B ph:fubarő*
+happy/B ph:hepy ph:hepi->happi
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.good
new file mode 100644
index 0000000..c471b4e
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.good
@@ -0,0 +1,9 @@
+foo
+word
+stem
+wordstem
+stemword
+rootforbidden
+root's
+foobarö
+foobarőt
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.sug
new file mode 100644
index 0000000..916a607
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.sug
@@ -0,0 +1,14 @@
+foo
+foo, word
+foo bar
+wordstem
+stemword
+stemwordstem
+forbidden root
+forbidden root's
+pretty
+prettiest
+foobarö
+foobarőt
+happy
+happiest
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.wrong
new file mode 100644
index 0000000..74055eb
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.wrong
@@ -0,0 +1,15 @@
+bar
+baz
+foobar
+bazstem
+stembaz
+stembazstem
+forbiddenroot
+forbiddenroot's
+rootforbiddenroot
+prity
+pritiest
+fubarö
+fubarőt
+hepy
+hepiest
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.aff
index 4ccc6e2..485755c 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.aff
@@ -10,7 +10,7 @@ REP shun$ tion
 REP ^alot$ a_lot  # add the highest priority for "a lot" suggestion to "alot"
 REP ^foo$ bar
 REP ' _    # "un'alunno" -> "un alunno"
-REP ^vinte�n$ vinte_e_un
+REP ^vinte�n$ vinte_e_un
 REP s 's