You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/06 16:04:33 UTC
[lucene-solr] branch master updated: LUCENE-9734: Hunspell: support
suggestions based on "ph" morphological data (#2308)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 1852d7a LUCENE-9734: Hunspell: support suggestions based on "ph" morphological data (#2308)
1852d7a is described below
commit 1852d7ad5aea0f4900550f6f0853b3adc4c49124
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Sat Feb 6 17:04:12 2021 +0100
LUCENE-9734: Hunspell: support suggestions based on "ph" morphological data (#2308)
---
.../lucene/analysis/hunspell/Dictionary.java | 129 +++++++++++++++++----
.../analysis/hunspell/ModifyingSuggester.java | 79 ++++++++-----
.../lucene/analysis/hunspell/SpellChecker.java | 11 +-
.../lucene/analysis/hunspell/SpellCheckerTest.java | 10 +-
.../org/apache/lucene/analysis/hunspell/ph.aff | 30 +++++
.../org/apache/lucene/analysis/hunspell/ph.dic | 11 ++
.../org/apache/lucene/analysis/hunspell/ph.sug | 11 ++
.../org/apache/lucene/analysis/hunspell/ph.wrong | 11 ++
.../org/apache/lucene/analysis/hunspell/ph2.aff | 32 +++++
.../org/apache/lucene/analysis/hunspell/ph2.dic | 11 ++
.../org/apache/lucene/analysis/hunspell/ph2.good | 9 ++
.../org/apache/lucene/analysis/hunspell/ph2.sug | 14 +++
.../org/apache/lucene/analysis/hunspell/ph2.wrong | 15 +++
.../org/apache/lucene/analysis/hunspell/rep.aff | 2 +-
14 files changed, 321 insertions(+), 54 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 048f9c6..d71e714 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -44,6 +44,8 @@ import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@@ -80,6 +82,7 @@ public class Dictionary {
// TODO: really for suffixes we should reverse the automaton and run them backwards
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
+ private static final Pattern MORPH_KEY_PATTERN = Pattern.compile("\\s+(?=\\p{Alpha}{2}:)");
static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);
@@ -386,8 +389,7 @@ public class Dictionary {
fullStrip = true;
} else if ("LANG".equals(firstWord)) {
language = singleArgument(reader, line);
- String langCode = extractLanguageCode(language);
- alternateCasing = langCode.equals("tr") || langCode.equals("az");
+ this.alternateCasing = hasLanguage("tr", "az");
} else if ("BREAK".equals(firstWord)) {
breaks = parseBreaks(reader, line);
} else if ("WORDCHARS".equals(firstWord)) {
@@ -463,6 +465,17 @@ public class Dictionary {
stripOffsets[currentIndex] = currentOffset;
}
+ private boolean hasLanguage(String... langCodes) {
+ if (language == null) return false;
+ String langCode = extractLanguageCode(language);
+ for (String code : langCodes) {
+ if (langCode.equals(code)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
static String extractLanguageCode(String isoCode) {
int underscore = isoCode.indexOf("_");
return underscore < 0 ? isoCode : isoCode.substring(0, underscore);
@@ -910,7 +923,7 @@ public class Dictionary {
if (!hasStemExceptions) {
int morphStart = line.indexOf(MORPH_SEPARATOR);
if (morphStart >= 0 && morphStart < line.length()) {
- hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
+ hasStemExceptions = hasStemException(line.substring(morphStart + 1));
}
}
@@ -963,6 +976,23 @@ public class Dictionary {
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
}
+ String toLowerCase(String word) {
+ char[] chars = new char[word.length()];
+ for (int i = 0; i < word.length(); i++) {
+ chars[i] = caseFold(word.charAt(i));
+ }
+ return new String(chars);
+ }
+
+ String toTitleCase(String word) {
+ char[] chars = new char[word.length()];
+ chars[0] = Character.toUpperCase(word.charAt(0));
+ for (int i = 1; i < word.length(); i++) {
+ chars[i] = caseFold(word.charAt(i));
+ }
+ return new String(chars);
+ }
+
private String sortWordsOffline(
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
OfflineSorter sorter =
@@ -1062,13 +1092,14 @@ public class Dictionary {
}
// we possibly have morphological data
int stemExceptionID = 0;
- if (hasStemExceptions && end + 1 < line.length()) {
- String stemException = parseStemException(line.substring(end + 1));
- if (stemException != null) {
- stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1);
- stemExceptionID =
- stemExceptionCount + 1; // we use '0' to indicate no exception for the form
- stemExceptions[stemExceptionCount++] = stemException;
+ if (end + 1 < line.length()) {
+ String morphData = line.substring(end + 1);
+ for (String datum : splitMorphData(morphData)) {
+ if (datum.startsWith("st:")) {
+ stemExceptionID = addStemException(datum.substring(3));
+ } else if (datum.startsWith("ph:") && datum.length() > 3) {
+ addPhoneticRepEntries(entry, datum.substring(3));
+ }
}
}
@@ -1088,6 +1119,52 @@ public class Dictionary {
}
}
+ private int addStemException(String stemException) {
+ stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1);
+ stemExceptions[stemExceptionCount++] = stemException;
+ return stemExceptionCount; // we use '0' to indicate no exception for the form
+ }
+
+ private void addPhoneticRepEntries(String word, String ph) {
+ // e.g. "pretty ph:prity ph:priti->pretti" to suggest both prity->pretty and pritier->prettiest
+ int arrow = ph.indexOf("->");
+ String pattern;
+ String replacement;
+ if (arrow > 0) {
+ pattern = ph.substring(0, arrow);
+ replacement = ph.substring(arrow + 2);
+ } else {
+ pattern = ph;
+ replacement = word;
+ }
+
+ // when the ph: field ends with *, strip last character of pattern and replacement
+ // e.g., "pretty ph:prity*" results in "prit->prett" replacement instead of "prity->pretty",
+ // to get both prity->pretty and pritiest->prettiest suggestions.
+ if (pattern.endsWith("*") && pattern.length() > 2 && replacement.length() > 1) {
+ pattern = pattern.substring(0, pattern.length() - 2);
+ replacement = replacement.substring(0, replacement.length() - 1);
+ }
+
+ // capitalize lowercase pattern for capitalized words to support
+ // good suggestions also for capitalized misspellings,
+ // e.g. Wednesday ph:wendsay results in wendsay -> Wednesday and Wendsay -> Wednesday.
+ if (WordCase.caseOf(word) == WordCase.TITLE && WordCase.caseOf(pattern) == WordCase.LOWER) {
+ // add also lowercase word in the case of German or
+ // Hungarian to support lowercase suggestions lowercased by
+ // compound word generation or derivational suffixes
+ // for example by adjectival suffix "-i" of geographical names in Hungarian:
+ // Massachusetts ph:messzecsuzec
+ // messzecsuzeci -> massachusettsi (adjective)
+ // For lowercasing by conditional PFX rules, see e.g. germancompounding test
+ if (hasLanguage("de", "hu")) {
+ repTable.add(new RepEntry(pattern, toLowerCase(replacement)));
+ }
+ repTable.add(new RepEntry(toTitleCase(pattern), replacement));
+ }
+ repTable.add(new RepEntry(pattern, replacement));
+ }
+
boolean isDotICaseChangeDisallowed(char[] word) {
return word[0] == 'İ' && !alternateCasing;
}
@@ -1220,29 +1297,31 @@ public class Dictionary {
}
}
- private String parseStemException(String morphData) {
+ private boolean hasStemException(String morphData) {
+ for (String datum : splitMorphData(morphData)) {
+ if (datum.startsWith("st:")) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private List<String> splitMorphData(String morphData) {
// first see if it's an alias
if (morphAliasCount > 0) {
try {
int alias = Integer.parseInt(morphData.trim());
morphData = morphAliases[alias - 1];
- } catch (NumberFormatException e) {
- // fine
+ } catch (NumberFormatException ignored) {
}
}
- // try to parse morph entry
- int index = morphData.indexOf(" st:");
- if (index < 0) {
- index = morphData.indexOf("\tst:");
- }
- if (index >= 0) {
- int endIndex = indexOfSpaceOrTab(morphData, index + 1);
- if (endIndex < 0) {
- endIndex = morphData.length();
- }
- return morphData.substring(index + 4, endIndex);
+ if (morphData.isBlank()) {
+ return Collections.emptyList();
}
- return null;
+ return Arrays.stream(MORPH_KEY_PATTERN.split(morphData))
+ .map(String::trim)
+ .filter(s -> !s.isBlank())
+ .collect(Collectors.toList());
}
boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
index 4dd91c0..0c60e1b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@@ -16,9 +16,12 @@
*/
package org.apache.lucene.analysis.hunspell;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashSet;
+import java.util.List;
import java.util.Locale;
+import java.util.stream.Collectors;
class ModifyingSuggester {
private static final int MAX_CHAR_DISTANCE = 4;
@@ -36,6 +39,14 @@ class ModifyingSuggester {
WordCase wc = WordCase.caseOf(word);
+ if (wc == WordCase.UPPER) {
+ tryVariationsOf(speller.dictionary.toLowerCase(word));
+ tryVariationsOf(speller.dictionary.toTitleCase(word));
+ return result.stream()
+ .map(this::tryUpperCase)
+ .collect(Collectors.toCollection(LinkedHashSet::new));
+ }
+
if (wc == WordCase.MIXED) {
int dot = word.indexOf('.');
if (dot > 0
@@ -44,27 +55,24 @@ class ModifyingSuggester {
result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1));
}
- tryVariationsOf(toLowerCase(word));
+ tryVariationsOf(speller.dictionary.toLowerCase(word));
}
return result;
}
- private String toLowerCase(String word) {
- char[] chars = new char[word.length()];
- for (int i = 0; i < word.length(); i++) {
- chars[i] = speller.dictionary.caseFold(word.charAt(i));
+ private String tryUpperCase(String candidate) {
+ String upper = candidate.toUpperCase(Locale.ROOT);
+ if (upper.contains(" ") || speller.spell(upper)) {
+ return upper;
}
- return new String(chars);
+ String title = speller.dictionary.toTitleCase(candidate);
+ return speller.spell(title) ? title : candidate;
}
private void tryVariationsOf(String word) {
- trySuggestion(word.toUpperCase(Locale.ROOT));
- if (checkDictionaryForSplitSuggestions(word)) {
- return;
- }
-
- tryRep(word);
+ boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
+ hasGoodSuggestions |= tryRep(word);
trySwappingChars(word);
tryLongSwap(word);
@@ -75,12 +83,24 @@ class ModifyingSuggester {
tryReplacingChar(word);
tryTwoDuplicateChars(word);
- if (speller.dictionary.enableSplitSuggestions) {
+ List<String> goodSplit = checkDictionaryForSplitSuggestions(word);
+ if (!goodSplit.isEmpty()) {
+ List<String> copy = new ArrayList<>(result);
+ result.clear();
+ result.addAll(goodSplit);
+ if (hasGoodSuggestions) {
+ result.addAll(copy);
+ }
+ hasGoodSuggestions = true;
+ }
+
+ if (!hasGoodSuggestions && speller.dictionary.enableSplitSuggestions) {
trySplitting(word);
}
}
- private void tryRep(String word) {
+ private boolean tryRep(String word) {
+ int before = result.size();
for (RepEntry entry : speller.dictionary.repTable) {
for (String candidate : entry.substitute(word)) {
if (trySuggestion(candidate)) {
@@ -88,11 +108,16 @@ class ModifyingSuggester {
}
if (candidate.contains(" ")
- && Arrays.stream(candidate.split(" ")).allMatch(speller::checkWord)) {
+ && Arrays.stream(candidate.split(" ")).allMatch(this::checkSimpleWord)) {
result.add(candidate);
}
}
}
+ return result.size() > before;
+ }
+
+ private boolean checkSimpleWord(String part) {
+ return Boolean.TRUE.equals(speller.checkSimpleWord(part.toCharArray(), part.length(), null));
}
private void trySwappingChars(String word) {
@@ -213,24 +238,30 @@ class ModifyingSuggester {
}
}
- private boolean checkDictionaryForSplitSuggestions(String word) {
- boolean found = false;
+ private List<String> checkDictionaryForSplitSuggestions(String word) {
+ List<String> result = new ArrayList<>();
for (int i = 1; i < word.length() - 1; i++) {
String w1 = word.substring(0, i);
String w2 = word.substring(i);
- found |= trySuggestion(w1 + " " + w2);
+ String spaced = w1 + " " + w2;
+ if (speller.checkWord(spaced)) {
+ result.add(spaced);
+ }
if (shouldSplitByDash()) {
- found |= trySuggestion(w1 + "-" + w2);
+ String dashed = w1 + "-" + w2;
+ if (speller.checkWord(dashed)) {
+ result.add(dashed);
+ }
}
}
- return found;
+ return result;
}
private void trySplitting(String word) {
for (int i = 1; i < word.length() - 1; i++) {
String w1 = word.substring(0, i);
String w2 = word.substring(i);
- if (speller.checkWord(w1) && speller.checkWord(w2)) {
+ if (checkSimpleWord(w1) && checkSimpleWord(w2)) {
result.add(w1 + " " + w2);
if (shouldSplitByDash()) {
result.add(w1 + "-" + w2);
@@ -244,10 +275,6 @@ class ModifyingSuggester {
}
private boolean trySuggestion(String candidate) {
- if (speller.checkWord(candidate)) {
- result.add(candidate);
- return true;
- }
- return false;
+ return speller.checkWord(candidate) && result.add(candidate);
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index 1b0c2d3..53bf53e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -134,7 +134,7 @@ public class SpellChecker {
return checkWord(word.toCharArray(), word.length(), null);
}
- private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
+ Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
return false;
}
@@ -143,6 +143,15 @@ public class SpellChecker {
return true;
}
+ return null;
+ }
+
+ private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
+ Boolean simpleResult = checkSimpleWord(wordChars, length, originalCase);
+ if (simpleResult != null) {
+ return simpleResult;
+ }
+
if (dictionary.compoundRules != null
&& checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
return true;
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index a2399af..f4ca6b5 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -44,10 +44,18 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("allcaps");
}
- public void rep() throws Exception {
+ public void testRepSuggestions() throws Exception {
doTest("rep");
}
+ public void testPhSuggestions() throws Exception {
+ doTest("ph");
+ }
+
+ public void testPhSuggestions2() throws Exception {
+ doTest("ph2");
+ }
+
public void testForceUCase() throws Exception {
doTest("forceucase");
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.aff
new file mode 100644
index 0000000..c7d26bb
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.aff
@@ -0,0 +1,30 @@
+# new suggestion methods of Hunspell 1.7:
+# ph: for dictionary-based suggestions.
+#
+# For example, suggestions for "wich"
+# with this test dictonary:
+#
+# Hunspell 1.3.3
+# wich
+# & wich 4 0: winch, witch, which, wish
+#
+# Hunspell 1.6.2
+# wich
+# & wich 4 0: which, witch, winch, wish
+#
+# Suggestions will be limited for
+# the dictionary words with the same ph: field,
+# and for non-ngram suggestions.
+#
+# Order of the ph: suggestions for the
+# same mispelling, eg. wich -> which, witch
+# follows the order of the words in the dictionary:
+#
+# which ph:wich
+# witch ph:witch
+#
+# switch off ngram suggestions to check only
+# ph: based suggestions
+MAXNGRAMSUGS 0
+
+TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ'-
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.dic
new file mode 100644
index 0000000..e9462d5
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.dic
@@ -0,0 +1,11 @@
+8
+a lot ph:alot
+in spite ph:inspite
+inspire
+what ph:whta ph:waht
+Wednesday ph:wendsay ph:wensday
+which ph:wich
+witch ph:wich
+winch
+wish
+Oh, my gosh! ph:omg
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.sug
new file mode 100644
index 0000000..8daee56
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.sug
@@ -0,0 +1,11 @@
+a lot
+in spite, inspire
+what
+what
+Wednesday
+Wednesday
+Wednesday
+Wednesday
+which, witch, winch, wish
+Oh, my gosh!
+OH, MY GOSH!
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.wrong
new file mode 100644
index 0000000..f51b31a
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph.wrong
@@ -0,0 +1,11 @@
+alot
+inspite
+whta
+waht
+wensday
+wendsay
+Wensday
+Wendsay
+wich
+omg
+OMG
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.aff
new file mode 100644
index 0000000..d9d4288
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.aff
@@ -0,0 +1,32 @@
+# switch off ngram suggestion to test
+# usage of ph: dictionary fields in REP
+# suggestions
+SET UTF-8
+
+MAXNGRAMSUGS 0
+
+# test in compounds, too
+COMPOUNDFLAG Y
+
+# test also dictionary items with space,
+# and forbidden compounding, if there is
+# a ph: field with that compound as
+# mispelling in the dictionary
+CHECKCOMPOUNDREP
+
+# test in compound word with affixes
+SFX A Y 1
+SFX A 0 's .
+
+# when the ph: field ends with the character *,
+# strip last character of the pattern and the replacement
+# to match in REP suggestions also at character changes,
+# for example, "pretty ph:prity*" results "prit->prett"
+# REP replacement instead of "prity->pretty", to get
+# prity->pretty and pritiest->prettiest suggestions.
+
+SFX B Y 2
+SFX B y iest [^aeiou]y
+SFX B ö őt ö
+
+WORDCHARS '
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.dic
new file mode 100644
index 0000000..34482bb
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.dic
@@ -0,0 +1,11 @@
+9
+foo ph:bar ph:baz
+foo bar ph:foobar
+word/Y ph:baz
+stem/Y ph: ph:
+forbidden/Y
+root/YA
+forbidden root/A ph:forbiddenroot
+pretty/B ph:prity*
+foobarö/B ph:fubarő*
+happy/B ph:hepy ph:hepi->happi
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.good
new file mode 100644
index 0000000..c471b4e
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.good
@@ -0,0 +1,9 @@
+foo
+word
+stem
+wordstem
+stemword
+rootforbidden
+root's
+foobarö
+foobarőt
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.sug
new file mode 100644
index 0000000..916a607
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.sug
@@ -0,0 +1,14 @@
+foo
+foo, word
+foo bar
+wordstem
+stemword
+stemwordstem
+forbidden root
+forbidden root's
+pretty
+prettiest
+foobarö
+foobarőt
+happy
+happiest
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.wrong
new file mode 100644
index 0000000..74055eb
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ph2.wrong
@@ -0,0 +1,15 @@
+bar
+baz
+foobar
+bazstem
+stembaz
+stembazstem
+forbiddenroot
+forbiddenroot's
+rootforbiddenroot
+prity
+pritiest
+fubarö
+fubarőt
+hepy
+hepiest
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.aff
index 4ccc6e2..485755c 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/rep.aff
@@ -10,7 +10,7 @@ REP shun$ tion
REP ^alot$ a_lot # add the highest priority for "a lot" suggestion to "alot"
REP ^foo$ bar
REP ' _ # "un'alunno" -> "un alunno"
-REP ^vinte�n$ vinte_e_un
+REP ^vinte�n$ vinte_e_un
REP s 's