You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/10 08:21:15 UTC
[lucene-solr] branch master updated: LUCENE-9750: Hunspell: improve
suggestions for mixed-case misspelled words (#2332)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new c3166e1 LUCENE-9750: Hunspell: improve suggestions for mixed-case misspelled words (#2332)
c3166e1 is described below
commit c3166e1dc355e827b19067f037b3a127b2ef79fa
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Wed Feb 10 09:21:01 2021 +0100
LUCENE-9750: Hunspell: improve suggestions for mixed-case misspelled words (#2332)
---
.../lucene/analysis/hunspell/Dictionary.java | 2 +-
.../analysis/hunspell/ModifyingSuggester.java | 38 ++++++++++++++++++++--
.../lucene/analysis/hunspell/SpellChecker.java | 14 ++++----
.../lucene/analysis/hunspell/SpellCheckerTest.java | 4 +++
.../org/apache/lucene/analysis/hunspell/i58202.aff | 4 +++
.../org/apache/lucene/analysis/hunspell/i58202.dic | 5 +++
.../apache/lucene/analysis/hunspell/i58202.good | 10 ++++++
.../org/apache/lucene/analysis/hunspell/i58202.sug | 13 ++++++++
.../apache/lucene/analysis/hunspell/i58202.wrong | 13 ++++++++
9 files changed, 93 insertions(+), 10 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index f39575c..99f60b6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -70,7 +70,7 @@ import org.apache.lucene.util.fst.Util;
/** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
public class Dictionary {
- // Derived from woorm/ openoffice dictionaries.
+ // Derived from woorm/LibreOffice dictionaries.
// See TestAllDictionaries.testMaxPrologueNeeded.
static final int MAX_PROLOGUE_SCAN_WINDOW = 30 * 1024;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
index 08dd018..50c5dce 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@@ -21,6 +21,7 @@ import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
+import java.util.stream.Collectors;
/** A class that modifies the given misspelled word in various ways to get correct suggestions */
class ModifyingSuggester {
@@ -36,12 +37,17 @@ class ModifyingSuggester {
}
LinkedHashSet<String> suggest(String word, WordCase wordCase) {
+ String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word;
+ if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) {
+ trySuggestion(low);
+ }
+
tryVariationsOf(word);
if (wordCase == WordCase.TITLE) {
- tryVariationsOf(speller.dictionary.toLowerCase(word));
+ tryVariationsOf(low);
} else if (wordCase == WordCase.UPPER) {
- tryVariationsOf(speller.dictionary.toLowerCase(word));
+ tryVariationsOf(low);
tryVariationsOf(speller.dictionary.toTitleCase(word));
} else if (wordCase == WordCase.MIXED) {
int dot = word.indexOf('.');
@@ -51,12 +57,38 @@ class ModifyingSuggester {
result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1));
}
- tryVariationsOf(speller.dictionary.toLowerCase(word));
+ boolean capitalized = Character.isUpperCase(word.charAt(0));
+ if (capitalized) {
+ tryVariationsOf(speller.dictionary.caseFold(word.charAt(0)) + word.substring(1));
+ }
+
+ tryVariationsOf(low);
+
+ if (capitalized) {
+ tryVariationsOf(speller.dictionary.toTitleCase(low));
+ }
+
+ return result.stream()
+ .map(s -> capitalizeAfterSpace(low, s))
+ .collect(Collectors.toCollection(LinkedHashSet::new));
}
return result;
}
+ // aNew -> "a New" (instead of "a new")
+ private String capitalizeAfterSpace(String lowMisspelled, String candidate) {
+ int space = candidate.indexOf(' ');
+ int tail = candidate.length() - space - 1;
+ if (space > 0
+ && lowMisspelled.regionMatches(lowMisspelled.length() - tail, candidate, space + 1, tail)) {
+ return candidate.substring(0, space + 1)
+ + Character.toUpperCase(candidate.charAt(space + 1))
+ + candidate.substring(space + 2);
+ }
+ return candidate;
+ }
+
private void tryVariationsOf(String word) {
hasGoodSuggestions |= trySuggestion(word.toUpperCase(Locale.ROOT));
hasGoodSuggestions |= tryRep(word);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index 0c7aaa0..482697f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -435,7 +435,7 @@ public class SpellChecker {
Set<String> result = new LinkedHashSet<>();
for (String candidate : suggestions) {
- result.add(adjustSuggestionCase(candidate, wordCase));
+ result.add(adjustSuggestionCase(candidate, wordCase, word));
if (wordCase == WordCase.UPPER && dictionary.checkSharpS && candidate.contains("ß")) {
result.add(candidate);
}
@@ -443,16 +443,18 @@ public class SpellChecker {
return result.stream().map(this::cleanOutput).collect(Collectors.toList());
}
- private String adjustSuggestionCase(String candidate, WordCase original) {
- if (original == WordCase.UPPER) {
+ private String adjustSuggestionCase(String candidate, WordCase originalCase, String original) {
+ if (originalCase == WordCase.UPPER) {
String upper = candidate.toUpperCase(Locale.ROOT);
if (upper.contains(" ") || spell(upper)) {
return upper;
}
}
- if (original == WordCase.UPPER || original == WordCase.TITLE) {
- String title = dictionary.toTitleCase(candidate);
- return spell(title) ? title : candidate;
+ if (Character.isUpperCase(original.charAt(0))) {
+ String title = Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1);
+ if (title.contains(" ") || spell(title)) {
+ return title;
+ }
}
return candidate;
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index 6ee3994..441e5d8 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -184,6 +184,10 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("sug2");
}
+ public void testMixedCaseSuggestionHeuristics() throws Exception {
+ doTest("i58202");
+ }
+
public void testMapSuggestions() throws Exception {
doTest("map");
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.aff
new file mode 100644
index 0000000..11249d4
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.aff
@@ -0,0 +1,4 @@
+# case suggestions
+MAXNGRAMSUGS 0
+# capitalise baz->Baz
+TRY B
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.dic
new file mode 100644
index 0000000..19e1980
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.dic
@@ -0,0 +1,5 @@
+4
+foo
+bar
+Baz
+Boo
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.good
new file mode 100644
index 0000000..88a079a
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.good
@@ -0,0 +1,10 @@
+foo
+bar
+Foo
+Bar
+Baz
+Boo
+FOO
+BAR
+BAZ
+BOO
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.sug
new file mode 100644
index 0000000..bc784ac
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.sug
@@ -0,0 +1,13 @@
+foo, Boo
+Bar
+Baz
+Boo
+foo bar
+foo Bar
+Foo bar
+Foo Bar
+foo Baz
+Foo Baz
+Baz foo
+Baz Foo
+Baz Boo
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.wrong
new file mode 100644
index 0000000..886584d
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.wrong
@@ -0,0 +1,13 @@
+fOO
+BAr
+baz
+BOo
+foobar
+fooBar
+Foobar
+FooBar
+fooBaz
+FooBaz
+Bazfoo
+BazFoo
+BazBoo