You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/10 08:21:15 UTC

[lucene-solr] branch master updated: LUCENE-9750: Hunspell: improve suggestions for mixed-case misspelled words (#2332)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new c3166e1  LUCENE-9750: Hunspell: improve suggestions for mixed-case misspelled words (#2332)
c3166e1 is described below

commit c3166e1dc355e827b19067f037b3a127b2ef79fa
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Wed Feb 10 09:21:01 2021 +0100

    LUCENE-9750: Hunspell: improve suggestions for mixed-case misspelled words (#2332)
---
 .../lucene/analysis/hunspell/Dictionary.java       |  2 +-
 .../analysis/hunspell/ModifyingSuggester.java      | 38 ++++++++++++++++++++--
 .../lucene/analysis/hunspell/SpellChecker.java     | 14 ++++----
 .../lucene/analysis/hunspell/SpellCheckerTest.java |  4 +++
 .../org/apache/lucene/analysis/hunspell/i58202.aff |  4 +++
 .../org/apache/lucene/analysis/hunspell/i58202.dic |  5 +++
 .../apache/lucene/analysis/hunspell/i58202.good    | 10 ++++++
 .../org/apache/lucene/analysis/hunspell/i58202.sug | 13 ++++++++
 .../apache/lucene/analysis/hunspell/i58202.wrong   | 13 ++++++++
 9 files changed, 93 insertions(+), 10 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index f39575c..99f60b6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -70,7 +70,7 @@ import org.apache.lucene.util.fst.Util;
 
 /** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
 public class Dictionary {
-  // Derived from woorm/ openoffice dictionaries.
+  // Derived from woorm/LibreOffice dictionaries.
   // See TestAllDictionaries.testMaxPrologueNeeded.
   static final int MAX_PROLOGUE_SCAN_WINDOW = 30 * 1024;
 
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
index 08dd018..50c5dce 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@@ -21,6 +21,7 @@ import java.util.Arrays;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
+import java.util.stream.Collectors;
 
 /** A class that modifies the given misspelled word in various ways to get correct suggestions */
 class ModifyingSuggester {
@@ -36,12 +37,17 @@ class ModifyingSuggester {
   }
 
   LinkedHashSet<String> suggest(String word, WordCase wordCase) {
+    String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word;
+    if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) {
+      trySuggestion(low);
+    }
+
     tryVariationsOf(word);
 
     if (wordCase == WordCase.TITLE) {
-      tryVariationsOf(speller.dictionary.toLowerCase(word));
+      tryVariationsOf(low);
     } else if (wordCase == WordCase.UPPER) {
-      tryVariationsOf(speller.dictionary.toLowerCase(word));
+      tryVariationsOf(low);
       tryVariationsOf(speller.dictionary.toTitleCase(word));
     } else if (wordCase == WordCase.MIXED) {
       int dot = word.indexOf('.');
@@ -51,12 +57,38 @@ class ModifyingSuggester {
         result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1));
       }
 
-      tryVariationsOf(speller.dictionary.toLowerCase(word));
+      boolean capitalized = Character.isUpperCase(word.charAt(0));
+      if (capitalized) {
+        tryVariationsOf(speller.dictionary.caseFold(word.charAt(0)) + word.substring(1));
+      }
+
+      tryVariationsOf(low);
+
+      if (capitalized) {
+        tryVariationsOf(speller.dictionary.toTitleCase(low));
+      }
+
+      return result.stream()
+          .map(s -> capitalizeAfterSpace(low, s))
+          .collect(Collectors.toCollection(LinkedHashSet::new));
     }
 
     return result;
   }
 
+  // aNew -> "a New" (instead of "a new")
+  private String capitalizeAfterSpace(String lowMisspelled, String candidate) {
+    int space = candidate.indexOf(' ');
+    int tail = candidate.length() - space - 1;
+    if (space > 0
+        && lowMisspelled.regionMatches(lowMisspelled.length() - tail, candidate, space + 1, tail)) {
+      return candidate.substring(0, space + 1)
+          + Character.toUpperCase(candidate.charAt(space + 1))
+          + candidate.substring(space + 2);
+    }
+    return candidate;
+  }
+
   private void tryVariationsOf(String word) {
     hasGoodSuggestions |= trySuggestion(word.toUpperCase(Locale.ROOT));
     hasGoodSuggestions |= tryRep(word);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index 0c7aaa0..482697f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -435,7 +435,7 @@ public class SpellChecker {
 
     Set<String> result = new LinkedHashSet<>();
     for (String candidate : suggestions) {
-      result.add(adjustSuggestionCase(candidate, wordCase));
+      result.add(adjustSuggestionCase(candidate, wordCase, word));
       if (wordCase == WordCase.UPPER && dictionary.checkSharpS && candidate.contains("ß")) {
         result.add(candidate);
       }
@@ -443,16 +443,18 @@ public class SpellChecker {
     return result.stream().map(this::cleanOutput).collect(Collectors.toList());
   }
 
-  private String adjustSuggestionCase(String candidate, WordCase original) {
-    if (original == WordCase.UPPER) {
+  private String adjustSuggestionCase(String candidate, WordCase originalCase, String original) {
+    if (originalCase == WordCase.UPPER) {
       String upper = candidate.toUpperCase(Locale.ROOT);
       if (upper.contains(" ") || spell(upper)) {
         return upper;
       }
     }
-    if (original == WordCase.UPPER || original == WordCase.TITLE) {
-      String title = dictionary.toTitleCase(candidate);
-      return spell(title) ? title : candidate;
+    if (Character.isUpperCase(original.charAt(0))) {
+      String title = Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1);
+      if (title.contains(" ") || spell(title)) {
+        return title;
+      }
     }
     return candidate;
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index 6ee3994..441e5d8 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -184,6 +184,10 @@ public class SpellCheckerTest extends StemmerTestBase {
     doTest("sug2");
   }
 
+  public void testMixedCaseSuggestionHeuristics() throws Exception {
+    doTest("i58202");
+  }
+
   public void testMapSuggestions() throws Exception {
     doTest("map");
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.aff
new file mode 100644
index 0000000..11249d4
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.aff
@@ -0,0 +1,4 @@
+# case suggestions
+MAXNGRAMSUGS 0
+# capitalise baz->Baz
+TRY B
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.dic
new file mode 100644
index 0000000..19e1980
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.dic
@@ -0,0 +1,5 @@
+4
+foo
+bar
+Baz
+Boo
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.good
new file mode 100644
index 0000000..88a079a
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.good
@@ -0,0 +1,10 @@
+foo
+bar
+Foo
+Bar
+Baz
+Boo
+FOO
+BAR
+BAZ
+BOO
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.sug
new file mode 100644
index 0000000..bc784ac
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.sug
@@ -0,0 +1,13 @@
+foo, Boo
+Bar
+Baz
+Boo
+foo bar
+foo Bar
+Foo bar
+Foo Bar
+foo Baz
+Foo Baz
+Baz foo
+Baz Foo
+Baz Boo
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.wrong
new file mode 100644
index 0000000..886584d
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i58202.wrong
@@ -0,0 +1,13 @@
+fOO
+BAr
+baz
+BOo
+foobar
+fooBar
+Foobar
+FooBar
+fooBaz
+FooBaz
+Bazfoo
+BazFoo
+BazBoo