You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/08 10:00:07 UTC
[lucene-solr] branch master updated: LUCENE-9736: Hunspell: support MAP-based suggestions for groups of similar letters (#2314)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 6536263  LUCENE-9736: Hunspell: support MAP-based suggestions for groups of similar letters (#2314)
6536263 is described below

commit 653626399f1d50cb8b90769ba38bed54a86e9352
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Mon Feb 8 10:59:53 2021 +0100

    LUCENE-9736: Hunspell: support MAP-based suggestions for groups of similar letters (#2314)
---
 .../lucene/analysis/hunspell/Dictionary.java       | 25 ++++++++++++++++++++++
 .../analysis/hunspell/ModifyingSuggester.java      | 25 ++++++++++++++++++++++
 .../lucene/analysis/hunspell/SpellCheckerTest.java |  4 ++++
 .../org/apache/lucene/analysis/hunspell/map.aff    |  9 ++++++++
 .../org/apache/lucene/analysis/hunspell/map.dic    |  4 ++++
 .../org/apache/lucene/analysis/hunspell/map.sug    |  3 +++
 .../org/apache/lucene/analysis/hunspell/map.wrong  |  3 +++
 7 files changed, 73 insertions(+)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 95a4b83..557037c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -168,6 +168,7 @@ public class Dictionary {
   String[] neighborKeyGroups = new String[0];
   boolean enableSplitSuggestions = true;
   List<RepEntry> repTable = new ArrayList<>();
+  List<List<String>> mapTable = new ArrayList<>();
 
   // FSTs used for ICONV/OCONV, output ord pointing to replacement text
   FST<CharsRef> iconv;
@@ -399,6 +400,11 @@ public class Dictionary {
           String[] parts = splitBySpace(reader, reader.readLine(), 3, Integer.MAX_VALUE);
           repTable.add(new RepEntry(parts[1], parts[2]));
         }
+      } else if ("MAP".equals(firstWord)) {
+        int count = parseNum(reader, line);
+        for (int i = 0; i < count; i++) {
+          mapTable.add(parseMapEntry(reader, reader.readLine()));
+        }
       } else if ("KEY".equals(firstWord)) {
         neighborKeyGroups = singleArgument(reader, line).split("\\|");
       } else if ("NOSPLITSUGS".equals(firstWord)) {
@@ -462,6 +468,25 @@ public class Dictionary {
     stripOffsets[currentIndex] = currentOffset;
   }
 
+  private List<String> parseMapEntry(LineNumberReader reader, String line) throws ParseException {
+    String unparsed = singleArgument(reader, line);
+    List<String> mapEntry = new ArrayList<>();
+    for (int j = 0; j < unparsed.length(); j++) {
+      if (unparsed.charAt(j) == '(') {
+        int closing = unparsed.indexOf(')', j);
+        if (closing < 0) {
+          throw new ParseException("Unclosed parenthesis: " + line, reader.getLineNumber());
+        }
+
+        mapEntry.add(unparsed.substring(j + 1, closing));
+        j = closing;
+      } else {
+        mapEntry.add(String.valueOf(unparsed.charAt(j)));
+      }
+    }
+    return mapEntry;
+  }
+
   private boolean hasLanguage(String... langCodes) {
     if (language == null) return false;
     String langCode = extractLanguageCode(language);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
index 0c60e1b..cc763e2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@@ -74,6 +74,10 @@ class ModifyingSuggester {
     boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
     hasGoodSuggestions |= tryRep(word);
 
+    if (!speller.dictionary.mapTable.isEmpty()) {
+      enumerateMapReplacements(word, "", 0);
+    }
+
     trySwappingChars(word);
     tryLongSwap(word);
     tryNeighborKeys(word);
@@ -116,6 +120,27 @@ class ModifyingSuggester {
     return result.size() > before;
   }
 
+  private void enumerateMapReplacements(String word, String accumulated, int offset) {
+    if (offset == word.length()) {
+      trySuggestion(accumulated);
+      return;
+    }
+
+    for (List<String> entries : speller.dictionary.mapTable) {
+      for (String entry : entries) {
+        if (word.regionMatches(offset, entry, 0, entry.length())) {
+          for (String replacement : entries) {
+            if (!entry.equals(replacement)) {
+              enumerateMapReplacements(word, accumulated + replacement, offset + entry.length());
+            }
+          }
+        }
+      }
+    }
+
+    enumerateMapReplacements(word, accumulated + word.charAt(offset), offset + 1);
+  }
+
   private boolean checkSimpleWord(String part) {
     return Boolean.TRUE.equals(speller.checkSimpleWord(part.toCharArray(), part.length(), null));
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index f4ca6b5..f216cb0 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -180,6 +180,10 @@ public class SpellCheckerTest extends StemmerTestBase {
     doTest("sug2");
   }
 
+  public void testMapSuggestions() throws Exception {
+    doTest("map");
+  }
+
   protected void doTest(String name) throws Exception {
     checkSpellCheckerExpectations(
         Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name), true);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.aff
new file mode 100644
index 0000000..3e78bab
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.aff
@@ -0,0 +1,9 @@
+# With MAP suggestion, Hunspell can add missing accents to a word.
+
+# switch off ngram suggestion for testing
+MAXNGRAMSUGS 0
+
+MAP 3
+MAP u��
+MAP o��
+MAP �(ss)
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.dic
new file mode 100644
index 0000000..744394f
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.dic
@@ -0,0 +1,4 @@
+3
+Fr�hst�ck
+t�k�rf�r�
+gro�
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.sug
new file mode 100644
index 0000000..81d09e0
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.sug
@@ -0,0 +1,3 @@
+Frühstück
+tükörfúró
+groß
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.wrong
new file mode 100644
index 0000000..251c8a1
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.wrong
@@ -0,0 +1,3 @@
+Fruhstuck
+tukorfuro
+gross