You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/08 10:00:07 UTC
[lucene-solr] branch master updated: LUCENE-9736: Hunspell: support
MAP-based suggestions for groups of similar letters (#2314)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 6536263 LUCENE-9736: Hunspell: support MAP-based suggestions for groups of similar letters (#2314)
6536263 is described below
commit 653626399f1d50cb8b90769ba38bed54a86e9352
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Mon Feb 8 10:59:53 2021 +0100
LUCENE-9736: Hunspell: support MAP-based suggestions for groups of similar letters (#2314)
---
.../lucene/analysis/hunspell/Dictionary.java | 25 ++++++++++++++++++++++
.../analysis/hunspell/ModifyingSuggester.java | 25 ++++++++++++++++++++++
.../lucene/analysis/hunspell/SpellCheckerTest.java | 4 ++++
.../org/apache/lucene/analysis/hunspell/map.aff | 9 ++++++++
.../org/apache/lucene/analysis/hunspell/map.dic | 4 ++++
.../org/apache/lucene/analysis/hunspell/map.sug | 3 +++
.../org/apache/lucene/analysis/hunspell/map.wrong | 3 +++
7 files changed, 73 insertions(+)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 95a4b83..557037c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -168,6 +168,7 @@ public class Dictionary {
String[] neighborKeyGroups = new String[0];
boolean enableSplitSuggestions = true;
List<RepEntry> repTable = new ArrayList<>();
+ List<List<String>> mapTable = new ArrayList<>();
// FSTs used for ICONV/OCONV, output ord pointing to replacement text
FST<CharsRef> iconv;
@@ -399,6 +400,11 @@ public class Dictionary {
String[] parts = splitBySpace(reader, reader.readLine(), 3, Integer.MAX_VALUE);
repTable.add(new RepEntry(parts[1], parts[2]));
}
+ } else if ("MAP".equals(firstWord)) {
+ int count = parseNum(reader, line);
+ for (int i = 0; i < count; i++) {
+ mapTable.add(parseMapEntry(reader, reader.readLine()));
+ }
} else if ("KEY".equals(firstWord)) {
neighborKeyGroups = singleArgument(reader, line).split("\\|");
} else if ("NOSPLITSUGS".equals(firstWord)) {
@@ -462,6 +468,25 @@ public class Dictionary {
stripOffsets[currentIndex] = currentOffset;
}
+ private List<String> parseMapEntry(LineNumberReader reader, String line) throws ParseException {
+ String unparsed = singleArgument(reader, line);
+ List<String> mapEntry = new ArrayList<>();
+ for (int j = 0; j < unparsed.length(); j++) {
+ if (unparsed.charAt(j) == '(') {
+ int closing = unparsed.indexOf(')', j);
+ if (closing < 0) {
+ throw new ParseException("Unclosed parenthesis: " + line, reader.getLineNumber());
+ }
+
+ mapEntry.add(unparsed.substring(j + 1, closing));
+ j = closing;
+ } else {
+ mapEntry.add(String.valueOf(unparsed.charAt(j)));
+ }
+ }
+ return mapEntry;
+ }
+
private boolean hasLanguage(String... langCodes) {
if (language == null) return false;
String langCode = extractLanguageCode(language);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
index 0c60e1b..cc763e2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@@ -74,6 +74,10 @@ class ModifyingSuggester {
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
hasGoodSuggestions |= tryRep(word);
+ if (!speller.dictionary.mapTable.isEmpty()) {
+ enumerateMapReplacements(word, "", 0);
+ }
+
trySwappingChars(word);
tryLongSwap(word);
tryNeighborKeys(word);
@@ -116,6 +120,27 @@ class ModifyingSuggester {
return result.size() > before;
}
+ private void enumerateMapReplacements(String word, String accumulated, int offset) {
+ if (offset == word.length()) {
+ trySuggestion(accumulated);
+ return;
+ }
+
+ for (List<String> entries : speller.dictionary.mapTable) {
+ for (String entry : entries) {
+ if (word.regionMatches(offset, entry, 0, entry.length())) {
+ for (String replacement : entries) {
+ if (!entry.equals(replacement)) {
+ enumerateMapReplacements(word, accumulated + replacement, offset + entry.length());
+ }
+ }
+ }
+ }
+ }
+
+ enumerateMapReplacements(word, accumulated + word.charAt(offset), offset + 1);
+ }
+
private boolean checkSimpleWord(String part) {
return Boolean.TRUE.equals(speller.checkSimpleWord(part.toCharArray(), part.length(), null));
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index f4ca6b5..f216cb0 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -180,6 +180,10 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("sug2");
}
+ public void testMapSuggestions() throws Exception {
+ doTest("map");
+ }
+
protected void doTest(String name) throws Exception {
checkSpellCheckerExpectations(
Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name), true);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.aff
new file mode 100644
index 0000000..3e78bab
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.aff
@@ -0,0 +1,9 @@
+# With MAP suggestion, Hunspell can add missing accents to a word.
+
+# switch off ngram suggestion for testing
+MAXNGRAMSUGS 0
+
+MAP 3
+MAP u��
+MAP o��
+MAP �(ss)
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.dic
new file mode 100644
index 0000000..744394f
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.dic
@@ -0,0 +1,4 @@
+3
+Fr�hst�ck
+t�k�rf�r�
+gro�
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.sug
new file mode 100644
index 0000000..81d09e0
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.sug
@@ -0,0 +1,3 @@
+Frühstück
+tükörfúró
+groß
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.wrong
new file mode 100644
index 0000000..251c8a1
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/map.wrong
@@ -0,0 +1,3 @@
+Fruhstuck
+tukorfuro
+gross