You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/01/20 09:57:45 UTC

[lucene-solr] branch master updated: LUCENE-9667: Hunspell: add spellchecker API, support BREAK and FORBIDDENWORD affix rules (#2207)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 939699f  LUCENE-9667: Hunspell: add spellchecker API, support BREAK and FORBIDDENWORD affix rules (#2207)
939699f is described below

commit 939699f5509673c5d7d3e650ebda3d240b259044
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Wed Jan 20 10:57:27 2021 +0100

    LUCENE-9667: Hunspell: add spellchecker API, support BREAK and FORBIDDENWORD affix rules (#2207)
---
 gradle/validation/rat-sources.gradle               |   2 +
 lucene/CHANGES.txt                                 |   4 +-
 .../lucene/analysis/hunspell/Dictionary.java       |  76 +++++++++++++++
 .../lucene/analysis/hunspell/SpellChecker.java     | 104 +++++++++++++++++++++
 .../apache/lucene/analysis/hunspell/Stemmer.java   |   2 +-
 .../lucene/analysis/hunspell/SpellCheckerTest.java |  71 ++++++++++++++
 .../org/apache/lucene/analysis/hunspell/break.aff  |  10 ++
 .../org/apache/lucene/analysis/hunspell/break.dic  |   7 ++
 .../org/apache/lucene/analysis/hunspell/break.good |  12 +++
 .../apache/lucene/analysis/hunspell/break.wrong    |  13 +++
 .../lucene/analysis/hunspell/breakdefault.aff      |   6 ++
 .../lucene/analysis/hunspell/breakdefault.dic      |   6 ++
 .../lucene/analysis/hunspell/breakdefault.good     |   7 ++
 .../lucene/analysis/hunspell/breakdefault.wrong    |   6 ++
 .../apache/lucene/analysis/hunspell/breakoff.aff   |   7 ++
 .../apache/lucene/analysis/hunspell/breakoff.dic   |   6 ++
 .../apache/lucene/analysis/hunspell/breakoff.good  |   3 +
 .../apache/lucene/analysis/hunspell/breakoff.wrong |   5 +
 18 files changed, 344 insertions(+), 3 deletions(-)

diff --git a/gradle/validation/rat-sources.gradle b/gradle/validation/rat-sources.gradle
index 3298b95..9454f0f 100644
--- a/gradle/validation/rat-sources.gradle
+++ b/gradle/validation/rat-sources.gradle
@@ -54,6 +54,8 @@ configure(project(":lucene:analysis:common")) {
         srcExcludes += [
             "**/*.aff",
             "**/*.dic",
+            "**/*.wrong",
+            "**/*.good",
             "**/charfilter/*.htm*",
             "**/*LuceneResourcesWikiPage.html"
         ]
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 92c0c32..fa1d09c 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -86,8 +86,8 @@ API Changes
 
 Improvements
 
-* LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming of all-caps words
-  (Peter Gromov)
+* LUCENE-9665 LUCENE-9676 LUCENE-9667 : Hunspell improvements: add SpellChecker API, support default encoding and
+  BREAK/FORBIDDENWORD affix rules, improve stemming of all-caps words (Peter Gromov)
 
 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
   (Dawid Weiss)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 34edb73..bf4b1d0 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -34,13 +34,16 @@ import java.nio.file.Paths;
 import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -87,6 +90,8 @@ public class Dictionary {
   private static final String OCONV_KEY = "OCONV";
   private static final String FULLSTRIP_KEY = "FULLSTRIP";
   private static final String LANG_KEY = "LANG";
+  private static final String BREAK_KEY = "BREAK";
+  private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
   private static final String KEEPCASE_KEY = "KEEPCASE";
   private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
   private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
@@ -103,6 +108,7 @@ public class Dictionary {
 
   FST<IntsRef> prefixes;
   FST<IntsRef> suffixes;
+  Breaks breaks = Breaks.DEFAULT;
 
   // all condition checks used by prefixes and suffixes. these are typically re-used across
   // many affix stripping rules. so these are deduplicated, to save RAM.
@@ -155,6 +161,7 @@ public class Dictionary {
   int circumfix = -1; // circumfix flag, or -1 if one is not defined
   int keepcase = -1; // keepcase flag, or -1 if one is not defined
   int needaffix = -1; // needaffix flag, or -1 if one is not defined
+  int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
   int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
 
   // ignored characters (dictionary, affix, inputs)
@@ -256,6 +263,10 @@ public class Dictionary {
     }
   }
 
+  int formStep() {
+    return hasStemExceptions ? 2 : 1;
+  }
+
   /** Looks up Hunspell word forms from the dictionary */
   IntsRef lookupWord(char[] word, int offset, int length) {
     return lookup(words, word, offset, length);
@@ -400,6 +411,14 @@ public class Dictionary {
       } else if (line.startsWith(LANG_KEY)) {
         language = line.substring(LANG_KEY.length()).trim();
         alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
+      } else if (line.startsWith(BREAK_KEY)) {
+        breaks = parseBreaks(reader, line);
+      } else if (line.startsWith(FORBIDDENWORD_KEY)) {
+        String[] parts = line.split("\\s+");
+        if (parts.length != 2) {
+          throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
+        }
+        forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
       }
     }
 
@@ -423,6 +442,30 @@ public class Dictionary {
     stripOffsets[currentIndex] = currentOffset;
   }
 
+  private Breaks parseBreaks(LineNumberReader reader, String line)
+      throws IOException, ParseException {
+    Set<String> starting = new LinkedHashSet<>();
+    Set<String> ending = new LinkedHashSet<>();
+    Set<String> middle = new LinkedHashSet<>();
+    int num = Integer.parseInt(line.substring(BREAK_KEY.length()).trim());
+    for (int i = 0; i < num; i++) {
+      line = reader.readLine();
+      String[] parts = line.split("\\s+");
+      if (!line.startsWith(BREAK_KEY) || parts.length != 2) {
+        throw new ParseException("BREAK chars expected", reader.getLineNumber());
+      }
+      String breakStr = parts[1];
+      if (breakStr.startsWith("^")) {
+        starting.add(breakStr.substring(1));
+      } else if (breakStr.endsWith("$")) {
+        ending.add(breakStr.substring(0, breakStr.length() - 1));
+      } else {
+        middle.add(breakStr);
+      }
+    }
+    return new Breaks(starting, ending, middle);
+  }
+
   private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
     IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
     FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
@@ -1143,6 +1186,22 @@ public class Dictionary {
     return null;
   }
 
+  boolean isForbiddenWord(char[] word, BytesRef scratch) {
+    if (forbiddenword != -1) {
+      IntsRef forms = lookupWord(word, 0, word.length);
+      if (forms != null) {
+        int formStep = formStep();
+        for (int i = 0; i < forms.length; i += formStep) {
+          flagLookup.get(forms.ints[forms.offset + i], scratch);
+          if (hasFlag(Dictionary.decodeFlags(scratch), (char) forbiddenword)) {
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
   /** Abstraction of the process of parsing flags taken from the affix and dic files */
   abstract static class FlagParsingStrategy {
 
@@ -1371,4 +1430,21 @@ public class Dictionary {
 
     return DEFAULT_TEMP_DIR;
   }
+
+  /** Possible word breaks according to BREAK directives */
+  static class Breaks {
+    private static final Set<String> MINUS = Collections.singleton("-");
+    static final Breaks DEFAULT = new Breaks(MINUS, MINUS, MINUS);
+    final String[] starting, ending, middle;
+
+    Breaks(Collection<String> starting, Collection<String> ending, Collection<String> middle) {
+      this.starting = starting.toArray(new String[0]);
+      this.ending = ending.toArray(new String[0]);
+      this.middle = middle.toArray(new String[0]);
+    }
+
+    boolean isNotEmpty() {
+      return middle.length > 0 || starting.length > 0 || ending.length > 0;
+    }
+  }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
new file mode 100644
index 0000000..741fdc4
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
+ * (but a single underlying Dictionary can be shared by multiple spell-checkers in different
+ * threads). Not all Hunspell features are supported yet.
+ */
+public class SpellChecker {
+  private final Dictionary dictionary;
+  private final BytesRef scratch = new BytesRef();
+  private final Stemmer stemmer;
+
+  public SpellChecker(Dictionary dictionary) {
+    this.dictionary = dictionary;
+    stemmer = new Stemmer(dictionary);
+  }
+
+  /** @return whether the given word's spelling is considered correct according to Hunspell rules */
+  public boolean spell(String word) {
+    char[] wordChars = word.toCharArray();
+    if (dictionary.isForbiddenWord(wordChars, scratch)) {
+      return false;
+    }
+
+    if (!stemmer.stem(wordChars, word.length()).isEmpty()) {
+      return true;
+    }
+
+    if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
+      return tryBreaks(word);
+    }
+
+    return false;
+  }
+
+  private boolean tryBreaks(String word) {
+    for (String br : dictionary.breaks.starting) {
+      if (word.length() > br.length() && word.startsWith(br)) {
+        if (spell(word.substring(br.length()))) {
+          return true;
+        }
+      }
+    }
+
+    for (String br : dictionary.breaks.ending) {
+      if (word.length() > br.length() && word.endsWith(br)) {
+        if (spell(word.substring(0, word.length() - br.length()))) {
+          return true;
+        }
+      }
+    }
+
+    for (String br : dictionary.breaks.middle) {
+      int pos = word.indexOf(br);
+      if (canBeBrokenAt(word, br, pos)) {
+        return true;
+      }
+
+      // try to break at the second occurrence
+      // to recognize dictionary words with a word break
+      if (pos > 0 && canBeBrokenAt(word, br, word.indexOf(br, pos + 1))) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private boolean hasTooManyBreakOccurrences(String word) {
+    int occurrences = 0;
+    for (String br : dictionary.breaks.middle) {
+      int pos = 0;
+      while ((pos = word.indexOf(br, pos)) >= 0) {
+        if (++occurrences >= 10) return true;
+        pos += br.length();
+      }
+    }
+    return false;
+  }
+
+  private boolean canBeBrokenAt(String word, String breakStr, int breakPos) {
+    return breakPos > 0
+        && breakPos < word.length() - breakStr.length()
+        && spell(word.substring(0, breakPos))
+        && spell(word.substring(breakPos + breakStr.length()));
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 0e06d3d..10ae992 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -64,7 +64,7 @@ final class Stemmer {
         suffixReaders[level] = dictionary.suffixes.getBytesReader();
       }
     }
-    formStep = dictionary.hasStemExceptions ? 2 : 1;
+    formStep = dictionary.formStep();
   }
 
   /**
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
new file mode 100644
index 0000000..7be4eaf
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.io.InputStream;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Objects;
+import org.apache.lucene.store.ByteBuffersDirectory;
+import org.apache.lucene.util.IOUtils;
+
+public class SpellCheckerTest extends StemmerTestBase {
+
+  public void testBreak() throws Exception {
+    doTest("break");
+  }
+
+  public void testBreakDefault() throws Exception {
+    doTest("breakdefault");
+  }
+
+  public void testBreakOff() throws Exception {
+    doTest("breakoff");
+  }
+
+  protected void doTest(String name) throws Exception {
+    InputStream affixStream =
+        Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);
+    InputStream dictStream =
+        Objects.requireNonNull(getClass().getResourceAsStream(name + ".dic"), name);
+
+    SpellChecker speller;
+    try {
+      Dictionary dictionary =
+          new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
+      speller = new SpellChecker(dictionary);
+    } finally {
+      IOUtils.closeWhileHandlingException(affixStream);
+      IOUtils.closeWhileHandlingException(dictStream);
+    }
+
+    URL good = StemmerTestBase.class.getResource(name + ".good");
+    if (good != null) {
+      for (String word : Files.readAllLines(Path.of(good.toURI()))) {
+        assertTrue("Unexpectedly considered misspelled: " + word, speller.spell(word));
+      }
+    }
+
+    URL wrong = StemmerTestBase.class.getResource(name + ".wrong");
+    if (wrong != null) {
+      for (String word : Files.readAllLines(Path.of(wrong.toURI()))) {
+        assertFalse("Unexpectedly considered correct: " + word, speller.spell(word));
+      }
+    }
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.aff
new file mode 100644
index 0000000..55d0609
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.aff
@@ -0,0 +1,10 @@
+# word break points test, recursive break at dash and n-dash
+SET UTF-8
+
+BREAK 2
+BREAK -
+BREAK –
+
+WORDCHARS -–
+
+FORBIDDENWORD !
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.dic
new file mode 100644
index 0000000..b2c5741
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.dic
@@ -0,0 +1,7 @@
+6
+foo
+bar
+baz
+fox-bax
+foo-baz/!
+e-mail
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.good
new file mode 100644
index 0000000..d651a63
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.good
@@ -0,0 +1,12 @@
+foo
+bar
+fox-bax
+foo-bar
+foo–bar
+foo-bar-foo-bar
+foo-bar–foo-bar
+bar-baz
+baz-foo
+foo-bar-foo-bar-foo-bar-foo-bar-foo-bar
+e-mail
+e-mail-foo
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.wrong
new file mode 100644
index 0000000..d03b402
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.wrong
@@ -0,0 +1,13 @@
+fox
+bax
+-foo
+bar-
+fox-bar
+foo-bax
+foo–bax
+fox–bar
+foo-bar-fox-bar
+foo-bax-foo-bar
+foo-bar–fox-bar
+foo-bax–foo-bar
+foo-baz
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.aff
new file mode 100644
index 0000000..a13f464
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.aff
@@ -0,0 +1,6 @@
+# default word break at hyphens and n-dashes
+
+SET UTF-8
+MAXNGRAMSUGS 0
+WORDCHARS -
+TRY ot
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.dic
new file mode 100644
index 0000000..bf29960
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.dic
@@ -0,0 +1,6 @@
+3
+foo
+bar
+free
+scott
+scot-free
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.good
new file mode 100644
index 0000000..8d81254
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.good
@@ -0,0 +1,7 @@
+foo
+bar
+foo-
+-foo
+scot-free
+foo-bar
+foo-bar-foo-bar
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.wrong
new file mode 100644
index 0000000..e070c5c
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.wrong
@@ -0,0 +1,6 @@
+scot
+sco-free
+fo-bar
+foo-fo-bar
+foo-foo-fo
+-
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.aff
new file mode 100644
index 0000000..2e83d38
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.aff
@@ -0,0 +1,7 @@
+# switch off default word break at hyphens and n-dashes by BREAK 0
+SET UTF-8
+MAXNGRAMSUGS 0
+WORDCHARS -
+TRY ot
+
+BREAK 0
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.dic
new file mode 100644
index 0000000..bf29960
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.dic
@@ -0,0 +1,6 @@
+3
+foo
+bar
+free
+scott
+scot-free
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.good
new file mode 100644
index 0000000..854b39e
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.good
@@ -0,0 +1,3 @@
+foo
+bar
+scot-free
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.wrong
new file mode 100644
index 0000000..a6fcf7f
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.wrong
@@ -0,0 +1,5 @@
+foo-
+-foo
+foo-bar
+foo-bar-foo-bar
+scot