You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/01/21 11:33:23 UTC

[lucene-solr] branch master updated: LUCENE-9681: Hunspell spellchecker: support numbers with separators (#2224)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new fdf04d8  LUCENE-9681: Hunspell spellchecker: support numbers with separators (#2224)
fdf04d8 is described below

commit fdf04d8c630c49b1be17109abb3c7a350228bac2
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Thu Jan 21 12:33:03 2021 +0100

    LUCENE-9681: Hunspell spellchecker: support numbers with separators (#2224)
---
 .../lucene/analysis/hunspell/SpellChecker.java     | 28 ++++++++++++++++++++++
 .../lucene/analysis/hunspell/SpellCheckerTest.java |  5 ++++
 .../org/apache/lucene/analysis/hunspell/i53643.aff |  2 ++
 .../org/apache/lucene/analysis/hunspell/i53643.dic |  2 ++
 .../apache/lucene/analysis/hunspell/i53643.good    | 21 ++++++++++++++++
 .../apache/lucene/analysis/hunspell/i53643.wrong   |  4 ++++
 6 files changed, 62 insertions(+)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index 741fdc4..a3e765b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -35,11 +35,17 @@ public class SpellChecker {
 
   /** @return whether the given word's spelling is considered correct according to Hunspell rules */
   public boolean spell(String word) {
+    if (word.isEmpty()) return true;
+
     char[] wordChars = word.toCharArray();
     if (dictionary.isForbiddenWord(wordChars, scratch)) {
       return false;
     }
 
+    if (isNumber(word)) {
+      return true;
+    }
+
     if (!stemmer.stem(wordChars, word.length()).isEmpty()) {
       return true;
     }
@@ -51,6 +57,28 @@ public class SpellChecker {
     return false;
   }
 
+  private static boolean isNumber(String s) {
+    int i = 0;
+    while (i < s.length()) {
+      char c = s.charAt(i);
+      if (isDigit(c)) {
+        i++;
+      } else if (c == '.' || c == ',' || c == '-') {
+        if (i == 0 || i >= s.length() - 1 || !isDigit(s.charAt(i + 1))) {
+          return false;
+        }
+        i += 2;
+      } else {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  private static boolean isDigit(char c) {
+    return c >= '0' && c <= '9';
+  }
+
   private boolean tryBreaks(String word) {
     for (String br : dictionary.breaks.starting) {
       if (word.length() > br.length() && word.startsWith(br)) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index 7be4eaf..a478dda 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -23,8 +23,13 @@ import java.nio.file.Path;
 import java.util.Objects;
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.util.IOUtils;
+import org.junit.Test;
 
 public class SpellCheckerTest extends StemmerTestBase {
+  @Test
+  public void i53643_numbersWithSeparators() throws Exception {
+    doTest("i53643");
+  }
 
   public void testBreak() throws Exception {
     doTest("break");
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.aff
new file mode 100644
index 0000000..9fac6d8
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.aff
@@ -0,0 +1,2 @@
+# check numbers with separators
+WORDCHARS 0123456789.-,
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.dic
new file mode 100644
index 0000000..aec5d50
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.dic
@@ -0,0 +1,2 @@
+1
+foo
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.good
new file mode 100644
index 0000000..a387a36
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.good
@@ -0,0 +1,21 @@
+1
+12
+123
+1234
+12345
+123456
+1234567
+1.1
+1.12
+1.123
+1.1234
+1.12345
+1.123456
+12.1
+123.12
+1234.123
+12345.1234
+123456.12345
+1234567.123456
+4,2
+42-42
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.wrong
new file mode 100644
index 0000000..2eab32c
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.wrong
@@ -0,0 +1,4 @@
+1..2
+1,,2
+1.,2
+1,.2
\ No newline at end of file