You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/01/21 11:33:23 UTC
[lucene-solr] branch master updated: LUCENE-9681: Hunspell
spellchecker: support numbers with separators (#2224)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new fdf04d8 LUCENE-9681: Hunspell spellchecker: support numbers with separators (#2224)
fdf04d8 is described below
commit fdf04d8c630c49b1be17109abb3c7a350228bac2
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Thu Jan 21 12:33:03 2021 +0100
LUCENE-9681: Hunspell spellchecker: support numbers with separators (#2224)
---
.../lucene/analysis/hunspell/SpellChecker.java | 28 ++++++++++++++++++++++
.../lucene/analysis/hunspell/SpellCheckerTest.java | 5 ++++
.../org/apache/lucene/analysis/hunspell/i53643.aff | 2 ++
.../org/apache/lucene/analysis/hunspell/i53643.dic | 2 ++
.../apache/lucene/analysis/hunspell/i53643.good | 21 ++++++++++++++++
.../apache/lucene/analysis/hunspell/i53643.wrong | 4 ++++
6 files changed, 62 insertions(+)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index 741fdc4..a3e765b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -35,11 +35,17 @@ public class SpellChecker {
/** @return whether the given word's spelling is considered correct according to Hunspell rules */
public boolean spell(String word) {
+ if (word.isEmpty()) return true;
+
char[] wordChars = word.toCharArray();
if (dictionary.isForbiddenWord(wordChars, scratch)) {
return false;
}
+ if (isNumber(word)) {
+ return true;
+ }
+
if (!stemmer.stem(wordChars, word.length()).isEmpty()) {
return true;
}
@@ -51,6 +57,28 @@ public class SpellChecker {
return false;
}
+ private static boolean isNumber(String s) {
+ int i = 0;
+ while (i < s.length()) {
+ char c = s.charAt(i);
+ if (isDigit(c)) {
+ i++;
+ } else if (c == '.' || c == ',' || c == '-') {
+ if (i == 0 || i >= s.length() - 1 || !isDigit(s.charAt(i + 1))) {
+ return false;
+ }
+ i += 2;
+ } else {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static boolean isDigit(char c) {
+ return c >= '0' && c <= '9';
+ }
+
private boolean tryBreaks(String word) {
for (String br : dictionary.breaks.starting) {
if (word.length() > br.length() && word.startsWith(br)) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index 7be4eaf..a478dda 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -23,8 +23,13 @@ import java.nio.file.Path;
import java.util.Objects;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.util.IOUtils;
+import org.junit.Test;
public class SpellCheckerTest extends StemmerTestBase {
+ @Test
+ public void i53643_numbersWithSeparators() throws Exception {
+ doTest("i53643");
+ }
public void testBreak() throws Exception {
doTest("break");
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.aff
new file mode 100644
index 0000000..9fac6d8
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.aff
@@ -0,0 +1,2 @@
+# check numbers with separators
+WORDCHARS 0123456789.-,
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.dic
new file mode 100644
index 0000000..aec5d50
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.dic
@@ -0,0 +1,2 @@
+1
+foo
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.good
new file mode 100644
index 0000000..a387a36
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.good
@@ -0,0 +1,21 @@
+1
+12
+123
+1234
+12345
+123456
+1234567
+1.1
+1.12
+1.123
+1.1234
+1.12345
+1.123456
+12.1
+123.12
+1234.123
+12345.1234
+123456.12345
+1234567.123456
+4,2
+42-42
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.wrong
new file mode 100644
index 0000000..2eab32c
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/i53643.wrong
@@ -0,0 +1,4 @@
+1..2
+1,,2
+1.,2
+1,.2
\ No newline at end of file