You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/01/29 07:23:51 UTC
[lucene-solr] branch master updated: LUCENE-9700: Hunspell: support
words with trailing dots (#2249)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 800f4d0 LUCENE-9700: Hunspell: support words with trailing dots (#2249)
800f4d0 is described below
commit 800f4d0919f567dd9878e9cfa9a545afbf7080d9
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Fri Jan 29 08:23:03 2021 +0100
LUCENE-9700: Hunspell: support words with trailing dots (#2249)
---
.../lucene/analysis/hunspell/SpellChecker.java | 16 ++
.../lucene/analysis/hunspell/SpellCheckerTest.java | 10 ++
.../lucene/analysis/hunspell/TestKeepCase.java | 6 +
.../org/apache/lucene/analysis/hunspell/base.aff | 192 +++++++++++++++++++++
.../org/apache/lucene/analysis/hunspell/base.dic | 29 ++++
.../org/apache/lucene/analysis/hunspell/base.good | 28 +++
.../org/apache/lucene/analysis/hunspell/base.wrong | 11 ++
.../apache/lucene/analysis/hunspell/keepcase.dic | 6 +-
.../apache/lucene/analysis/hunspell/keepcase.good | 4 +
.../apache/lucene/analysis/hunspell/keepcase.wrong | 8 +
10 files changed, 309 insertions(+), 1 deletion(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index 4056db6..e32a805 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -44,6 +44,14 @@ public class SpellChecker {
word = dictionary.cleanInput(word, new StringBuilder()).toString();
}
+ if (word.endsWith(".")) {
+ return spellWithTrailingDots(word);
+ }
+
+ return spellClean(word);
+ }
+
+ private boolean spellClean(String word) {
if (isNumber(word)) {
return true;
}
@@ -67,6 +75,14 @@ public class SpellChecker {
return false;
}
+ private boolean spellWithTrailingDots(String word) {
+ int length = word.length() - 1;
+ while (length > 0 && word.charAt(length - 1) == '.') {
+ length--;
+ }
+ return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1));
+ }
+
private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
char[] caseVariant = wordChars;
if (wordCase == WordCase.UPPER) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index 75b7639..30ceb58 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -27,6 +27,16 @@ import org.junit.Test;
public class SpellCheckerTest extends StemmerTestBase {
@Test
+ public void base() throws Exception {
+ doTest("base");
+ }
+
+ @Test
+ public void keepcase() throws Exception {
+ doTest("keepcase");
+ }
+
+ @Test
public void allcaps() throws Exception {
doTest("allcaps");
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
index 62f3381..63f9cc1 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
@@ -40,5 +40,11 @@ public class TestKeepCase extends StemmerTestBase {
assertStemsTo("test", "test");
assertStemsTo("Test");
assertStemsTo("TEST");
+
+ assertStemsTo("baz.", "baz.");
+ assertStemsTo("Baz.");
+
+ assertStemsTo("Quux.", "Quux.");
+ assertStemsTo("QUUX.");
}
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.aff
new file mode 100644
index 0000000..632f04b
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.aff
@@ -0,0 +1,192 @@
+# OpenOffice.org's en_US.aff file
+
+SET ISO8859-1
+TRY esianrtolcdugmphbyfvkwz'
+
+WORDCHARS .'
+
+PFX A Y 1
+PFX A 0 re .
+
+PFX I Y 1
+PFX I 0 in .
+
+PFX U Y 1
+PFX U 0 un .
+
+PFX C Y 1
+PFX C 0 de .
+
+PFX E Y 1
+PFX E 0 dis .
+
+PFX F Y 1
+PFX F 0 con .
+
+PFX K Y 1
+PFX K 0 pro .
+
+SFX V N 2
+SFX V e ive e
+SFX V 0 ive [^e]
+
+SFX N Y 3
+SFX N e ion e
+SFX N y ication y
+SFX N 0 en [^ey]
+
+SFX X Y 3
+SFX X e ions e
+SFX X y ications y
+SFX X 0 ens [^ey]
+
+SFX H N 2
+SFX H y ieth y
+SFX H 0 th [^y]
+
+SFX Y Y 1
+SFX Y 0 ly .
+
+SFX G Y 2
+SFX G e ing e
+SFX G 0 ing [^e]
+
+SFX J Y 2
+SFX J e ings e
+SFX J 0 ings [^e]
+
+SFX D Y 4
+SFX D 0 d e
+SFX D y ied [^aeiou]y
+SFX D 0 ed [^ey]
+SFX D 0 ed [aeiou]y
+
+SFX T N 4
+SFX T 0 st e
+SFX T y iest [^aeiou]y
+SFX T 0 est [aeiou]y
+SFX T 0 est [^ey]
+
+SFX R Y 4
+SFX R 0 r e
+SFX R y ier [^aeiou]y
+SFX R 0 er [aeiou]y
+SFX R 0 er [^ey]
+
+SFX Z Y 4
+SFX Z 0 rs e
+SFX Z y iers [^aeiou]y
+SFX Z 0 ers [aeiou]y
+SFX Z 0 ers [^ey]
+
+SFX S Y 4
+SFX S y ies [^aeiou]y
+SFX S 0 s [aeiou]y
+SFX S 0 es [sxzh]
+SFX S 0 s [^sxzhy]
+
+SFX P Y 3
+SFX P y iness [^aeiou]y
+SFX P 0 ness [aeiou]y
+SFX P 0 ness [^y]
+
+SFX M Y 1
+SFX M 0 's .
+
+SFX B Y 3
+SFX B 0 able [^aeiou]
+SFX B 0 able ee
+SFX B e able [^aeiou]e
+
+SFX L Y 1
+SFX L 0 ment .
+
+REP 88
+REP a ei
+REP ei a
+REP a ey
+REP ey a
+REP ai ie
+REP ie ai
+REP are air
+REP are ear
+REP are eir
+REP air are
+REP air ere
+REP ere air
+REP ere ear
+REP ere eir
+REP ear are
+REP ear air
+REP ear ere
+REP eir are
+REP eir ere
+REP ch te
+REP te ch
+REP ch ti
+REP ti ch
+REP ch tu
+REP tu ch
+REP ch s
+REP s ch
+REP ch k
+REP k ch
+REP f ph
+REP ph f
+REP gh f
+REP f gh
+REP i igh
+REP igh i
+REP i uy
+REP uy i
+REP i ee
+REP ee i
+REP j di
+REP di j
+REP j gg
+REP gg j
+REP j ge
+REP ge j
+REP s ti
+REP ti s
+REP s ci
+REP ci s
+REP k cc
+REP cc k
+REP k qu
+REP qu k
+REP kw qu
+REP o eau
+REP eau o
+REP o ew
+REP ew o
+REP oo ew
+REP ew oo
+REP ew ui
+REP ui ew
+REP oo ui
+REP ui oo
+REP ew u
+REP u ew
+REP oo u
+REP u oo
+REP u oe
+REP oe u
+REP u ieu
+REP ieu u
+REP ue ew
+REP ew ue
+REP uff ough
+REP oo ieu
+REP ieu oo
+REP ier ear
+REP ear ier
+REP ear air
+REP air ear
+REP w qu
+REP qu w
+REP z ss
+REP ss z
+REP shun tion
+REP shun sion
+REP shun cion
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.dic
new file mode 100644
index 0000000..5d9b8a2
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.dic
@@ -0,0 +1,29 @@
+28
+created/U
+create/XKVNGADS
+imply/GNSDX
+natural/PUY
+like/USPBY
+convey/BDGS
+look/GZRDS
+text
+hello
+said
+sawyer
+NASA
+rotten
+day
+tomorrow
+seven
+FAQ/SM
+can't
+doesn't
+etc
+won't
+lip
+text
+horrifying
+speech
+suggest
+uncreate/V
+Hunspell
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.good
new file mode 100644
index 0000000..2e5439a
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.good
@@ -0,0 +1,28 @@
+created
+uncreate
+uncreated
+imply
+implied
+unnatural
+conveyed
+sawyer
+NASA
+FAQs
+can't
+doesn't
+won't
+Created
+Hello
+HELLO
+NASA
+etc.
+etc
+HELLO
+lip.
+text.
+NASA.
+Text.
+TEXT.
+Hunspell.
+HUNSPELL.
+HUNSPELL...
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.wrong
new file mode 100644
index 0000000..88a6e25
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.wrong
@@ -0,0 +1,11 @@
+loooked
+texxt
+hlelo
+seid
+rottenday
+tomorow
+seeeven
+Nasa
+horrorfying
+peech
+sugesst
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
index 8678796..48d88a7 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
@@ -1,4 +1,8 @@
-3
+7
drink/X
walk/XZ
test/Z
+foo/Z
+Bar/Z
+baz./Z
+Quux./Z
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good
new file mode 100644
index 0000000..e6ff181
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good
@@ -0,0 +1,4 @@
+foo
+Bar
+baz.
+Quux.
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong
new file mode 100644
index 0000000..3b79142
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong
@@ -0,0 +1,8 @@
+Foo
+FOO
+BAR
+bar
+Baz.
+BAZ.
+quux.
+QUUX.