You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/01/29 07:23:51 UTC

[lucene-solr] branch master updated: LUCENE-9700: Hunspell: support words with trailing dots (#2249)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 800f4d0  LUCENE-9700: Hunspell: support words with trailing dots (#2249)
800f4d0 is described below

commit 800f4d0919f567dd9878e9cfa9a545afbf7080d9
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Fri Jan 29 08:23:03 2021 +0100

    LUCENE-9700: Hunspell: support words with trailing dots (#2249)
---
 .../lucene/analysis/hunspell/SpellChecker.java     |  16 ++
 .../lucene/analysis/hunspell/SpellCheckerTest.java |  10 ++
 .../lucene/analysis/hunspell/TestKeepCase.java     |   6 +
 .../org/apache/lucene/analysis/hunspell/base.aff   | 192 +++++++++++++++++++++
 .../org/apache/lucene/analysis/hunspell/base.dic   |  29 ++++
 .../org/apache/lucene/analysis/hunspell/base.good  |  28 +++
 .../org/apache/lucene/analysis/hunspell/base.wrong |  11 ++
 .../apache/lucene/analysis/hunspell/keepcase.dic   |   6 +-
 .../apache/lucene/analysis/hunspell/keepcase.good  |   4 +
 .../apache/lucene/analysis/hunspell/keepcase.wrong |   8 +
 10 files changed, 309 insertions(+), 1 deletion(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index 4056db6..e32a805 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -44,6 +44,14 @@ public class SpellChecker {
       word = dictionary.cleanInput(word, new StringBuilder()).toString();
     }
 
+    if (word.endsWith(".")) {
+      return spellWithTrailingDots(word);
+    }
+
+    return spellClean(word);
+  }
+
+  private boolean spellClean(String word) {
     if (isNumber(word)) {
       return true;
     }
@@ -67,6 +75,14 @@ public class SpellChecker {
     return false;
   }
 
+  private boolean spellWithTrailingDots(String word) {
+    int length = word.length() - 1;
+    while (length > 0 && word.charAt(length - 1) == '.') {
+      length--;
+    }
+    return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1));
+  }
+
   private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
     char[] caseVariant = wordChars;
     if (wordCase == WordCase.UPPER) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index 75b7639..30ceb58 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -27,6 +27,16 @@ import org.junit.Test;
 
 public class SpellCheckerTest extends StemmerTestBase {
   @Test
+  public void base() throws Exception {
+    doTest("base");
+  }
+
+  @Test
+  public void keepcase() throws Exception {
+    doTest("keepcase");
+  }
+
+  @Test
   public void allcaps() throws Exception {
     doTest("allcaps");
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
index 62f3381..63f9cc1 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
@@ -40,5 +40,11 @@ public class TestKeepCase extends StemmerTestBase {
     assertStemsTo("test", "test");
     assertStemsTo("Test");
     assertStemsTo("TEST");
+
+    assertStemsTo("baz.", "baz.");
+    assertStemsTo("Baz.");
+
+    assertStemsTo("Quux.", "Quux.");
+    assertStemsTo("QUUX.");
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.aff
new file mode 100644
index 0000000..632f04b
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.aff
@@ -0,0 +1,192 @@
+# OpenOffice.org's en_US.aff file
+
+SET ISO8859-1
+TRY esianrtolcdugmphbyfvkwz'
+
+WORDCHARS .'
+
+PFX A Y 1
+PFX A   0     re         .
+
+PFX I Y 1
+PFX I   0     in         .
+
+PFX U Y 1
+PFX U   0     un         .
+
+PFX C Y 1
+PFX C   0     de          .
+
+PFX E Y 1
+PFX E   0     dis         .
+
+PFX F Y 1
+PFX F   0     con         .
+
+PFX K Y 1
+PFX K   0     pro         .
+
+SFX V N 2
+SFX V   e     ive        e
+SFX V   0     ive        [^e]
+
+SFX N Y 3
+SFX N   e     ion        e
+SFX N   y     ication    y 
+SFX N   0     en         [^ey] 
+
+SFX X Y 3
+SFX X   e     ions       e
+SFX X   y     ications   y
+SFX X   0     ens        [^ey]
+
+SFX H N 2
+SFX H   y     ieth       y
+SFX H   0     th         [^y] 
+
+SFX Y Y 1
+SFX Y   0     ly         .
+
+SFX G Y 2
+SFX G   e     ing        e
+SFX G   0     ing        [^e] 
+
+SFX J Y 2
+SFX J   e     ings       e
+SFX J   0     ings       [^e]
+
+SFX D Y 4
+SFX D   0     d          e
+SFX D   y     ied        [^aeiou]y
+SFX D   0     ed         [^ey]
+SFX D   0     ed         [aeiou]y
+
+SFX T N 4
+SFX T   0     st         e
+SFX T   y     iest       [^aeiou]y
+SFX T   0     est        [aeiou]y
+SFX T   0     est        [^ey]
+
+SFX R Y 4
+SFX R   0     r          e
+SFX R   y     ier        [^aeiou]y
+SFX R   0     er         [aeiou]y
+SFX R   0     er         [^ey]
+
+SFX Z Y 4
+SFX Z   0     rs         e
+SFX Z   y     iers       [^aeiou]y
+SFX Z   0     ers        [aeiou]y
+SFX Z   0     ers        [^ey]
+
+SFX S Y 4
+SFX S   y     ies        [^aeiou]y
+SFX S   0     s          [aeiou]y
+SFX S   0     es         [sxzh]
+SFX S   0     s          [^sxzhy]
+
+SFX P Y 3
+SFX P   y     iness      [^aeiou]y
+SFX P   0     ness       [aeiou]y
+SFX P   0     ness       [^y]
+
+SFX M Y 1
+SFX M   0     's         .
+
+SFX B Y 3
+SFX B   0     able       [^aeiou]
+SFX B   0     able       ee
+SFX B   e     able       [^aeiou]e
+
+SFX L Y 1
+SFX L   0     ment       .
+
+REP 88
+REP a ei
+REP ei a
+REP a ey
+REP ey a
+REP ai ie
+REP ie ai
+REP are air
+REP are ear
+REP are eir
+REP air are
+REP air ere
+REP ere air
+REP ere ear
+REP ere eir
+REP ear are
+REP ear air
+REP ear ere
+REP eir are
+REP eir ere
+REP ch te
+REP te ch
+REP ch ti
+REP ti ch
+REP ch tu
+REP tu ch
+REP ch s
+REP s ch
+REP ch k
+REP k ch
+REP f ph
+REP ph f
+REP gh f
+REP f gh
+REP i igh
+REP igh i
+REP i uy
+REP uy i
+REP i ee
+REP ee i
+REP j di
+REP di j
+REP j gg
+REP gg j
+REP j ge
+REP ge j
+REP s ti
+REP ti s
+REP s ci
+REP ci s
+REP k cc
+REP cc k
+REP k qu
+REP qu k
+REP kw qu
+REP o eau
+REP eau o
+REP o ew
+REP ew o
+REP oo ew
+REP ew oo
+REP ew ui
+REP ui ew
+REP oo ui
+REP ui oo
+REP ew u
+REP u ew
+REP oo u
+REP u oo
+REP u oe
+REP oe u
+REP u ieu
+REP ieu u
+REP ue ew
+REP ew ue
+REP uff ough
+REP oo ieu
+REP ieu oo
+REP ier ear
+REP ear ier
+REP ear air
+REP air ear
+REP w qu
+REP qu w
+REP z ss
+REP ss z
+REP shun tion
+REP shun sion
+REP shun cion
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.dic
new file mode 100644
index 0000000..5d9b8a2
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.dic
@@ -0,0 +1,29 @@
+28
+created/U
+create/XKVNGADS
+imply/GNSDX
+natural/PUY
+like/USPBY
+convey/BDGS
+look/GZRDS
+text
+hello
+said
+sawyer
+NASA
+rotten
+day
+tomorrow
+seven
+FAQ/SM
+can't
+doesn't
+etc
+won't
+lip
+text
+horrifying
+speech
+suggest
+uncreate/V
+Hunspell
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.good
new file mode 100644
index 0000000..2e5439a
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.good
@@ -0,0 +1,28 @@
+created
+uncreate
+uncreated
+imply
+implied
+unnatural
+conveyed
+sawyer
+NASA
+FAQs
+can't
+doesn't
+won't
+Created
+Hello
+HELLO
+NASA
+etc.
+etc
+HELLO
+lip.
+text.
+NASA.
+Text.
+TEXT.
+Hunspell.
+HUNSPELL.
+HUNSPELL...
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.wrong
new file mode 100644
index 0000000..88a6e25
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/base.wrong
@@ -0,0 +1,11 @@
+loooked
+texxt
+hlelo
+seid
+rottenday
+tomorow
+seeeven
+Nasa
+horrorfying
+peech
+sugesst
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
index 8678796..48d88a7 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
@@ -1,4 +1,8 @@
-3
+7
 drink/X
 walk/XZ
 test/Z
+foo/Z
+Bar/Z
+baz./Z
+Quux./Z
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good
new file mode 100644
index 0000000..e6ff181
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good
@@ -0,0 +1,4 @@
+foo
+Bar
+baz.
+Quux.
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong
new file mode 100644
index 0000000..3b79142
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong
@@ -0,0 +1,8 @@
+Foo
+FOO
+BAR
+bar
+Baz.
+BAZ.
+quux.
+QUUX.