You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/10 08:23:31 UTC

[lucene-solr] branch master updated: LUCENE-9753: Hunspell: disallow compounds with parts present in dictionary, space-separated (#2335)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 6f52530  LUCENE-9753: Hunspell: disallow compounds with parts present in dictionary, space-separated (#2335)
6f52530 is described below

commit 6f525302dd683fcbbab78c07c5a104b6101c5f4c
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Wed Feb 10 09:23:15 2021 +0100

    LUCENE-9753: Hunspell: disallow compounds with parts present in dictionary, space-separated (#2335)
---
 .../java/org/apache/lucene/analysis/hunspell/SpellChecker.java    | 8 ++++++--
 .../org/apache/lucene/analysis/hunspell/SpellCheckerTest.java     | 4 ++++
 .../src/test/org/apache/lucene/analysis/hunspell/wordpair.aff     | 4 ++++
 .../src/test/org/apache/lucene/analysis/hunspell/wordpair.dic     | 4 ++++
 .../src/test/org/apache/lucene/analysis/hunspell/wordpair.good    | 3 +++
 .../src/test/org/apache/lucene/analysis/hunspell/wordpair.wrong   | 1 +
 6 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index 482697f..790eca8 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -261,12 +261,16 @@ public class SpellChecker {
         return false;
       }
 
-      //noinspection RedundantIfStatement
       if (dictionary.checkCompoundRep
           && isMisspelledSimpleWord(length + nextPartLength, originalCase)) {
         return false;
       }
-      return true;
+
+      String spaceSeparated =
+          new String(tail.chars, tail.offset, length)
+              + " "
+              + new String(tail.chars, tail.offset + length, nextPartLength);
+      return !checkWord(spaceSeparated);
     }
 
     private boolean isMisspelledSimpleWord(int length, WordCase originalCase) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index 441e5d8..b71b6e3 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -132,6 +132,10 @@ public class SpellCheckerTest extends StemmerTestBase {
     doTest("checkcompoundrep");
   }
 
+  public void testDisallowCompoundsWhenDictionaryContainsSeparatedWordPair() throws Exception {
+    doTest("wordpair");
+  }
+
   public void testCompoundrule() throws Exception {
     doTest("compoundrule");
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.aff
new file mode 100644
index 0000000..e788b17
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.aff
@@ -0,0 +1,4 @@
+# a dictionary word pair separated by space
+# will avoid its recognition without space
+# at compound word analysis
+COMPOUNDFLAG Y
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.dic
new file mode 100644
index 0000000..96fc77f
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.dic
@@ -0,0 +1,4 @@
+3
+word/Y
+compound/Y
+compound word
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.good
new file mode 100644
index 0000000..d868fce
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.good
@@ -0,0 +1,3 @@
+word
+compound
+wordcompound
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.wrong
new file mode 100644
index 0000000..04ca38b
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.wrong
@@ -0,0 +1 @@
+compoundword