You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/10 08:23:31 UTC
[lucene-solr] branch master updated: LUCENE-9753: Hunspell:
disallow compounds with parts present in dictionary, space-separated (#2335)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 6f52530 LUCENE-9753: Hunspell: disallow compounds with parts present in dictionary, space-separated (#2335)
6f52530 is described below
commit 6f525302dd683fcbbab78c07c5a104b6101c5f4c
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Wed Feb 10 09:23:15 2021 +0100
LUCENE-9753: Hunspell: disallow compounds with parts present in dictionary, space-separated (#2335)
---
.../java/org/apache/lucene/analysis/hunspell/SpellChecker.java | 8 ++++++--
.../org/apache/lucene/analysis/hunspell/SpellCheckerTest.java | 4 ++++
.../src/test/org/apache/lucene/analysis/hunspell/wordpair.aff | 4 ++++
.../src/test/org/apache/lucene/analysis/hunspell/wordpair.dic | 4 ++++
.../src/test/org/apache/lucene/analysis/hunspell/wordpair.good | 3 +++
.../src/test/org/apache/lucene/analysis/hunspell/wordpair.wrong | 1 +
6 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index 482697f..790eca8 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -261,12 +261,16 @@ public class SpellChecker {
return false;
}
- //noinspection RedundantIfStatement
if (dictionary.checkCompoundRep
&& isMisspelledSimpleWord(length + nextPartLength, originalCase)) {
return false;
}
- return true;
+
+ String spaceSeparated =
+ new String(tail.chars, tail.offset, length)
+ + " "
+ + new String(tail.chars, tail.offset + length, nextPartLength);
+ return !checkWord(spaceSeparated);
}
private boolean isMisspelledSimpleWord(int length, WordCase originalCase) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index 441e5d8..b71b6e3 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -132,6 +132,10 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("checkcompoundrep");
}
+ public void testDisallowCompoundsWhenDictionaryContainsSeparatedWordPair() throws Exception {
+ doTest("wordpair");
+ }
+
public void testCompoundrule() throws Exception {
doTest("compoundrule");
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.aff
new file mode 100644
index 0000000..e788b17
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.aff
@@ -0,0 +1,4 @@
+# a dictionary word pair separated by space
+# will avoid its recognition without space
+# at compound word analysis
+COMPOUNDFLAG Y
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.dic
new file mode 100644
index 0000000..96fc77f
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.dic
@@ -0,0 +1,4 @@
+3
+word/Y
+compound/Y
+compound word
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.good
new file mode 100644
index 0000000..d868fce
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.good
@@ -0,0 +1,3 @@
+word
+compound
+wordcompound
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.wrong
new file mode 100644
index 0000000..04ca38b
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/wordpair.wrong
@@ -0,0 +1 @@
+compoundword