You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/18 08:30:26 UTC
[lucene-solr] branch master updated: LUCENE-9769: Hunspell:
KEEPCASE should take precedence over affixed forms (#2374)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 5e834b3 LUCENE-9769: Hunspell: KEEPCASE should take precedence over affixed forms (#2374)
5e834b3 is described below
commit 5e834b39eb23bdc4c2b6b80a782efb6ff9d397dd
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Thu Feb 18 09:30:09 2021 +0100
LUCENE-9769: Hunspell: KEEPCASE should take precedence over affixed forms (#2374)
and disregard KEEPCASE in Stemmer to make it more consistent with "hunspell -s"
---
.../apache/lucene/analysis/hunspell/Hunspell.java | 27 +++++++++-
.../apache/lucene/analysis/hunspell/Stemmer.java | 63 ++--------------------
.../lucene/analysis/hunspell/TestAllCaps.java | 7 +--
.../lucene/analysis/hunspell/TestKeepCase.java | 19 ++++---
.../lucene/analysis/hunspell/forbiddenword.dic | 6 ++-
.../lucene/analysis/hunspell/forbiddenword.good | 2 +-
.../lucene/analysis/hunspell/forbiddenword.wrong | 2 +
.../apache/lucene/analysis/hunspell/keepcase.dic | 4 +-
.../apache/lucene/analysis/hunspell/keepcase.good | 4 ++
.../apache/lucene/analysis/hunspell/keepcase.sug | 1 +
.../apache/lucene/analysis/hunspell/keepcase.wrong | 1 +
11 files changed, 62 insertions(+), 74 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
index 082076a..ea486da 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@@ -165,9 +165,11 @@ public class Hunspell {
wordChars,
offset,
length,
- originalCase,
context,
(stem, formID, morphDataId) -> {
+ if (!acceptCase(originalCase, formID, stem)) {
+ return dictionary.hasFlag(formID, Dictionary.HIDDEN_FLAG);
+ }
if (acceptsStem(formID)) {
result[0] = new Root<>(stem, formID);
}
@@ -176,6 +178,29 @@ public class Hunspell {
return result[0];
}
+ private boolean acceptCase(WordCase originalCase, int entryId, CharsRef root) {
+ boolean keepCase = dictionary.hasFlag(entryId, dictionary.keepcase);
+ if (originalCase != null) {
+ if (keepCase
+ && dictionary.checkSharpS
+ && originalCase == WordCase.TITLE
+ && containsSharpS(root.chars, root.offset, root.length)) {
+ return true;
+ }
+ return !keepCase;
+ }
+ return !dictionary.hasFlag(entryId, Dictionary.HIDDEN_FLAG);
+ }
+
+ private boolean containsSharpS(char[] word, int offset, int length) {
+ for (int i = 0; i < length; i++) {
+ if (word[i + offset] == 'ß') {
+ return true;
+ }
+ }
+ return false;
+ }
+
boolean acceptsStem(int formID) {
return true;
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 012b764..b84050e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -101,8 +101,7 @@ final class Stemmer {
list.add(newStem(stem, stemException));
return true;
};
-
- if (!doStem(word, 0, length, null, WordContext.SIMPLE_WORD, processor)) {
+ if (!doStem(word, 0, length, WordContext.SIMPLE_WORD, processor)) {
return list;
}
@@ -110,7 +109,7 @@ final class Stemmer {
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
CaseVariationProcessor variationProcessor =
(variant, varLength, originalCase) ->
- doStem(variant, 0, varLength, originalCase, WordContext.SIMPLE_WORD, processor);
+ doStem(variant, 0, varLength, WordContext.SIMPLE_WORD, processor);
varyCase(word, length, wordCase, variationProcessor);
}
return list;
@@ -239,19 +238,11 @@ final class Stemmer {
}
boolean doStem(
- char[] word,
- int offset,
- int length,
- WordCase originalCase,
- WordContext context,
- RootProcessor processor) {
+ char[] word, int offset, int length, WordContext context, RootProcessor processor) {
IntsRef forms = dictionary.lookupWord(word, offset, length);
if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) {
int entryId = forms.ints[forms.offset + i];
- if (!acceptCase(originalCase, entryId, word, offset, length)) {
- continue;
- }
// we can't add this form, it's a pseudostem requiring an affix
if (dictionary.hasFlag(entryId, dictionary.needaffix)) {
continue;
@@ -277,47 +268,12 @@ final class Stemmer {
}
try {
return stem(
- word,
- offset,
- length,
- context,
- -1,
- Dictionary.FLAG_UNSET,
- -1,
- 0,
- true,
- false,
- originalCase,
- processor);
+ word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
- private boolean acceptCase(
- WordCase originalCase, int entryId, char[] word, int offset, int length) {
- boolean keepCase = dictionary.hasFlag(entryId, dictionary.keepcase);
- if (originalCase != null) {
- if (keepCase
- && dictionary.checkSharpS
- && originalCase == WordCase.TITLE
- && containsSharpS(word, offset, length)) {
- return true;
- }
- return !keepCase;
- }
- return !dictionary.hasFlag(entryId, Dictionary.HIDDEN_FLAG);
- }
-
- private boolean containsSharpS(char[] word, int offset, int length) {
- for (int i = 0; i < length; i++) {
- if (word[i + offset] == 'ß') {
- return true;
- }
- }
- return false;
- }
-
/**
* Find the unique stem(s) of the provided word
*
@@ -411,8 +367,6 @@ final class Stemmer {
* @param previousWasPrefix true if the previous removal was a prefix: if we are removing a
* suffix, and it has no continuation requirements, it's ok. but two prefixes
* (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
- * @param originalCase if non-null, represents original word case to disallow case variations of
- * word with KEEPCASE flags
* @return whether the processing should be continued
*/
private boolean stem(
@@ -426,7 +380,6 @@ final class Stemmer {
int recursionDepth,
boolean doPrefix,
boolean previousWasPrefix,
- WordCase originalCase,
RootProcessor processor)
throws IOException {
if (doPrefix && dictionary.prefixes != null) {
@@ -473,7 +426,6 @@ final class Stemmer {
-1,
recursionDepth,
true,
- originalCase,
processor)) {
return false;
}
@@ -527,7 +479,6 @@ final class Stemmer {
prefixId,
recursionDepth,
false,
- originalCase,
processor)) {
return false;
}
@@ -667,7 +618,6 @@ final class Stemmer {
int prefixId,
int recursionDepth,
boolean prefix,
- WordCase originalCase,
RootProcessor processor)
throws IOException {
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
@@ -690,10 +640,6 @@ final class Stemmer {
}
}
- // we are looking for a case variant, but this word does not allow it
- if (!acceptCase(originalCase, entryId, strippedWord, offset, length)) {
- continue;
- }
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
continue;
}
@@ -752,7 +698,6 @@ final class Stemmer {
recursionDepth + 1,
doPrefix,
prefix,
- originalCase,
processor);
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java
index 33f132f..84867ae 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java
@@ -37,8 +37,9 @@ public class TestAllCaps extends StemmerTestBase {
}
public void testWrong() {
- assertStemsTo("Openoffice.org");
- assertStemsTo("Unicef");
- assertStemsTo("Unicef's");
+ // "hunspell -s" still stems them
+ assertStemsTo("Openoffice.org", "Openoffice.org");
+ assertStemsTo("Unicef", "Unicef");
+ assertStemsTo("Unicef's", "Unicef");
}
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
index 63f9cc1..f67133a 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
@@ -34,17 +34,22 @@ public class TestKeepCase extends StemmerTestBase {
assertStemsTo("DRINKS", "drink");
assertStemsTo("walk", "walk");
assertStemsTo("walks", "walk");
- assertStemsTo("Walk");
- assertStemsTo("Walks");
- assertStemsTo("WALKS");
+ assertStemsTo("Walk", "walk");
+ assertStemsTo("Walks", "walk");
+ assertStemsTo("WALKS", "walk");
assertStemsTo("test", "test");
- assertStemsTo("Test");
- assertStemsTo("TEST");
+ assertStemsTo("Test", "test");
+ assertStemsTo("TEST", "test");
+ // dotted stems differ form "hunspell -s", but in a controversial way,
+ // and most importantly stem presence is the same
assertStemsTo("baz.", "baz.");
- assertStemsTo("Baz.");
+ assertStemsTo("Baz.", "baz.");
assertStemsTo("Quux.", "Quux.");
- assertStemsTo("QUUX.");
+ assertStemsTo("QUUX.", "Quux.");
+
+ assertStemsTo("Ways", "way", "ways");
+ assertStemsTo("WAYS", "way", "ways");
}
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
index cb63592..b012cc8 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
@@ -1,4 +1,4 @@
-10
+11
foo/S
foo/YX
bar/YS
@@ -8,4 +8,6 @@ kg
Kg/X
KG/X
cm
-Cm/X
\ No newline at end of file
+Cm/X
+SIPS/X
+Sip/A
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.good
index 7bd112e..1f73e52 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.good
@@ -1,3 +1,3 @@
foo
bar
-
+Sips
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.wrong
index 5752c1e..3279626 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.wrong
@@ -2,3 +2,5 @@ bars
foos
foobar
barfoo
+SIPS
+sips
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
index 48d88a7..4d497dc 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
@@ -1,4 +1,4 @@
-7
+9
drink/X
walk/XZ
test/Z
@@ -6,3 +6,5 @@ foo/Z
Bar/Z
baz./Z
Quux./Z
+way/X
+ways/Z
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good
index e6ff181..795112e 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good
@@ -2,3 +2,7 @@ foo
Bar
baz.
Quux.
+way
+Way
+WAY
+ways
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.sug
index 69e80dd..e4cb7fa 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.sug
@@ -6,3 +6,4 @@ baz.
baz.
Quux.
Quux.
+Way
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong
index 3b79142..1986f6d 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong
@@ -6,3 +6,4 @@ Baz.
BAZ.
quux.
QUUX.
+Ways