You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/18 08:30:26 UTC

[lucene-solr] branch master updated: LUCENE-9769: Hunspell: KEEPCASE should take precedence over affixed forms (#2374)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 5e834b3  LUCENE-9769: Hunspell: KEEPCASE should take precedence over affixed forms (#2374)
5e834b3 is described below

commit 5e834b39eb23bdc4c2b6b80a782efb6ff9d397dd
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Thu Feb 18 09:30:09 2021 +0100

    LUCENE-9769: Hunspell: KEEPCASE should take precedence over affixed forms (#2374)
    
    and disregard KEEPCASE in Stemmer to make it more consistent with "hunspell -s"
---
 .../apache/lucene/analysis/hunspell/Hunspell.java  | 27 +++++++++-
 .../apache/lucene/analysis/hunspell/Stemmer.java   | 63 ++--------------------
 .../lucene/analysis/hunspell/TestAllCaps.java      |  7 +--
 .../lucene/analysis/hunspell/TestKeepCase.java     | 19 ++++---
 .../lucene/analysis/hunspell/forbiddenword.dic     |  6 ++-
 .../lucene/analysis/hunspell/forbiddenword.good    |  2 +-
 .../lucene/analysis/hunspell/forbiddenword.wrong   |  2 +
 .../apache/lucene/analysis/hunspell/keepcase.dic   |  4 +-
 .../apache/lucene/analysis/hunspell/keepcase.good  |  4 ++
 .../apache/lucene/analysis/hunspell/keepcase.sug   |  1 +
 .../apache/lucene/analysis/hunspell/keepcase.wrong |  1 +
 11 files changed, 62 insertions(+), 74 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
index 082076a..ea486da 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@@ -165,9 +165,11 @@ public class Hunspell {
         wordChars,
         offset,
         length,
-        originalCase,
         context,
         (stem, formID, morphDataId) -> {
+          if (!acceptCase(originalCase, formID, stem)) {
+            return dictionary.hasFlag(formID, Dictionary.HIDDEN_FLAG);
+          }
           if (acceptsStem(formID)) {
             result[0] = new Root<>(stem, formID);
           }
@@ -176,6 +178,29 @@ public class Hunspell {
     return result[0];
   }
 
+  private boolean acceptCase(WordCase originalCase, int entryId, CharsRef root) {
+    boolean keepCase = dictionary.hasFlag(entryId, dictionary.keepcase);
+    if (originalCase != null) {
+      if (keepCase
+          && dictionary.checkSharpS
+          && originalCase == WordCase.TITLE
+          && containsSharpS(root.chars, root.offset, root.length)) {
+        return true;
+      }
+      return !keepCase;
+    }
+    return !dictionary.hasFlag(entryId, Dictionary.HIDDEN_FLAG);
+  }
+
+  private boolean containsSharpS(char[] word, int offset, int length) {
+    for (int i = 0; i < length; i++) {
+      if (word[i + offset] == 'ß') {
+        return true;
+      }
+    }
+    return false;
+  }
+
   boolean acceptsStem(int formID) {
     return true;
   }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 012b764..b84050e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -101,8 +101,7 @@ final class Stemmer {
           list.add(newStem(stem, stemException));
           return true;
         };
-
-    if (!doStem(word, 0, length, null, WordContext.SIMPLE_WORD, processor)) {
+    if (!doStem(word, 0, length, WordContext.SIMPLE_WORD, processor)) {
       return list;
     }
 
@@ -110,7 +109,7 @@ final class Stemmer {
     if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
       CaseVariationProcessor variationProcessor =
           (variant, varLength, originalCase) ->
-              doStem(variant, 0, varLength, originalCase, WordContext.SIMPLE_WORD, processor);
+              doStem(variant, 0, varLength, WordContext.SIMPLE_WORD, processor);
       varyCase(word, length, wordCase, variationProcessor);
     }
     return list;
@@ -239,19 +238,11 @@ final class Stemmer {
   }
 
   boolean doStem(
-      char[] word,
-      int offset,
-      int length,
-      WordCase originalCase,
-      WordContext context,
-      RootProcessor processor) {
+      char[] word, int offset, int length, WordContext context, RootProcessor processor) {
     IntsRef forms = dictionary.lookupWord(word, offset, length);
     if (forms != null) {
       for (int i = 0; i < forms.length; i += formStep) {
         int entryId = forms.ints[forms.offset + i];
-        if (!acceptCase(originalCase, entryId, word, offset, length)) {
-          continue;
-        }
         // we can't add this form, it's a pseudostem requiring an affix
         if (dictionary.hasFlag(entryId, dictionary.needaffix)) {
           continue;
@@ -277,47 +268,12 @@ final class Stemmer {
     }
     try {
       return stem(
-          word,
-          offset,
-          length,
-          context,
-          -1,
-          Dictionary.FLAG_UNSET,
-          -1,
-          0,
-          true,
-          false,
-          originalCase,
-          processor);
+          word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
     } catch (IOException bogus) {
       throw new RuntimeException(bogus);
     }
   }
 
-  private boolean acceptCase(
-      WordCase originalCase, int entryId, char[] word, int offset, int length) {
-    boolean keepCase = dictionary.hasFlag(entryId, dictionary.keepcase);
-    if (originalCase != null) {
-      if (keepCase
-          && dictionary.checkSharpS
-          && originalCase == WordCase.TITLE
-          && containsSharpS(word, offset, length)) {
-        return true;
-      }
-      return !keepCase;
-    }
-    return !dictionary.hasFlag(entryId, Dictionary.HIDDEN_FLAG);
-  }
-
-  private boolean containsSharpS(char[] word, int offset, int length) {
-    for (int i = 0; i < length; i++) {
-      if (word[i + offset] == 'ß') {
-        return true;
-      }
-    }
-    return false;
-  }
-
   /**
    * Find the unique stem(s) of the provided word
    *
@@ -411,8 +367,6 @@ final class Stemmer {
    * @param previousWasPrefix true if the previous removal was a prefix: if we are removing a
    *     suffix, and it has no continuation requirements, it's ok. but two prefixes
    *     (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
-   * @param originalCase if non-null, represents original word case to disallow case variations of
-   *     word with KEEPCASE flags
    * @return whether the processing should be continued
    */
   private boolean stem(
@@ -426,7 +380,6 @@ final class Stemmer {
       int recursionDepth,
       boolean doPrefix,
       boolean previousWasPrefix,
-      WordCase originalCase,
       RootProcessor processor)
       throws IOException {
     if (doPrefix && dictionary.prefixes != null) {
@@ -473,7 +426,6 @@ final class Stemmer {
                 -1,
                 recursionDepth,
                 true,
-                originalCase,
                 processor)) {
               return false;
             }
@@ -527,7 +479,6 @@ final class Stemmer {
                 prefixId,
                 recursionDepth,
                 false,
-                originalCase,
                 processor)) {
               return false;
             }
@@ -667,7 +618,6 @@ final class Stemmer {
       int prefixId,
       int recursionDepth,
       boolean prefix,
-      WordCase originalCase,
       RootProcessor processor)
       throws IOException {
     char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
@@ -690,10 +640,6 @@ final class Stemmer {
             }
           }
 
-          // we are looking for a case variant, but this word does not allow it
-          if (!acceptCase(originalCase, entryId, strippedWord, offset, length)) {
-            continue;
-          }
           if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
             continue;
           }
@@ -752,7 +698,6 @@ final class Stemmer {
           recursionDepth + 1,
           doPrefix,
           prefix,
-          originalCase,
           processor);
     }
 
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java
index 33f132f..84867ae 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java
@@ -37,8 +37,9 @@ public class TestAllCaps extends StemmerTestBase {
   }
 
   public void testWrong() {
-    assertStemsTo("Openoffice.org");
-    assertStemsTo("Unicef");
-    assertStemsTo("Unicef's");
+    // "hunspell -s" still stems them
+    assertStemsTo("Openoffice.org", "Openoffice.org");
+    assertStemsTo("Unicef", "Unicef");
+    assertStemsTo("Unicef's", "Unicef");
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
index 63f9cc1..f67133a 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestKeepCase.java
@@ -34,17 +34,22 @@ public class TestKeepCase extends StemmerTestBase {
     assertStemsTo("DRINKS", "drink");
     assertStemsTo("walk", "walk");
     assertStemsTo("walks", "walk");
-    assertStemsTo("Walk");
-    assertStemsTo("Walks");
-    assertStemsTo("WALKS");
+    assertStemsTo("Walk", "walk");
+    assertStemsTo("Walks", "walk");
+    assertStemsTo("WALKS", "walk");
     assertStemsTo("test", "test");
-    assertStemsTo("Test");
-    assertStemsTo("TEST");
+    assertStemsTo("Test", "test");
+    assertStemsTo("TEST", "test");
 
+    // dotted stems differ form "hunspell -s", but in a controversial way,
+    // and most importantly stem presence is the same
     assertStemsTo("baz.", "baz.");
-    assertStemsTo("Baz.");
+    assertStemsTo("Baz.", "baz.");
 
     assertStemsTo("Quux.", "Quux.");
-    assertStemsTo("QUUX.");
+    assertStemsTo("QUUX.", "Quux.");
+
+    assertStemsTo("Ways", "way", "ways");
+    assertStemsTo("WAYS", "way", "ways");
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
index cb63592..b012cc8 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
@@ -1,4 +1,4 @@
-10
+11
 foo/S
 foo/YX
 bar/YS
@@ -8,4 +8,6 @@ kg
 Kg/X
 KG/X
 cm
-Cm/X
\ No newline at end of file
+Cm/X
+SIPS/X
+Sip/A
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.good
index 7bd112e..1f73e52 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.good
@@ -1,3 +1,3 @@
 foo
 bar
-
+Sips
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.wrong
index 5752c1e..3279626 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.wrong
@@ -2,3 +2,5 @@ bars
 foos
 foobar
 barfoo
+SIPS
+sips
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
index 48d88a7..4d497dc 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.dic
@@ -1,4 +1,4 @@
-7
+9
 drink/X
 walk/XZ
 test/Z
@@ -6,3 +6,5 @@ foo/Z
 Bar/Z
 baz./Z
 Quux./Z
+way/X
+ways/Z
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good
index e6ff181..795112e 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.good
@@ -2,3 +2,7 @@ foo
 Bar
 baz.
 Quux.
+way
+Way
+WAY
+ways
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.sug
index 69e80dd..e4cb7fa 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.sug
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.sug
@@ -6,3 +6,4 @@ baz.
 baz.
 Quux.
 Quux.
+Way
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong
index 3b79142..1986f6d 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/keepcase.wrong
@@ -6,3 +6,4 @@ Baz.
 BAZ.
 quux.
 QUUX.
+Ways