You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/01/29 07:24:38 UTC

[lucene-solr] branch master updated: LUCENE-9706: Hunspell: support NEEDAFFIX flag on affixes (#2262)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 4ba78f2  LUCENE-9706: Hunspell: support NEEDAFFIX flag on affixes (#2262)
4ba78f2 is described below

commit 4ba78f2ab25442c026623b62dc27c60347fd99d6
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Fri Jan 29 08:24:23 2021 +0100

    LUCENE-9706: Hunspell: support NEEDAFFIX flag on affixes (#2262)
---
 .../org/apache/lucene/analysis/hunspell/Dictionary.java |  2 +-
 .../org/apache/lucene/analysis/hunspell/Stemmer.java    | 17 +++++++++++++++--
 .../lucene/analysis/hunspell/SpellCheckerTest.java      |  5 +++++
 .../org/apache/lucene/analysis/hunspell/needaffix5.aff  | 13 +++++++++++++
 .../org/apache/lucene/analysis/hunspell/needaffix5.dic  |  2 ++
 .../org/apache/lucene/analysis/hunspell/needaffix5.good | 11 +++++++++++
 .../apache/lucene/analysis/hunspell/needaffix5.wrong    |  3 +++
 7 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index d9473a9..6d7638b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -76,7 +76,7 @@ public class Dictionary {
 
   static final char[] NOFLAGS = new char[0];
 
-  static final int FLAG_UNSET = 0;
+  static final char FLAG_UNSET = (char) 0;
   private static final int DEFAULT_FLAGS = 65510;
   private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
 
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 6b6fb80..572473c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -191,7 +191,7 @@ final class Stemmer {
               length,
               context,
               -1,
-              (char) 0,
+              Dictionary.FLAG_UNSET,
               -1,
               0,
               true,
@@ -361,6 +361,7 @@ final class Stemmer {
                     pureAffix ? length - i : strippedWord.length,
                     context,
                     prefix,
+                    previous,
                     -1,
                     recursionDepth,
                     true,
@@ -413,6 +414,7 @@ final class Stemmer {
                     pureAffix ? i : strippedWord.length,
                     context,
                     suffix,
+                    previous,
                     prefixId,
                     recursionDepth,
                     false,
@@ -543,6 +545,7 @@ final class Stemmer {
       int length,
       WordContext context,
       int affix,
+      int previousAffix,
       int prefixId,
       int recursionDepth,
       boolean prefix,
@@ -553,7 +556,8 @@ final class Stemmer {
 
     List<CharsRef> stems = new ArrayList<>();
 
-    IntsRef forms = dictionary.lookupWord(strippedWord, offset, length);
+    boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix);
+    IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length);
     if (forms != null) {
       for (int i = 0; i < forms.length; i += formStep) {
         char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
@@ -651,6 +655,15 @@ final class Stemmer {
     return stems;
   }
 
+  private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix) {
+    if (isFlagAppendedByAffix(affix, dictionary.needaffix)) {
+      return !isSuffix
+          || previousAffix < 0
+          || isFlagAppendedByAffix(previousAffix, dictionary.needaffix);
+    }
+    return false;
+  }
+
   private boolean isFlagAppendedByAffix(int affixId, char flag) {
     if (affixId < 0 || flag == Dictionary.FLAG_UNSET) return false;
     int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index 30ceb58..dbfbbec 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -46,6 +46,11 @@ public class SpellCheckerTest extends StemmerTestBase {
     doTest("i53643");
   }
 
+  @Test
+  public void needAffixOnAffixes() throws Exception {
+    doTest("needaffix5");
+  }
+
   public void testBreak() throws Exception {
     doTest("break");
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.aff
new file mode 100644
index 0000000..6399a3e
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.aff
@@ -0,0 +1,13 @@
+# on affixes
+NEEDAFFIX X
+
+SFX A Y 2
+SFX A 0 suf/B .
+SFX A 0 pseudosuf/XB .
+
+SFX B Y 1
+SFX B 0 bar .
+
+PFX C Y 2
+PFX C 0 pre .
+PFX C 0 pseudopre/X .
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.dic
new file mode 100644
index 0000000..83131e2
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.dic
@@ -0,0 +1,2 @@
+1
+foo/AC
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.good
new file mode 100644
index 0000000..d1b86bf
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.good
@@ -0,0 +1,11 @@
+foo
+prefoo
+foosuf
+prefoosuf
+foosufbar
+prefoosufbar
+pseudoprefoosuf
+pseudoprefoosufbar
+pseudoprefoopseudosufbar
+prefoopseudosuf
+prefoopseudosufbar
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.wrong
new file mode 100644
index 0000000..fdd1797
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.wrong
@@ -0,0 +1,3 @@
+pseudoprefoo
+foopseudosuf
+pseudoprefoopseudosuf