You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/01/22 11:02:14 UTC

[lucene-solr] branch master updated: LUCENE-9684: Hunspell: support COMPOUNDRULE (#2228)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new d796813  LUCENE-9684: Hunspell: support COMPOUNDRULE (#2228)
d796813 is described below

commit d7968130c3f5d7166c10756c37b5ed644414cd1d
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Fri Jan 22 12:01:53 2021 +0100

    LUCENE-9684: Hunspell: support COMPOUNDRULE (#2228)
---
 lucene/CHANGES.txt                                 |   4 +-
 .../lucene/analysis/hunspell/CompoundRule.java     | 105 +++++++++++++++++++++
 .../lucene/analysis/hunspell/Dictionary.java       |  81 +++++++++++++---
 .../lucene/analysis/hunspell/SpellChecker.java     |  87 ++++++++++++++++-
 .../apache/lucene/analysis/hunspell/Stemmer.java   |  12 ++-
 .../apache/lucene/analysis/hunspell/WordCase.java  |  12 +--
 .../lucene/analysis/hunspell/SpellCheckerTest.java |  32 +++++++
 .../lucene/analysis/hunspell/TestDictionary.java   |  23 +++++
 .../lucene/analysis/hunspell/compoundrule.aff      |   3 +
 .../lucene/analysis/hunspell/compoundrule.dic      |   5 +
 .../lucene/analysis/hunspell/compoundrule.good     |   2 +
 .../lucene/analysis/hunspell/compoundrule.wrong    |  39 ++++++++
 .../lucene/analysis/hunspell/compoundrule2.aff     |   3 +
 .../lucene/analysis/hunspell/compoundrule2.dic     |   5 +
 .../lucene/analysis/hunspell/compoundrule2.good    |  37 ++++++++
 .../lucene/analysis/hunspell/compoundrule2.wrong   |   8 ++
 .../lucene/analysis/hunspell/compoundrule3.aff     |   3 +
 .../lucene/analysis/hunspell/compoundrule3.dic     |   5 +
 .../lucene/analysis/hunspell/compoundrule3.good    |   7 ++
 .../lucene/analysis/hunspell/compoundrule3.wrong   |  41 ++++++++
 .../lucene/analysis/hunspell/compoundrule4.aff     |   7 ++
 .../lucene/analysis/hunspell/compoundrule4.dic     |  24 +++++
 .../lucene/analysis/hunspell/compoundrule4.good    |  31 ++++++
 .../lucene/analysis/hunspell/compoundrule4.wrong   |   5 +
 .../lucene/analysis/hunspell/compoundrule5.aff     |   7 ++
 .../lucene/analysis/hunspell/compoundrule5.dic     |  14 +++
 .../lucene/analysis/hunspell/compoundrule5.good    |   7 ++
 .../lucene/analysis/hunspell/compoundrule5.wrong   |   1 +
 .../lucene/analysis/hunspell/compoundrule6.aff     |   4 +
 .../lucene/analysis/hunspell/compoundrule6.dic     |   5 +
 .../lucene/analysis/hunspell/compoundrule6.good    |   4 +
 .../lucene/analysis/hunspell/compoundrule6.wrong   |   4 +
 .../lucene/analysis/hunspell/compoundrule7.aff     |   8 ++
 .../lucene/analysis/hunspell/compoundrule7.dic     |  24 +++++
 .../lucene/analysis/hunspell/compoundrule7.good    |  29 ++++++
 .../lucene/analysis/hunspell/compoundrule7.wrong   |   5 +
 .../lucene/analysis/hunspell/compoundrule8.aff     |   8 ++
 .../lucene/analysis/hunspell/compoundrule8.dic     |  24 +++++
 .../lucene/analysis/hunspell/compoundrule8.good    |  29 ++++++
 .../lucene/analysis/hunspell/compoundrule8.wrong   |   5 +
 40 files changed, 730 insertions(+), 29 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index fce70e9..f99553b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -86,8 +86,8 @@ API Changes
 
 Improvements
 
-* LUCENE-9665 LUCENE-9676 LUCENE-9667 : Hunspell improvements: add SpellChecker API, support default encoding and
-  BREAK/FORBIDDENWORD affix rules, improve stemming of all-caps words (Peter Gromov)
+* LUCENE-9687: Hunspell support improvements: add SpellChecker API, support default encoding and
+  BREAK/FORBIDDENWORD/COMPOUNDRULE affix rules, improve stemming of all-caps words (Peter Gromov)
 
 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
   (Dawid Weiss)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java
new file mode 100644
index 0000000..0f89de8
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.List;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+
+class CompoundRule {
+  private final char[] data;
+  private final Dictionary dictionary;
+
+  CompoundRule(String rule, Dictionary dictionary) {
+    this.dictionary = dictionary;
+    StringBuilder parsedFlags = new StringBuilder();
+    int pos = 0;
+    while (pos < rule.length()) {
+      int lParen = rule.indexOf("(", pos);
+      if (lParen < 0) {
+        parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos)));
+        break;
+      }
+
+      parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos, lParen)));
+      int rParen = rule.indexOf(')', lParen + 1);
+      if (rParen < 0) {
+        throw new IllegalArgumentException("Unmatched parentheses: " + rule);
+      }
+
+      parsedFlags.append(
+          dictionary.flagParsingStrategy.parseFlags(rule.substring(lParen + 1, rParen)));
+      pos = rParen + 1;
+      if (pos < rule.length() && (rule.charAt(pos) == '?' || rule.charAt(pos) == '*')) {
+        parsedFlags.append(rule.charAt(pos++));
+      }
+    }
+    data = parsedFlags.toString().toCharArray();
+  }
+
+  boolean mayMatch(List<IntsRef> words, BytesRef scratch) {
+    return match(words, 0, 0, scratch, false);
+  }
+
+  boolean fullyMatches(List<IntsRef> words, BytesRef scratch) {
+    return match(words, 0, 0, scratch, true);
+  }
+
+  private boolean match(
+      List<IntsRef> words, int patternIndex, int wordIndex, BytesRef scratch, boolean fully) {
+    if (patternIndex >= data.length) {
+      return wordIndex >= words.size();
+    }
+    if (wordIndex >= words.size() && !fully) {
+      return true;
+    }
+
+    char flag = data[patternIndex];
+    if (patternIndex < data.length - 1 && data[patternIndex + 1] == '*') {
+      int startWI = wordIndex;
+      while (wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch)) {
+        wordIndex++;
+      }
+
+      while (wordIndex >= startWI) {
+        if (match(words, patternIndex + 2, wordIndex, scratch, fully)) {
+          return true;
+        }
+
+        wordIndex--;
+      }
+      return false;
+    }
+
+    boolean currentWordMatches =
+        wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch);
+
+    if (patternIndex < data.length - 1 && data[patternIndex + 1] == '?') {
+      if (currentWordMatches && match(words, patternIndex + 2, wordIndex + 1, scratch, fully)) {
+        return true;
+      }
+      return match(words, patternIndex + 2, wordIndex, scratch, fully);
+    }
+
+    return currentWordMatches && match(words, patternIndex + 1, wordIndex + 1, scratch, fully);
+  }
+
+  @Override
+  public String toString() {
+    return new String(data);
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 19cfaa3..2c620a2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -92,6 +92,8 @@ public class Dictionary {
   private static final String LANG_KEY = "LANG";
   private static final String BREAK_KEY = "BREAK";
   private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
+  private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN";
+  private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE";
   private static final String KEEPCASE_KEY = "KEEPCASE";
   private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
   private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
@@ -136,7 +138,7 @@ public class Dictionary {
   static final int AFFIX_APPEND = 3;
 
   // Default flag parsing strategy
-  private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
+  FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
 
   // AF entries
   private String[] aliases;
@@ -163,6 +165,8 @@ public class Dictionary {
   int needaffix = -1; // needaffix flag, or -1 if one is not defined
   int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
   int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
+  int compoundMin = 3;
+  List<CompoundRule> compoundRules; // nullable
 
   // ignored characters (dictionary, affix, inputs)
   private char[] ignore;
@@ -419,6 +423,18 @@ public class Dictionary {
           throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
         }
         forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
+      } else if (line.startsWith(COMPOUNDMIN_KEY)) {
+        String[] parts = line.split("\\s+");
+        if (parts.length != 2) {
+          throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber());
+        }
+        compoundMin = Math.max(1, Integer.parseInt(parts[1]));
+      } else if (line.startsWith(COMPOUNDRULE_KEY)) {
+        String[] parts = line.split("\\s+");
+        if (parts.length != 2) {
+          throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber());
+        }
+        this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1]));
       }
     }
 
@@ -442,6 +458,21 @@ public class Dictionary {
     stripOffsets[currentIndex] = currentOffset;
   }
 
+  private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
+      throws IOException, ParseException {
+    String line;
+    List<CompoundRule> compoundRules = new ArrayList<>();
+    for (int i = 0; i < num; i++) {
+      line = reader.readLine();
+      String[] parts = line.split("\\s+");
+      if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) {
+        throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber());
+      }
+      compoundRules.add(new CompoundRule(parts[1], this));
+    }
+    return compoundRules;
+  }
+
   private Breaks parseBreaks(LineNumberReader reader, String line)
       throws IOException, ParseException {
     Set<String> starting = new LinkedHashSet<>();
@@ -910,7 +941,7 @@ public class Dictionary {
       reuse.append(caseFold(word.charAt(i)));
     }
     reuse.append(FLAG_SEPARATOR);
-    reuse.append(HIDDEN_FLAG);
+    flagParsingStrategy.appendFlag(HIDDEN_FLAG, reuse);
     reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
     writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
   }
@@ -1188,16 +1219,19 @@ public class Dictionary {
     return null;
   }
 
-  boolean isForbiddenWord(char[] word, BytesRef scratch) {
+  boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
     if (forbiddenword != -1) {
-      IntsRef forms = lookupWord(word, 0, word.length);
-      if (forms != null) {
-        int formStep = formStep();
-        for (int i = 0; i < forms.length; i += formStep) {
-          if (hasFlag(forms.ints[forms.offset + i], (char) forbiddenword, scratch)) {
-            return true;
-          }
-        }
+      IntsRef forms = lookupWord(word, 0, length);
+      return forms != null && hasFlag(forms, (char) forbiddenword, scratch);
+    }
+    return false;
+  }
+
+  boolean hasFlag(IntsRef forms, char flag, BytesRef scratch) {
+    int formStep = formStep();
+    for (int i = 0; i < forms.length; i += formStep) {
+      if (hasFlag(forms.ints[forms.offset + i], flag, scratch)) {
+        return true;
       }
     }
     return false;
@@ -1227,6 +1261,8 @@ public class Dictionary {
      * @return Parsed flags
      */
     abstract char[] parseFlags(String rawFlags);
+
+    abstract void appendFlag(char flag, StringBuilder to);
   }
 
   /**
@@ -1238,6 +1274,11 @@ public class Dictionary {
     public char[] parseFlags(String rawFlags) {
       return rawFlags.toCharArray();
     }
+
+    @Override
+    void appendFlag(char flag, StringBuilder to) {
+      to.append(flag);
+    }
   }
 
   /**
@@ -1266,6 +1307,14 @@ public class Dictionary {
       }
       return flags;
     }
+
+    @Override
+    void appendFlag(char flag, StringBuilder to) {
+      if (to.length() > 0) {
+        to.append(",");
+      }
+      to.append((int) flag);
+    }
   }
 
   /**
@@ -1300,6 +1349,16 @@ public class Dictionary {
       builder.getChars(0, builder.length(), flags, 0);
       return flags;
     }
+
+    @Override
+    void appendFlag(char flag, StringBuilder to) {
+      to.append((char) (flag >> 8));
+      to.append((char) (flag & 0xff));
+    }
+  }
+
+  boolean hasCompounding() {
+    return compoundRules != null;
   }
 
   boolean hasFlag(int entryId, char flag, BytesRef scratch) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
index a3e765b..66e21a1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -16,7 +16,10 @@
  */
 package org.apache.lucene.analysis.hunspell;
 
+import java.util.ArrayList;
+import java.util.List;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
 
 /**
  * A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
@@ -37,26 +40,100 @@ public class SpellChecker {
   public boolean spell(String word) {
     if (word.isEmpty()) return true;
 
-    char[] wordChars = word.toCharArray();
-    if (dictionary.isForbiddenWord(wordChars, scratch)) {
-      return false;
+    if (dictionary.needsInputCleaning) {
+      word = dictionary.cleanInput(word, new StringBuilder()).toString();
     }
 
     if (isNumber(word)) {
       return true;
     }
 
-    if (!stemmer.stem(wordChars, word.length()).isEmpty()) {
+    char[] wordChars = word.toCharArray();
+    if (checkWord(wordChars, wordChars.length, false)) {
       return true;
     }
 
-    if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
+    WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
+    if ((wc == WordCase.UPPER || wc == WordCase.TITLE) && checkCaseVariants(wordChars, wc)) {
+      return true;
+    }
+
+    if (dictionary.breaks.isNotEmpty()
+        && !hasTooManyBreakOccurrences(word)
+        && !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) {
       return tryBreaks(word);
     }
 
     return false;
   }
 
+  private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
+    char[] caseVariant = wordChars;
+    if (wordCase == WordCase.UPPER) {
+      caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
+      if (checkWord(caseVariant, wordChars.length, true)) {
+        return true;
+      }
+    }
+    return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
+  }
+
+  private boolean checkWord(char[] wordChars, int length, boolean caseVariant) {
+    if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
+      return false;
+    }
+
+    if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) {
+      return true;
+    }
+
+    if (dictionary.hasCompounding()) {
+      return checkCompounds(wordChars, 0, length, new ArrayList<>());
+    }
+
+    return false;
+  }
+
+  private boolean checkCompounds(char[] wordChars, int offset, int length, List<IntsRef> words) {
+    if (words.size() >= 100) return false;
+
+    int limit = length - dictionary.compoundMin + 1;
+    for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
+      IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos);
+      if (forms != null) {
+        words.add(forms);
+
+        if (dictionary.compoundRules != null
+            && dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words, scratch))) {
+          if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
+            return true;
+          }
+
+          if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) {
+            return true;
+          }
+        }
+
+        words.remove(words.size() - 1);
+      }
+    }
+
+    return false;
+  }
+
+  private boolean checkLastCompoundPart(
+      char[] wordChars, int start, int length, List<IntsRef> words) {
+    IntsRef forms = dictionary.lookupWord(wordChars, start, length);
+    if (forms == null) return false;
+
+    words.add(forms);
+    boolean result =
+        dictionary.compoundRules != null
+            && dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
+    words.remove(words.size() - 1);
+    return result;
+  }
+
   private static boolean isNumber(String s) {
     int i = 0;
     while (i < s.length()) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 1355627..3bb46a7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -112,8 +112,8 @@ final class Stemmer {
   private char[] titleBuffer = new char[8];
 
   /** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
-  private WordCase caseOf(char[] word, int length) {
-    if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
+  WordCase caseOf(char[] word, int length) {
+    if (dictionary.ignoreCase || length == 0 || Character.isLowerCase(word[0])) {
       return WordCase.MIXED;
     }
 
@@ -121,22 +121,24 @@ final class Stemmer {
   }
 
   /** folds titlecase variant of word to titleBuffer */
-  private void caseFoldTitle(char[] word, int length) {
+  char[] caseFoldTitle(char[] word, int length) {
     titleBuffer = ArrayUtil.grow(titleBuffer, length);
     System.arraycopy(word, 0, titleBuffer, 0, length);
     for (int i = 1; i < length; i++) {
       titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
     }
+    return titleBuffer;
   }
 
   /** folds lowercase variant of word (title cased) to lowerBuffer */
-  private void caseFoldLower(char[] word, int length) {
+  char[] caseFoldLower(char[] word, int length) {
     lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
     System.arraycopy(word, 0, lowerBuffer, 0, length);
     lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
+    return lowerBuffer;
   }
 
-  private List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
+  List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
     List<CharsRef> stems = new ArrayList<>();
     IntsRef forms = dictionary.lookupWord(word, 0, length);
     if (forms != null) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
index 7d9e2e7..04adf7a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
@@ -23,7 +23,7 @@ enum WordCase {
   MIXED;
 
   static WordCase caseOf(char[] word, int length) {
-    boolean capitalized = Character.isUpperCase(word[0]);
+    boolean startsWithLower = Character.isLowerCase(word[0]);
 
     boolean seenUpper = false;
     boolean seenLower = false;
@@ -34,11 +34,11 @@ enum WordCase {
       if (seenUpper && seenLower) break;
     }
 
-    return get(capitalized, seenUpper, seenLower);
+    return get(startsWithLower, seenUpper, seenLower);
   }
 
   static WordCase caseOf(CharSequence word, int length) {
-    boolean capitalized = Character.isUpperCase(word.charAt(0));
+    boolean startsWithLower = Character.isLowerCase(word.charAt(0));
 
     boolean seenUpper = false;
     boolean seenLower = false;
@@ -49,11 +49,11 @@ enum WordCase {
       if (seenUpper && seenLower) break;
     }
 
-    return get(capitalized, seenUpper, seenLower);
+    return get(startsWithLower, seenUpper, seenLower);
   }
 
-  private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) {
-    if (capitalized) {
+  private static WordCase get(boolean startsWithLower, boolean seenUpper, boolean seenLower) {
+    if (!startsWithLower) {
       return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
     }
     return seenUpper ? MIXED : LOWER;
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
index a478dda..cfa1719 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -43,6 +43,38 @@ public class SpellCheckerTest extends StemmerTestBase {
     doTest("breakoff");
   }
 
+  public void testCompoundrule() throws Exception {
+    doTest("compoundrule");
+  }
+
+  public void testCompoundrule2() throws Exception {
+    doTest("compoundrule2");
+  }
+
+  public void testCompoundrule3() throws Exception {
+    doTest("compoundrule3");
+  }
+
+  public void testCompoundrule4() throws Exception {
+    doTest("compoundrule4");
+  }
+
+  public void testCompoundrule5() throws Exception {
+    doTest("compoundrule5");
+  }
+
+  public void testCompoundrule6() throws Exception {
+    doTest("compoundrule6");
+  }
+
+  public void testCompoundrule7() throws Exception {
+    doTest("compoundrule7");
+  }
+
+  public void testCompoundrule8() throws Exception {
+    doTest("compoundrule8");
+  }
+
   protected void doTest(String name) throws Exception {
     InputStream affixStream =
         Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index 5e8fdff..a0ece78 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.text.ParseException;
+import java.util.Random;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
@@ -33,6 +34,7 @@ import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.Outputs;
 import org.apache.lucene.util.fst.Util;
+import org.junit.Test;
 
 public class TestDictionary extends LuceneTestCase {
 
@@ -268,6 +270,27 @@ public class TestDictionary extends LuceneTestCase {
     assertNotNull(Dictionary.getFlagParsingStrategy("FLAG    UTF-8"));
   }
 
+  @Test
+  public void testFlagSerialization() {
+    Random r = random();
+    char[] flags = new char[r.nextInt(10)];
+    for (int i = 0; i < flags.length; i++) {
+      flags[i] = (char) r.nextInt(Character.MAX_VALUE);
+    }
+
+    String[] flagLines = {"FLAG long", "FLAG UTF-8", "FLAG num"};
+    for (String flagLine : flagLines) {
+      Dictionary.FlagParsingStrategy strategy = Dictionary.getFlagParsingStrategy(flagLine);
+      StringBuilder serialized = new StringBuilder();
+      for (char flag : flags) {
+        strategy.appendFlag(flag, serialized);
+      }
+
+      char[] deserialized = strategy.parseFlags(serialized.toString());
+      assertEquals(new String(flags), new String(deserialized));
+    }
+  }
+
   private Directory getDirectory() {
     return newDirectory();
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.aff
new file mode 100644
index 0000000..09309e0
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.aff
@@ -0,0 +1,3 @@
+COMPOUNDMIN 1
+COMPOUNDRULE 1
+COMPOUNDRULE ABC
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.dic
new file mode 100644
index 0000000..b11e829
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.dic
@@ -0,0 +1,5 @@
+3
+a/A
+b/B
+c/BC
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.good
new file mode 100644
index 0000000..c7a0763
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.good
@@ -0,0 +1,2 @@
+abc
+acc
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.wrong
new file mode 100644
index 0000000..bc151ea
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.wrong
@@ -0,0 +1,39 @@
+ba
+aaabaaa
+bbaaa
+aaaaba
+bbbbbaa
+aa
+aaa
+aaaa
+ab
+aab
+aaab
+aaaab
+abb
+aabb
+aaabbb
+bb
+bbb
+bbbb
+aaab
+abcc
+abbc
+abbcc
+aabc
+aabcc
+aabbc
+aabbcc
+aaabbbccc
+ac
+aac
+aacc
+aaaccc
+bc
+bcc
+bbc
+bbcc
+bbbccc
+cc
+ccc
+cccccc
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.aff
new file mode 100644
index 0000000..e4b86a5
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.aff
@@ -0,0 +1,3 @@
+COMPOUNDMIN 1
+COMPOUNDRULE 1
+COMPOUNDRULE A*B*C*
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.dic
new file mode 100644
index 0000000..7d07bbc
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.dic
@@ -0,0 +1,5 @@
+3
+a/A
+b/B
+c/C
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.good
new file mode 100644
index 0000000..de743bb
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.good
@@ -0,0 +1,37 @@
+aa
+aaa
+aaaa
+ab
+aab
+aaab
+aaaab
+abb
+aabb
+aaabbb
+bb
+bbb
+bbbb
+aaab
+abc
+abcc
+abbc
+abbcc
+aabc
+aabcc
+aabbc
+aabbcc
+aaabbbccc
+ac
+acc
+aac
+aacc
+aaaccc
+bc
+bcc
+bbc
+bbcc
+bbbccc
+cc
+ccc
+cccccc
+abcc
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.wrong
new file mode 100644
index 0000000..9e5d38d
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.wrong
@@ -0,0 +1,8 @@
+ba
+aaabaaa
+bbaaa
+aaaaba
+bbbbbaa
+cba
+cab
+acb
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.aff
new file mode 100644
index 0000000..0053145
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.aff
@@ -0,0 +1,3 @@
+COMPOUNDMIN 1
+COMPOUNDRULE 1
+COMPOUNDRULE A?B?C?
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.dic
new file mode 100644
index 0000000..7d07bbc
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.dic
@@ -0,0 +1,5 @@
+3
+a/A
+b/B
+c/C
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.good
new file mode 100644
index 0000000..7f51889
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.good
@@ -0,0 +1,7 @@
+a
+b
+c
+ab
+abc
+ac
+bc
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.wrong
new file mode 100644
index 0000000..6bd1d80
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.wrong
@@ -0,0 +1,41 @@
+aa
+aaa
+aaaa
+aab
+aaab
+aaaab
+abb
+aabb
+aaabbb
+bb
+bbb
+bbbb
+aaab
+abcc
+abbc
+abbcc
+aabc
+aabcc
+aabbc
+aabbcc
+aaabbbccc
+acc
+aac
+aacc
+aaaccc
+bcc
+bbc
+bbcc
+bbbccc
+cc
+ccc
+cccccc
+abcc
+ba
+aaabaaa
+bbaaa
+aaaaba
+bbbbbaa
+cba
+cab
+acb
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff
new file mode 100644
index 0000000..8a9996c
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff
@@ -0,0 +1,7 @@
+# English ordinal numbers
+WORDCHARS 0123456789
+COMPOUNDMIN 1
+ONLYINCOMPOUND c
+COMPOUNDRULE 2
+COMPOUNDRULE n*1t
+COMPOUNDRULE n*mp
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic
new file mode 100644
index 0000000..ced0735
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic
@@ -0,0 +1,24 @@
+22
+0/nm
+1/n1
+2/nm
+3/nm
+4/nm
+5/nm
+6/nm
+7/nm
+8/nm
+9/nm
+0th/pt
+1st/p
+1th/tc
+2nd/p
+2th/tc
+3rd/p
+3th/tc
+4th/pt
+5th/pt
+6th/pt
+7th/pt
+8th/pt
+9th/pt
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good
new file mode 100644
index 0000000..8694943
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good
@@ -0,0 +1,31 @@
+1st
+2nd
+3rd
+4th
+5th
+6th
+7th
+8th
+9th
+10th
+11th
+12th
+13th
+14th
+15th
+16th
+17th
+18th
+19th
+20th
+21st
+22nd
+23rd
+24th
+25th
+100th
+1000th
+10001st
+10011th
+1ST
+42ND
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.wrong
new file mode 100644
index 0000000..99f28e7
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.wrong
@@ -0,0 +1,5 @@
+1th
+2th
+3th
+10001th
+10011st
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.aff
new file mode 100644
index 0000000..4650246
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.aff
@@ -0,0 +1,7 @@
+# number + percent
+SET UTF-8
+COMPOUNDMIN 1
+COMPOUNDRULE 2
+COMPOUNDRULE N*%?
+COMPOUNDRULE NN*.NN*%?
+WORDCHARS 0123456789‰.
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.dic
new file mode 100644
index 0000000..eeeffda
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.dic
@@ -0,0 +1,14 @@
+13
+0/N	po:num
+1/N	po:num
+2/N	po:num
+3/N	po:num
+4/N	po:num
+5/N	po:num
+6/N	po:num
+7/N	po:num
+8/N	po:num
+9/N	po:num
+./.	po:sign_dot
+%/%	po:sign_percent
+‰/%	po:sign_per_mille
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.good
new file mode 100644
index 0000000..691fca1
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.good
@@ -0,0 +1,7 @@
+10%
+0.2%
+0.20%
+123.4561‰
+10
+0000
+10.25
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.wrong
new file mode 100644
index 0000000..ba1fe32
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.wrong
@@ -0,0 +1 @@
+.25
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.aff
new file mode 100644
index 0000000..e8a088d
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.aff
@@ -0,0 +1,4 @@
+COMPOUNDMIN 1
+COMPOUNDRULE 2
+COMPOUNDRULE A*A
+COMPOUNDRULE A*AAB*BBBC*C
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.dic
new file mode 100644
index 0000000..7d07bbc
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.dic
@@ -0,0 +1,5 @@
+3
+a/A
+b/B
+c/C
+
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.good
new file mode 100644
index 0000000..55a8f8b
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.good
@@ -0,0 +1,4 @@
+aa
+aaaaaa
+aabbbc
+aaaaabbbbbbcccccc
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.wrong
new file mode 100644
index 0000000..48b376d
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.wrong
@@ -0,0 +1,4 @@
+abc
+abbbbbccccccc
+aabbccccccc
+aabbbbbbb
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.aff
new file mode 100644
index 0000000..3ae1fc7
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.aff
@@ -0,0 +1,8 @@
+# English ordinal numbers (parenthesized long flags)
+FLAG long
+WORDCHARS 0123456789
+COMPOUNDMIN 1
+ONLYINCOMPOUND cc
+COMPOUNDRULE 2
+COMPOUNDRULE (nn)*(11)(tt)
+COMPOUNDRULE (nn)*(mm)(pp)
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.dic
new file mode 100644
index 0000000..ad4bb4d
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.dic
@@ -0,0 +1,24 @@
+22
+0/nnmm
+1/nn11
+2/nnmm
+3/nnmm
+4/nnmm
+5/nnmm
+6/nnmm
+7/nnmm
+8/nnmm
+9/nnmm
+0th/pptt
+1st/pp
+1th/ttcc
+2nd/pp
+2th/ttcc
+3rd/pp
+3th/ttcc
+4th/pptt
+5th/pptt
+6th/pptt
+7th/pptt
+8th/pptt
+9th/pptt
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.good
new file mode 100644
index 0000000..fafe64a5
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.good
@@ -0,0 +1,29 @@
+1st
+2nd
+3rd
+4th
+5th
+6th
+7th
+8th
+9th
+10th
+11th
+12th
+13th
+14th
+15th
+16th
+17th
+18th
+19th
+20th
+21st
+22nd
+23rd
+24th
+25th
+100th
+1000th
+10001st
+10011th
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.wrong
new file mode 100644
index 0000000..99f28e7
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.wrong
@@ -0,0 +1,5 @@
+1th
+2th
+3th
+10001th
+10011st
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.aff
new file mode 100644
index 0000000..03a423d
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.aff
@@ -0,0 +1,8 @@
+# English ordinal numbers (parenthesized numerical flags)
+FLAG num
+WORDCHARS 0123456789
+COMPOUNDMIN 1
+ONLYINCOMPOUND 1000
+COMPOUNDRULE 2
+COMPOUNDRULE (1001)*(1002)(2001)
+COMPOUNDRULE (1001)*(2002)(2000)
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.dic
new file mode 100644
index 0000000..e156e95
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.dic
@@ -0,0 +1,24 @@
+22
+0/1001,2002
+1/1001,1002
+2/1001,2002
+3/1001,2002
+4/1001,2002
+5/1001,2002
+6/1001,2002
+7/1001,2002
+8/1001,2002
+9/1001,2002
+0th/2000,2001
+1st/2000
+1th/2001,1000
+2nd/2000
+2th/2001,1000
+3rd/2000
+3th/2001,1000
+4th/2000,2001
+5th/2000,2001
+6th/2000,2001
+7th/2000,2001
+8th/2000,2001
+9th/2000,2001
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.good
new file mode 100644
index 0000000..fafe64a5
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.good
@@ -0,0 +1,29 @@
+1st
+2nd
+3rd
+4th
+5th
+6th
+7th
+8th
+9th
+10th
+11th
+12th
+13th
+14th
+15th
+16th
+17th
+18th
+19th
+20th
+21st
+22nd
+23rd
+24th
+25th
+100th
+1000th
+10001st
+10011th
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.wrong
new file mode 100644
index 0000000..99f28e7
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.wrong
@@ -0,0 +1,5 @@
+1th
+2th
+3th
+10001th
+10011st