You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by do...@apache.org on 2023/01/13 11:48:54 UTC
[lucene] 01/09: LUCENE-10626: Hunspell: add tools to aid dictionary editing: analysis introspection, stem expansion and stem/flag suggestion (#975)

This is an automated email from the ASF dual-hosted git repository.

donnerpeter pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git

commit 3b763af12f576b8b8120e98ae38b7a67461a99e8
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Tue Jul 5 21:38:03 2022 +0200

    LUCENE-10626: Hunspell: add tools to aid dictionary editing: analysis introspection, stem expansion and stem/flag suggestion (#975)
---
 lucene/CHANGES.txt                                 |   3 +
 .../lucene/analysis/hunspell/AffixedWord.java      | 119 +++++
 .../lucene/analysis/hunspell/DictEntries.java      |  24 +-
 .../apache/lucene/analysis/hunspell/DictEntry.java | 109 +++++
 .../lucene/analysis/hunspell/Dictionary.java       |  91 +++-
 .../lucene/analysis/hunspell/EntrySuggestion.java  |  65 +++
 .../apache/lucene/analysis/hunspell/Hunspell.java  |  50 ++-
 .../apache/lucene/analysis/hunspell/Stemmer.java   | 297 +++++++------
 .../analysis/hunspell/WordFormGenerator.java       | 487 +++++++++++++++++++++
 .../lucene/analysis/hunspell/TestDictionary.java   |   5 +-
 .../lucene/analysis/hunspell/TestHunspell.java     | 134 +++++-
 .../analysis/hunspell/TestSpellChecking.java       |  87 +++-
 .../apache/lucene/analysis/hunspell/compress.aff   |  14 +
 .../apache/lucene/analysis/hunspell/compress.dic   |   2 +
 .../apache/lucene/analysis/hunspell/flagutf8.aff   |  15 +
 .../apache/lucene/analysis/hunspell/flagutf8.dic   |   2 +
 .../apache/lucene/analysis/hunspell/flagutf8.good  |   8 +
 .../lucene/analysis/hunspell/forbiddenword.aff     |   3 +
 .../lucene/analysis/hunspell/forbiddenword.dic     |   6 +-
 19 files changed, 1351 insertions(+), 170 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 210ff7b56b4..dc471f0e0e6 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -83,6 +83,9 @@ New Features
 * GITHUB#11869: RangeOnRangeFacetCounts added, supporting numeric range "relationship" faceting over docvalue-stored
   ranges. (Marc D'Mello)
 
+* LUCENE-10626 Hunspell: add tools to aid dictionary editing:
+  analysis introspection, stem expansion and stem/flag suggestion (Peter Gromov)
+
 Improvements
 ---------------------
 * GITHUB#11785: Improve Tessellator performance by delaying calls to the method
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixedWord.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixedWord.java
new file mode 100644
index 00000000000..f0b8b1b58a4
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixedWord.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+
+/** An object representing the analysis result of a simple (non-compound) word */
+public final class AffixedWord {
+  private final String word;
+  private final DictEntry entry;
+  private final List<Affix> prefixes;
+  private final List<Affix> suffixes;
+
+  AffixedWord(String word, DictEntry entry, List<Affix> prefixes, List<Affix> suffixes) {
+    this.word = word;
+    this.entry = entry;
+    this.prefixes = Collections.unmodifiableList(prefixes);
+    this.suffixes = Collections.unmodifiableList(suffixes);
+  }
+
+  /** @return the word being analyzed */
+  public String getWord() {
+    return word;
+  }
+
+  /** @return the dictionary entry for the stem in this analysis */
+  public DictEntry getDictEntry() {
+    return entry;
+  }
+
+  /** @return the list of prefixes applied to the stem, at most two, outermost first */
+  public List<Affix> getPrefixes() {
+    return prefixes;
+  }
+
+  /** @return the list of suffixes applied to the stem, at most two, outermost first */
+  public List<Affix> getSuffixes() {
+    return suffixes;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (!(o instanceof AffixedWord that)) return false;
+    return word.equals(that.word)
+        && entry.equals(that.entry)
+        && prefixes.equals(that.prefixes)
+        && suffixes.equals(that.suffixes);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(word, entry, prefixes, suffixes);
+  }
+
+  @Override
+  public String toString() {
+    return "AffixedWord["
+        + ("word=" + word + ", ")
+        + ("entry=" + entry + ", ")
+        + ("prefixes=" + prefixes + ", ")
+        + ("suffixes=" + suffixes)
+        + "]";
+  }
+
+  /** An object representing a prefix or a suffix applied to a word stem */
+  public static final class Affix {
+    final int affixId;
+    private final String presentableFlag;
+
+    Affix(Dictionary dictionary, int affixId) {
+      this.affixId = affixId;
+      char encodedFlag = dictionary.affixData(affixId, AFFIX_FLAG);
+      presentableFlag = dictionary.flagParsingStrategy.printFlag(encodedFlag);
+    }
+
+    /**
+     * @return the corresponding affix flag as it appears in the *.aff file. Depending on the
+     *     format, it could be a Unicode character, two ASCII characters, or an integer in decimal
+     *     form
+     */
+    public String getFlag() {
+      return presentableFlag;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      return this == o || o instanceof Affix a && affixId == a.affixId;
+    }
+
+    @Override
+    public int hashCode() {
+      return affixId;
+    }
+
+    @Override
+    public String toString() {
+      return presentableFlag + "(id=" + affixId + ")";
+    }
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java
index d9174dcbc7e..42dadc101b7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java
@@ -24,26 +24,22 @@ import java.util.List;
  *
  * @see Dictionary#lookupEntries
  */
-public interface DictEntries {
+public interface DictEntries extends List<DictEntry> {
   /**
    * @return a positive number of dictionary entries with the same word. Most often it's 1 (unless
    *     there are homonyms). Entries are indexed from 0 to {@code size() - 1} and these indices can
    *     be passed into other methods of this class.
    */
+  @Override
   int size();
 
-  /**
-   * @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
-   * @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
-   *     {@code ph:}) associated with the homonym at the given entry index, or an empty string
-   */
-  String getMorphologicalData(int entryIndex);
+  /** Same as {@code get(entryIndex).getMorphologicalData()} */
+  default String getMorphologicalData(int entryIndex) {
+    return get(entryIndex).getMorphologicalData();
+  }
 
-  /**
-   * @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
-   * @param key the key in the form {@code kk:} by which to filter the morphological fields
-   * @return the values (of {@code vvvvvv} form) of morphological fields with the given key
-   *     associated with the homonym at the given entry index
-   */
-  List<String> getMorphologicalValues(int entryIndex, String key);
+  /** Same as {@code get(entryIndex).getMorphologicalValues(key)} */
+  default List<String> getMorphologicalValues(int entryIndex, String key) {
+    return get(entryIndex).getMorphologicalValues(key);
+  }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntry.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntry.java
new file mode 100644
index 00000000000..ee76e77c39a
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntry.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+
+/** An object representing *.dic file entry with its word, flags and morphological data. */
+public abstract class DictEntry {
+  private final String stem;
+
+  DictEntry(String stem) {
+    this.stem = stem;
+  }
+
+  @Override
+  public String toString() {
+    String result = stem;
+    String flags = getFlags();
+    if (!flags.isEmpty()) {
+      result += "/" + flags;
+    }
+    String morph = getMorphologicalData();
+    if (!morph.isEmpty()) {
+      result += " " + morph;
+    }
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (!(o instanceof DictEntry that)) return false;
+    return stem.equals(that.stem)
+        && getMorphologicalData().equals(that.getMorphologicalData())
+        && getFlags().equals(that.getFlags());
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(stem, getFlags(), getMorphologicalData());
+  }
+
+  /** @return the stem word in the dictionary */
+  public String getStem() {
+    return stem;
+  }
+
+  /**
+   * @return the flags associated with the dictionary entry, encoded in the same format as in the
+   *     *.dic file, but possibly in a different order
+   */
+  public abstract String getFlags();
+
+  /**
+   * @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
+   *     {@code ph:}) associated with the homonym at the given entry index, or an empty string
+   */
+  public abstract String getMorphologicalData();
+
+  /**
+   * @param key the key in the form {@code kk:} by which to filter the morphological fields
+   * @return the values (of {@code vvvvvv} form) of morphological fields with the given key
+   *     associated with the homonym at the given entry index
+   */
+  public List<String> getMorphologicalValues(String key) {
+    assert key.length() == 3 && key.charAt(2) == ':'
+        : "A morphological data key should consist of two letters followed by a semicolon, found: "
+            + key;
+
+    String data = getMorphologicalData();
+    if (data.isEmpty() || !data.contains(key)) return Collections.emptyList();
+
+    return Arrays.stream(data.split(" "))
+        .filter(s -> s.startsWith(key))
+        .map(s -> s.substring(3))
+        .toList();
+  }
+
+  static DictEntry create(String stem, String flags) {
+    return new DictEntry(stem) {
+      @Override
+      public String getFlags() {
+        return flags;
+      }
+
+      @Override
+      public String getMorphologicalData() {
+        return "";
+      }
+    };
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 2fb73d01ed6..af894d35a99 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -34,6 +34,7 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.text.ParseException;
+import java.util.AbstractList;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -537,31 +538,33 @@ public class Dictionary {
     IntsRef forms = lookupWord(root.toCharArray(), 0, root.length());
     if (forms == null) return null;
 
-    return new DictEntries() {
+    class DictEntriesImpl extends AbstractList<DictEntry> implements DictEntries {
       @Override
       public int size() {
-        return forms.length / (hasCustomMorphData ? 2 : 1);
+        return forms.length / formStep();
       }
 
       @Override
-      public String getMorphologicalData(int entryIndex) {
-        if (!hasCustomMorphData) return "";
-        return morphData.get(forms.ints[forms.offset + entryIndex * 2 + 1]);
+      public DictEntry get(int entryIndex) {
+        return dictEntry(
+            root,
+            forms.ints[forms.offset + (entryIndex * formStep())],
+            hasCustomMorphData ? forms.ints[forms.offset + entryIndex * 2 + 1] : 0);
+      }
+    }
+    return new DictEntriesImpl();
+  }
+
+  DictEntry dictEntry(String root, int flagId, int morphDataId) {
+    return new DictEntry(root) {
+      @Override
+      public String getFlags() {
+        return Dictionary.this.flagParsingStrategy.printFlags(flagLookup.getFlags(flagId));
       }
 
       @Override
-      public List<String> getMorphologicalValues(int entryIndex, String key) {
-        assert key.length() == 3 && key.charAt(2) == ':'
-            : "A morphological data key should consist of two letters followed by a semicolon, found: "
-                + key;
-
-        String fields = getMorphologicalData(entryIndex);
-        if (fields.isEmpty() || !fields.contains(key)) return Collections.emptyList();
-
-        return Arrays.stream(fields.split(" "))
-            .filter(s -> s.startsWith(key))
-            .map(s -> s.substring(3))
-            .collect(Collectors.toList());
+      public String getMorphologicalData() {
+        return morphDataId == 0 ? "" : morphData.get(morphDataId);
       }
     };
   }
@@ -1155,7 +1158,7 @@ public class Dictionary {
         } else {
           end = line.indexOf(MORPH_SEPARATOR);
           boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
-          String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end);
+          String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
           if (aliasCount > 0 && !flagPart.isEmpty()) {
             flagPart = getAliasValue(Integer.parseInt(flagPart));
           }
@@ -1329,6 +1332,12 @@ public class Dictionary {
     return false;
   }
 
+  boolean isFlagAppendedByAffix(int affixId, char flag) {
+    if (affixId < 0 || flag == FLAG_UNSET) return false;
+    int appendId = affixData(affixId, AFFIX_APPEND);
+    return hasFlag(appendId, flag);
+  }
+
   /** Abstraction of the process of parsing flags taken from the affix and dic files */
   abstract static class FlagParsingStrategy {
     // we don't check the flag count, as Hunspell accepts longer sequences
@@ -1356,6 +1365,27 @@ public class Dictionary {
      * @return Parsed flags
      */
     abstract char[] parseFlags(String rawFlags);
+
+    /**
+     * @return the original string representation of the given flag encoded by {@link #parseFlags}.
+     */
+    abstract String printFlag(char flag);
+
+    /** @return a presentable sorted concatenation of {@link #printFlag} results */
+    String printFlags(char[] encodedFlags) {
+      List<String> printed = new ArrayList<>();
+      for (char c : encodedFlags) {
+        if (c >= DEFAULT_FLAGS) continue;
+        printed.add(printFlag(c));
+      }
+      String delimiter = this instanceof NumFlagParsingStrategy ? "," : "";
+      return printed.stream().sorted().collect(Collectors.joining(delimiter));
+    }
+
+    /** Parse flags from a string resulting from {@link #printFlags} */
+    char[] parseUtfFlags(String flagsInUtf) {
+      return parseFlags(flagsInUtf);
+    }
   }
 
   /**
@@ -1367,6 +1397,11 @@ public class Dictionary {
     public char[] parseFlags(String rawFlags) {
       return rawFlags.toCharArray();
     }
+
+    @Override
+    String printFlag(char flag) {
+      return String.valueOf(flag);
+    }
   }
 
   /** Used to read flags as UTF-8 even if the rest of the file is in the default (8-bit) encoding */
@@ -1375,6 +1410,16 @@ public class Dictionary {
     public char[] parseFlags(String rawFlags) {
       return new String(rawFlags.getBytes(DEFAULT_CHARSET), StandardCharsets.UTF_8).toCharArray();
     }
+
+    @Override
+    String printFlag(char flag) {
+      return String.valueOf(flag);
+    }
+
+    @Override
+    char[] parseUtfFlags(String flagsInUtf) {
+      return flagsInUtf.toCharArray();
+    }
   }
 
   /**
@@ -1405,6 +1450,11 @@ public class Dictionary {
 
       return result.toString().toCharArray();
     }
+
+    @Override
+    String printFlag(char flag) {
+      return String.valueOf((int) flag);
+    }
   }
 
   /**
@@ -1432,6 +1482,11 @@ public class Dictionary {
       }
       return flags;
     }
+
+    @Override
+    String printFlag(char flag) {
+      return new String(new char[] {(char) ((flag & 0xff00) >>> 8), (char) (flag & 0xff)});
+    }
   }
 
   boolean hasFlag(int entryId, char flag) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/EntrySuggestion.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/EntrySuggestion.java
new file mode 100644
index 00000000000..3fe489dcb22
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/EntrySuggestion.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Suggestion to add/edit dictionary entries to generate a given list of words created by {@link
+ * WordFormGenerator#compress}.
+ */
+public class EntrySuggestion {
+  private final List<DictEntry> toEdit, toAdd;
+  private final List<String> extraGenerated;
+
+  EntrySuggestion(List<DictEntry> toEdit, List<DictEntry> toAdd, List<String> extraGenerated) {
+    this.toEdit = Collections.unmodifiableList(toEdit);
+    this.toAdd = Collections.unmodifiableList(toAdd);
+    this.extraGenerated = Collections.unmodifiableList(extraGenerated);
+  }
+
+  /**
+   * @return the existing dictionary entries whose flags would need changing to accommodate the
+   *     given word list
+   */
+  public List<DictEntry> getEntriesToEdit() {
+    return toEdit;
+  }
+
+  /** @return new dictionary entries to be added to accommodate the given word list */
+  public List<DictEntry> getEntriesToAdd() {
+    return toAdd;
+  }
+
+  /**
+   * @return additional words generated by union of {@link #getEntriesToAdd()} and {@link
+   *     #getEntriesToEdit()} which weren't in the given list of words
+   */
+  public List<String> getExtraGeneratedWords() {
+    return extraGenerated;
+  }
+
+  @Override
+  public String toString() {
+    return "EntrySuggestion{" + internalsToString() + '}';
+  }
+
+  String internalsToString() {
+    return "toEdit=" + toEdit + ", toAdd=" + toAdd + ", extra=" + extraGenerated;
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
index 90e35930442..4123bcc28e1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@@ -178,7 +178,7 @@ public class Hunspell {
         offset,
         length,
         context,
-        (stem, formID, morphDataId) -> {
+        (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
           if (checkCase && !acceptCase(originalCase, formID, stem)) {
             return dictionary.hasFlag(formID, Dictionary.HIDDEN_FLAG);
           }
@@ -316,6 +316,52 @@ public class Hunspell {
         .collect(Collectors.toList());
   }
 
+  /**
+   * @return all possible analyses of the given word with stems, prefixes, suffixed and
+   *     morphological data. Note that the order of the returned objects might not correspond to the
+   *     *.dic file order!
+   */
+  public List<AffixedWord> analyzeSimpleWord(String word) {
+    List<AffixedWord> result = new ArrayList<>();
+    stemmer.analyze(
+        word.toCharArray(),
+        word.length(),
+        (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
+          List<AffixedWord.Affix> prefixes = new ArrayList<>();
+          List<AffixedWord.Affix> suffixes = new ArrayList<>();
+          if (outerPrefix >= 0) prefixes.add(new AffixedWord.Affix(dictionary, outerPrefix));
+          if (innerPrefix >= 0) prefixes.add(new AffixedWord.Affix(dictionary, innerPrefix));
+          if (outerSuffix >= 0) suffixes.add(new AffixedWord.Affix(dictionary, outerSuffix));
+          if (innerSuffix >= 0) suffixes.add(new AffixedWord.Affix(dictionary, innerSuffix));
+
+          DictEntry entry = dictionary.dictEntry(stem.toString(), formID, morphDataId);
+          result.add(new AffixedWord(word, entry, prefixes, suffixes));
+          return true;
+        });
+    return result;
+  }
+
+  /**
+   * Generate all word forms for all dictionary entries with the given root word. The result order
+   * is stable but not specified. This is equivalent to "unmunch" from the "hunspell-tools" package.
+   *
+   * @see WordFormGenerator for finer-grained APIs
+   */
+  public List<AffixedWord> getAllWordForms(String root) {
+    return new WordFormGenerator(dictionary).getAllWordForms(root, checkCanceled);
+  }
+
+  /**
+   * Given a list of words, try to produce a smaller set of dictionary entries (with some flags)
+   * that would generate these words. This is equivalent to "munch" from the "hunspell-tools"
+   * package.
+   *
+   * @see WordFormGenerator#compress for more details and control
+   */
+  public EntrySuggestion compress(List<String> words) {
+    return new WordFormGenerator(dictionary).compress(words, Set.of(), checkCanceled);
+  }
+
   private class CompoundPart {
     final CompoundPart prev;
     final int index, length;
@@ -433,7 +479,7 @@ public class Hunspell {
     words.add(ref);
 
     Stemmer.RootProcessor stopOnMatching =
-        (stem, formID, morphDataId) -> {
+        (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
           ref.ints[0] = formID;
           return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words));
         };
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 012f8bb6696..0e11d457c3c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -18,7 +18,6 @@ package org.apache.lucene.analysis.hunspell;
 
 import java.util.ArrayList;
 import java.util.List;
-import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.ArrayUtil;
@@ -65,7 +64,18 @@ final class Stemmer {
    * @return List of stems for the word
    */
   public List<CharsRef> stem(char[] word, int length) {
+    List<CharsRef> list = new ArrayList<>();
+    analyze(
+        word,
+        length,
+        (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
+          list.add(newStem(stem, morphDataId));
+          return true;
+        });
+    return list;
+  }
 
+  void analyze(char[] word, int length, RootProcessor processor) {
     if (dictionary.mayNeedInputCleaning()) {
       CharsRef scratchSegment = new CharsRef(word, 0, length);
       if (dictionary.needsInputCleaning(scratchSegment)) {
@@ -77,19 +87,12 @@ final class Stemmer {
         word = scratchBuffer;
       }
     }
-
-    List<CharsRef> list = new ArrayList<>();
     if (length == 0) {
-      return list;
+      return;
     }
 
-    RootProcessor processor =
-        (stem, formID, stemException) -> {
-          list.add(newStem(stem, stemException));
-          return true;
-        };
     if (!doStem(word, 0, length, WordContext.SIMPLE_WORD, processor)) {
-      return list;
+      return;
     }
 
     WordCase wordCase = caseOf(word, length);
@@ -99,7 +102,6 @@ final class Stemmer {
               doStem(variant, 0, varLength, WordContext.SIMPLE_WORD, processor);
       varyCase(word, length, wordCase, variationProcessor);
     }
-    return list;
   }
 
   interface CaseVariationProcessor {
@@ -214,7 +216,7 @@ final class Stemmer {
     if (result == null) return true;
 
     String src = new String(word, 0, length);
-    for (String s : result.collect(Collectors.toList())) {
+    for (String s : result.toList()) {
       if (!s.equals(src) && !processor.process(s.toCharArray(), s.length(), null)) {
         return false;
       }
@@ -239,13 +241,61 @@ final class Stemmer {
         if (!isRootCompatibleWithContext(context, -1, entryId)) {
           continue;
         }
-        if (!callProcessor(word, offset, length, processor, forms, i)) {
+        CharsRef charsRef = new CharsRef(word, offset, length);
+        if (!processor.processRoot(charsRef, entryId, morphDataId(forms, i), -1, -1, -1, -1)) {
           return false;
         }
       }
     }
-    return stem(
-        word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
+    StemCandidateProcessor stemProcessor =
+        new StemCandidateProcessor(context) {
+          @Override
+          boolean processStemCandidate(
+              char[] word,
+              int offset,
+              int length,
+              int lastAffix,
+              int outerPrefix,
+              int innerPrefix,
+              int outerSuffix,
+              int innerSuffix) {
+            IntsRef forms = dictionary.lookupWord(word, offset, length);
+            if (forms == null) return true;
+
+            char flag = dictionary.affixData(lastAffix, Dictionary.AFFIX_FLAG);
+            int prefixId = innerPrefix >= 0 ? innerPrefix : outerPrefix;
+            for (int i = 0; i < forms.length; i += formStep) {
+              int entryId = forms.ints[forms.offset + i];
+              if (dictionary.hasFlag(entryId, flag)
+                  || dictionary.isFlagAppendedByAffix(prefixId, flag)) {
+                if (innerPrefix < 0 && outerPrefix >= 0) {
+                  char prefixFlag = dictionary.affixData(outerPrefix, Dictionary.AFFIX_FLAG);
+                  if (!dictionary.hasFlag(entryId, prefixFlag)
+                      && !dictionary.isFlagAppendedByAffix(lastAffix, prefixFlag)) {
+                    continue;
+                  }
+                }
+
+                if (!isRootCompatibleWithContext(context, lastAffix, entryId)) {
+                  continue;
+                }
+
+                if (!processor.processRoot(
+                    new CharsRef(word, offset, length),
+                    entryId,
+                    morphDataId(forms, i),
+                    outerPrefix,
+                    innerPrefix,
+                    outerSuffix,
+                    innerSuffix)) {
+                  return false;
+                }
+              }
+            }
+            return true;
+          }
+        };
+    return removeAffixes(word, offset, length, true, -1, -1, -1, stemProcessor);
   }
 
   /**
@@ -277,9 +327,20 @@ final class Stemmer {
      *     Dictionary#hasFlag(int, char)}
      * @param morphDataId the id of the custom morphological data (0 if none), to be used with
      *     {@link Dictionary#morphData}
+     * @param outerPrefix the id of the outer prefix applied to the stem, or -1 if none
+     * @param innerPrefix the id of the inner prefix applied to the stem, or -1 if none
+     * @param outerSuffix the id of the outer suffix applied to the stem, or -1 if none
+     * @param innerSuffix the id of the inner suffix applied to the stem, or -1 if none
      * @return whether the processing should be continued
      */
-    boolean processRoot(CharsRef stem, int formID, int morphDataId);
+    boolean processRoot(
+        CharsRef stem,
+        int formID,
+        int morphDataId,
+        int outerPrefix,
+        int innerPrefix,
+        int outerSuffix,
+        int innerSuffix);
   }
 
   private String stemException(int morphDataId) {
@@ -318,33 +379,23 @@ final class Stemmer {
   }
 
   /**
-   * Generates a list of stems for the provided word
+   * Generates a list of stems for the provided word. It's called recursively when applying affixes
+   * one by one, setting {@code (inner/outer)(Suffix/Prefix)} parameters to non-negative values as
+   * that happens.
    *
    * @param word Word to generate the stems for
-   * @param previous previous affix that was removed (so we dont remove same one twice)
-   * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any
-   *     affixes in this recursive step
-   * @param prefixId ID of the most inner removed prefix, so that when removing a suffix, it's also
-   *     checked against the word
-   * @param recursionDepth current recursiondepth
    * @param doPrefix true if we should remove prefixes
-   * @param previousWasPrefix true if the previous removal was a prefix: if we are removing a
-   *     suffix, and it has no continuation requirements, it's ok. but two prefixes
-   *     (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
    * @return whether the processing should be continued
    */
-  private boolean stem(
+  boolean removeAffixes(
       char[] word,
       int offset,
       int length,
-      WordContext context,
-      int previous,
-      char prevFlag,
-      int prefixId,
-      int recursionDepth,
       boolean doPrefix,
-      boolean previousWasPrefix,
-      RootProcessor processor) {
+      int outerPrefix,
+      int innerPrefix,
+      int outerSuffix,
+      StemCandidateProcessor processor) {
     FST.Arc<IntsRef> arc = new FST.Arc<>();
     if (doPrefix && dictionary.prefixes != null) {
       FST<IntsRef> fst = dictionary.prefixes;
@@ -366,11 +417,11 @@ final class Stemmer {
 
         for (int j = 0; j < prefixes.length; j++) {
           int prefix = prefixes.ints[prefixes.offset + j];
-          if (prefix == previous) {
+          if (prefix == outerPrefix) {
             continue;
           }
 
-          if (isAffixCompatible(prefix, prevFlag, recursionDepth, true, false, context)) {
+          if (isAffixCompatible(prefix, true, outerPrefix, outerSuffix, processor.context)) {
             char[] strippedWord = stripAffix(word, offset, length, i, prefix, true);
             if (strippedWord == null) {
               continue;
@@ -381,12 +432,11 @@ final class Stemmer {
                 strippedWord,
                 pureAffix ? offset + i : 0,
                 pureAffix ? length - i : strippedWord.length,
-                context,
                 prefix,
-                previous,
-                -1,
-                recursionDepth,
                 true,
+                outerPrefix,
+                innerPrefix,
+                outerSuffix,
                 processor)) {
               return false;
             }
@@ -415,12 +465,11 @@ final class Stemmer {
 
         for (int j = 0; j < suffixes.length; j++) {
           int suffix = suffixes.ints[suffixes.offset + j];
-          if (suffix == previous) {
+          if (suffix == outerSuffix) {
             continue;
           }
 
-          if (isAffixCompatible(
-              suffix, prevFlag, recursionDepth, false, previousWasPrefix, context)) {
+          if (isAffixCompatible(suffix, false, outerPrefix, outerSuffix, processor.context)) {
             char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false);
             if (strippedWord == null) {
               continue;
@@ -431,12 +480,11 @@ final class Stemmer {
                 strippedWord,
                 pureAffix ? offset : 0,
                 pureAffix ? i : strippedWord.length,
-                context,
                 suffix,
-                previous,
-                prefixId,
-                recursionDepth,
                 false,
+                outerPrefix,
+                innerPrefix,
+                outerSuffix,
                 processor)) {
               return false;
             }
@@ -487,14 +535,10 @@ final class Stemmer {
   }
 
   private boolean isAffixCompatible(
-      int affix,
-      char prevFlag,
-      int recursionDepth,
-      boolean isPrefix,
-      boolean previousWasPrefix,
-      WordContext context) {
+      int affix, boolean isPrefix, int outerPrefix, int outerSuffix, WordContext context) {
     int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
 
+    boolean previousWasPrefix = outerSuffix < 0 && outerPrefix >= 0;
     if (context.isCompound()) {
       if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
         return false;
@@ -513,79 +557,70 @@ final class Stemmer {
       return false;
     }
 
-    if (recursionDepth == 0) {
+    if (outerPrefix == -1 && outerSuffix == -1) {
       return true;
     }
 
     if (dictionary.isCrossProduct(affix)) {
-      // cross check incoming continuation class (flag of previous affix) against list.
-      return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
+      // cross-check incoming continuation class (flag of previous affix) against this affix's flags
+      if (previousWasPrefix) return true;
+      if (outerSuffix >= 0) {
+        char prevFlag = dictionary.affixData(outerSuffix, Dictionary.AFFIX_FLAG);
+        return dictionary.hasFlag(append, prevFlag);
+      }
     }
 
     return false;
   }
 
   /**
-   * Applies the affix rule to the given word, producing a list of stems if any are found
+   * Applies the affix rule to the given word, producing a list of stems if any are found.
+   * Non-negative {@code (inner/outer)(Suffix/Prefix)} parameters indicate the already applied
+   * affixes.
    *
-   * @param strippedWord Char array containing the word with the affix removed and the strip added
+   * @param word Char array containing the word with the affix removed and the strip added
    * @param offset where the word actually starts in the array
    * @param length the length of the stripped word
-   * @param affix HunspellAffix representing the affix rule itself
-   * @param prefixId when we already stripped a prefix, we can't simply recurse and check the
-   *     suffix, unless both are compatible so we must check dictionary form against both to add it
-   *     as a stem!
-   * @param recursionDepth current recursion depth
+   * @param affix the id of the affix in {@link Dictionary#affixData}
    * @param prefix true if we are removing a prefix (false if it's a suffix)
    * @return whether the processing should be continued
    */
   private boolean applyAffix(
-      char[] strippedWord,
+      char[] word,
       int offset,
       int length,
-      WordContext context,
       int affix,
-      int previousAffix,
-      int prefixId,
-      int recursionDepth,
       boolean prefix,
-      RootProcessor processor) {
-    char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
+      int outerPrefix,
+      int innerPrefix,
+      int outerSuffix,
+      StemCandidateProcessor processor) {
+    int prefixId = innerPrefix >= 0 ? innerPrefix : outerPrefix;
+    int previousAffix = outerSuffix >= 0 ? outerSuffix : prefixId;
+
+    int innerSuffix = -1;
+    if (prefix) {
+      if (outerPrefix < 0) outerPrefix = affix;
+      else innerPrefix = affix;
+    } else {
+      if (outerSuffix < 0) outerSuffix = affix;
+      else innerSuffix = affix;
+    }
 
     boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix, prefixId);
-    IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length);
-    if (forms != null) {
-      for (int i = 0; i < forms.length; i += formStep) {
-        int entryId = forms.ints[forms.offset + i];
-        if (dictionary.hasFlag(entryId, flag) || isFlagAppendedByAffix(prefixId, flag)) {
-          // confusing: in this one exception, we already chained the first prefix against the
-          // second,
-          // so it doesnt need to be checked against the word
-          boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
-          if (!chainedPrefix && prefixId >= 0) {
-            char prefixFlag = dictionary.affixData(prefixId, Dictionary.AFFIX_FLAG);
-            if (!dictionary.hasFlag(entryId, prefixFlag)
-                && !isFlagAppendedByAffix(affix, prefixFlag)) {
-              continue;
-            }
-          }
-
-          if (!isRootCompatibleWithContext(context, affix, entryId)) {
-            continue;
-          }
-
-          if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
-            return false;
-          }
-        }
-      }
+    if (!skipLookup
+        && !processor.processStemCandidate(
+            word, offset, length, affix, outerPrefix, innerPrefix, outerSuffix, innerSuffix)) {
+      return false;
     }
 
+    int recursionDepth =
+        (outerSuffix >= 0 ? 1 : 0) + (innerPrefix >= 0 ? 2 : outerPrefix >= 0 ? 1 : 0) - 1;
     if (dictionary.isCrossProduct(affix) && recursionDepth <= 1) {
+      char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
       boolean doPrefix;
       if (recursionDepth == 0) {
         if (prefix) {
-          prefixId = affix;
           doPrefix = dictionary.complexPrefixes && dictionary.isSecondStagePrefix(flag);
           // we took away the first prefix.
           // COMPLEXPREFIXES = true:  combine with a second prefix and another suffix
@@ -599,33 +634,42 @@ final class Stemmer {
           return true;
         }
       } else {
-        doPrefix = false;
         if (prefix && dictionary.complexPrefixes) {
-          prefixId = affix;
+          doPrefix = true;
           // we took away the second prefix: go look for another suffix
         } else if (prefix || dictionary.complexPrefixes || !dictionary.isSecondStageSuffix(flag)) {
           return true;
+        } else {
+          // we took away a prefix, then a suffix: go look for another suffix
+          doPrefix = false;
         }
-        // we took away a prefix, then a suffix: go look for another suffix
       }
 
-      return stem(
-          strippedWord,
-          offset,
-          length,
-          context,
-          affix,
-          flag,
-          prefixId,
-          recursionDepth + 1,
-          doPrefix,
-          prefix,
-          processor);
+      return removeAffixes(
+          word, offset, length, doPrefix, outerPrefix, innerPrefix, outerSuffix, processor);
     }
 
     return true;
   }
 
+  abstract static class StemCandidateProcessor {
+    private final WordContext context;
+
+    StemCandidateProcessor(WordContext context) {
+      this.context = context;
+    }
+
+    abstract boolean processStemCandidate(
+        char[] word,
+        int offset,
+        int length,
+        int lastAffix,
+        int outerPrefix,
+        int innerPrefix,
+        int outerSuffix,
+        int innerSuffix);
+  }
+
   private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
     if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
       return false;
@@ -633,39 +677,32 @@ final class Stemmer {
     if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
       char cFlag = context.requiredFlag(dictionary);
       return dictionary.hasFlag(entryId, cFlag)
-          || isFlagAppendedByAffix(lastAffix, cFlag)
+          || dictionary.isFlagAppendedByAffix(lastAffix, cFlag)
           || dictionary.hasFlag(entryId, dictionary.compoundFlag)
-          || isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
+          || dictionary.isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
     }
     return true;
   }
 
-  private boolean callProcessor(
-      char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
-    CharsRef stem = new CharsRef(word, offset, length);
-    int morphDataId = dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0;
-    return processor.processRoot(stem, forms.ints[forms.offset + i], morphDataId);
+  private int morphDataId(IntsRef forms, int i) {
+    return dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0;
   }
 
   private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) {
     char circumfix = dictionary.circumfix;
     // if circumfix was previously set by a prefix, we must check this suffix,
     // to ensure it has it, and vice versa
-    if (isSuffix
-        && isFlagAppendedByAffix(prefixId, circumfix) != isFlagAppendedByAffix(affix, circumfix)) {
-      return true;
+    if (isSuffix) {
+      if (dictionary.isFlagAppendedByAffix(prefixId, circumfix)
+          != dictionary.isFlagAppendedByAffix(affix, circumfix)) {
+        return true;
+      }
     }
-    if (isFlagAppendedByAffix(affix, dictionary.needaffix)) {
+    if (dictionary.isFlagAppendedByAffix(affix, dictionary.needaffix)) {
       return !isSuffix
           || previousAffix < 0
-          || isFlagAppendedByAffix(previousAffix, dictionary.needaffix);
+          || dictionary.isFlagAppendedByAffix(previousAffix, dictionary.needaffix);
     }
     return false;
   }
-
-  private boolean isFlagAppendedByAffix(int affixId, char flag) {
-    if (affixId < 0 || flag == Dictionary.FLAG_UNSET) return false;
-    int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND);
-    return dictionary.hasFlag(appendId, flag);
-  }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java
new file mode 100644
index 00000000000..043df4fbb1d
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java
@@ -0,0 +1,487 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
+import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
+import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import org.apache.lucene.analysis.hunspell.AffixedWord.Affix;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.IntsRefFSTEnum;
+
+/**
+ * A utility class used for generating possible word forms by adding affixes to stems ({@link
+ * #getAllWordForms(String, String, Runnable)}), and suggesting stems and flags to generate the
+ * given set of words ({@link #compress(List, Set, Runnable)}).
+ */
+public class WordFormGenerator {
+  private final Dictionary dictionary;
+  private final Map<Character, List<AffixEntry>> affixes = new HashMap<>();
+  private final Stemmer stemmer;
+
+  public WordFormGenerator(Dictionary dictionary) {
+    this.dictionary = dictionary;
+    fillAffixMap(dictionary.prefixes, AffixKind.PREFIX);
+    fillAffixMap(dictionary.suffixes, AffixKind.SUFFIX);
+    stemmer = new Stemmer(dictionary);
+  }
+
+  private void fillAffixMap(FST<IntsRef> fst, AffixKind kind) {
+    if (fst == null) return;
+
+    IntsRefFSTEnum<IntsRef> fstEnum = new IntsRefFSTEnum<>(fst);
+    try {
+      while (true) {
+        IntsRefFSTEnum.InputOutput<IntsRef> io = fstEnum.next();
+        if (io == null) break;
+
+        IntsRef affixIds = io.output;
+        for (int j = 0; j < affixIds.length; j++) {
+          int id = affixIds.ints[affixIds.offset + j];
+          char flag = dictionary.affixData(id, AFFIX_FLAG);
+          var entry =
+              new AffixEntry(id, flag, kind, toString(kind, io.input), strip(id), condition(id));
+          affixes.computeIfAbsent(flag, __ -> new ArrayList<>()).add(entry);
+        }
+      }
+    } catch (IOException e) {
+      throw new UncheckedIOException(e);
+    }
+  }
+
+  private String toString(AffixKind kind, IntsRef input) {
+    char[] affixChars = new char[input.length];
+    for (int i = 0; i < affixChars.length; i++) {
+      affixChars[kind == AffixKind.PREFIX ? i : affixChars.length - i - 1] =
+          (char) input.ints[input.offset + i];
+    }
+    return new String(affixChars);
+  }
+
+  private AffixCondition condition(int affixId) {
+    int condition = dictionary.getAffixCondition(affixId);
+    return condition == 0 ? AffixCondition.ALWAYS_TRUE : dictionary.patterns.get(condition);
+  }
+
+  private String strip(int affixId) {
+    int stripOrd = dictionary.affixData(affixId, Dictionary.AFFIX_STRIP_ORD);
+    int stripStart = dictionary.stripOffsets[stripOrd];
+    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
+    return new String(dictionary.stripData, stripStart, stripEnd - stripStart);
+  }
+
+  /**
+   * Generate all word forms for all dictionary entries with the given root word. The result order
+   * is stable but not specified. This is equivalent to "unmunch" from the "hunspell-tools" package.
+   *
+   * @param checkCanceled an object that's periodically called, allowing to interrupt the generation
+   *     by throwing an exception
+   */
+  public List<AffixedWord> getAllWordForms(String root, Runnable checkCanceled) {
+    Set<AffixedWord> result = new LinkedHashSet<>();
+    DictEntries entries = dictionary.lookupEntries(root);
+    if (entries != null) {
+      for (DictEntry entry : entries) {
+        result.addAll(getAllWordForms(root, entry.getFlags(), checkCanceled));
+      }
+    }
+    return new ArrayList<>(result);
+  }
+
+  /**
+   * Generate all word forms for the given root pretending it has the given flags (in the same
+   * format as the dictionary uses). The result order is stable but not specified. This is
+   * equivalent to "unmunch" from the "hunspell-tools" package.
+   *
+   * @param checkCanceled an object that's periodically called, allowing to interrupt the generation
+   *     by throwing an exception
+   */
+  public List<AffixedWord> getAllWordForms(String stem, String flags, Runnable checkCanceled) {
+    var encodedFlags = toSet(dictionary.flagParsingStrategy.parseUtfFlags(flags));
+    if (!shouldConsiderAtAll(encodedFlags)) return List.of();
+
+    LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
+    AffixedWord bare = new AffixedWord(stem, DictEntry.create(stem, flags), List.of(), List.of());
+    checkCanceled.run();
+    if (!encodedFlags.contains(dictionary.needaffix)) {
+      result.add(bare);
+    }
+    result.addAll(expand(bare, encodedFlags, checkCanceled));
+    return new ArrayList<>(result);
+  }
+
+  private boolean canStemToOriginal(AffixedWord derived) {
+    String word = derived.getWord();
+    char[] chars = word.toCharArray();
+    if (isForbiddenWord(chars, 0, chars.length)) {
+      return false;
+    }
+
+    String stem = derived.getDictEntry().getStem();
+    var processor =
+        new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) {
+          boolean foundStem = false;
+          boolean foundForbidden = false;
+
+          @Override
+          boolean processStemCandidate(
+              char[] chars,
+              int offset,
+              int length,
+              int lastAffix,
+              int outerPrefix,
+              int innerPrefix,
+              int outerSuffix,
+              int innerSuffix) {
+            if (isForbiddenWord(chars, offset, length)) {
+              foundForbidden = true;
+              return false;
+            }
+            foundStem |= length == stem.length() && stem.equals(new String(chars, offset, length));
+            return !foundStem;
+          }
+        };
+    stemmer.removeAffixes(chars, 0, chars.length, true, -1, -1, -1, processor);
+    return processor.foundStem && !processor.foundForbidden;
+  }
+
+  private boolean isForbiddenWord(char[] chars, int offset, int length) {
+    if (dictionary.forbiddenword != FLAG_UNSET) {
+      IntsRef forms = dictionary.lookupWord(chars, offset, length);
+      if (forms != null) {
+        for (int i = 0; i < forms.length; i += dictionary.formStep()) {
+          if (dictionary.hasFlag(forms.ints[forms.offset + i], dictionary.forbiddenword)) {
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+  private static LinkedHashSet<Character> toSet(char[] flags) {
+    LinkedHashSet<Character> set = new LinkedHashSet<>();
+    for (char c : flags) {
+      set.add(c);
+    }
+    return set;
+  }
+
+  private LinkedHashSet<AffixedWord> expand(
+      AffixedWord stem, LinkedHashSet<Character> flags, Runnable checkCanceled) {
+    LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
+    for (Character flag : flags) {
+      List<AffixEntry> entries = affixes.get(flag);
+      if (entries == null) continue;
+
+      for (AffixEntry affix : entries) {
+        checkCanceled.run();
+        AffixedWord derived = affix.apply(stem, dictionary);
+        if (derived != null) {
+          LinkedHashSet<Character> append = appendFlags(affix);
+          if (shouldConsiderAtAll(append)) {
+            if (canStemToOriginal(derived)) {
+              result.add(derived);
+            }
+            if (dictionary.isCrossProduct(affix.id)) {
+              result.addAll(expand(derived, updateFlags(flags, flag, append), checkCanceled));
+            }
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  private boolean shouldConsiderAtAll(Set<Character> flags) {
+    return !flags.contains(dictionary.compoundBegin)
+        && !flags.contains(dictionary.compoundMiddle)
+        && !flags.contains(dictionary.compoundEnd)
+        && !flags.contains(dictionary.forbiddenword)
+        && !flags.contains(dictionary.onlyincompound);
+  }
+
+  private LinkedHashSet<Character> updateFlags(
+      Set<Character> flags, Character toRemove, Set<Character> toAppend) {
+    LinkedHashSet<Character> copy = new LinkedHashSet<>(flags);
+    copy.remove(toRemove);
+    copy.addAll(toAppend);
+    return copy;
+  }
+
+  private LinkedHashSet<Character> appendFlags(AffixEntry affix) {
+    char appendId = dictionary.affixData(affix.id, AFFIX_APPEND);
+    return appendId <= 0 ? new LinkedHashSet<>() : toSet(dictionary.flagLookup.getFlags(appendId));
+  }
+
+  /**
+   * Given a list of words, try to produce a smaller set of dictionary entries (with some flags)
+   * that would generate these words. This is equivalent to "munch" from the "hunspell-tools"
+   * package. The algorithm tries to minimize the number of the dictionary entries to add or change,
+   * the number of flags involved, and the number of non-requested additionally generated words. All
+   * the mentioned words are in the dictionary format and case: no ICONV/OCONV/IGNORE conversions
+   * are applied.
+   *
+   * @param words the list of words to generate
+   * @param forbidden the set of words to avoid generating
+   * @param checkCanceled an object that's periodically called, allowing to interrupt the generation
+   *     by throwing an exception
+   * @return the information about suggested dictionary entries and overgenerated words, or {@code
+   *     null} if the algorithm couldn't generate anything
+   */
+  public EntrySuggestion compress(
+      List<String> words, Set<String> forbidden, Runnable checkCanceled) {
+    if (words.isEmpty()) return null;
+    if (words.stream().anyMatch(forbidden::contains)) {
+      throw new IllegalArgumentException("'words' and 'forbidden' shouldn't intersect");
+    }
+
+    return new WordCompressor(words, forbidden, checkCanceled).compress();
+  }
+
+  private record AffixEntry(
+      int id, char flag, AffixKind kind, String affix, String strip, AffixCondition condition) {
+    AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
+      if (!isCompatibleWithPreviousAffixes(stem, dictionary)) return null;
+
+      String word = stem.getWord();
+      boolean isPrefix = kind == AffixKind.PREFIX;
+      if (!(isPrefix ? word.startsWith(strip) : word.endsWith(strip))) return null;
+
+      String stripped =
+          isPrefix
+              ? word.substring(strip.length())
+              : word.substring(0, word.length() - strip.length());
+      if (!condition.acceptsStem(stripped)) return null;
+
+      String applied = isPrefix ? affix + stripped : stripped + affix;
+      List<Affix> prefixes = new ArrayList<>(stem.getPrefixes());
+      List<Affix> suffixes = new ArrayList<>(stem.getSuffixes());
+      (isPrefix ? prefixes : suffixes).add(0, new Affix(dictionary, id));
+      return new AffixedWord(applied, stem.getDictEntry(), prefixes, suffixes);
+    }
+
+    private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, Dictionary dictionary) {
+      boolean isPrefix = kind == AffixKind.PREFIX;
+      List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes();
+      if (sameAffixes.size() == 2) return false;
+      if (isPrefix && sameAffixes.size() == 1 && !dictionary.complexPrefixes) return false;
+      if (!isPrefix && !stem.getPrefixes().isEmpty()) return false;
+      if (sameAffixes.size() == 1
+          && !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) {
+        return false;
+      }
+      return true;
+    }
+  }
+
+  private class WordCompressor {
+    private final Comparator<State> solutionFitness =
+        Comparator.comparingInt((State s) -> s.forbidden)
+            .thenComparingInt(s -> s.underGenerated)
+            .thenComparingInt(s -> s.stemToFlags.size())
+            .thenComparingInt(s -> s.overGenerated);
+    private final Set<String> forbidden;
+    private final Runnable checkCanceled;
+    private final Set<String> wordSet;
+    private final Set<String> existingStems;
+    private final Map<String, Set<FlagSet>> stemToPossibleFlags = new HashMap<>();
+    private final Map<String, Integer> stemCounts = new LinkedHashMap<>();
+
+    WordCompressor(List<String> words, Set<String> forbidden, Runnable checkCanceled) {
+      this.forbidden = forbidden;
+      this.checkCanceled = checkCanceled;
+      wordSet = new HashSet<>(words);
+
+      Stemmer.StemCandidateProcessor processor =
+          new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) {
+            @Override
+            boolean processStemCandidate(
+                char[] word,
+                int offset,
+                int length,
+                int lastAffix,
+                int outerPrefix,
+                int innerPrefix,
+                int outerSuffix,
+                int innerSuffix) {
+              String candidate = new String(word, offset, length);
+              stemCounts.merge(candidate, 1, Integer::sum);
+              Set<Character> flags = new LinkedHashSet<>();
+              if (outerPrefix >= 0) flags.add(dictionary.affixData(outerPrefix, AFFIX_FLAG));
+              if (innerPrefix >= 0) flags.add(dictionary.affixData(innerPrefix, AFFIX_FLAG));
+              if (outerSuffix >= 0) flags.add(dictionary.affixData(outerSuffix, AFFIX_FLAG));
+              if (innerSuffix >= 0) flags.add(dictionary.affixData(innerSuffix, AFFIX_FLAG));
+              stemToPossibleFlags
+                  .computeIfAbsent(candidate, __ -> new LinkedHashSet<>())
+                  .add(new FlagSet(flags, dictionary));
+              return true;
+            }
+          };
+
+      for (String word : words) {
+        checkCanceled.run();
+        stemCounts.merge(word, 1, Integer::sum);
+        stemToPossibleFlags.computeIfAbsent(word, __ -> new LinkedHashSet<>());
+        stemmer.removeAffixes(word.toCharArray(), 0, word.length(), true, -1, -1, -1, processor);
+      }
+
+      existingStems =
+          stemCounts.keySet().stream()
+              .filter(stem -> dictionary.lookupEntries(stem) != null)
+              .collect(Collectors.toSet());
+    }
+
+    EntrySuggestion compress() {
+      Comparator<String> stemSorter =
+          Comparator.comparing((String s) -> existingStems.contains(s))
+              .thenComparing(stemCounts::get)
+              .reversed();
+      List<String> sortedStems = stemCounts.keySet().stream().sorted(stemSorter).toList();
+      PriorityQueue<State> queue = new PriorityQueue<>(solutionFitness);
+      queue.offer(new State(Map.of(), wordSet.size(), 0, 0));
+      State result = null;
+      while (!queue.isEmpty()) {
+        State state = queue.poll();
+        if (state.underGenerated == 0) {
+          if (result == null || solutionFitness.compare(state, result) < 0) result = state;
+          if (state.forbidden == 0) break;
+          continue;
+        }
+
+        for (String stem : sortedStems) {
+          if (!state.stemToFlags.containsKey(stem)) {
+            queue.offer(addStem(state, stem));
+          }
+        }
+
+        for (Map.Entry<String, Set<FlagSet>> entry : state.stemToFlags.entrySet()) {
+          for (FlagSet flags : stemToPossibleFlags.get(entry.getKey())) {
+            if (!entry.getValue().contains(flags)) {
+              queue.offer(addFlags(state, entry.getKey(), flags));
+            }
+          }
+        }
+      }
+      return result == null ? null : toSuggestion(result);
+    }
+
+    EntrySuggestion toSuggestion(State state) {
+      List<DictEntry> toEdit = new ArrayList<>();
+      List<DictEntry> toAdd = new ArrayList<>();
+      for (Map.Entry<String, Set<FlagSet>> entry : state.stemToFlags.entrySet()) {
+        addEntry(toEdit, toAdd, entry.getKey(), FlagSet.flatten(entry.getValue()));
+      }
+
+      List<String> extraGenerated = new ArrayList<>();
+      for (String extra : allGenerated(state.stemToFlags).distinct().sorted().toList()) {
+        if (wordSet.contains(extra)) continue;
+
+        if (forbidden.contains(extra) && dictionary.forbiddenword != FLAG_UNSET) {
+          addEntry(toEdit, toAdd, extra, Set.of(dictionary.forbiddenword));
+        } else {
+          extraGenerated.add(extra);
+        }
+      }
+
+      return new EntrySuggestion(toEdit, toAdd, extraGenerated);
+    }
+
+    private void addEntry(
+        List<DictEntry> toEdit, List<DictEntry> toAdd, String stem, Set<Character> flags) {
+      String flagString = toFlagString(flags);
+      (existingStems.contains(stem) ? toEdit : toAdd).add(DictEntry.create(stem, flagString));
+    }
+
+    private State addStem(State state, String stem) {
+      LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
+      stemToFlags.put(stem, Set.of());
+      return newState(stemToFlags);
+    }
+
+    private State addFlags(State state, String stem, FlagSet flags) {
+      LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
+      Set<FlagSet> flagSets = new LinkedHashSet<>(stemToFlags.get(stem));
+      flagSets.add(flags);
+      stemToFlags.put(stem, flagSets);
+      return newState(stemToFlags);
+    }
+
+    private State newState(Map<String, Set<FlagSet>> stemToFlags) {
+      Set<String> allGenerated = allGenerated(stemToFlags).collect(Collectors.toSet());
+      return new State(
+          stemToFlags,
+          (int) wordSet.stream().filter(s -> !allGenerated.contains(s)).count(),
+          (int) allGenerated.stream().filter(s -> !wordSet.contains(s)).count(),
+          (int) allGenerated.stream().filter(s -> forbidden.contains(s)).count());
+    }
+
+    private final Map<StemWithFlags, List<String>> expansionCache = new HashMap<>();
+
+    private record StemWithFlags(String stem, Set<FlagSet> flags) {}
+
+    private Stream<String> allGenerated(Map<String, Set<FlagSet>> stemToFlags) {
+      Function<StemWithFlags, List<String>> expandToWords =
+          e -> expand(e.stem, FlagSet.flatten(e.flags)).stream().map(w -> w.getWord()).toList();
+      return stemToFlags.entrySet().stream()
+          .map(e -> new StemWithFlags(e.getKey(), e.getValue()))
+          .flatMap(swc -> expansionCache.computeIfAbsent(swc, expandToWords).stream());
+    }
+
+    private List<AffixedWord> expand(String stem, Set<Character> flagSet) {
+      return getAllWordForms(stem, toFlagString(flagSet), checkCanceled);
+    }
+
+    private String toFlagString(Set<Character> flagSet) {
+      return dictionary.flagParsingStrategy.printFlags(Dictionary.toSortedCharArray(flagSet));
+    }
+  }
+
+  private record FlagSet(Set<Character> flags, Dictionary dictionary) {
+    static Set<Character> flatten(Set<FlagSet> flagSets) {
+      return flagSets.stream().flatMap(f -> f.flags.stream()).collect(Collectors.toSet());
+    }
+
+    @Override
+    public String toString() {
+      return dictionary.flagParsingStrategy.printFlags(Dictionary.toSortedCharArray(flags));
+    }
+  }
+
+  private record State(
+      Map<String, Set<FlagSet>> stemToFlags,
+      int underGenerated,
+      int overGenerated,
+      int forbidden) {}
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index 3ba00941109..b18b2a4015e 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -28,6 +28,7 @@ import java.text.ParseException;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.stream.Collectors;
@@ -275,7 +276,9 @@ public class TestDictionary extends LuceneTestCase {
     DictEntries simpleNoun = dic.lookupEntries("simplenoun");
     assertEquals(1, simpleNoun.size());
     assertEquals(Collections.emptyList(), simpleNoun.getMorphologicalValues(0, "aa:"));
-    assertEquals(Collections.singletonList("42"), simpleNoun.getMorphologicalValues(0, "fr:"));
+    assertEquals(List.of("42"), simpleNoun.getMorphologicalValues(0, "fr:"));
+    assertEquals(List.of("42"), simpleNoun.get(0).getMorphologicalValues("fr:"));
+    assertEquals("A", simpleNoun.get(0).getFlags());
 
     DictEntries lay = dic.lookupEntries("lay");
     String actual =
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
index 3da1f114a16..9524f682546 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
@@ -24,8 +24,13 @@ import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.THROW_EXCEPTION;
 import java.io.IOException;
 import java.text.ParseException;
 import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.CancellationException;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.junit.Test;
 
@@ -72,9 +77,134 @@ public class TestHunspell extends LuceneTestCase {
 
   @Test
   public void testStemmingApi() throws Exception {
-    Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
-    Hunspell hunspell = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
+    Hunspell hunspell = loadNoTimeout("simple");
     assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
     assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
   }
+
+  @Test
+  public void testAnalysisApi() throws Exception {
+    Hunspell hunspell = loadNoTimeout("base");
+    assertEquals(hunspell.analyzeSimpleWord("nonexistent"), List.of());
+    AffixedWord word = hunspell.analyzeSimpleWord("recreated").get(0);
+    checkAffixedWord(word, "create", List.of("A"), List.of("D"));
+  }
+
+  @Test
+  public void testAnalysisSeveralSuffixes() throws Exception {
+    Hunspell hunspell = loadNoTimeout("needaffix5");
+    AffixedWord word = hunspell.analyzeSimpleWord("pseudoprefoopseudosufbar").get(0);
+    checkAffixedWord(word, "foo", List.of("C"), List.of("B", "A"));
+  }
+
+  @Test
+  public void testAnalysisFlagLong() throws Exception {
+    AffixedWord word = loadNoTimeout("flaglong").analyzeSimpleWord("foos").get(0);
+    checkAffixedWord(word, "foo", List.of(), List.of("Y1"));
+  }
+
+  @Test
+  public void testAnalysisFlagNum() throws Exception {
+    AffixedWord word = loadNoTimeout("flagnum").analyzeSimpleWord("foos").get(0);
+    checkAffixedWord(word, "foo", List.of(), List.of("65000"));
+  }
+
+  @Test
+  public void testAnalysisMorphData() throws Exception {
+    List<AffixedWord> words = loadNoTimeout("morphdata").analyzeSimpleWord("works");
+    assertEquals(2, words.size());
+    AffixedWord verb =
+        words.get(words.get(0).getDictEntry().getMorphologicalData().contains("verb") ? 0 : 1);
+    AffixedWord noun = words.get(words.get(0) != verb ? 0 : 1);
+    assertNotNull(verb);
+    assertNotNull(noun);
+    checkAffixedWord(verb, "work", List.of(), List.of("A"));
+    checkAffixedWord(noun, "work", List.of(), List.of("B"));
+
+    assertEquals(List.of("worknoun"), noun.getDictEntry().getMorphologicalValues("st:"));
+    assertEquals(List.of("workverb"), verb.getDictEntry().getMorphologicalValues("st:"));
+    assertEquals("st:worknoun", noun.getDictEntry().getMorphologicalData());
+    assertEquals("st:workverb", verb.getDictEntry().getMorphologicalData());
+  }
+
+  private void checkAffixedWord(
+      AffixedWord word, String stem, List<String> prefixFlags, List<String> suffixFlags) {
+    assertEquals(stem, word.getDictEntry().getStem());
+    assertEquals(prefixFlags, word.getPrefixes().stream().map(AffixedWord.Affix::getFlag).toList());
+    assertEquals(suffixFlags, word.getSuffixes().stream().map(AffixedWord.Affix::getFlag).toList());
+  }
+
+  private Hunspell loadNoTimeout(String name) throws Exception {
+    Dictionary dictionary = loadDictionary(false, name + ".aff", name + ".dic");
+    return new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
+  }
+
+  @Test
+  public void testExpandRootApi() throws Exception {
+    Hunspell h = loadNoTimeout("base");
+    String[] createFormsBase = {
+      "create", "created", "creates", "creating", "creation", "creations"
+    };
+    List<String> expected =
+        Stream.concat(
+                Stream.of(createFormsBase).flatMap(s -> Stream.of(s, "pro" + s, "re" + s)),
+                Stream.of("creative"))
+            .sorted()
+            .toList();
+
+    Map<String, AffixedWord> expanded =
+        TestSpellChecking.checkExpansionGeneratesCorrectWords(h, "create", "base").stream()
+            .collect(Collectors.toMap(w -> w.getWord(), w -> w));
+    assertEquals(expected, expanded.keySet().stream().sorted().toList());
+
+    checkAffixedWord(expanded.get("created"), "create", List.of(), List.of("D"));
+    checkAffixedWord(expanded.get("recreated"), "create", List.of("A"), List.of("D"));
+
+    WordFormGenerator generator = new WordFormGenerator(h.dictionary);
+    List<AffixedWord> overrideFlag = generator.getAllWordForms("create", "U", () -> {});
+    assertEquals(
+        Set.of("create", "uncreate"),
+        overrideFlag.stream().map(w -> w.getWord()).collect(Collectors.toSet()));
+
+    List<AffixedWord> nonExistentRoot = generator.getAllWordForms("form", "S", () -> {});
+    assertEquals(
+        Set.of("form", "forms"),
+        nonExistentRoot.stream().map(w -> w.getWord()).collect(Collectors.toSet()));
+  }
+
+  @Test
+  public void testCompressingApi() throws Exception {
+    Hunspell h = loadNoTimeout("base");
+    String[] createQuery = {"create", "created", "creates", "creating", "creation"};
+    checkCompression(h, "toEdit=[create/DGNS], toAdd=[], extra=[]", createQuery);
+    checkCompression(h, "toEdit=[created], toAdd=[creates], extra=[]", "creates", "created");
+    checkCompression(h, "toEdit=[], toAdd=[creation/S], extra=[]", "creation", "creations");
+    checkCompression(h, "toEdit=[], toAdd=[abc, def], extra=[]", "abc", "def");
+    checkCompression(h, "toEdit=[], toAdd=[form/S], extra=[]", "form", "forms");
+
+    checkCompression(
+        loadNoTimeout("compress"), "toEdit=[], toAdd=[form/X], extra=[forms]", "form", "formx");
+  }
+
+  @Test
+  public void testCompressingIsMinimal() throws Exception {
+    Hunspell h = loadNoTimeout("compress");
+    checkCompression(
+        h, "toEdit=[], toAdd=[form/GS], extra=[]", "formings", "forming", "form", "forms");
+  }
+
+  @Test
+  public void testCompressingWithProhibition() throws Exception {
+    WordFormGenerator gen = new WordFormGenerator(loadNoTimeout("compress").dictionary);
+    assertEquals(
+        "toEdit=[], toAdd=[form/S], extra=[]",
+        gen.compress(List.of("form", "forms"), Set.of("formx"), () -> {}).internalsToString());
+    assertEquals(
+        "toEdit=[], toAdd=[form, formx], extra=[]",
+        gen.compress(List.of("form", "formx"), Set.of("forms"), () -> {}).internalsToString());
+  }
+
+  private void checkCompression(Hunspell h, String expected, String... words) {
+    assertEquals(expected, h.compress(List.of(words)).internalsToString());
+  }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
index 15e452ce133..21927aa231a 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
@@ -21,8 +21,12 @@ import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.text.ParseException;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
+import java.util.Set;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.util.IOUtils;
@@ -97,6 +101,10 @@ public class TestSpellChecking extends LuceneTestCase {
     doTest("compoundflag");
   }
 
+  public void testFlagUtf8() throws Exception {
+    doTest("flagutf8");
+  }
+
   public void testCheckCompoundCase() throws Exception {
     doTest("checkcompoundcase");
   }
@@ -230,13 +238,15 @@ public class TestSpellChecking extends LuceneTestCase {
   }
 
   protected void doTest(String name) throws Exception {
+    //noinspection ConstantConditions
     checkSpellCheckerExpectations(
         Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name));
   }
 
   static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
     InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
-    InputStream dictStream = Files.newInputStream(Path.of(basePath.toString() + ".dic"));
+    Path dicFile = Path.of(basePath + ".dic");
+    InputStream dictStream = Files.newInputStream(dicFile);
 
     Hunspell speller;
     try {
@@ -273,5 +283,80 @@ public class TestSpellChecking extends LuceneTestCase {
     } else {
       assertFalse(".sug file without .wrong file!", Files.exists(sug));
     }
+
+    Set<String> everythingGenerated = expandWholeDictionary(dicFile, speller);
+    if (everythingGenerated != null && !speller.dictionary.mayNeedInputCleaning()) {
+      checkGoodSugWordsAreGenerated(speller, good, sug, everythingGenerated);
+    }
+  }
+
+  private static Set<String> expandWholeDictionary(Path dic, Hunspell speller) throws IOException {
+    Set<String> everythingGenerated = new HashSet<>();
+    boolean generatedEverything = true;
+    try (Stream<String> lines = Files.lines(dic, speller.dictionary.decoder.charset())) {
+      for (String line : lines.skip(1).toList()) {
+        int len = (int) line.chars().takeWhile(c -> !Character.isWhitespace(c) && c != '/').count();
+        String word = line.substring(0, len).trim();
+        if (word.isEmpty() || word.contains("\\")) {
+          generatedEverything = false;
+          continue;
+        }
+
+        List<AffixedWord> expanded =
+            checkExpansionGeneratesCorrectWords(speller, word, dic.toString());
+        expanded.forEach(w -> everythingGenerated.add(w.getWord().toLowerCase(Locale.ROOT)));
+      }
+    }
+    return generatedEverything ? everythingGenerated : null;
+  }
+
+  private static void checkGoodSugWordsAreGenerated(
+      Hunspell speller, Path good, Path sug, Set<String> everythingGenerated) throws IOException {
+    Set<String> goodWords = new HashSet<>();
+    if (Files.exists(good)) {
+      Files.readAllLines(good).stream().map(String::trim).forEach(goodWords::add);
+    }
+    if (Files.exists(sug)) {
+      Files.readAllLines(sug).stream()
+          .flatMap(line -> Stream.of(line.split(", ")))
+          .map(String::trim)
+          .filter(s -> !s.contains(" "))
+          .forEach(goodWords::add);
+    }
+
+    goodWords.removeAll(everythingGenerated);
+    goodWords.removeIf(s -> !s.equals(s.toLowerCase(Locale.ROOT)));
+    goodWords.removeIf(s -> speller.analyzeSimpleWord(s).isEmpty());
+
+    assertTrue("Some *.good/sug words weren't generated: " + goodWords, goodWords.isEmpty());
+  }
+
+  static List<AffixedWord> checkExpansionGeneratesCorrectWords(
+      Hunspell hunspell, String stem, String baseName) {
+    List<AffixedWord> expanded = hunspell.getAllWordForms(stem);
+    Set<AffixedWord> misspelled = new HashSet<>();
+    for (AffixedWord word : expanded) {
+      if (!hunspell.spell(word.getWord()) || hunspell.analyzeSimpleWord(word.getWord()).isEmpty()) {
+        misspelled.add(word);
+      }
+    }
+    if (!misspelled.isEmpty()) {
+      fail("Misspelled words generated in " + baseName + ": " + misspelled);
+    }
+
+    if (expanded.stream().anyMatch(e -> e.getWord().equals(stem))) {
+      EntrySuggestion suggestion =
+          hunspell.compress(expanded.stream().map(AffixedWord::getWord).toList());
+      if (suggestion != null) {
+        String message =
+            ("Compression suggests a different stem from the original " + stem)
+                + (" in " + baseName + ":" + suggestion);
+        assertTrue(
+            message,
+            suggestion.getEntriesToEdit().stream().anyMatch(e -> e.getStem().equals(stem)));
+      }
+    }
+
+    return expanded;
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.aff
new file mode 100644
index 00000000000..70642d81ebf
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.aff
@@ -0,0 +1,14 @@
+FORBIDDENWORD *
+
+SFX G Y 1
+SFX G   0     ing/S      .
+
+SFX J Y 1
+SFX J   0     ings       .
+
+SFX S Y 1
+SFX S   0     s          .
+
+SFX X Y 2
+SFX X   0     s          .
+SFX X   0     x          .
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.dic
new file mode 100644
index 00000000000..262348bee18
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.dic
@@ -0,0 +1,2 @@
+1
+word
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.aff
new file mode 100644
index 00000000000..d0f75c18580
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.aff
@@ -0,0 +1,15 @@
+# UTF-8 flags
+FLAG UTF-8
+
+SFX A Y 1
+SFX A 0 s/ÖüÜ .
+#SFX A 0 s/ÖüÖÜ .
+
+SFX Ö Y 1
+SFX Ö 0 bar .
+
+SFX ü Y 1
+SFX ü 0 baz .
+
+PFX Ü Y 1
+PFX Ü 0 un .
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.dic
new file mode 100644
index 00000000000..2944490c901
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.dic
@@ -0,0 +1,2 @@
+1
+foo/AÜ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.good
new file mode 100644
index 00000000000..d5c27b1a677
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.good
@@ -0,0 +1,8 @@
+foo
+foos
+foosbar
+foosbaz
+unfoo
+unfoos
+unfoosbar
+unfoosbaz
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.aff
index de7f8ad9a42..a01d19d3502 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.aff
@@ -9,3 +9,6 @@ COMPOUNDFLAG Y
 
 SFX A Y 1
 SFX A 0 s .
+
+SFX s N 1
+SFX s 0 os .
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
index b012cc8a5a0..c5c19307b33 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
@@ -1,4 +1,4 @@
-11
+14
 foo/S
 foo/YX
 bar/YS
@@ -10,4 +10,6 @@ KG/X
 cm
 Cm/X
 SIPS/X
-Sip/A
\ No newline at end of file
+Sip/A
+iPod/s
+iPodos/X
\ No newline at end of file