You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by do...@apache.org on 2023/01/13 11:48:54 UTC
[lucene] 01/09: LUCENE-10626: Hunspell: add tools to aid dictionary editing: analysis introspection, stem expansion and stem/flag suggestion (#975)
This is an automated email from the ASF dual-hosted git repository.
donnerpeter pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
commit 3b763af12f576b8b8120e98ae38b7a67461a99e8
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Tue Jul 5 21:38:03 2022 +0200
LUCENE-10626: Hunspell: add tools to aid dictionary editing: analysis introspection, stem expansion and stem/flag suggestion (#975)
---
lucene/CHANGES.txt | 3 +
.../lucene/analysis/hunspell/AffixedWord.java | 119 +++++
.../lucene/analysis/hunspell/DictEntries.java | 24 +-
.../apache/lucene/analysis/hunspell/DictEntry.java | 109 +++++
.../lucene/analysis/hunspell/Dictionary.java | 91 +++-
.../lucene/analysis/hunspell/EntrySuggestion.java | 65 +++
.../apache/lucene/analysis/hunspell/Hunspell.java | 50 ++-
.../apache/lucene/analysis/hunspell/Stemmer.java | 297 +++++++------
.../analysis/hunspell/WordFormGenerator.java | 487 +++++++++++++++++++++
.../lucene/analysis/hunspell/TestDictionary.java | 5 +-
.../lucene/analysis/hunspell/TestHunspell.java | 134 +++++-
.../analysis/hunspell/TestSpellChecking.java | 87 +++-
.../apache/lucene/analysis/hunspell/compress.aff | 14 +
.../apache/lucene/analysis/hunspell/compress.dic | 2 +
.../apache/lucene/analysis/hunspell/flagutf8.aff | 15 +
.../apache/lucene/analysis/hunspell/flagutf8.dic | 2 +
.../apache/lucene/analysis/hunspell/flagutf8.good | 8 +
.../lucene/analysis/hunspell/forbiddenword.aff | 3 +
.../lucene/analysis/hunspell/forbiddenword.dic | 6 +-
19 files changed, 1351 insertions(+), 170 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 210ff7b56b4..dc471f0e0e6 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -83,6 +83,9 @@ New Features
* GITHUB#11869: RangeOnRangeFacetCounts added, supporting numeric range "relationship" faceting over docvalue-stored
ranges. (Marc D'Mello)
+* LUCENE-10626 Hunspell: add tools to aid dictionary editing:
+ analysis introspection, stem expansion and stem/flag suggestion (Peter Gromov)
+
Improvements
---------------------
* GITHUB#11785: Improve Tessellator performance by delaying calls to the method
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixedWord.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixedWord.java
new file mode 100644
index 00000000000..f0b8b1b58a4
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixedWord.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+
+/** An object representing the analysis result of a simple (non-compound) word */
+public final class AffixedWord {
+ private final String word;
+ private final DictEntry entry;
+ private final List<Affix> prefixes;
+ private final List<Affix> suffixes;
+
+ AffixedWord(String word, DictEntry entry, List<Affix> prefixes, List<Affix> suffixes) {
+ this.word = word;
+ this.entry = entry;
+ this.prefixes = Collections.unmodifiableList(prefixes);
+ this.suffixes = Collections.unmodifiableList(suffixes);
+ }
+
+ /** @return the word being analyzed */
+ public String getWord() {
+ return word;
+ }
+
+ /** @return the dictionary entry for the stem in this analysis */
+ public DictEntry getDictEntry() {
+ return entry;
+ }
+
+ /** @return the list of prefixes applied to the stem, at most two, outermost first */
+ public List<Affix> getPrefixes() {
+ return prefixes;
+ }
+
+ /** @return the list of suffixes applied to the stem, at most two, outermost first */
+ public List<Affix> getSuffixes() {
+ return suffixes;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (!(o instanceof AffixedWord that)) return false;
+ return word.equals(that.word)
+ && entry.equals(that.entry)
+ && prefixes.equals(that.prefixes)
+ && suffixes.equals(that.suffixes);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(word, entry, prefixes, suffixes);
+ }
+
+ @Override
+ public String toString() {
+ return "AffixedWord["
+ + ("word=" + word + ", ")
+ + ("entry=" + entry + ", ")
+ + ("prefixes=" + prefixes + ", ")
+ + ("suffixes=" + suffixes)
+ + "]";
+ }
+
+ /** An object representing a prefix or a suffix applied to a word stem */
+ public static final class Affix {
+ final int affixId;
+ private final String presentableFlag;
+
+ Affix(Dictionary dictionary, int affixId) {
+ this.affixId = affixId;
+ char encodedFlag = dictionary.affixData(affixId, AFFIX_FLAG);
+ presentableFlag = dictionary.flagParsingStrategy.printFlag(encodedFlag);
+ }
+
+ /**
+ * @return the corresponding affix flag as it appears in the *.aff file. Depending on the
+ * format, it could be a Unicode character, two ASCII characters, or an integer in decimal
+ * form
+ */
+ public String getFlag() {
+ return presentableFlag;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ return this == o || o instanceof Affix a && affixId == a.affixId;
+ }
+
+ @Override
+ public int hashCode() {
+ return affixId;
+ }
+
+ @Override
+ public String toString() {
+ return presentableFlag + "(id=" + affixId + ")";
+ }
+ }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java
index d9174dcbc7e..42dadc101b7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java
@@ -24,26 +24,22 @@ import java.util.List;
*
* @see Dictionary#lookupEntries
*/
-public interface DictEntries {
+public interface DictEntries extends List<DictEntry> {
/**
* @return a positive number of dictionary entries with the same word. Most often it's 1 (unless
* there are homonyms). Entries are indexed from 0 to {@code size() - 1} and these indices can
* be passed into other methods of this class.
*/
+ @Override
int size();
- /**
- * @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
- * @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
- * {@code ph:}) associated with the homonym at the given entry index, or an empty string
- */
- String getMorphologicalData(int entryIndex);
+ /** Same as {@code get(entryIndex).getMorphologicalData()} */
+ default String getMorphologicalData(int entryIndex) {
+ return get(entryIndex).getMorphologicalData();
+ }
- /**
- * @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
- * @param key the key in the form {@code kk:} by which to filter the morphological fields
- * @return the values (of {@code vvvvvv} form) of morphological fields with the given key
- * associated with the homonym at the given entry index
- */
- List<String> getMorphologicalValues(int entryIndex, String key);
+ /** Same as {@code get(entryIndex).getMorphologicalValues(key)} */
+ default List<String> getMorphologicalValues(int entryIndex, String key) {
+ return get(entryIndex).getMorphologicalValues(key);
+ }
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntry.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntry.java
new file mode 100644
index 00000000000..ee76e77c39a
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntry.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+
+/** An object representing *.dic file entry with its word, flags and morphological data. */
+public abstract class DictEntry {
+ private final String stem;
+
+ DictEntry(String stem) {
+ this.stem = stem;
+ }
+
+ @Override
+ public String toString() {
+ String result = stem;
+ String flags = getFlags();
+ if (!flags.isEmpty()) {
+ result += "/" + flags;
+ }
+ String morph = getMorphologicalData();
+ if (!morph.isEmpty()) {
+ result += " " + morph;
+ }
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (!(o instanceof DictEntry that)) return false;
+ return stem.equals(that.stem)
+ && getMorphologicalData().equals(that.getMorphologicalData())
+ && getFlags().equals(that.getFlags());
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(stem, getFlags(), getMorphologicalData());
+ }
+
+ /** @return the stem word in the dictionary */
+ public String getStem() {
+ return stem;
+ }
+
+ /**
+ * @return the flags associated with the dictionary entry, encoded in the same format as in the
+ * *.dic file, but possibly in a different order
+ */
+ public abstract String getFlags();
+
+ /**
+ * @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
+ * {@code ph:}) associated with the homonym at the given entry index, or an empty string
+ */
+ public abstract String getMorphologicalData();
+
+ /**
+ * @param key the key in the form {@code kk:} by which to filter the morphological fields
+ * @return the values (of {@code vvvvvv} form) of morphological fields with the given key
+ * associated with the homonym at the given entry index
+ */
+ public List<String> getMorphologicalValues(String key) {
+ assert key.length() == 3 && key.charAt(2) == ':'
+ : "A morphological data key should consist of two letters followed by a semicolon, found: "
+ + key;
+
+ String data = getMorphologicalData();
+ if (data.isEmpty() || !data.contains(key)) return Collections.emptyList();
+
+ return Arrays.stream(data.split(" "))
+ .filter(s -> s.startsWith(key))
+ .map(s -> s.substring(3))
+ .toList();
+ }
+
+ static DictEntry create(String stem, String flags) {
+ return new DictEntry(stem) {
+ @Override
+ public String getFlags() {
+ return flags;
+ }
+
+ @Override
+ public String getMorphologicalData() {
+ return "";
+ }
+ };
+ }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 2fb73d01ed6..af894d35a99 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -34,6 +34,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.ParseException;
+import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -537,31 +538,33 @@ public class Dictionary {
IntsRef forms = lookupWord(root.toCharArray(), 0, root.length());
if (forms == null) return null;
- return new DictEntries() {
+ class DictEntriesImpl extends AbstractList<DictEntry> implements DictEntries {
@Override
public int size() {
- return forms.length / (hasCustomMorphData ? 2 : 1);
+ return forms.length / formStep();
}
@Override
- public String getMorphologicalData(int entryIndex) {
- if (!hasCustomMorphData) return "";
- return morphData.get(forms.ints[forms.offset + entryIndex * 2 + 1]);
+ public DictEntry get(int entryIndex) {
+ return dictEntry(
+ root,
+ forms.ints[forms.offset + (entryIndex * formStep())],
+ hasCustomMorphData ? forms.ints[forms.offset + entryIndex * 2 + 1] : 0);
+ }
+ }
+ return new DictEntriesImpl();
+ }
+
+ DictEntry dictEntry(String root, int flagId, int morphDataId) {
+ return new DictEntry(root) {
+ @Override
+ public String getFlags() {
+ return Dictionary.this.flagParsingStrategy.printFlags(flagLookup.getFlags(flagId));
}
@Override
- public List<String> getMorphologicalValues(int entryIndex, String key) {
- assert key.length() == 3 && key.charAt(2) == ':'
- : "A morphological data key should consist of two letters followed by a semicolon, found: "
- + key;
-
- String fields = getMorphologicalData(entryIndex);
- if (fields.isEmpty() || !fields.contains(key)) return Collections.emptyList();
-
- return Arrays.stream(fields.split(" "))
- .filter(s -> s.startsWith(key))
- .map(s -> s.substring(3))
- .collect(Collectors.toList());
+ public String getMorphologicalData() {
+ return morphDataId == 0 ? "" : morphData.get(morphDataId);
}
};
}
@@ -1155,7 +1158,7 @@ public class Dictionary {
} else {
end = line.indexOf(MORPH_SEPARATOR);
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
- String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end);
+ String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
if (aliasCount > 0 && !flagPart.isEmpty()) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
@@ -1329,6 +1332,12 @@ public class Dictionary {
return false;
}
+ boolean isFlagAppendedByAffix(int affixId, char flag) {
+ if (affixId < 0 || flag == FLAG_UNSET) return false;
+ int appendId = affixData(affixId, AFFIX_APPEND);
+ return hasFlag(appendId, flag);
+ }
+
/** Abstraction of the process of parsing flags taken from the affix and dic files */
abstract static class FlagParsingStrategy {
// we don't check the flag count, as Hunspell accepts longer sequences
@@ -1356,6 +1365,27 @@ public class Dictionary {
* @return Parsed flags
*/
abstract char[] parseFlags(String rawFlags);
+
+ /**
+ * @return the original string representation of the given flag encoded by {@link #parseFlags}.
+ */
+ abstract String printFlag(char flag);
+
+ /** @return a presentable sorted concatenation of {@link #printFlag} results */
+ String printFlags(char[] encodedFlags) {
+ List<String> printed = new ArrayList<>();
+ for (char c : encodedFlags) {
+ if (c >= DEFAULT_FLAGS) continue;
+ printed.add(printFlag(c));
+ }
+ String delimiter = this instanceof NumFlagParsingStrategy ? "," : "";
+ return printed.stream().sorted().collect(Collectors.joining(delimiter));
+ }
+
+ /** Parse flags from a string resulting from {@link #printFlags} */
+ char[] parseUtfFlags(String flagsInUtf) {
+ return parseFlags(flagsInUtf);
+ }
}
/**
@@ -1367,6 +1397,11 @@ public class Dictionary {
public char[] parseFlags(String rawFlags) {
return rawFlags.toCharArray();
}
+
+ @Override
+ String printFlag(char flag) {
+ return String.valueOf(flag);
+ }
}
/** Used to read flags as UTF-8 even if the rest of the file is in the default (8-bit) encoding */
@@ -1375,6 +1410,16 @@ public class Dictionary {
public char[] parseFlags(String rawFlags) {
return new String(rawFlags.getBytes(DEFAULT_CHARSET), StandardCharsets.UTF_8).toCharArray();
}
+
+ @Override
+ String printFlag(char flag) {
+ return String.valueOf(flag);
+ }
+
+ @Override
+ char[] parseUtfFlags(String flagsInUtf) {
+ return flagsInUtf.toCharArray();
+ }
}
/**
@@ -1405,6 +1450,11 @@ public class Dictionary {
return result.toString().toCharArray();
}
+
+ @Override
+ String printFlag(char flag) {
+ return String.valueOf((int) flag);
+ }
}
/**
@@ -1432,6 +1482,11 @@ public class Dictionary {
}
return flags;
}
+
+ @Override
+ String printFlag(char flag) {
+ return new String(new char[] {(char) ((flag & 0xff00) >>> 8), (char) (flag & 0xff)});
+ }
}
boolean hasFlag(int entryId, char flag) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/EntrySuggestion.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/EntrySuggestion.java
new file mode 100644
index 00000000000..3fe489dcb22
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/EntrySuggestion.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Suggestion to add/edit dictionary entries to generate a given list of words created by {@link
+ * WordFormGenerator#compress}.
+ */
+public class EntrySuggestion {
+ private final List<DictEntry> toEdit, toAdd;
+ private final List<String> extraGenerated;
+
+ EntrySuggestion(List<DictEntry> toEdit, List<DictEntry> toAdd, List<String> extraGenerated) {
+ this.toEdit = Collections.unmodifiableList(toEdit);
+ this.toAdd = Collections.unmodifiableList(toAdd);
+ this.extraGenerated = Collections.unmodifiableList(extraGenerated);
+ }
+
+ /**
+ * @return the existing dictionary entries whose flags would need changing to accommodate the
+ * given word list
+ */
+ public List<DictEntry> getEntriesToEdit() {
+ return toEdit;
+ }
+
+ /** @return new dictionary entries to be added to accommodate the given word list */
+ public List<DictEntry> getEntriesToAdd() {
+ return toAdd;
+ }
+
+ /**
+ * @return additional words generated by union of {@link #getEntriesToAdd()} and {@link
+ * #getEntriesToEdit()} which weren't in the given list of words
+ */
+ public List<String> getExtraGeneratedWords() {
+ return extraGenerated;
+ }
+
+ @Override
+ public String toString() {
+ return "EntrySuggestion{" + internalsToString() + '}';
+ }
+
+ String internalsToString() {
+ return "toEdit=" + toEdit + ", toAdd=" + toAdd + ", extra=" + extraGenerated;
+ }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
index 90e35930442..4123bcc28e1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@@ -178,7 +178,7 @@ public class Hunspell {
offset,
length,
context,
- (stem, formID, morphDataId) -> {
+ (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
if (checkCase && !acceptCase(originalCase, formID, stem)) {
return dictionary.hasFlag(formID, Dictionary.HIDDEN_FLAG);
}
@@ -316,6 +316,52 @@ public class Hunspell {
.collect(Collectors.toList());
}
+ /**
+ * @return all possible analyses of the given word with stems, prefixes, suffixed and
+ * morphological data. Note that the order of the returned objects might not correspond to the
+ * *.dic file order!
+ */
+ public List<AffixedWord> analyzeSimpleWord(String word) {
+ List<AffixedWord> result = new ArrayList<>();
+ stemmer.analyze(
+ word.toCharArray(),
+ word.length(),
+ (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
+ List<AffixedWord.Affix> prefixes = new ArrayList<>();
+ List<AffixedWord.Affix> suffixes = new ArrayList<>();
+ if (outerPrefix >= 0) prefixes.add(new AffixedWord.Affix(dictionary, outerPrefix));
+ if (innerPrefix >= 0) prefixes.add(new AffixedWord.Affix(dictionary, innerPrefix));
+ if (outerSuffix >= 0) suffixes.add(new AffixedWord.Affix(dictionary, outerSuffix));
+ if (innerSuffix >= 0) suffixes.add(new AffixedWord.Affix(dictionary, innerSuffix));
+
+ DictEntry entry = dictionary.dictEntry(stem.toString(), formID, morphDataId);
+ result.add(new AffixedWord(word, entry, prefixes, suffixes));
+ return true;
+ });
+ return result;
+ }
+
+ /**
+ * Generate all word forms for all dictionary entries with the given root word. The result order
+ * is stable but not specified. This is equivalent to "unmunch" from the "hunspell-tools" package.
+ *
+ * @see WordFormGenerator for finer-grained APIs
+ */
+ public List<AffixedWord> getAllWordForms(String root) {
+ return new WordFormGenerator(dictionary).getAllWordForms(root, checkCanceled);
+ }
+
+ /**
+ * Given a list of words, try to produce a smaller set of dictionary entries (with some flags)
+ * that would generate these words. This is equivalent to "munch" from the "hunspell-tools"
+ * package.
+ *
+ * @see WordFormGenerator#compress for more details and control
+ */
+ public EntrySuggestion compress(List<String> words) {
+ return new WordFormGenerator(dictionary).compress(words, Set.of(), checkCanceled);
+ }
+
private class CompoundPart {
final CompoundPart prev;
final int index, length;
@@ -433,7 +479,7 @@ public class Hunspell {
words.add(ref);
Stemmer.RootProcessor stopOnMatching =
- (stem, formID, morphDataId) -> {
+ (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
ref.ints[0] = formID;
return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words));
};
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 012f8bb6696..0e11d457c3c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -18,7 +18,6 @@ package org.apache.lucene.analysis.hunspell;
import java.util.ArrayList;
import java.util.List;
-import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.ArrayUtil;
@@ -65,7 +64,18 @@ final class Stemmer {
* @return List of stems for the word
*/
public List<CharsRef> stem(char[] word, int length) {
+ List<CharsRef> list = new ArrayList<>();
+ analyze(
+ word,
+ length,
+ (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
+ list.add(newStem(stem, morphDataId));
+ return true;
+ });
+ return list;
+ }
+ void analyze(char[] word, int length, RootProcessor processor) {
if (dictionary.mayNeedInputCleaning()) {
CharsRef scratchSegment = new CharsRef(word, 0, length);
if (dictionary.needsInputCleaning(scratchSegment)) {
@@ -77,19 +87,12 @@ final class Stemmer {
word = scratchBuffer;
}
}
-
- List<CharsRef> list = new ArrayList<>();
if (length == 0) {
- return list;
+ return;
}
- RootProcessor processor =
- (stem, formID, stemException) -> {
- list.add(newStem(stem, stemException));
- return true;
- };
if (!doStem(word, 0, length, WordContext.SIMPLE_WORD, processor)) {
- return list;
+ return;
}
WordCase wordCase = caseOf(word, length);
@@ -99,7 +102,6 @@ final class Stemmer {
doStem(variant, 0, varLength, WordContext.SIMPLE_WORD, processor);
varyCase(word, length, wordCase, variationProcessor);
}
- return list;
}
interface CaseVariationProcessor {
@@ -214,7 +216,7 @@ final class Stemmer {
if (result == null) return true;
String src = new String(word, 0, length);
- for (String s : result.collect(Collectors.toList())) {
+ for (String s : result.toList()) {
if (!s.equals(src) && !processor.process(s.toCharArray(), s.length(), null)) {
return false;
}
@@ -239,13 +241,61 @@ final class Stemmer {
if (!isRootCompatibleWithContext(context, -1, entryId)) {
continue;
}
- if (!callProcessor(word, offset, length, processor, forms, i)) {
+ CharsRef charsRef = new CharsRef(word, offset, length);
+ if (!processor.processRoot(charsRef, entryId, morphDataId(forms, i), -1, -1, -1, -1)) {
return false;
}
}
}
- return stem(
- word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
+ StemCandidateProcessor stemProcessor =
+ new StemCandidateProcessor(context) {
+ @Override
+ boolean processStemCandidate(
+ char[] word,
+ int offset,
+ int length,
+ int lastAffix,
+ int outerPrefix,
+ int innerPrefix,
+ int outerSuffix,
+ int innerSuffix) {
+ IntsRef forms = dictionary.lookupWord(word, offset, length);
+ if (forms == null) return true;
+
+ char flag = dictionary.affixData(lastAffix, Dictionary.AFFIX_FLAG);
+ int prefixId = innerPrefix >= 0 ? innerPrefix : outerPrefix;
+ for (int i = 0; i < forms.length; i += formStep) {
+ int entryId = forms.ints[forms.offset + i];
+ if (dictionary.hasFlag(entryId, flag)
+ || dictionary.isFlagAppendedByAffix(prefixId, flag)) {
+ if (innerPrefix < 0 && outerPrefix >= 0) {
+ char prefixFlag = dictionary.affixData(outerPrefix, Dictionary.AFFIX_FLAG);
+ if (!dictionary.hasFlag(entryId, prefixFlag)
+ && !dictionary.isFlagAppendedByAffix(lastAffix, prefixFlag)) {
+ continue;
+ }
+ }
+
+ if (!isRootCompatibleWithContext(context, lastAffix, entryId)) {
+ continue;
+ }
+
+ if (!processor.processRoot(
+ new CharsRef(word, offset, length),
+ entryId,
+ morphDataId(forms, i),
+ outerPrefix,
+ innerPrefix,
+ outerSuffix,
+ innerSuffix)) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+ };
+ return removeAffixes(word, offset, length, true, -1, -1, -1, stemProcessor);
}
/**
@@ -277,9 +327,20 @@ final class Stemmer {
* Dictionary#hasFlag(int, char)}
* @param morphDataId the id of the custom morphological data (0 if none), to be used with
* {@link Dictionary#morphData}
+ * @param outerPrefix the id of the outer prefix applied to the stem, or -1 if none
+ * @param innerPrefix the id of the inner prefix applied to the stem, or -1 if none
+ * @param outerSuffix the id of the outer suffix applied to the stem, or -1 if none
+ * @param innerSuffix the id of the inner suffix applied to the stem, or -1 if none
* @return whether the processing should be continued
*/
- boolean processRoot(CharsRef stem, int formID, int morphDataId);
+ boolean processRoot(
+ CharsRef stem,
+ int formID,
+ int morphDataId,
+ int outerPrefix,
+ int innerPrefix,
+ int outerSuffix,
+ int innerSuffix);
}
private String stemException(int morphDataId) {
@@ -318,33 +379,23 @@ final class Stemmer {
}
/**
- * Generates a list of stems for the provided word
+ * Generates a list of stems for the provided word. It's called recursively when applying affixes
+ * one by one, setting {@code (inner/outer)(Suffix/Prefix)} parameters to non-negative values as
+ * that happens.
*
* @param word Word to generate the stems for
- * @param previous previous affix that was removed (so we dont remove same one twice)
- * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any
- * affixes in this recursive step
- * @param prefixId ID of the most inner removed prefix, so that when removing a suffix, it's also
- * checked against the word
- * @param recursionDepth current recursiondepth
* @param doPrefix true if we should remove prefixes
- * @param previousWasPrefix true if the previous removal was a prefix: if we are removing a
- * suffix, and it has no continuation requirements, it's ok. but two prefixes
- * (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
* @return whether the processing should be continued
*/
- private boolean stem(
+ boolean removeAffixes(
char[] word,
int offset,
int length,
- WordContext context,
- int previous,
- char prevFlag,
- int prefixId,
- int recursionDepth,
boolean doPrefix,
- boolean previousWasPrefix,
- RootProcessor processor) {
+ int outerPrefix,
+ int innerPrefix,
+ int outerSuffix,
+ StemCandidateProcessor processor) {
FST.Arc<IntsRef> arc = new FST.Arc<>();
if (doPrefix && dictionary.prefixes != null) {
FST<IntsRef> fst = dictionary.prefixes;
@@ -366,11 +417,11 @@ final class Stemmer {
for (int j = 0; j < prefixes.length; j++) {
int prefix = prefixes.ints[prefixes.offset + j];
- if (prefix == previous) {
+ if (prefix == outerPrefix) {
continue;
}
- if (isAffixCompatible(prefix, prevFlag, recursionDepth, true, false, context)) {
+ if (isAffixCompatible(prefix, true, outerPrefix, outerSuffix, processor.context)) {
char[] strippedWord = stripAffix(word, offset, length, i, prefix, true);
if (strippedWord == null) {
continue;
@@ -381,12 +432,11 @@ final class Stemmer {
strippedWord,
pureAffix ? offset + i : 0,
pureAffix ? length - i : strippedWord.length,
- context,
prefix,
- previous,
- -1,
- recursionDepth,
true,
+ outerPrefix,
+ innerPrefix,
+ outerSuffix,
processor)) {
return false;
}
@@ -415,12 +465,11 @@ final class Stemmer {
for (int j = 0; j < suffixes.length; j++) {
int suffix = suffixes.ints[suffixes.offset + j];
- if (suffix == previous) {
+ if (suffix == outerSuffix) {
continue;
}
- if (isAffixCompatible(
- suffix, prevFlag, recursionDepth, false, previousWasPrefix, context)) {
+ if (isAffixCompatible(suffix, false, outerPrefix, outerSuffix, processor.context)) {
char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false);
if (strippedWord == null) {
continue;
@@ -431,12 +480,11 @@ final class Stemmer {
strippedWord,
pureAffix ? offset : 0,
pureAffix ? i : strippedWord.length,
- context,
suffix,
- previous,
- prefixId,
- recursionDepth,
false,
+ outerPrefix,
+ innerPrefix,
+ outerSuffix,
processor)) {
return false;
}
@@ -487,14 +535,10 @@ final class Stemmer {
}
private boolean isAffixCompatible(
- int affix,
- char prevFlag,
- int recursionDepth,
- boolean isPrefix,
- boolean previousWasPrefix,
- WordContext context) {
+ int affix, boolean isPrefix, int outerPrefix, int outerSuffix, WordContext context) {
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
+ boolean previousWasPrefix = outerSuffix < 0 && outerPrefix >= 0;
if (context.isCompound()) {
if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
return false;
@@ -513,79 +557,70 @@ final class Stemmer {
return false;
}
- if (recursionDepth == 0) {
+ if (outerPrefix == -1 && outerSuffix == -1) {
return true;
}
if (dictionary.isCrossProduct(affix)) {
- // cross check incoming continuation class (flag of previous affix) against list.
- return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
+ // cross-check incoming continuation class (flag of previous affix) against this affix's flags
+ if (previousWasPrefix) return true;
+ if (outerSuffix >= 0) {
+ char prevFlag = dictionary.affixData(outerSuffix, Dictionary.AFFIX_FLAG);
+ return dictionary.hasFlag(append, prevFlag);
+ }
}
return false;
}
/**
- * Applies the affix rule to the given word, producing a list of stems if any are found
+ * Applies the affix rule to the given word, producing a list of stems if any are found.
+ * Non-negative {@code (inner/outer)(Suffix/Prefix)} parameters indicate the already applied
+ * affixes.
*
- * @param strippedWord Char array containing the word with the affix removed and the strip added
+ * @param word Char array containing the word with the affix removed and the strip added
* @param offset where the word actually starts in the array
* @param length the length of the stripped word
- * @param affix HunspellAffix representing the affix rule itself
- * @param prefixId when we already stripped a prefix, we can't simply recurse and check the
- * suffix, unless both are compatible so we must check dictionary form against both to add it
- * as a stem!
- * @param recursionDepth current recursion depth
+ * @param affix the id of the affix in {@link Dictionary#affixData}
* @param prefix true if we are removing a prefix (false if it's a suffix)
* @return whether the processing should be continued
*/
private boolean applyAffix(
- char[] strippedWord,
+ char[] word,
int offset,
int length,
- WordContext context,
int affix,
- int previousAffix,
- int prefixId,
- int recursionDepth,
boolean prefix,
- RootProcessor processor) {
- char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
+ int outerPrefix,
+ int innerPrefix,
+ int outerSuffix,
+ StemCandidateProcessor processor) {
+ int prefixId = innerPrefix >= 0 ? innerPrefix : outerPrefix;
+ int previousAffix = outerSuffix >= 0 ? outerSuffix : prefixId;
+
+ int innerSuffix = -1;
+ if (prefix) {
+ if (outerPrefix < 0) outerPrefix = affix;
+ else innerPrefix = affix;
+ } else {
+ if (outerSuffix < 0) outerSuffix = affix;
+ else innerSuffix = affix;
+ }
boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix, prefixId);
- IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length);
- if (forms != null) {
- for (int i = 0; i < forms.length; i += formStep) {
- int entryId = forms.ints[forms.offset + i];
- if (dictionary.hasFlag(entryId, flag) || isFlagAppendedByAffix(prefixId, flag)) {
- // confusing: in this one exception, we already chained the first prefix against the
- // second,
- // so it doesnt need to be checked against the word
- boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
- if (!chainedPrefix && prefixId >= 0) {
- char prefixFlag = dictionary.affixData(prefixId, Dictionary.AFFIX_FLAG);
- if (!dictionary.hasFlag(entryId, prefixFlag)
- && !isFlagAppendedByAffix(affix, prefixFlag)) {
- continue;
- }
- }
-
- if (!isRootCompatibleWithContext(context, affix, entryId)) {
- continue;
- }
-
- if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
- return false;
- }
- }
- }
+ if (!skipLookup
+ && !processor.processStemCandidate(
+ word, offset, length, affix, outerPrefix, innerPrefix, outerSuffix, innerSuffix)) {
+ return false;
}
+ int recursionDepth =
+ (outerSuffix >= 0 ? 1 : 0) + (innerPrefix >= 0 ? 2 : outerPrefix >= 0 ? 1 : 0) - 1;
if (dictionary.isCrossProduct(affix) && recursionDepth <= 1) {
+ char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
boolean doPrefix;
if (recursionDepth == 0) {
if (prefix) {
- prefixId = affix;
doPrefix = dictionary.complexPrefixes && dictionary.isSecondStagePrefix(flag);
// we took away the first prefix.
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
@@ -599,33 +634,42 @@ final class Stemmer {
return true;
}
} else {
- doPrefix = false;
if (prefix && dictionary.complexPrefixes) {
- prefixId = affix;
+ doPrefix = true;
// we took away the second prefix: go look for another suffix
} else if (prefix || dictionary.complexPrefixes || !dictionary.isSecondStageSuffix(flag)) {
return true;
+ } else {
+ // we took away a prefix, then a suffix: go look for another suffix
+ doPrefix = false;
}
- // we took away a prefix, then a suffix: go look for another suffix
}
- return stem(
- strippedWord,
- offset,
- length,
- context,
- affix,
- flag,
- prefixId,
- recursionDepth + 1,
- doPrefix,
- prefix,
- processor);
+ return removeAffixes(
+ word, offset, length, doPrefix, outerPrefix, innerPrefix, outerSuffix, processor);
}
return true;
}
+ abstract static class StemCandidateProcessor {
+ private final WordContext context;
+
+ StemCandidateProcessor(WordContext context) {
+ this.context = context;
+ }
+
+ abstract boolean processStemCandidate(
+ char[] word,
+ int offset,
+ int length,
+ int lastAffix,
+ int outerPrefix,
+ int innerPrefix,
+ int outerSuffix,
+ int innerSuffix);
+ }
+
private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
return false;
@@ -633,39 +677,32 @@ final class Stemmer {
if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
char cFlag = context.requiredFlag(dictionary);
return dictionary.hasFlag(entryId, cFlag)
- || isFlagAppendedByAffix(lastAffix, cFlag)
+ || dictionary.isFlagAppendedByAffix(lastAffix, cFlag)
|| dictionary.hasFlag(entryId, dictionary.compoundFlag)
- || isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
+ || dictionary.isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
}
return true;
}
- private boolean callProcessor(
- char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
- CharsRef stem = new CharsRef(word, offset, length);
- int morphDataId = dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0;
- return processor.processRoot(stem, forms.ints[forms.offset + i], morphDataId);
+ private int morphDataId(IntsRef forms, int i) {
+ return dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0;
}
private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) {
char circumfix = dictionary.circumfix;
// if circumfix was previously set by a prefix, we must check this suffix,
// to ensure it has it, and vice versa
- if (isSuffix
- && isFlagAppendedByAffix(prefixId, circumfix) != isFlagAppendedByAffix(affix, circumfix)) {
- return true;
+ if (isSuffix) {
+ if (dictionary.isFlagAppendedByAffix(prefixId, circumfix)
+ != dictionary.isFlagAppendedByAffix(affix, circumfix)) {
+ return true;
+ }
}
- if (isFlagAppendedByAffix(affix, dictionary.needaffix)) {
+ if (dictionary.isFlagAppendedByAffix(affix, dictionary.needaffix)) {
return !isSuffix
|| previousAffix < 0
- || isFlagAppendedByAffix(previousAffix, dictionary.needaffix);
+ || dictionary.isFlagAppendedByAffix(previousAffix, dictionary.needaffix);
}
return false;
}
-
- private boolean isFlagAppendedByAffix(int affixId, char flag) {
- if (affixId < 0 || flag == Dictionary.FLAG_UNSET) return false;
- int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND);
- return dictionary.hasFlag(appendId, flag);
- }
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java
new file mode 100644
index 00000000000..043df4fbb1d
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java
@@ -0,0 +1,487 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
+import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
+import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import org.apache.lucene.analysis.hunspell.AffixedWord.Affix;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.IntsRefFSTEnum;
+
+/**
+ * A utility class used for generating possible word forms by adding affixes to stems ({@link
+ * #getAllWordForms(String, String, Runnable)}), and suggesting stems and flags to generate the
+ * given set of words ({@link #compress(List, Set, Runnable)}).
+ */
+public class WordFormGenerator {
+ private final Dictionary dictionary;
+ private final Map<Character, List<AffixEntry>> affixes = new HashMap<>();
+ private final Stemmer stemmer;
+
+ public WordFormGenerator(Dictionary dictionary) {
+ this.dictionary = dictionary;
+ fillAffixMap(dictionary.prefixes, AffixKind.PREFIX);
+ fillAffixMap(dictionary.suffixes, AffixKind.SUFFIX);
+ stemmer = new Stemmer(dictionary);
+ }
+
+ private void fillAffixMap(FST<IntsRef> fst, AffixKind kind) {
+ if (fst == null) return;
+
+ IntsRefFSTEnum<IntsRef> fstEnum = new IntsRefFSTEnum<>(fst);
+ try {
+ while (true) {
+ IntsRefFSTEnum.InputOutput<IntsRef> io = fstEnum.next();
+ if (io == null) break;
+
+ IntsRef affixIds = io.output;
+ for (int j = 0; j < affixIds.length; j++) {
+ int id = affixIds.ints[affixIds.offset + j];
+ char flag = dictionary.affixData(id, AFFIX_FLAG);
+ var entry =
+ new AffixEntry(id, flag, kind, toString(kind, io.input), strip(id), condition(id));
+ affixes.computeIfAbsent(flag, __ -> new ArrayList<>()).add(entry);
+ }
+ }
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+ }
+
+ private String toString(AffixKind kind, IntsRef input) {
+ char[] affixChars = new char[input.length];
+ for (int i = 0; i < affixChars.length; i++) {
+ affixChars[kind == AffixKind.PREFIX ? i : affixChars.length - i - 1] =
+ (char) input.ints[input.offset + i];
+ }
+ return new String(affixChars);
+ }
+
+ private AffixCondition condition(int affixId) {
+ int condition = dictionary.getAffixCondition(affixId);
+ return condition == 0 ? AffixCondition.ALWAYS_TRUE : dictionary.patterns.get(condition);
+ }
+
+ private String strip(int affixId) {
+ int stripOrd = dictionary.affixData(affixId, Dictionary.AFFIX_STRIP_ORD);
+ int stripStart = dictionary.stripOffsets[stripOrd];
+ int stripEnd = dictionary.stripOffsets[stripOrd + 1];
+ return new String(dictionary.stripData, stripStart, stripEnd - stripStart);
+ }
+
+ /**
+ * Generate all word forms for all dictionary entries with the given root word. The result order
+ * is stable but not specified. This is equivalent to "unmunch" from the "hunspell-tools" package.
+ *
+ * @param checkCanceled an object that's periodically called, allowing to interrupt the generation
+ * by throwing an exception
+ */
+ public List<AffixedWord> getAllWordForms(String root, Runnable checkCanceled) {
+ Set<AffixedWord> result = new LinkedHashSet<>();
+ DictEntries entries = dictionary.lookupEntries(root);
+ if (entries != null) {
+ for (DictEntry entry : entries) {
+ result.addAll(getAllWordForms(root, entry.getFlags(), checkCanceled));
+ }
+ }
+ return new ArrayList<>(result);
+ }
+
+ /**
+ * Generate all word forms for the given root pretending it has the given flags (in the same
+ * format as the dictionary uses). The result order is stable but not specified. This is
+ * equivalent to "unmunch" from the "hunspell-tools" package.
+ *
+ * @param checkCanceled an object that's periodically called, allowing to interrupt the generation
+ * by throwing an exception
+ */
+ public List<AffixedWord> getAllWordForms(String stem, String flags, Runnable checkCanceled) {
+ var encodedFlags = toSet(dictionary.flagParsingStrategy.parseUtfFlags(flags));
+ if (!shouldConsiderAtAll(encodedFlags)) return List.of();
+
+ LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
+ AffixedWord bare = new AffixedWord(stem, DictEntry.create(stem, flags), List.of(), List.of());
+ checkCanceled.run();
+ if (!encodedFlags.contains(dictionary.needaffix)) {
+ result.add(bare);
+ }
+ result.addAll(expand(bare, encodedFlags, checkCanceled));
+ return new ArrayList<>(result);
+ }
+
+ private boolean canStemToOriginal(AffixedWord derived) {
+ String word = derived.getWord();
+ char[] chars = word.toCharArray();
+ if (isForbiddenWord(chars, 0, chars.length)) {
+ return false;
+ }
+
+ String stem = derived.getDictEntry().getStem();
+ var processor =
+ new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) {
+ boolean foundStem = false;
+ boolean foundForbidden = false;
+
+ @Override
+ boolean processStemCandidate(
+ char[] chars,
+ int offset,
+ int length,
+ int lastAffix,
+ int outerPrefix,
+ int innerPrefix,
+ int outerSuffix,
+ int innerSuffix) {
+ if (isForbiddenWord(chars, offset, length)) {
+ foundForbidden = true;
+ return false;
+ }
+ foundStem |= length == stem.length() && stem.equals(new String(chars, offset, length));
+ return !foundStem;
+ }
+ };
+ stemmer.removeAffixes(chars, 0, chars.length, true, -1, -1, -1, processor);
+ return processor.foundStem && !processor.foundForbidden;
+ }
+
+ private boolean isForbiddenWord(char[] chars, int offset, int length) {
+ if (dictionary.forbiddenword != FLAG_UNSET) {
+ IntsRef forms = dictionary.lookupWord(chars, offset, length);
+ if (forms != null) {
+ for (int i = 0; i < forms.length; i += dictionary.formStep()) {
+ if (dictionary.hasFlag(forms.ints[forms.offset + i], dictionary.forbiddenword)) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ private static LinkedHashSet<Character> toSet(char[] flags) {
+ LinkedHashSet<Character> set = new LinkedHashSet<>();
+ for (char c : flags) {
+ set.add(c);
+ }
+ return set;
+ }
+
+ private LinkedHashSet<AffixedWord> expand(
+ AffixedWord stem, LinkedHashSet<Character> flags, Runnable checkCanceled) {
+ LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
+ for (Character flag : flags) {
+ List<AffixEntry> entries = affixes.get(flag);
+ if (entries == null) continue;
+
+ for (AffixEntry affix : entries) {
+ checkCanceled.run();
+ AffixedWord derived = affix.apply(stem, dictionary);
+ if (derived != null) {
+ LinkedHashSet<Character> append = appendFlags(affix);
+ if (shouldConsiderAtAll(append)) {
+ if (canStemToOriginal(derived)) {
+ result.add(derived);
+ }
+ if (dictionary.isCrossProduct(affix.id)) {
+ result.addAll(expand(derived, updateFlags(flags, flag, append), checkCanceled));
+ }
+ }
+ }
+ }
+ }
+ return result;
+ }
+
+ private boolean shouldConsiderAtAll(Set<Character> flags) {
+ return !flags.contains(dictionary.compoundBegin)
+ && !flags.contains(dictionary.compoundMiddle)
+ && !flags.contains(dictionary.compoundEnd)
+ && !flags.contains(dictionary.forbiddenword)
+ && !flags.contains(dictionary.onlyincompound);
+ }
+
+ private LinkedHashSet<Character> updateFlags(
+ Set<Character> flags, Character toRemove, Set<Character> toAppend) {
+ LinkedHashSet<Character> copy = new LinkedHashSet<>(flags);
+ copy.remove(toRemove);
+ copy.addAll(toAppend);
+ return copy;
+ }
+
+ private LinkedHashSet<Character> appendFlags(AffixEntry affix) {
+ char appendId = dictionary.affixData(affix.id, AFFIX_APPEND);
+ return appendId <= 0 ? new LinkedHashSet<>() : toSet(dictionary.flagLookup.getFlags(appendId));
+ }
+
+ /**
+ * Given a list of words, try to produce a smaller set of dictionary entries (with some flags)
+ * that would generate these words. This is equivalent to "munch" from the "hunspell-tools"
+ * package. The algorithm tries to minimize the number of the dictionary entries to add or change,
+ * the number of flags involved, and the number of non-requested additionally generated words. All
+ * the mentioned words are in the dictionary format and case: no ICONV/OCONV/IGNORE conversions
+ * are applied.
+ *
+ * @param words the list of words to generate
+ * @param forbidden the set of words to avoid generating
+ * @param checkCanceled an object that's periodically called, allowing to interrupt the generation
+ * by throwing an exception
+ * @return the information about suggested dictionary entries and overgenerated words, or {@code
+ * null} if the algorithm couldn't generate anything
+ */
+ public EntrySuggestion compress(
+ List<String> words, Set<String> forbidden, Runnable checkCanceled) {
+ if (words.isEmpty()) return null;
+ if (words.stream().anyMatch(forbidden::contains)) {
+ throw new IllegalArgumentException("'words' and 'forbidden' shouldn't intersect");
+ }
+
+ return new WordCompressor(words, forbidden, checkCanceled).compress();
+ }
+
+ private record AffixEntry(
+ int id, char flag, AffixKind kind, String affix, String strip, AffixCondition condition) {
+ AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
+ if (!isCompatibleWithPreviousAffixes(stem, dictionary)) return null;
+
+ String word = stem.getWord();
+ boolean isPrefix = kind == AffixKind.PREFIX;
+ if (!(isPrefix ? word.startsWith(strip) : word.endsWith(strip))) return null;
+
+ String stripped =
+ isPrefix
+ ? word.substring(strip.length())
+ : word.substring(0, word.length() - strip.length());
+ if (!condition.acceptsStem(stripped)) return null;
+
+ String applied = isPrefix ? affix + stripped : stripped + affix;
+ List<Affix> prefixes = new ArrayList<>(stem.getPrefixes());
+ List<Affix> suffixes = new ArrayList<>(stem.getSuffixes());
+ (isPrefix ? prefixes : suffixes).add(0, new Affix(dictionary, id));
+ return new AffixedWord(applied, stem.getDictEntry(), prefixes, suffixes);
+ }
+
+ private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, Dictionary dictionary) {
+ boolean isPrefix = kind == AffixKind.PREFIX;
+ List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes();
+ if (sameAffixes.size() == 2) return false;
+ if (isPrefix && sameAffixes.size() == 1 && !dictionary.complexPrefixes) return false;
+ if (!isPrefix && !stem.getPrefixes().isEmpty()) return false;
+ if (sameAffixes.size() == 1
+ && !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) {
+ return false;
+ }
+ return true;
+ }
+ }
+
+ private class WordCompressor {
+ private final Comparator<State> solutionFitness =
+ Comparator.comparingInt((State s) -> s.forbidden)
+ .thenComparingInt(s -> s.underGenerated)
+ .thenComparingInt(s -> s.stemToFlags.size())
+ .thenComparingInt(s -> s.overGenerated);
+ private final Set<String> forbidden;
+ private final Runnable checkCanceled;
+ private final Set<String> wordSet;
+ private final Set<String> existingStems;
+ private final Map<String, Set<FlagSet>> stemToPossibleFlags = new HashMap<>();
+ private final Map<String, Integer> stemCounts = new LinkedHashMap<>();
+
+ WordCompressor(List<String> words, Set<String> forbidden, Runnable checkCanceled) {
+ this.forbidden = forbidden;
+ this.checkCanceled = checkCanceled;
+ wordSet = new HashSet<>(words);
+
+ Stemmer.StemCandidateProcessor processor =
+ new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) {
+ @Override
+ boolean processStemCandidate(
+ char[] word,
+ int offset,
+ int length,
+ int lastAffix,
+ int outerPrefix,
+ int innerPrefix,
+ int outerSuffix,
+ int innerSuffix) {
+ String candidate = new String(word, offset, length);
+ stemCounts.merge(candidate, 1, Integer::sum);
+ Set<Character> flags = new LinkedHashSet<>();
+ if (outerPrefix >= 0) flags.add(dictionary.affixData(outerPrefix, AFFIX_FLAG));
+ if (innerPrefix >= 0) flags.add(dictionary.affixData(innerPrefix, AFFIX_FLAG));
+ if (outerSuffix >= 0) flags.add(dictionary.affixData(outerSuffix, AFFIX_FLAG));
+ if (innerSuffix >= 0) flags.add(dictionary.affixData(innerSuffix, AFFIX_FLAG));
+ stemToPossibleFlags
+ .computeIfAbsent(candidate, __ -> new LinkedHashSet<>())
+ .add(new FlagSet(flags, dictionary));
+ return true;
+ }
+ };
+
+ for (String word : words) {
+ checkCanceled.run();
+ stemCounts.merge(word, 1, Integer::sum);
+ stemToPossibleFlags.computeIfAbsent(word, __ -> new LinkedHashSet<>());
+ stemmer.removeAffixes(word.toCharArray(), 0, word.length(), true, -1, -1, -1, processor);
+ }
+
+ existingStems =
+ stemCounts.keySet().stream()
+ .filter(stem -> dictionary.lookupEntries(stem) != null)
+ .collect(Collectors.toSet());
+ }
+
+ EntrySuggestion compress() {
+ Comparator<String> stemSorter =
+ Comparator.comparing((String s) -> existingStems.contains(s))
+ .thenComparing(stemCounts::get)
+ .reversed();
+ List<String> sortedStems = stemCounts.keySet().stream().sorted(stemSorter).toList();
+ PriorityQueue<State> queue = new PriorityQueue<>(solutionFitness);
+ queue.offer(new State(Map.of(), wordSet.size(), 0, 0));
+ State result = null;
+ while (!queue.isEmpty()) {
+ State state = queue.poll();
+ if (state.underGenerated == 0) {
+ if (result == null || solutionFitness.compare(state, result) < 0) result = state;
+ if (state.forbidden == 0) break;
+ continue;
+ }
+
+ for (String stem : sortedStems) {
+ if (!state.stemToFlags.containsKey(stem)) {
+ queue.offer(addStem(state, stem));
+ }
+ }
+
+ for (Map.Entry<String, Set<FlagSet>> entry : state.stemToFlags.entrySet()) {
+ for (FlagSet flags : stemToPossibleFlags.get(entry.getKey())) {
+ if (!entry.getValue().contains(flags)) {
+ queue.offer(addFlags(state, entry.getKey(), flags));
+ }
+ }
+ }
+ }
+ return result == null ? null : toSuggestion(result);
+ }
+
+ EntrySuggestion toSuggestion(State state) {
+ List<DictEntry> toEdit = new ArrayList<>();
+ List<DictEntry> toAdd = new ArrayList<>();
+ for (Map.Entry<String, Set<FlagSet>> entry : state.stemToFlags.entrySet()) {
+ addEntry(toEdit, toAdd, entry.getKey(), FlagSet.flatten(entry.getValue()));
+ }
+
+ List<String> extraGenerated = new ArrayList<>();
+ for (String extra : allGenerated(state.stemToFlags).distinct().sorted().toList()) {
+ if (wordSet.contains(extra)) continue;
+
+ if (forbidden.contains(extra) && dictionary.forbiddenword != FLAG_UNSET) {
+ addEntry(toEdit, toAdd, extra, Set.of(dictionary.forbiddenword));
+ } else {
+ extraGenerated.add(extra);
+ }
+ }
+
+ return new EntrySuggestion(toEdit, toAdd, extraGenerated);
+ }
+
+ private void addEntry(
+ List<DictEntry> toEdit, List<DictEntry> toAdd, String stem, Set<Character> flags) {
+ String flagString = toFlagString(flags);
+ (existingStems.contains(stem) ? toEdit : toAdd).add(DictEntry.create(stem, flagString));
+ }
+
+ private State addStem(State state, String stem) {
+ LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
+ stemToFlags.put(stem, Set.of());
+ return newState(stemToFlags);
+ }
+
+ private State addFlags(State state, String stem, FlagSet flags) {
+ LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
+ Set<FlagSet> flagSets = new LinkedHashSet<>(stemToFlags.get(stem));
+ flagSets.add(flags);
+ stemToFlags.put(stem, flagSets);
+ return newState(stemToFlags);
+ }
+
+ private State newState(Map<String, Set<FlagSet>> stemToFlags) {
+ Set<String> allGenerated = allGenerated(stemToFlags).collect(Collectors.toSet());
+ return new State(
+ stemToFlags,
+ (int) wordSet.stream().filter(s -> !allGenerated.contains(s)).count(),
+ (int) allGenerated.stream().filter(s -> !wordSet.contains(s)).count(),
+ (int) allGenerated.stream().filter(s -> forbidden.contains(s)).count());
+ }
+
+ private final Map<StemWithFlags, List<String>> expansionCache = new HashMap<>();
+
+ private record StemWithFlags(String stem, Set<FlagSet> flags) {}
+
+ private Stream<String> allGenerated(Map<String, Set<FlagSet>> stemToFlags) {
+ Function<StemWithFlags, List<String>> expandToWords =
+ e -> expand(e.stem, FlagSet.flatten(e.flags)).stream().map(w -> w.getWord()).toList();
+ return stemToFlags.entrySet().stream()
+ .map(e -> new StemWithFlags(e.getKey(), e.getValue()))
+ .flatMap(swc -> expansionCache.computeIfAbsent(swc, expandToWords).stream());
+ }
+
+ private List<AffixedWord> expand(String stem, Set<Character> flagSet) {
+ return getAllWordForms(stem, toFlagString(flagSet), checkCanceled);
+ }
+
+ private String toFlagString(Set<Character> flagSet) {
+ return dictionary.flagParsingStrategy.printFlags(Dictionary.toSortedCharArray(flagSet));
+ }
+ }
+
+ private record FlagSet(Set<Character> flags, Dictionary dictionary) {
+ static Set<Character> flatten(Set<FlagSet> flagSets) {
+ return flagSets.stream().flatMap(f -> f.flags.stream()).collect(Collectors.toSet());
+ }
+
+ @Override
+ public String toString() {
+ return dictionary.flagParsingStrategy.printFlags(Dictionary.toSortedCharArray(flags));
+ }
+ }
+
+ private record State(
+ Map<String, Set<FlagSet>> stemToFlags,
+ int underGenerated,
+ int overGenerated,
+ int forbidden) {}
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index 3ba00941109..b18b2a4015e 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -28,6 +28,7 @@ import java.text.ParseException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
@@ -275,7 +276,9 @@ public class TestDictionary extends LuceneTestCase {
DictEntries simpleNoun = dic.lookupEntries("simplenoun");
assertEquals(1, simpleNoun.size());
assertEquals(Collections.emptyList(), simpleNoun.getMorphologicalValues(0, "aa:"));
- assertEquals(Collections.singletonList("42"), simpleNoun.getMorphologicalValues(0, "fr:"));
+ assertEquals(List.of("42"), simpleNoun.getMorphologicalValues(0, "fr:"));
+ assertEquals(List.of("42"), simpleNoun.get(0).getMorphologicalValues("fr:"));
+ assertEquals("A", simpleNoun.get(0).getFlags());
DictEntries lay = dic.lookupEntries("lay");
String actual =
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
index 3da1f114a16..9524f682546 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
@@ -24,8 +24,13 @@ import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.THROW_EXCEPTION;
import java.io.IOException;
import java.text.ParseException;
import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.junit.Test;
@@ -72,9 +77,134 @@ public class TestHunspell extends LuceneTestCase {
@Test
public void testStemmingApi() throws Exception {
- Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
- Hunspell hunspell = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
+ Hunspell hunspell = loadNoTimeout("simple");
assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
}
+
+ @Test
+ public void testAnalysisApi() throws Exception {
+ Hunspell hunspell = loadNoTimeout("base");
+ assertEquals(hunspell.analyzeSimpleWord("nonexistent"), List.of());
+ AffixedWord word = hunspell.analyzeSimpleWord("recreated").get(0);
+ checkAffixedWord(word, "create", List.of("A"), List.of("D"));
+ }
+
+ @Test
+ public void testAnalysisSeveralSuffixes() throws Exception {
+ Hunspell hunspell = loadNoTimeout("needaffix5");
+ AffixedWord word = hunspell.analyzeSimpleWord("pseudoprefoopseudosufbar").get(0);
+ checkAffixedWord(word, "foo", List.of("C"), List.of("B", "A"));
+ }
+
+ @Test
+ public void testAnalysisFlagLong() throws Exception {
+ AffixedWord word = loadNoTimeout("flaglong").analyzeSimpleWord("foos").get(0);
+ checkAffixedWord(word, "foo", List.of(), List.of("Y1"));
+ }
+
+ @Test
+ public void testAnalysisFlagNum() throws Exception {
+ AffixedWord word = loadNoTimeout("flagnum").analyzeSimpleWord("foos").get(0);
+ checkAffixedWord(word, "foo", List.of(), List.of("65000"));
+ }
+
+ @Test
+ public void testAnalysisMorphData() throws Exception {
+ List<AffixedWord> words = loadNoTimeout("morphdata").analyzeSimpleWord("works");
+ assertEquals(2, words.size());
+ AffixedWord verb =
+ words.get(words.get(0).getDictEntry().getMorphologicalData().contains("verb") ? 0 : 1);
+ AffixedWord noun = words.get(words.get(0) != verb ? 0 : 1);
+ assertNotNull(verb);
+ assertNotNull(noun);
+ checkAffixedWord(verb, "work", List.of(), List.of("A"));
+ checkAffixedWord(noun, "work", List.of(), List.of("B"));
+
+ assertEquals(List.of("worknoun"), noun.getDictEntry().getMorphologicalValues("st:"));
+ assertEquals(List.of("workverb"), verb.getDictEntry().getMorphologicalValues("st:"));
+ assertEquals("st:worknoun", noun.getDictEntry().getMorphologicalData());
+ assertEquals("st:workverb", verb.getDictEntry().getMorphologicalData());
+ }
+
+ private void checkAffixedWord(
+ AffixedWord word, String stem, List<String> prefixFlags, List<String> suffixFlags) {
+ assertEquals(stem, word.getDictEntry().getStem());
+ assertEquals(prefixFlags, word.getPrefixes().stream().map(AffixedWord.Affix::getFlag).toList());
+ assertEquals(suffixFlags, word.getSuffixes().stream().map(AffixedWord.Affix::getFlag).toList());
+ }
+
+ private Hunspell loadNoTimeout(String name) throws Exception {
+ Dictionary dictionary = loadDictionary(false, name + ".aff", name + ".dic");
+ return new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
+ }
+
+ @Test
+ public void testExpandRootApi() throws Exception {
+ Hunspell h = loadNoTimeout("base");
+ String[] createFormsBase = {
+ "create", "created", "creates", "creating", "creation", "creations"
+ };
+ List<String> expected =
+ Stream.concat(
+ Stream.of(createFormsBase).flatMap(s -> Stream.of(s, "pro" + s, "re" + s)),
+ Stream.of("creative"))
+ .sorted()
+ .toList();
+
+ Map<String, AffixedWord> expanded =
+ TestSpellChecking.checkExpansionGeneratesCorrectWords(h, "create", "base").stream()
+ .collect(Collectors.toMap(w -> w.getWord(), w -> w));
+ assertEquals(expected, expanded.keySet().stream().sorted().toList());
+
+ checkAffixedWord(expanded.get("created"), "create", List.of(), List.of("D"));
+ checkAffixedWord(expanded.get("recreated"), "create", List.of("A"), List.of("D"));
+
+ WordFormGenerator generator = new WordFormGenerator(h.dictionary);
+ List<AffixedWord> overrideFlag = generator.getAllWordForms("create", "U", () -> {});
+ assertEquals(
+ Set.of("create", "uncreate"),
+ overrideFlag.stream().map(w -> w.getWord()).collect(Collectors.toSet()));
+
+ List<AffixedWord> nonExistentRoot = generator.getAllWordForms("form", "S", () -> {});
+ assertEquals(
+ Set.of("form", "forms"),
+ nonExistentRoot.stream().map(w -> w.getWord()).collect(Collectors.toSet()));
+ }
+
+ @Test
+ public void testCompressingApi() throws Exception {
+ Hunspell h = loadNoTimeout("base");
+ String[] createQuery = {"create", "created", "creates", "creating", "creation"};
+ checkCompression(h, "toEdit=[create/DGNS], toAdd=[], extra=[]", createQuery);
+ checkCompression(h, "toEdit=[created], toAdd=[creates], extra=[]", "creates", "created");
+ checkCompression(h, "toEdit=[], toAdd=[creation/S], extra=[]", "creation", "creations");
+ checkCompression(h, "toEdit=[], toAdd=[abc, def], extra=[]", "abc", "def");
+ checkCompression(h, "toEdit=[], toAdd=[form/S], extra=[]", "form", "forms");
+
+ checkCompression(
+ loadNoTimeout("compress"), "toEdit=[], toAdd=[form/X], extra=[forms]", "form", "formx");
+ }
+
+ @Test
+ public void testCompressingIsMinimal() throws Exception {
+ Hunspell h = loadNoTimeout("compress");
+ checkCompression(
+ h, "toEdit=[], toAdd=[form/GS], extra=[]", "formings", "forming", "form", "forms");
+ }
+
+ @Test
+ public void testCompressingWithProhibition() throws Exception {
+ WordFormGenerator gen = new WordFormGenerator(loadNoTimeout("compress").dictionary);
+ assertEquals(
+ "toEdit=[], toAdd=[form/S], extra=[]",
+ gen.compress(List.of("form", "forms"), Set.of("formx"), () -> {}).internalsToString());
+ assertEquals(
+ "toEdit=[], toAdd=[form, formx], extra=[]",
+ gen.compress(List.of("form", "formx"), Set.of("forms"), () -> {}).internalsToString());
+ }
+
+ private void checkCompression(Hunspell h, String expected, String... words) {
+ assertEquals(expected, h.compress(List.of(words)).internalsToString());
+ }
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
index 15e452ce133..21927aa231a 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
@@ -21,8 +21,12 @@ import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.ParseException;
+import java.util.HashSet;
import java.util.List;
+import java.util.Locale;
+import java.util.Set;
import java.util.stream.Collectors;
+import java.util.stream.Stream;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.IOUtils;
@@ -97,6 +101,10 @@ public class TestSpellChecking extends LuceneTestCase {
doTest("compoundflag");
}
+ public void testFlagUtf8() throws Exception {
+ doTest("flagutf8");
+ }
+
public void testCheckCompoundCase() throws Exception {
doTest("checkcompoundcase");
}
@@ -230,13 +238,15 @@ public class TestSpellChecking extends LuceneTestCase {
}
protected void doTest(String name) throws Exception {
+ //noinspection ConstantConditions
checkSpellCheckerExpectations(
Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name));
}
static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
- InputStream dictStream = Files.newInputStream(Path.of(basePath.toString() + ".dic"));
+ Path dicFile = Path.of(basePath + ".dic");
+ InputStream dictStream = Files.newInputStream(dicFile);
Hunspell speller;
try {
@@ -273,5 +283,80 @@ public class TestSpellChecking extends LuceneTestCase {
} else {
assertFalse(".sug file without .wrong file!", Files.exists(sug));
}
+
+ Set<String> everythingGenerated = expandWholeDictionary(dicFile, speller);
+ if (everythingGenerated != null && !speller.dictionary.mayNeedInputCleaning()) {
+ checkGoodSugWordsAreGenerated(speller, good, sug, everythingGenerated);
+ }
+ }
+
+ private static Set<String> expandWholeDictionary(Path dic, Hunspell speller) throws IOException {
+ Set<String> everythingGenerated = new HashSet<>();
+ boolean generatedEverything = true;
+ try (Stream<String> lines = Files.lines(dic, speller.dictionary.decoder.charset())) {
+ for (String line : lines.skip(1).toList()) {
+ int len = (int) line.chars().takeWhile(c -> !Character.isWhitespace(c) && c != '/').count();
+ String word = line.substring(0, len).trim();
+ if (word.isEmpty() || word.contains("\\")) {
+ generatedEverything = false;
+ continue;
+ }
+
+ List<AffixedWord> expanded =
+ checkExpansionGeneratesCorrectWords(speller, word, dic.toString());
+ expanded.forEach(w -> everythingGenerated.add(w.getWord().toLowerCase(Locale.ROOT)));
+ }
+ }
+ return generatedEverything ? everythingGenerated : null;
+ }
+
+ private static void checkGoodSugWordsAreGenerated(
+ Hunspell speller, Path good, Path sug, Set<String> everythingGenerated) throws IOException {
+ Set<String> goodWords = new HashSet<>();
+ if (Files.exists(good)) {
+ Files.readAllLines(good).stream().map(String::trim).forEach(goodWords::add);
+ }
+ if (Files.exists(sug)) {
+ Files.readAllLines(sug).stream()
+ .flatMap(line -> Stream.of(line.split(", ")))
+ .map(String::trim)
+ .filter(s -> !s.contains(" "))
+ .forEach(goodWords::add);
+ }
+
+ goodWords.removeAll(everythingGenerated);
+ goodWords.removeIf(s -> !s.equals(s.toLowerCase(Locale.ROOT)));
+ goodWords.removeIf(s -> speller.analyzeSimpleWord(s).isEmpty());
+
+ assertTrue("Some *.good/sug words weren't generated: " + goodWords, goodWords.isEmpty());
+ }
+
+ static List<AffixedWord> checkExpansionGeneratesCorrectWords(
+ Hunspell hunspell, String stem, String baseName) {
+ List<AffixedWord> expanded = hunspell.getAllWordForms(stem);
+ Set<AffixedWord> misspelled = new HashSet<>();
+ for (AffixedWord word : expanded) {
+ if (!hunspell.spell(word.getWord()) || hunspell.analyzeSimpleWord(word.getWord()).isEmpty()) {
+ misspelled.add(word);
+ }
+ }
+ if (!misspelled.isEmpty()) {
+ fail("Misspelled words generated in " + baseName + ": " + misspelled);
+ }
+
+ if (expanded.stream().anyMatch(e -> e.getWord().equals(stem))) {
+ EntrySuggestion suggestion =
+ hunspell.compress(expanded.stream().map(AffixedWord::getWord).toList());
+ if (suggestion != null) {
+ String message =
+ ("Compression suggests a different stem from the original " + stem)
+ + (" in " + baseName + ":" + suggestion);
+ assertTrue(
+ message,
+ suggestion.getEntriesToEdit().stream().anyMatch(e -> e.getStem().equals(stem)));
+ }
+ }
+
+ return expanded;
}
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.aff
new file mode 100644
index 00000000000..70642d81ebf
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.aff
@@ -0,0 +1,14 @@
+FORBIDDENWORD *
+
+SFX G Y 1
+SFX G 0 ing/S .
+
+SFX J Y 1
+SFX J 0 ings .
+
+SFX S Y 1
+SFX S 0 s .
+
+SFX X Y 2
+SFX X 0 s .
+SFX X 0 x .
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.dic
new file mode 100644
index 00000000000..262348bee18
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compress.dic
@@ -0,0 +1,2 @@
+1
+word
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.aff
new file mode 100644
index 00000000000..d0f75c18580
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.aff
@@ -0,0 +1,15 @@
+# UTF-8 flags
+FLAG UTF-8
+
+SFX A Y 1
+SFX A 0 s/ÖüÜ .
+#SFX A 0 s/ÖüÖÜ .
+
+SFX Ö Y 1
+SFX Ö 0 bar .
+
+SFX ü Y 1
+SFX ü 0 baz .
+
+PFX Ü Y 1
+PFX Ü 0 un .
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.dic
new file mode 100644
index 00000000000..2944490c901
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.dic
@@ -0,0 +1,2 @@
+1
+foo/AÜ
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.good
new file mode 100644
index 00000000000..d5c27b1a677
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/flagutf8.good
@@ -0,0 +1,8 @@
+foo
+foos
+foosbar
+foosbaz
+unfoo
+unfoos
+unfoosbar
+unfoosbaz
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.aff
index de7f8ad9a42..a01d19d3502 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.aff
@@ -9,3 +9,6 @@ COMPOUNDFLAG Y
SFX A Y 1
SFX A 0 s .
+
+SFX s N 1
+SFX s 0 os .
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
index b012cc8a5a0..c5c19307b33 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic
@@ -1,4 +1,4 @@
-11
+14
foo/S
foo/YX
bar/YS
@@ -10,4 +10,6 @@ KG/X
cm
Cm/X
SIPS/X
-Sip/A
\ No newline at end of file
+Sip/A
+iPod/s
+iPodos/X
\ No newline at end of file