You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by do...@apache.org on 2023/01/13 11:48:58 UTC
[lucene] 05/09: [hunspell] speed up WordFormGenerator (#11904)
This is an automated email from the ASF dual-hosted git repository.
donnerpeter pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
commit b9ff38e37d9f5ddf39c467a0a49f684d87966425
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Mon Nov 7 19:41:17 2022 +0100
[hunspell] speed up WordFormGenerator (#11904)
---
.../lucene/analysis/hunspell/AffixCondition.java | 26 ++++-
.../lucene/analysis/hunspell/FlagEnumerator.java | 21 ++--
.../analysis/hunspell/WordFormGenerator.java | 125 +++++++++++++--------
3 files changed, 112 insertions(+), 60 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java
index 25cb2933990..6a00d925c62 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java
@@ -30,8 +30,30 @@ import org.apache.lucene.util.automaton.RegExp;
*/
interface AffixCondition {
String ALWAYS_TRUE_KEY = ".*";
- AffixCondition ALWAYS_TRUE = (word, offset, length) -> true;
- AffixCondition ALWAYS_FALSE = (word, offset, length) -> false;
+ AffixCondition ALWAYS_TRUE =
+ new AffixCondition() {
+ @Override
+ public boolean acceptsStem(String stem) {
+ return true;
+ }
+
+ @Override
+ public boolean acceptsStem(char[] word, int offset, int length) {
+ return true;
+ }
+ };
+ AffixCondition ALWAYS_FALSE =
+ new AffixCondition() {
+ @Override
+ public boolean acceptsStem(String stem) {
+ return false;
+ }
+
+ @Override
+ public boolean acceptsStem(char[] word, int offset, int length) {
+ return false;
+ }
+ };
default boolean acceptsStem(String stem) {
return acceptsStem(stem.toCharArray(), 0, stem.length());
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java
index bb31b5ba3da..a7cd45c4a14 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java
@@ -60,6 +60,17 @@ class FlagEnumerator {
return new Lookup(result);
}
+ static boolean hasFlagInSortedArray(char flag, char[] array, int start, int length) {
+ if (flag == Dictionary.FLAG_UNSET) return false;
+
+ for (int i = start; i < start + length; i++) {
+ char c = array[i];
+ if (c == flag) return true;
+ if (c > flag) return false;
+ }
+ return false;
+ }
+
static class Lookup {
private final char[] data;
@@ -68,15 +79,7 @@ class FlagEnumerator {
}
boolean hasFlag(int entryId, char flag) {
- if (entryId < 0 || flag == Dictionary.FLAG_UNSET) return false;
-
- int length = data[entryId];
- for (int i = entryId + 1; i < entryId + 1 + length; i++) {
- char c = data[i];
- if (c == flag) return true;
- if (c > flag) return false;
- }
- return false;
+ return entryId >= 0 && hasFlagInSortedArray(flag, data, entryId + 1, data[entryId]);
}
boolean hasAnyFlag(int entryId, char[] sortedFlags) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java
index 043df4fbb1d..bc0531edb89 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java
@@ -19,10 +19,12 @@ package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
+import static org.apache.lucene.analysis.hunspell.Dictionary.toSortedCharArray;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
@@ -109,14 +111,14 @@ public class WordFormGenerator {
* by throwing an exception
*/
public List<AffixedWord> getAllWordForms(String root, Runnable checkCanceled) {
- Set<AffixedWord> result = new LinkedHashSet<>();
+ List<AffixedWord> result = new ArrayList<>();
DictEntries entries = dictionary.lookupEntries(root);
if (entries != null) {
for (DictEntry entry : entries) {
result.addAll(getAllWordForms(root, entry.getFlags(), checkCanceled));
}
}
- return new ArrayList<>(result);
+ return result;
}
/**
@@ -128,20 +130,40 @@ public class WordFormGenerator {
* by throwing an exception
*/
public List<AffixedWord> getAllWordForms(String stem, String flags, Runnable checkCanceled) {
- var encodedFlags = toSet(dictionary.flagParsingStrategy.parseUtfFlags(flags));
+ var encodedFlags = dictionary.flagParsingStrategy.parseUtfFlags(flags);
if (!shouldConsiderAtAll(encodedFlags)) return List.of();
- LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
+ encodedFlags = sortAndDeduplicate(encodedFlags);
+ List<AffixedWord> result = new ArrayList<>();
AffixedWord bare = new AffixedWord(stem, DictEntry.create(stem, flags), List.of(), List.of());
checkCanceled.run();
- if (!encodedFlags.contains(dictionary.needaffix)) {
+ if (!FlagEnumerator.hasFlagInSortedArray(
+ dictionary.needaffix, encodedFlags, 0, encodedFlags.length)) {
result.add(bare);
}
result.addAll(expand(bare, encodedFlags, checkCanceled));
- return new ArrayList<>(result);
+ return result;
+ }
+
+ private static char[] sortAndDeduplicate(char[] flags) {
+ Arrays.sort(flags);
+ for (int i = 1; i < flags.length; i++) {
+ if (flags[i] == flags[i - 1]) {
+ return deduplicate(flags);
+ }
+ }
+ return flags;
+ }
+
+ private static char[] deduplicate(char[] flags) {
+ Set<Character> set = new HashSet<>();
+ for (char flag : flags) {
+ set.add(flag);
+ }
+ return toSortedCharArray(set);
}
- private boolean canStemToOriginal(AffixedWord derived) {
+ protected boolean canStemToOriginal(AffixedWord derived) {
String word = derived.getWord();
char[] chars = word.toCharArray();
if (isForbiddenWord(chars, 0, chars.length)) {
@@ -190,26 +212,20 @@ public class WordFormGenerator {
return false;
}
- private static LinkedHashSet<Character> toSet(char[] flags) {
- LinkedHashSet<Character> set = new LinkedHashSet<>();
- for (char c : flags) {
- set.add(c);
- }
- return set;
- }
-
- private LinkedHashSet<AffixedWord> expand(
- AffixedWord stem, LinkedHashSet<Character> flags, Runnable checkCanceled) {
- LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
- for (Character flag : flags) {
+ private List<AffixedWord> expand(AffixedWord stem, char[] flags, Runnable checkCanceled) {
+ List<AffixedWord> result = new ArrayList<>();
+ for (char flag : flags) {
List<AffixEntry> entries = affixes.get(flag);
if (entries == null) continue;
+ AffixKind kind = entries.get(0).kind;
+ if (!isCompatibleWithPreviousAffixes(stem, kind, flag)) continue;
+
for (AffixEntry affix : entries) {
checkCanceled.run();
AffixedWord derived = affix.apply(stem, dictionary);
if (derived != null) {
- LinkedHashSet<Character> append = appendFlags(affix);
+ char[] append = appendFlags(affix);
if (shouldConsiderAtAll(append)) {
if (canStemToOriginal(derived)) {
result.add(derived);
@@ -224,25 +240,37 @@ public class WordFormGenerator {
return result;
}
- private boolean shouldConsiderAtAll(Set<Character> flags) {
- return !flags.contains(dictionary.compoundBegin)
- && !flags.contains(dictionary.compoundMiddle)
- && !flags.contains(dictionary.compoundEnd)
- && !flags.contains(dictionary.forbiddenword)
- && !flags.contains(dictionary.onlyincompound);
+ private boolean shouldConsiderAtAll(char[] flags) {
+ for (char flag : flags) {
+ if (flag == dictionary.compoundBegin
+ || flag == dictionary.compoundMiddle
+ || flag == dictionary.compoundEnd
+ || flag == dictionary.forbiddenword
+ || flag == dictionary.onlyincompound) {
+ return false;
+ }
+ }
+
+ return true;
}
- private LinkedHashSet<Character> updateFlags(
- Set<Character> flags, Character toRemove, Set<Character> toAppend) {
- LinkedHashSet<Character> copy = new LinkedHashSet<>(flags);
- copy.remove(toRemove);
- copy.addAll(toAppend);
- return copy;
+ private char[] updateFlags(char[] flags, char toRemove, char[] toAppend) {
+ char[] result = new char[flags.length + toAppend.length - 1];
+ int index = 0;
+ for (char flag : flags) {
+ if (flag != toRemove && flag != dictionary.needaffix) {
+ result[index++] = flag;
+ }
+ }
+ for (char flag : toAppend) {
+ result[index++] = flag;
+ }
+ return sortAndDeduplicate(result);
}
- private LinkedHashSet<Character> appendFlags(AffixEntry affix) {
+ private char[] appendFlags(AffixEntry affix) {
char appendId = dictionary.affixData(affix.id, AFFIX_APPEND);
- return appendId <= 0 ? new LinkedHashSet<>() : toSet(dictionary.flagLookup.getFlags(appendId));
+ return appendId == 0 ? new char[0] : dictionary.flagLookup.getFlags(appendId);
}
/**
@@ -272,9 +300,8 @@ public class WordFormGenerator {
private record AffixEntry(
int id, char flag, AffixKind kind, String affix, String strip, AffixCondition condition) {
- AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
- if (!isCompatibleWithPreviousAffixes(stem, dictionary)) return null;
+ AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
String word = stem.getWord();
boolean isPrefix = kind == AffixKind.PREFIX;
if (!(isPrefix ? word.startsWith(strip) : word.endsWith(strip))) return null;
@@ -286,24 +313,24 @@ public class WordFormGenerator {
if (!condition.acceptsStem(stripped)) return null;
String applied = isPrefix ? affix + stripped : stripped + affix;
- List<Affix> prefixes = new ArrayList<>(stem.getPrefixes());
- List<Affix> suffixes = new ArrayList<>(stem.getSuffixes());
+ List<Affix> prefixes = isPrefix ? new ArrayList<>(stem.getPrefixes()) : stem.getPrefixes();
+ List<Affix> suffixes = isPrefix ? stem.getSuffixes() : new ArrayList<>(stem.getSuffixes());
(isPrefix ? prefixes : suffixes).add(0, new Affix(dictionary, id));
return new AffixedWord(applied, stem.getDictEntry(), prefixes, suffixes);
}
+ }
- private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, Dictionary dictionary) {
- boolean isPrefix = kind == AffixKind.PREFIX;
- List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes();
- if (sameAffixes.size() == 2) return false;
- if (isPrefix && sameAffixes.size() == 1 && !dictionary.complexPrefixes) return false;
- if (!isPrefix && !stem.getPrefixes().isEmpty()) return false;
- if (sameAffixes.size() == 1
- && !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) {
- return false;
- }
- return true;
+ private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, AffixKind kind, char flag) {
+ boolean isPrefix = kind == AffixKind.PREFIX;
+ List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes();
+ int size = sameAffixes.size();
+ if (size == 2) return false;
+ if (isPrefix && size == 1 && !dictionary.complexPrefixes) return false;
+ if (!isPrefix && !stem.getPrefixes().isEmpty()) return false;
+ if (size == 1 && !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) {
+ return false;
}
+ return true;
}
private class WordCompressor {