You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/02/23 04:25:40 UTC
[lucene-solr] branch master updated: LUCENE-9801: Hunspell
suggestions: speed up expandWord by enumerating only applicable affixes
(#2416)
This is an automated email from the ASF dual-hosted git repository.
rmuir pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 34993c2 LUCENE-9801: Hunspell suggestions: speed up expandWord by enumerating only applicable affixes (#2416)
34993c2 is described below
commit 34993c22ddbac702bfad321a04bbd753e9ade1c4
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Tue Feb 23 05:25:21 2021 +0100
LUCENE-9801: Hunspell suggestions: speed up expandWord by enumerating only applicable affixes (#2416)
---
.../lucene/analysis/hunspell/Dictionary.java | 30 +++---
.../analysis/hunspell/GeneratingSuggester.java | 106 ++++++++++++---------
.../apache/lucene/analysis/hunspell/Stemmer.java | 33 ++-----
.../lucene/analysis/hunspell/TestPerformance.java | 2 +-
4 files changed, 88 insertions(+), 83 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index a3cbe39..115af24 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -297,29 +297,29 @@ public class Dictionary {
final FST.BytesReader bytesReader = fst.getBytesReader();
final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
// Accumulate output as we go
- final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
- IntsRef output = NO_OUTPUT;
+ IntsRef output = fst.outputs.getNoOutput();
int l = offset + length;
- try {
- for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
- cp = Character.codePointAt(word, i, l);
- if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
- return null;
- } else if (arc.output() != NO_OUTPUT) {
- output = fst.outputs.add(output, arc.output());
- }
+ for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
+ cp = Character.codePointAt(word, i, l);
+ output = nextArc(fst, arc, bytesReader, output, cp);
+ if (output == null) {
+ return null;
}
- if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
+ }
+ return nextArc(fst, arc, bytesReader, output, FST.END_LABEL);
+ }
+
+ static IntsRef nextArc(
+ FST<IntsRef> fst, FST.Arc<IntsRef> arc, FST.BytesReader reader, IntsRef output, int ch) {
+ try {
+ if (fst.findTargetArc(ch, arc, arc, reader) == null) {
return null;
- } else if (arc.output() != NO_OUTPUT) {
- return fst.outputs.add(output, arc.output());
- } else {
- return output;
}
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
+ return fst.outputs.add(output, arc.output());
}
/**
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
index e211111..500ae15 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@@ -175,67 +175,85 @@ class GeneratingSuggester {
}
// suffixes
- processFST(
- dictionary.suffixes,
- (key, ids) -> {
- String suffix = new StringBuilder(toString(key)).reverse().toString();
- if (misspelled.length() <= suffix.length() || !misspelled.endsWith(suffix)) return;
-
- for (int i = 0; i < ids.length; i++) {
- int suffixId = ids.ints[ids.offset + i];
- if (!hasCompatibleFlags(root, suffixId) || !checkAffixCondition(suffixId, root.word)) {
- continue;
- }
+ processAffixes(
+ false,
+ misspelled,
+ (suffixLength, suffixId) -> {
+ if (!hasCompatibleFlags(root, suffixId) || !checkAffixCondition(suffixId, root.word)) {
+ return;
+ }
- String withSuffix =
- root.word.substring(0, root.word.length() - affixStripLength(suffixId)) + suffix;
- result.add(withSuffix);
- if (dictionary.isCrossProduct(suffixId)) {
- crossProducts.add(withSuffix);
- }
+ String suffix = misspelled.substring(misspelled.length() - suffixLength);
+ String withSuffix =
+ root.word.substring(0, root.word.length() - affixStripLength(suffixId)) + suffix;
+ result.add(withSuffix);
+ if (dictionary.isCrossProduct(suffixId)) {
+ crossProducts.add(withSuffix);
}
});
// cross-product prefixes
- processFST(
- dictionary.prefixes,
- (key, ids) -> {
- String prefix = toString(key);
- if (misspelled.length() <= prefix.length() || !misspelled.startsWith(prefix)) return;
-
- for (int i = 0; i < ids.length; i++) {
- int prefixId = ids.ints[ids.offset + i];
- if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG))
- || !dictionary.isCrossProduct(prefixId)) {
- continue;
- }
+ processAffixes(
+ true,
+ misspelled,
+ (prefixLength, prefixId) -> {
+ if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG))
+ || !dictionary.isCrossProduct(prefixId)) {
+ return;
+ }
- for (String suffixed : crossProducts) {
- if (checkAffixCondition(prefixId, suffixed)) {
- result.add(prefix + suffixed.substring(affixStripLength(prefixId)));
- }
+ String prefix = misspelled.substring(0, prefixLength);
+ for (String suffixed : crossProducts) {
+ if (checkAffixCondition(prefixId, suffixed)) {
+ result.add(prefix + suffixed.substring(affixStripLength(prefixId)));
}
}
});
// pure prefixes
- processFST(
- dictionary.prefixes,
- (key, ids) -> {
- String prefix = toString(key);
- if (misspelled.length() <= prefix.length() || !misspelled.startsWith(prefix)) return;
-
- for (int i = 0; i < ids.length; i++) {
- int prefixId = ids.ints[ids.offset + i];
- if (hasCompatibleFlags(root, prefixId) && checkAffixCondition(prefixId, root.word)) {
- result.add(prefix + root.word.substring(affixStripLength(prefixId)));
- }
+ processAffixes(
+ true,
+ misspelled,
+ (prefixLength, prefixId) -> {
+ if (hasCompatibleFlags(root, prefixId) && checkAffixCondition(prefixId, root.word)) {
+ String prefix = misspelled.substring(0, prefixLength);
+ result.add(prefix + root.word.substring(affixStripLength(prefixId)));
}
});
return result.stream().limit(MAX_WORDS).collect(Collectors.toList());
}
+ private void processAffixes(boolean prefixes, String word, AffixProcessor processor) {
+ FST<IntsRef> fst = prefixes ? dictionary.prefixes : dictionary.suffixes;
+ if (fst == null) return;
+
+ FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
+ FST.BytesReader reader = fst.getBytesReader();
+
+ IntsRef output = fst.outputs.getNoOutput();
+ int length = word.length();
+ int step = prefixes ? 1 : -1;
+ int limit = prefixes ? length : -1;
+ for (int i = prefixes ? 0 : length - 1; i != limit; i += step) {
+ output = Dictionary.nextArc(fst, arc, reader, output, word.charAt(i));
+ if (output == null) {
+ break;
+ }
+
+ if (arc.isFinal()) {
+ IntsRef affixIds = fst.outputs.add(output, arc.nextFinalOutput());
+ for (int j = 0; j < affixIds.length; j++) {
+ processor.processAffix(prefixes ? i + 1 : length - i, affixIds.ints[affixIds.offset + j]);
+ }
+ }
+ }
+ }
+
+ private interface AffixProcessor {
+ void processAffix(int affixLength, int affixId);
+ }
+
private boolean hasCompatibleFlags(Root<?> root, int affixId) {
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) {
return false;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 2a75f45..66fac29 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -16,7 +16,6 @@
*/
package org.apache.lucene.analysis.hunspell;
-import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
@@ -259,12 +258,8 @@ final class Stemmer {
}
}
}
- try {
- return stem(
- word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
- } catch (IOException bogus) {
- throw new RuntimeException(bogus);
- }
+ return stem(
+ word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
}
/**
@@ -373,22 +368,18 @@ final class Stemmer {
int recursionDepth,
boolean doPrefix,
boolean previousWasPrefix,
- RootProcessor processor)
- throws IOException {
+ RootProcessor processor) {
if (doPrefix && dictionary.prefixes != null) {
FST<IntsRef> fst = dictionary.prefixes;
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
fst.getFirstArc(arc);
- IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
- IntsRef output = NO_OUTPUT;
+ IntsRef output = fst.outputs.getNoOutput();
int limit = dictionary.fullStrip ? length + 1 : length;
for (int i = 0; i < limit; i++) {
if (i > 0) {
- char ch = word[offset + i - 1];
- if (fst.findTargetArc(ch, arc, arc, prefixReader) == null) {
+ output = Dictionary.nextArc(fst, arc, prefixReader, output, word[offset + i - 1]);
+ if (output == null) {
break;
- } else if (arc.output() != NO_OUTPUT) {
- output = fst.outputs.add(output, arc.output());
}
}
if (!arc.isFinal()) {
@@ -431,16 +422,13 @@ final class Stemmer {
FST<IntsRef> fst = dictionary.suffixes;
FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
fst.getFirstArc(arc);
- IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
- IntsRef output = NO_OUTPUT;
+ IntsRef output = fst.outputs.getNoOutput();
int limit = dictionary.fullStrip ? 0 : 1;
for (int i = length; i >= limit; i--) {
if (i < length) {
- char ch = word[offset + i];
- if (fst.findTargetArc(ch, arc, arc, suffixReader) == null) {
+ output = Dictionary.nextArc(fst, arc, suffixReader, output, word[offset + i]);
+ if (output == null) {
break;
- } else if (arc.output() != NO_OUTPUT) {
- output = fst.outputs.add(output, arc.output());
}
}
if (!arc.isFinal()) {
@@ -610,8 +598,7 @@ final class Stemmer {
int prefixId,
int recursionDepth,
boolean prefix,
- RootProcessor processor)
- throws IOException {
+ RootProcessor processor) {
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix, prefixId);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
index ca38ca8..174ec44 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -82,7 +82,7 @@ public class TestPerformance extends LuceneTestCase {
@Test
public void fr_suggest() throws Exception {
- checkSuggestionPerformance("fr", 100);
+ checkSuggestionPerformance("fr", 120);
}
private Dictionary loadDictionary(String code) throws IOException, ParseException {