You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by do...@apache.org on 2023/01/13 11:48:55 UTC
[lucene] 02/09: hunspell: speedup suggestions by caching speller and compound stemming requests (#11857)
This is an automated email from the ASF dual-hosted git repository.
donnerpeter pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
commit 552f651c1a648770bd7d0f1922332e510e5f3771
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Mon Oct 17 21:25:12 2022 +0200
hunspell: speedup suggestions by caching speller and compound stemming requests (#11857)
hunspell: speed up suggestions by caching speller and compound stemming requests
---
lucene/CHANGES.txt | 2 ++
.../apache/lucene/analysis/hunspell/Hunspell.java | 24 ++++++++++++++++++++-
.../analysis/hunspell/ModifyingSuggester.java | 8 ++++++-
.../lucene/analysis/hunspell/TestPerformance.java | 25 +++++++++++-----------
4 files changed, 44 insertions(+), 15 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index dc471f0e0e6..09465c63dc6 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -194,6 +194,8 @@ Optimizations
* GITHUB#12078: Enhance XXXField#newRangeQuery. (Lu Xugang)
+* GITHUB#11857: Hunspell: improved suggestion performance (Peter Gromov)
+
Other
---------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
index 4123bcc28e1..b850402bfb4 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@@ -29,8 +29,11 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
+import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
+import java.util.Map;
+import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
@@ -167,7 +170,7 @@ public class Hunspell {
return false;
}
- private Root<CharsRef> findStem(
+ Root<CharsRef> findStem(
char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
checkCanceled.run();
boolean checkCase = context != COMPOUND_MIDDLE && context != COMPOUND_END;
@@ -614,11 +617,30 @@ public class Hunspell {
Runnable checkCanceled) {
Hunspell suggestionSpeller =
new Hunspell(dictionary, policy, checkCanceled) {
+ // Cache for expensive "findStem" requests issued when trying to split a compound word.
+ // The suggestion algorithm issues many of them, often with the same text.
+ // The cache can be large, but will be GC-ed after the "suggest" call.
+ final Map<String, Optional<Root<CharsRef>>> compoundCache = new HashMap<>();
+
@Override
boolean acceptsStem(int formID) {
return !dictionary.hasFlag(formID, dictionary.noSuggest)
&& !dictionary.hasFlag(formID, dictionary.subStandard);
}
+
+ @Override
+ Root<CharsRef> findStem(
+ char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
+ if (context == COMPOUND_BEGIN && originalCase == null) {
+ return compoundCache
+ .computeIfAbsent(
+ new String(chars, offset, length),
+ __ ->
+ Optional.ofNullable(super.findStem(chars, offset, length, null, context)))
+ .orElse(null);
+ }
+ return super.findStem(chars, offset, length, originalCase, context);
+ }
};
boolean hasGoodSuggestions =
new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase).suggest();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
index 5a4f25a6e9a..300e2f79fe3 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@@ -18,9 +18,11 @@ package org.apache.lucene.analysis.hunspell;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
+import java.util.Set;
/** A class that modifies the given misspelled word in various ways to get correct suggestions */
class ModifyingSuggester {
@@ -349,7 +351,11 @@ class ModifyingSuggester {
return speller.dictionary.tryChars.contains("-") || speller.dictionary.tryChars.contains("a");
}
+ private final Set<String> tried = new HashSet<>();
+
private boolean trySuggestion(String candidate) {
- return speller.checkWord(candidate) && result.add(createSuggestion(candidate));
+ return tried.add(candidate)
+ && speller.checkWord(candidate)
+ && result.add(createSuggestion(candidate));
}
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
index 7073d2a5ed1..190f3cab8c4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -86,7 +86,7 @@ public class TestPerformance extends LuceneTestCase {
@Test
public void de_suggest() throws Exception {
- checkSuggestionPerformance("de", 60);
+ checkSuggestionPerformance("de", 100);
}
@Test
@@ -119,10 +119,11 @@ public class TestPerformance extends LuceneTestCase {
Executors.newFixedThreadPool(cpus, new NamedThreadFactory("hunspellStemming-"));
try {
- measure("Stemming " + code, blackHole -> stemWords(words, stemmer, blackHole));
+ measure("Stemming " + code, words.size(), blackHole -> stemWords(words, stemmer, blackHole));
measure(
"Multi-threaded stemming " + code,
+ words.size(),
blackHole -> {
List<Future<?>> futures = new ArrayList<>();
for (int i = 0; i < cpus; i++) {
@@ -140,6 +141,7 @@ public class TestPerformance extends LuceneTestCase {
measure(
"Spellchecking " + code,
+ words.size(),
blackHole -> {
for (String word : words) {
blackHole.accept(speller.spell(word));
@@ -169,11 +171,13 @@ public class TestPerformance extends LuceneTestCase {
.collect(Collectors.toList());
System.out.println("Checking " + words.size() + " misspelled words");
+ Hunspell fullSpeller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
measure(
"Suggestions for " + code,
+ words.size(),
blackHole -> {
for (String word : words) {
- blackHole.accept(speller.suggest(word));
+ blackHole.accept(fullSpeller.suggest(word));
}
});
System.out.println();
@@ -184,7 +188,6 @@ public class TestPerformance extends LuceneTestCase {
return false;
}
- long start = System.nanoTime();
try {
speller.suggest(word);
} catch (
@@ -193,10 +196,6 @@ public class TestPerformance extends LuceneTestCase {
System.out.println("Timeout happened for " + word + ", skipping");
return false;
}
- long elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
- if (elapsed > Hunspell.SUGGEST_TIME_LIMIT * 4 / 5) {
- System.out.println(elapsed + "ms for " + word + ", too close to time limit, skipping");
- }
return true;
}
@@ -242,7 +241,7 @@ public class TestPerformance extends LuceneTestCase {
return words;
}
- private void measure(String what, Iteration iteration) {
+ private void measure(String what, int wordCount, Iteration iteration) {
Consumer<Object> consumer =
o -> {
if (o == null) {
@@ -261,12 +260,12 @@ public class TestPerformance extends LuceneTestCase {
iteration.run(consumer);
times.add(TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
}
+ double average = times.stream().mapToLong(Long::longValue).average().orElseThrow();
System.out.println(
what
- + ": average "
- + times.stream().mapToLong(Long::longValue).average().orElseThrow()
- + ", all times = "
- + times);
+ + (": average " + average)
+ + (" (" + (average / wordCount) + " per word)")
+ + (", all times = " + times));
}
private interface Iteration {