You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/19 19:13:31 UTC
[lucene-solr] branch master updated: LUCENE-9787: Hunspell: speed
up suggesting a bit by not creating a huge TreeSet (#2400)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 3ddc3c0 LUCENE-9787: Hunspell: speed up suggesting a bit by not creating a huge TreeSet (#2400)
3ddc3c0 is described below
commit 3ddc3c04a5d42165fb54b1542a3c0c5e187be5e3
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Fri Feb 19 20:13:19 2021 +0100
LUCENE-9787: Hunspell: speed up suggesting a bit by not creating a huge TreeSet (#2400)
---
.../analysis/hunspell/GeneratingSuggester.java | 12 +++++-
.../lucene/analysis/hunspell/TestPerformance.java | 50 +++++++++++++++++++---
2 files changed, 55 insertions(+), 7 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
index b8a9a67..e211111 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@@ -26,6 +26,7 @@ import java.util.EnumSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Objects;
+import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeSet;
import java.util.function.BiConsumer;
@@ -59,7 +60,7 @@ class GeneratingSuggester {
private List<Weighted<Root<String>>> findSimilarDictionaryEntries(
String word, WordCase originalCase) {
- TreeSet<Weighted<Root<String>>> roots = new TreeSet<>();
+ PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>();
processFST(
dictionary.words,
(key, forms) -> {
@@ -80,9 +81,16 @@ class GeneratingSuggester {
ngram(3, word, lower, EnumSet.of(NGramOptions.LONGER_WORSE))
+ commonPrefix(word, root);
+ if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
+ return;
+ }
+
entries.forEach(e -> roots.add(new Weighted<>(e, sc)));
+ while (roots.size() > MAX_ROOTS) {
+ roots.poll();
+ }
});
- return roots.stream().limit(MAX_ROOTS).collect(Collectors.toList());
+ return roots.stream().sorted().collect(Collectors.toList());
}
private void processFST(FST<IntsRef> fst, BiConsumer<IntsRef, IntsRef> keyValueConsumer) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
index 8ae5642..4cd54ba 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -25,10 +25,12 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
+import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
import java.util.regex.Pattern;
+import java.util.stream.Collectors;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Assume;
import org.junit.AssumptionViolatedException;
@@ -54,24 +56,43 @@ public class TestPerformance extends LuceneTestCase {
@Test
public void en() throws Exception {
- checkPerformance("en", 500_000);
+ checkAnalysisPerformance("en", 1_000_000);
+ }
+
+ @Test
+ public void en_suggest() throws Exception {
+ checkSuggestionPerformance("en", 1_000);
}
@Test
public void de() throws Exception {
- checkPerformance("de", 200_000);
+ checkAnalysisPerformance("de", 200_000);
+ }
+
+ @Test
+ public void de_suggest() throws Exception {
+ checkSuggestionPerformance("de", 30);
}
@Test
public void fr() throws Exception {
- checkPerformance("fr", 40_000);
+ checkAnalysisPerformance("fr", 40_000);
}
- private void checkPerformance(String code, int wordCount) throws Exception {
- Path aff = findAffFile(code);
+ @Test
+ public void fr_suggest() throws Exception {
+ checkSuggestionPerformance("fr", 10);
+ }
+ private Dictionary loadDictionary(String code) throws IOException, ParseException {
+ Path aff = findAffFile(code);
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
System.out.println("Loaded " + aff);
+ return dictionary;
+ }
+
+ private void checkAnalysisPerformance(String code, int wordCount) throws Exception {
+ Dictionary dictionary = loadDictionary(code);
List<String> words = loadWords(code, wordCount, dictionary);
@@ -94,6 +115,25 @@ public class TestPerformance extends LuceneTestCase {
System.out.println();
}
+ private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
+ Dictionary dictionary = loadDictionary(code);
+ Hunspell speller = new Hunspell(dictionary);
+ List<String> words =
+ loadWords(code, wordCount, dictionary).stream()
+ .filter(w -> !speller.spell(w))
+ .collect(Collectors.toList());
+ System.out.println("Checking " + words.size() + " misspelled words");
+
+ measure(
+ "Suggestions for " + code,
+ blackHole -> {
+ for (String word : words) {
+ blackHole.accept(speller.suggest(word));
+ }
+ });
+ System.out.println();
+ }
+
private Path findAffFile(String code) throws IOException {
return TestAllDictionaries.findAllAffixFiles()
.filter(