You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/02/19 19:13:31 UTC
[lucene-solr] branch master updated: LUCENE-9787: Hunspell: speed up suggesting a bit by not creating a huge TreeSet (#2400)

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 3ddc3c0  LUCENE-9787: Hunspell: speed up suggesting a bit by not creating a huge TreeSet (#2400)
3ddc3c0 is described below

commit 3ddc3c04a5d42165fb54b1542a3c0c5e187be5e3
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Fri Feb 19 20:13:19 2021 +0100

    LUCENE-9787: Hunspell: speed up suggesting a bit by not creating a huge TreeSet (#2400)
---
 .../analysis/hunspell/GeneratingSuggester.java     | 12 +++++-
 .../lucene/analysis/hunspell/TestPerformance.java  | 50 +++++++++++++++++++---
 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
index b8a9a67..e211111 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@@ -26,6 +26,7 @@ import java.util.EnumSet;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Objects;
+import java.util.PriorityQueue;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.function.BiConsumer;
@@ -59,7 +60,7 @@ class GeneratingSuggester {
 
   private List<Weighted<Root<String>>> findSimilarDictionaryEntries(
       String word, WordCase originalCase) {
-    TreeSet<Weighted<Root<String>>> roots = new TreeSet<>();
+    PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>();
     processFST(
         dictionary.words,
         (key, forms) -> {
@@ -80,9 +81,16 @@ class GeneratingSuggester {
               ngram(3, word, lower, EnumSet.of(NGramOptions.LONGER_WORSE))
                   + commonPrefix(word, root);
 
+          if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
+            return;
+          }
+
           entries.forEach(e -> roots.add(new Weighted<>(e, sc)));
+          while (roots.size() > MAX_ROOTS) {
+            roots.poll();
+          }
         });
-    return roots.stream().limit(MAX_ROOTS).collect(Collectors.toList());
+    return roots.stream().sorted().collect(Collectors.toList());
   }
 
   private void processFST(FST<IntsRef> fst, BiConsumer<IntsRef, IntsRef> keyValueConsumer) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
index 8ae5642..4cd54ba 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -25,10 +25,12 @@ import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.function.Consumer;
 import java.util.regex.Pattern;
+import java.util.stream.Collectors;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.Assume;
 import org.junit.AssumptionViolatedException;
@@ -54,24 +56,43 @@ public class TestPerformance extends LuceneTestCase {
 
   @Test
   public void en() throws Exception {
-    checkPerformance("en", 500_000);
+    checkAnalysisPerformance("en", 1_000_000);
+  }
+
+  @Test
+  public void en_suggest() throws Exception {
+    checkSuggestionPerformance("en", 1_000);
   }
 
   @Test
   public void de() throws Exception {
-    checkPerformance("de", 200_000);
+    checkAnalysisPerformance("de", 200_000);
+  }
+
+  @Test
+  public void de_suggest() throws Exception {
+    checkSuggestionPerformance("de", 30);
   }
 
   @Test
   public void fr() throws Exception {
-    checkPerformance("fr", 40_000);
+    checkAnalysisPerformance("fr", 40_000);
   }
 
-  private void checkPerformance(String code, int wordCount) throws Exception {
-    Path aff = findAffFile(code);
+  @Test
+  public void fr_suggest() throws Exception {
+    checkSuggestionPerformance("fr", 10);
+  }
 
+  private Dictionary loadDictionary(String code) throws IOException, ParseException {
+    Path aff = findAffFile(code);
     Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
     System.out.println("Loaded " + aff);
+    return dictionary;
+  }
+
+  private void checkAnalysisPerformance(String code, int wordCount) throws Exception {
+    Dictionary dictionary = loadDictionary(code);
 
     List<String> words = loadWords(code, wordCount, dictionary);
 
@@ -94,6 +115,25 @@ public class TestPerformance extends LuceneTestCase {
     System.out.println();
   }
 
+  private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
+    Dictionary dictionary = loadDictionary(code);
+    Hunspell speller = new Hunspell(dictionary);
+    List<String> words =
+        loadWords(code, wordCount, dictionary).stream()
+            .filter(w -> !speller.spell(w))
+            .collect(Collectors.toList());
+    System.out.println("Checking " + words.size() + " misspelled words");
+
+    measure(
+        "Suggestions for " + code,
+        blackHole -> {
+          for (String word : words) {
+            blackHole.accept(speller.suggest(word));
+          }
+        });
+    System.out.println();
+  }
+
   private Path findAffFile(String code) throws IOException {
     return TestAllDictionaries.findAllAffixFiles()
         .filter(