You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by do...@apache.org on 2023/01/13 11:48:55 UTC
[lucene] 02/09: hunspell: speedup suggestions by caching speller and compound stemming requests (#11857)

This is an automated email from the ASF dual-hosted git repository.

donnerpeter pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git

commit 552f651c1a648770bd7d0f1922332e510e5f3771
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Mon Oct 17 21:25:12 2022 +0200

    hunspell: speedup suggestions by caching speller and compound stemming requests (#11857)
    
    hunspell: speed up suggestions by caching speller and compound stemming requests
---
 lucene/CHANGES.txt                                 |  2 ++
 .../apache/lucene/analysis/hunspell/Hunspell.java  | 24 ++++++++++++++++++++-
 .../analysis/hunspell/ModifyingSuggester.java      |  8 ++++++-
 .../lucene/analysis/hunspell/TestPerformance.java  | 25 +++++++++++-----------
 4 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index dc471f0e0e6..09465c63dc6 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -194,6 +194,8 @@ Optimizations
 
 * GITHUB#12078: Enhance XXXField#newRangeQuery. (Lu Xugang)
 
+* GITHUB#11857: Hunspell: improved suggestion performance (Peter Gromov)
+
 Other
 ---------------------
 
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
index 4123bcc28e1..b850402bfb4 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@@ -29,8 +29,11 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
+import java.util.Map;
+import java.util.Optional;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
@@ -167,7 +170,7 @@ public class Hunspell {
     return false;
   }
 
-  private Root<CharsRef> findStem(
+  Root<CharsRef> findStem(
       char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
     checkCanceled.run();
     boolean checkCase = context != COMPOUND_MIDDLE && context != COMPOUND_END;
@@ -614,11 +617,30 @@ public class Hunspell {
       Runnable checkCanceled) {
     Hunspell suggestionSpeller =
         new Hunspell(dictionary, policy, checkCanceled) {
+          // Cache for expensive "findStem" requests issued when trying to split a compound word.
+          // The suggestion algorithm issues many of them, often with the same text.
+          // The cache can be large, but will be GC-ed after the "suggest" call.
+          final Map<String, Optional<Root<CharsRef>>> compoundCache = new HashMap<>();
+
           @Override
           boolean acceptsStem(int formID) {
             return !dictionary.hasFlag(formID, dictionary.noSuggest)
                 && !dictionary.hasFlag(formID, dictionary.subStandard);
           }
+
+          @Override
+          Root<CharsRef> findStem(
+              char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
+            if (context == COMPOUND_BEGIN && originalCase == null) {
+              return compoundCache
+                  .computeIfAbsent(
+                      new String(chars, offset, length),
+                      __ ->
+                          Optional.ofNullable(super.findStem(chars, offset, length, null, context)))
+                  .orElse(null);
+            }
+            return super.findStem(chars, offset, length, originalCase, context);
+          }
         };
     boolean hasGoodSuggestions =
         new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase).suggest();
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
index 5a4f25a6e9a..300e2f79fe3 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@@ -18,9 +18,11 @@ package org.apache.lucene.analysis.hunspell;
 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashSet;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
+import java.util.Set;
 
 /** A class that modifies the given misspelled word in various ways to get correct suggestions */
 class ModifyingSuggester {
@@ -349,7 +351,11 @@ class ModifyingSuggester {
     return speller.dictionary.tryChars.contains("-") || speller.dictionary.tryChars.contains("a");
   }
 
+  private final Set<String> tried = new HashSet<>();
+
   private boolean trySuggestion(String candidate) {
-    return speller.checkWord(candidate) && result.add(createSuggestion(candidate));
+    return tried.add(candidate)
+        && speller.checkWord(candidate)
+        && result.add(createSuggestion(candidate));
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
index 7073d2a5ed1..190f3cab8c4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -86,7 +86,7 @@ public class TestPerformance extends LuceneTestCase {
 
   @Test
   public void de_suggest() throws Exception {
-    checkSuggestionPerformance("de", 60);
+    checkSuggestionPerformance("de", 100);
   }
 
   @Test
@@ -119,10 +119,11 @@ public class TestPerformance extends LuceneTestCase {
         Executors.newFixedThreadPool(cpus, new NamedThreadFactory("hunspellStemming-"));
 
     try {
-      measure("Stemming " + code, blackHole -> stemWords(words, stemmer, blackHole));
+      measure("Stemming " + code, words.size(), blackHole -> stemWords(words, stemmer, blackHole));
 
       measure(
           "Multi-threaded stemming " + code,
+          words.size(),
           blackHole -> {
             List<Future<?>> futures = new ArrayList<>();
             for (int i = 0; i < cpus; i++) {
@@ -140,6 +141,7 @@ public class TestPerformance extends LuceneTestCase {
 
       measure(
           "Spellchecking " + code,
+          words.size(),
           blackHole -> {
             for (String word : words) {
               blackHole.accept(speller.spell(word));
@@ -169,11 +171,13 @@ public class TestPerformance extends LuceneTestCase {
             .collect(Collectors.toList());
     System.out.println("Checking " + words.size() + " misspelled words");
 
+    Hunspell fullSpeller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
     measure(
         "Suggestions for " + code,
+        words.size(),
         blackHole -> {
           for (String word : words) {
-            blackHole.accept(speller.suggest(word));
+            blackHole.accept(fullSpeller.suggest(word));
           }
         });
     System.out.println();
@@ -184,7 +188,6 @@ public class TestPerformance extends LuceneTestCase {
       return false;
     }
 
-    long start = System.nanoTime();
     try {
       speller.suggest(word);
     } catch (
@@ -193,10 +196,6 @@ public class TestPerformance extends LuceneTestCase {
       System.out.println("Timeout happened for " + word + ", skipping");
       return false;
     }
-    long elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
-    if (elapsed > Hunspell.SUGGEST_TIME_LIMIT * 4 / 5) {
-      System.out.println(elapsed + "ms for " + word + ", too close to time limit, skipping");
-    }
     return true;
   }
 
@@ -242,7 +241,7 @@ public class TestPerformance extends LuceneTestCase {
     return words;
   }
 
-  private void measure(String what, Iteration iteration) {
+  private void measure(String what, int wordCount, Iteration iteration) {
     Consumer<Object> consumer =
         o -> {
           if (o == null) {
@@ -261,12 +260,12 @@ public class TestPerformance extends LuceneTestCase {
       iteration.run(consumer);
       times.add(TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
     }
+    double average = times.stream().mapToLong(Long::longValue).average().orElseThrow();
     System.out.println(
         what
-            + ": average "
-            + times.stream().mapToLong(Long::longValue).average().orElseThrow()
-            + ", all times = "
-            + times);
+            + (": average " + average)
+            + (" (" + (average / wordCount) + " per word)")
+            + (", all times = " + times));
   }
 
   private interface Iteration {