You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/02/23 04:25:40 UTC

[lucene-solr] branch master updated: LUCENE-9801: Hunspell suggestions: speed up expandWord by enumerating only applicable affixes (#2416)

This is an automated email from the ASF dual-hosted git repository.

rmuir pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 34993c2  LUCENE-9801: Hunspell suggestions: speed up expandWord by enumerating only applicable affixes (#2416)
34993c2 is described below

commit 34993c22ddbac702bfad321a04bbd753e9ade1c4
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Tue Feb 23 05:25:21 2021 +0100

    LUCENE-9801: Hunspell suggestions: speed up expandWord by enumerating only applicable affixes (#2416)
---
 .../lucene/analysis/hunspell/Dictionary.java       |  30 +++---
 .../analysis/hunspell/GeneratingSuggester.java     | 106 ++++++++++++---------
 .../apache/lucene/analysis/hunspell/Stemmer.java   |  33 ++-----
 .../lucene/analysis/hunspell/TestPerformance.java  |   2 +-
 4 files changed, 88 insertions(+), 83 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index a3cbe39..115af24 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -297,29 +297,29 @@ public class Dictionary {
     final FST.BytesReader bytesReader = fst.getBytesReader();
     final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
     // Accumulate output as we go
-    final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
-    IntsRef output = NO_OUTPUT;
+    IntsRef output = fst.outputs.getNoOutput();
 
     int l = offset + length;
-    try {
-      for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
-        cp = Character.codePointAt(word, i, l);
-        if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
-          return null;
-        } else if (arc.output() != NO_OUTPUT) {
-          output = fst.outputs.add(output, arc.output());
-        }
+    for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
+      cp = Character.codePointAt(word, i, l);
+      output = nextArc(fst, arc, bytesReader, output, cp);
+      if (output == null) {
+        return null;
       }
-      if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
+    }
+    return nextArc(fst, arc, bytesReader, output, FST.END_LABEL);
+  }
+
+  static IntsRef nextArc(
+      FST<IntsRef> fst, FST.Arc<IntsRef> arc, FST.BytesReader reader, IntsRef output, int ch) {
+    try {
+      if (fst.findTargetArc(ch, arc, arc, reader) == null) {
         return null;
-      } else if (arc.output() != NO_OUTPUT) {
-        return fst.outputs.add(output, arc.output());
-      } else {
-        return output;
       }
     } catch (IOException bogus) {
       throw new RuntimeException(bogus);
     }
+    return fst.outputs.add(output, arc.output());
   }
 
   /**
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
index e211111..500ae15 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@@ -175,67 +175,85 @@ class GeneratingSuggester {
     }
 
     // suffixes
-    processFST(
-        dictionary.suffixes,
-        (key, ids) -> {
-          String suffix = new StringBuilder(toString(key)).reverse().toString();
-          if (misspelled.length() <= suffix.length() || !misspelled.endsWith(suffix)) return;
-
-          for (int i = 0; i < ids.length; i++) {
-            int suffixId = ids.ints[ids.offset + i];
-            if (!hasCompatibleFlags(root, suffixId) || !checkAffixCondition(suffixId, root.word)) {
-              continue;
-            }
+    processAffixes(
+        false,
+        misspelled,
+        (suffixLength, suffixId) -> {
+          if (!hasCompatibleFlags(root, suffixId) || !checkAffixCondition(suffixId, root.word)) {
+            return;
+          }
 
-            String withSuffix =
-                root.word.substring(0, root.word.length() - affixStripLength(suffixId)) + suffix;
-            result.add(withSuffix);
-            if (dictionary.isCrossProduct(suffixId)) {
-              crossProducts.add(withSuffix);
-            }
+          String suffix = misspelled.substring(misspelled.length() - suffixLength);
+          String withSuffix =
+              root.word.substring(0, root.word.length() - affixStripLength(suffixId)) + suffix;
+          result.add(withSuffix);
+          if (dictionary.isCrossProduct(suffixId)) {
+            crossProducts.add(withSuffix);
           }
         });
 
     // cross-product prefixes
-    processFST(
-        dictionary.prefixes,
-        (key, ids) -> {
-          String prefix = toString(key);
-          if (misspelled.length() <= prefix.length() || !misspelled.startsWith(prefix)) return;
-
-          for (int i = 0; i < ids.length; i++) {
-            int prefixId = ids.ints[ids.offset + i];
-            if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG))
-                || !dictionary.isCrossProduct(prefixId)) {
-              continue;
-            }
+    processAffixes(
+        true,
+        misspelled,
+        (prefixLength, prefixId) -> {
+          if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG))
+              || !dictionary.isCrossProduct(prefixId)) {
+            return;
+          }
 
-            for (String suffixed : crossProducts) {
-              if (checkAffixCondition(prefixId, suffixed)) {
-                result.add(prefix + suffixed.substring(affixStripLength(prefixId)));
-              }
+          String prefix = misspelled.substring(0, prefixLength);
+          for (String suffixed : crossProducts) {
+            if (checkAffixCondition(prefixId, suffixed)) {
+              result.add(prefix + suffixed.substring(affixStripLength(prefixId)));
             }
           }
         });
 
     // pure prefixes
-    processFST(
-        dictionary.prefixes,
-        (key, ids) -> {
-          String prefix = toString(key);
-          if (misspelled.length() <= prefix.length() || !misspelled.startsWith(prefix)) return;
-
-          for (int i = 0; i < ids.length; i++) {
-            int prefixId = ids.ints[ids.offset + i];
-            if (hasCompatibleFlags(root, prefixId) && checkAffixCondition(prefixId, root.word)) {
-              result.add(prefix + root.word.substring(affixStripLength(prefixId)));
-            }
+    processAffixes(
+        true,
+        misspelled,
+        (prefixLength, prefixId) -> {
+          if (hasCompatibleFlags(root, prefixId) && checkAffixCondition(prefixId, root.word)) {
+            String prefix = misspelled.substring(0, prefixLength);
+            result.add(prefix + root.word.substring(affixStripLength(prefixId)));
           }
         });
 
     return result.stream().limit(MAX_WORDS).collect(Collectors.toList());
   }
 
+  private void processAffixes(boolean prefixes, String word, AffixProcessor processor) {
+    FST<IntsRef> fst = prefixes ? dictionary.prefixes : dictionary.suffixes;
+    if (fst == null) return;
+
+    FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
+    FST.BytesReader reader = fst.getBytesReader();
+
+    IntsRef output = fst.outputs.getNoOutput();
+    int length = word.length();
+    int step = prefixes ? 1 : -1;
+    int limit = prefixes ? length : -1;
+    for (int i = prefixes ? 0 : length - 1; i != limit; i += step) {
+      output = Dictionary.nextArc(fst, arc, reader, output, word.charAt(i));
+      if (output == null) {
+        break;
+      }
+
+      if (arc.isFinal()) {
+        IntsRef affixIds = fst.outputs.add(output, arc.nextFinalOutput());
+        for (int j = 0; j < affixIds.length; j++) {
+          processor.processAffix(prefixes ? i + 1 : length - i, affixIds.ints[affixIds.offset + j]);
+        }
+      }
+    }
+  }
+
+  private interface AffixProcessor {
+    void processAffix(int affixLength, int affixId);
+  }
+
   private boolean hasCompatibleFlags(Root<?> root, int affixId) {
     if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) {
       return false;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index 2a75f45..66fac29 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -16,7 +16,6 @@
  */
 package org.apache.lucene.analysis.hunspell;
 
-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.stream.Collectors;
@@ -259,12 +258,8 @@ final class Stemmer {
         }
       }
     }
-    try {
-      return stem(
-          word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
-    } catch (IOException bogus) {
-      throw new RuntimeException(bogus);
-    }
+    return stem(
+        word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
   }
 
   /**
@@ -373,22 +368,18 @@ final class Stemmer {
       int recursionDepth,
       boolean doPrefix,
       boolean previousWasPrefix,
-      RootProcessor processor)
-      throws IOException {
+      RootProcessor processor) {
     if (doPrefix && dictionary.prefixes != null) {
       FST<IntsRef> fst = dictionary.prefixes;
       FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
       fst.getFirstArc(arc);
-      IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
-      IntsRef output = NO_OUTPUT;
+      IntsRef output = fst.outputs.getNoOutput();
       int limit = dictionary.fullStrip ? length + 1 : length;
       for (int i = 0; i < limit; i++) {
         if (i > 0) {
-          char ch = word[offset + i - 1];
-          if (fst.findTargetArc(ch, arc, arc, prefixReader) == null) {
+          output = Dictionary.nextArc(fst, arc, prefixReader, output, word[offset + i - 1]);
+          if (output == null) {
             break;
-          } else if (arc.output() != NO_OUTPUT) {
-            output = fst.outputs.add(output, arc.output());
           }
         }
         if (!arc.isFinal()) {
@@ -431,16 +422,13 @@ final class Stemmer {
       FST<IntsRef> fst = dictionary.suffixes;
       FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
       fst.getFirstArc(arc);
-      IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
-      IntsRef output = NO_OUTPUT;
+      IntsRef output = fst.outputs.getNoOutput();
       int limit = dictionary.fullStrip ? 0 : 1;
       for (int i = length; i >= limit; i--) {
         if (i < length) {
-          char ch = word[offset + i];
-          if (fst.findTargetArc(ch, arc, arc, suffixReader) == null) {
+          output = Dictionary.nextArc(fst, arc, suffixReader, output, word[offset + i]);
+          if (output == null) {
             break;
-          } else if (arc.output() != NO_OUTPUT) {
-            output = fst.outputs.add(output, arc.output());
           }
         }
         if (!arc.isFinal()) {
@@ -610,8 +598,7 @@ final class Stemmer {
       int prefixId,
       int recursionDepth,
       boolean prefix,
-      RootProcessor processor)
-      throws IOException {
+      RootProcessor processor) {
     char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
 
     boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix, prefixId);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
index ca38ca8..174ec44 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -82,7 +82,7 @@ public class TestPerformance extends LuceneTestCase {
 
   @Test
   public void fr_suggest() throws Exception {
-    checkSuggestionPerformance("fr", 100);
+    checkSuggestionPerformance("fr", 120);
   }
 
   private Dictionary loadDictionary(String code) throws IOException, ParseException {