You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/03/05 15:00:25 UTC
[lucene-solr] branch master updated: LUCENE-9824: Hunspell suggestions: speed up ngram score calculation for each dictionary entry (#2457)

This is an automated email from the ASF dual-hosted git repository.

rmuir pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 99a4bbf  LUCENE-9824: Hunspell suggestions: speed up ngram score calculation for each dictionary entry (#2457)
99a4bbf is described below

commit 99a4bbf3a0ab937cc66a9f256223f7b7bfaf9427
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Fri Mar 5 16:00:02 2021 +0100

    LUCENE-9824: Hunspell suggestions: speed up ngram score calculation for each dictionary entry (#2457)
---
 .../lucene/analysis/hunspell/Dictionary.java       |   1 +
 .../analysis/hunspell/GeneratingSuggester.java     |  54 ++++-----
 .../lucene/analysis/hunspell/TrigramAutomaton.java | 122 +++++++++++++++++++++
 .../analysis/hunspell/TestTrigramAutomaton.java    |  67 +++++++++++
 4 files changed, 212 insertions(+), 32 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index bea4cf9..450c77d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -1025,6 +1025,7 @@ public class Dictionary {
     assert morphSep > 0;
     assert morphSep > flagSep;
     int sep = flagSep < 0 ? morphSep : flagSep;
+    if (sep == 0) return 0;
 
     CharSequence toWrite;
     String beforeSep = line.substring(0, sep);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
index 7a5cde7..0b68bce 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@@ -23,7 +23,6 @@ import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_STRIP_ORD;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
-import java.util.EnumSet;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Objects;
@@ -67,7 +66,7 @@ class GeneratingSuggester {
     PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>(natural.reversed());
     List<Root<String>> entries = new ArrayList<>();
     boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de");
-    EnumSet<NGramOptions> options = EnumSet.of(NGramOptions.LONGER_WORSE);
+    TrigramAutomaton automaton = new TrigramAutomaton(word);
 
     IntsRefFSTEnum<IntsRef> fstEnum = new IntsRefFSTEnum<>(dictionary.words);
     InputOutput<IntsRef> mapping;
@@ -75,6 +74,7 @@ class GeneratingSuggester {
       speller.checkCanceled.run();
 
       IntsRef key = mapping.input;
+      assert key.length > 0;
       if (Math.abs(key.length - word.length()) > MAX_ROOT_LENGTH_DIFF) {
         assert key.length < word.length(); // nextKey takes care of longer keys
         continue;
@@ -89,7 +89,8 @@ class GeneratingSuggester {
       }
 
       String lower = dictionary.toLowerCase(root);
-      int sc = ngram(3, word, lower, options) + commonPrefix(word, root);
+      int sc =
+          automaton.ngramScore(lower) - longerWorsePenalty(word, lower) + commonPrefix(word, root);
 
       if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
         continue;
@@ -152,7 +153,7 @@ class GeneratingSuggester {
       for (String guess : expandRoot(weighted.word, misspelled)) {
         String lower = dictionary.toLowerCase(guess);
         int sc =
-            ngram(misspelled.length(), misspelled, lower, EnumSet.of(NGramOptions.ANY_MISMATCH))
+            anyMismatchNgram(misspelled.length(), misspelled, lower, false)
                 + commonPrefix(misspelled, guess);
         if (sc > thresh) {
           expanded.add(new Weighted<>(guess, sc));
@@ -173,7 +174,7 @@ class GeneratingSuggester {
         mw[k] = '*';
       }
 
-      thresh += ngram(word.length(), word, new String(mw), EnumSet.of(NGramOptions.ANY_MISMATCH));
+      thresh += anyMismatchNgram(word.length(), word, new String(mw), false);
     }
     return thresh / 3 - 1;
   }
@@ -314,16 +315,14 @@ class GeneratingSuggester {
         break;
       }
 
-      int re =
-          ngram(2, word, lower, EnumSet.of(NGramOptions.ANY_MISMATCH, NGramOptions.WEIGHTED))
-              + ngram(2, lower, word, EnumSet.of(NGramOptions.ANY_MISMATCH, NGramOptions.WEIGHTED));
+      int re = anyMismatchNgram(2, word, lower, true) + anyMismatchNgram(2, lower, word, true);
 
       int score =
           2 * lcs(word, lower)
               - Math.abs(word.length() - lower.length())
               + commonCharacterPositionScore(word, lower)
               + commonPrefix(word, lower)
-              + ngram(4, word, lower, EnumSet.of(NGramOptions.ANY_MISMATCH))
+              + anyMismatchNgram(4, word, lower, false)
               + re
               + (re < (word.length() + lower.length()) * fact ? -1000 : 0);
       bySimilarity.add(new Weighted<>(guess, score));
@@ -374,14 +373,9 @@ class GeneratingSuggester {
   }
 
   // generate an n-gram score comparing s1 and s2
-  private static int ngram(int n, String s1, String s2, EnumSet<NGramOptions> opt) {
-    int score = 0;
+  static int ngramScore(int n, String s1, String s2, boolean weighted) {
     int l1 = s1.length();
-    int l2 = s2.length();
-    if (l2 == 0) {
-      return 0;
-    }
-
+    int score = 0;
     int[] lastStarts = new int[l1];
     for (int j = 1; j <= n; j++) {
       int ns = 0;
@@ -394,7 +388,7 @@ class GeneratingSuggester {
             continue;
           }
         }
-        if (opt.contains(NGramOptions.WEIGHTED)) {
+        if (weighted) {
           ns--;
           if (i == 0 || i == l1 - j) {
             ns--; // side weight
@@ -402,19 +396,21 @@ class GeneratingSuggester {
         }
       }
       score = score + ns;
-      if (ns < 2 && !opt.contains(NGramOptions.WEIGHTED)) {
+      if (ns < 2 && !weighted) {
         break;
       }
     }
+    return score;
+  }
 
-    int ns = 0;
-    if (opt.contains(NGramOptions.LONGER_WORSE)) {
-      ns = (l2 - l1) - 2;
-    }
-    if (opt.contains(NGramOptions.ANY_MISMATCH)) {
-      ns = Math.abs(l2 - l1) - 2;
-    }
-    return score - Math.max(ns, 0);
+  // NGRAM_LONGER_WORSE flag in Hunspell
+  private static int longerWorsePenalty(String s1, String s2) {
+    return Math.max((s2.length() - s1.length()) - 2, 0);
+  }
+
+  // NGRAM_ANY_MISMATCH flag in Hunspell
+  private static int anyMismatchNgram(int n, String s1, String s2, boolean weighted) {
+    return ngramScore(n, s1, s2, weighted) - Math.max(Math.abs(s2.length() - s1.length()) - 2, 0);
   }
 
   private static int indexOfSubstring(
@@ -471,12 +467,6 @@ class GeneratingSuggester {
     return commonScore;
   }
 
-  private enum NGramOptions {
-    WEIGHTED,
-    LONGER_WORSE,
-    ANY_MISMATCH
-  }
-
   private static class Weighted<T extends Comparable<T>> implements Comparable<Weighted<T>> {
     final T word;
     final int score;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java
new file mode 100644
index 0000000..4235a1b
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.Operations;
+
+/**
+ * An automaton allowing to achieve the same results as non-weighted {@link
+ * GeneratingSuggester#ngramScore}, but faster (in O(s2.length) time).
+ */
+class TrigramAutomaton {
+  private static final int N = 3;
+  private final CharacterRunAutomaton automaton;
+  private final int[] state2Score;
+  private final FixedBitSet countedSubstrings;
+
+  TrigramAutomaton(String s1) {
+    Map<String, Integer> substringCounts = new HashMap<>();
+
+    Automaton.Builder builder = new Automaton.Builder(s1.length() * N, s1.length() * N);
+    int initialState = builder.createState();
+
+    for (int start = 0; start < s1.length(); start++) {
+      int limit = Math.min(s1.length(), start + N);
+      for (int end = start + 1; end <= limit; end++) {
+        substringCounts.merge(s1.substring(start, end), 1, Integer::sum);
+      }
+
+      int state = initialState;
+      for (int i = start; i < limit; i++) {
+        int next = builder.createState();
+        builder.addTransition(state, next, s1.charAt(i));
+        state = next;
+      }
+    }
+
+    automaton =
+        new CharacterRunAutomaton(
+            Operations.determinize(builder.finish(), Operations.DEFAULT_MAX_DETERMINIZED_STATES));
+
+    state2Score = new int[automaton.getSize()];
+    for (Map.Entry<String, Integer> entry : substringCounts.entrySet()) {
+      int state = runAutomatonOnStringChars(entry.getKey());
+      assert state2Score[state] == 0;
+      state2Score[state] = entry.getValue();
+    }
+    countedSubstrings = new FixedBitSet(state2Score.length);
+  }
+
+  private int runAutomatonOnStringChars(String s) {
+    int state = 0;
+    for (int i = 0; i < s.length(); i++) {
+      state = automaton.step(state, s.charAt(i));
+    }
+    return state;
+  }
+
+  int ngramScore(String s2) {
+    countedSubstrings.clear(0, countedSubstrings.length());
+
+    int score1 = 0, score2 = 0, score3 = 0; // scores for substrings of length 1, 2 and 3
+
+    // states of running the automaton on substrings [i-1, i) and [i-2, i)
+    int state1 = -1, state2 = -1;
+
+    int length = s2.length();
+    for (int i = 0; i < length; i++) {
+      char c = s2.charAt(i);
+
+      int state3 = state2 <= 0 ? 0 : automaton.step(state2, c);
+      if (state3 > 0) {
+        score3 += substringScore(state3, countedSubstrings);
+      }
+
+      state2 = state1 <= 0 ? 0 : automaton.step(state1, c);
+      if (state2 > 0) {
+        score2 += substringScore(state2, countedSubstrings);
+      }
+
+      state1 = automaton.step(0, c);
+      if (state1 > 0) {
+        score1 += substringScore(state1, countedSubstrings);
+      }
+    }
+
+    int score = score1;
+    if (score1 >= 2) {
+      score += score2;
+      if (score2 >= 2) {
+        score += score3;
+      }
+    }
+    return score;
+  }
+
+  private int substringScore(int state, FixedBitSet countedSubstrings) {
+    if (countedSubstrings.getAndSet(state)) return 0;
+
+    int score = state2Score[state];
+    assert score > 0;
+    return score;
+  }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTrigramAutomaton.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTrigramAutomaton.java
new file mode 100644
index 0000000..60e708e
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTrigramAutomaton.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.stream.Collectors;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestTrigramAutomaton extends LuceneTestCase {
+  public void testSameScore() {
+    checkScores("look", "looked");
+    checkScores("look", "cool");
+    checkScores("abracadabra", "abraham");
+  }
+
+  public void testRandomized() {
+    String[] alphabet = {
+      "a",
+      "b",
+      "c",
+      "aa",
+      "ab",
+      "abc",
+      "ccc",
+      "\uD800\uDFD1",
+      "\uD800\uDFD2",
+      "\uD800\uDFD2\uD800\uDFD1",
+      "\uD800\uDFD2\uD800\uDFD2"
+    };
+    for (int i = 0; i < 100; i++) {
+      checkScores(randomConcatenation(alphabet), randomConcatenation(alphabet));
+    }
+  }
+
+  private String randomConcatenation(String[] alphabet) {
+    return random()
+        .ints(0, alphabet.length)
+        .limit(random().nextInt(20) + 1)
+        .mapToObj(i -> alphabet[i])
+        .collect(Collectors.joining());
+  }
+
+  private void checkScores(String s1, String s2) {
+    String message = "Fails: checkScores(\"" + s1 + "\", \"" + s2 + "\")";
+    assertEquals(
+        message,
+        GeneratingSuggester.ngramScore(3, s1, s2, false),
+        new TrigramAutomaton(s1).ngramScore(s2));
+    assertEquals(
+        message,
+        GeneratingSuggester.ngramScore(3, s2, s1, false),
+        new TrigramAutomaton(s2).ngramScore(s1));
+  }
+}