You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/03/15 04:32:14 UTC
[lucene] branch main updated: LUCENE-9831: Hunspell
GeneratingSuggester: faster flag & case checks, less allocations (#4)
This is an automated email from the ASF dual-hosted git repository.
rmuir pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 42c6f78 LUCENE-9831: Hunspell GeneratingSuggester: faster flag & case checks, less allocations (#4)
42c6f78 is described below
commit 42c6f780bf01f45012fefc7e682ea7a0f4d5b648
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Mon Mar 15 05:32:08 2021 +0100
LUCENE-9831: Hunspell GeneratingSuggester: faster flag & case checks, less allocations (#4)
---
.../lucene/analysis/hunspell/Dictionary.java | 2 +-
.../lucene/analysis/hunspell/FlagEnumerator.java | 30 +++++++++
.../analysis/hunspell/GeneratingSuggester.java | 72 ++++++++++++++--------
.../lucene/analysis/hunspell/TrigramAutomaton.java | 13 ++--
.../analysis/hunspell/TestTrigramAutomaton.java | 5 +-
5 files changed, 91 insertions(+), 31 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 2672add..2be8188 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -1479,7 +1479,7 @@ public class Dictionary {
return reuse;
}
- private static char[] toSortedCharArray(Set<Character> set) {
+ static char[] toSortedCharArray(Set<Character> set) {
char[] chars = new char[set.size()];
int i = 0;
for (Character c : set) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java
index 57aac40..bb31b5b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java
@@ -79,6 +79,36 @@ class FlagEnumerator {
return false;
}
+ boolean hasAnyFlag(int entryId, char[] sortedFlags) {
+ int length = data[entryId];
+ if (length == 0) return false;
+
+ int pos1 = entryId + 1;
+ int limit1 = entryId + 1 + length;
+
+ int pos2 = 0;
+ int limit2 = sortedFlags.length;
+
+ char c1 = data[pos1];
+ char c2 = sortedFlags[pos2];
+ while (true) {
+ if (c1 == c2) {
+ return true;
+ }
+ if (c1 < c2) {
+ if (++pos1 >= limit1) {
+ return false;
+ }
+ c1 = data[pos1];
+ } else {
+ if (++pos2 >= limit2) {
+ return false;
+ }
+ c2 = sortedFlags[pos2];
+ }
+ }
+ }
+
char[] getFlags(int entryId) {
return ArrayUtil.copyOfSubArray(data, entryId + 1, entryId + 1 + data[entryId]);
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
index 1770b03..b37b55d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@@ -19,6 +19,8 @@ package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_STRIP_ORD;
+import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
+import static org.apache.lucene.analysis.hunspell.Dictionary.HIDDEN_FLAG;
import java.util.ArrayList;
import java.util.Comparator;
@@ -29,6 +31,7 @@ import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
+import java.util.stream.Stream;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST;
@@ -60,9 +63,15 @@ class GeneratingSuggester {
String word, WordCase originalCase) {
Comparator<Weighted<Root<String>>> natural = Comparator.naturalOrder();
PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>(natural.reversed());
- List<Root<String>> entries = new ArrayList<>();
+ EntryFilter filter = new EntryFilter(dictionary);
boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de");
- TrigramAutomaton automaton = new TrigramAutomaton(word);
+ TrigramAutomaton automaton =
+ new TrigramAutomaton(word) {
+ @Override
+ char transformChar(char c) {
+ return dictionary.caseFold(c);
+ }
+ };
dictionary.words.processAllWords(
word.length() + 4,
@@ -75,25 +84,29 @@ class GeneratingSuggester {
return;
}
- String root = rootChars.toString();
- filterSuitableEntries(root, forms, entries);
- if (entries.isEmpty()) return;
+ int suitable = filter.findSuitableFormIndex(forms, 0);
+ if (suitable < 0) return;
- if (ignoreTitleCaseRoots && WordCase.caseOf(rootChars) == WordCase.TITLE) {
+ if (ignoreTitleCaseRoots
+ && Character.isUpperCase(rootChars.charAt(0))
+ && WordCase.caseOf(rootChars) == WordCase.TITLE) {
return;
}
- String lower = dictionary.toLowerCase(root);
int sc =
- automaton.ngramScore(lower)
- - longerWorsePenalty(word, lower)
- + commonPrefix(word, root);
+ automaton.ngramScore(rootChars)
+ - longerWorsePenalty(word.length(), rootChars.length)
+ + commonPrefix(word, rootChars);
if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
return;
}
- entries.forEach(e -> roots.add(new Weighted<>(e, sc)));
+ String root = rootChars.toString();
+ do {
+ roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + suitable]), sc));
+ suitable = filter.findSuitableFormIndex(forms, suitable + filter.formStep);
+ } while (suitable > 0);
while (roots.size() > MAX_ROOTS) {
roots.poll();
}
@@ -102,17 +115,28 @@ class GeneratingSuggester {
return roots.stream().sorted().collect(Collectors.toList());
}
- private void filterSuitableEntries(String word, IntsRef forms, List<Root<String>> result) {
- result.clear();
- for (int i = 0; i < forms.length; i += dictionary.formStep()) {
- int entryId = forms.ints[forms.offset + i];
- if (dictionary.hasFlag(entryId, dictionary.forbiddenword)
- || dictionary.hasFlag(entryId, dictionary.noSuggest)
- || dictionary.hasFlag(entryId, Dictionary.HIDDEN_FLAG)
- || dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
- continue;
+ private static class EntryFilter {
+ private final int formStep;
+ private final FlagEnumerator.Lookup flagLookup;
+ private final char[] excludeFlags;
+
+ EntryFilter(Dictionary dic) {
+ formStep = dic.formStep();
+ flagLookup = dic.flagLookup;
+
+ Character[] flags = {HIDDEN_FLAG, dic.noSuggest, dic.forbiddenword, dic.onlyincompound};
+ excludeFlags =
+ Dictionary.toSortedCharArray(
+ Stream.of(flags).filter(c -> c != FLAG_UNSET).collect(Collectors.toSet()));
+ }
+
+ int findSuitableFormIndex(IntsRef forms, int start) {
+ for (int i = start; i < forms.length; i += formStep) {
+ if (!flagLookup.hasAnyFlag(forms.ints[forms.offset + i], excludeFlags)) {
+ return i;
+ }
}
- result.add(new Root<>(word, entryId));
+ return -1;
}
}
@@ -336,7 +360,7 @@ class GeneratingSuggester {
return result;
}
- static int commonPrefix(String s1, String s2) {
+ static int commonPrefix(CharSequence s1, CharSequence s2) {
int i = 0;
int limit = Math.min(s1.length(), s2.length());
while (i < limit && s1.charAt(i) == s2.charAt(i)) {
@@ -377,8 +401,8 @@ class GeneratingSuggester {
}
// NGRAM_LONGER_WORSE flag in Hunspell
- private static int longerWorsePenalty(String s1, String s2) {
- return Math.max((s2.length() - s1.length()) - 2, 0);
+ private static int longerWorsePenalty(int length1, int length2) {
+ return Math.max((length2 - length1) - 2, 0);
}
// NGRAM_ANY_MISMATCH flag in Hunspell
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java
index 4235a1b..a83505e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.hunspell;
import java.util.HashMap;
import java.util.Map;
+import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
@@ -74,7 +75,7 @@ class TrigramAutomaton {
return state;
}
- int ngramScore(String s2) {
+ int ngramScore(CharsRef s2) {
countedSubstrings.clear(0, countedSubstrings.length());
int score1 = 0, score2 = 0, score3 = 0; // scores for substrings of length 1, 2 and 3
@@ -82,9 +83,9 @@ class TrigramAutomaton {
// states of running the automaton on substrings [i-1, i) and [i-2, i)
int state1 = -1, state2 = -1;
- int length = s2.length();
- for (int i = 0; i < length; i++) {
- char c = s2.charAt(i);
+ int limit = s2.length + s2.offset;
+ for (int i = s2.offset; i < limit; i++) {
+ char c = transformChar(s2.chars[i]);
int state3 = state2 <= 0 ? 0 : automaton.step(state2, c);
if (state3 > 0) {
@@ -112,6 +113,10 @@ class TrigramAutomaton {
return score;
}
+ char transformChar(char c) {
+ return c;
+ }
+
private int substringScore(int state, FixedBitSet countedSubstrings) {
if (countedSubstrings.getAndSet(state)) return 0;
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTrigramAutomaton.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTrigramAutomaton.java
index 60e708e..a424d23 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTrigramAutomaton.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestTrigramAutomaton.java
@@ -17,6 +17,7 @@
package org.apache.lucene.analysis.hunspell;
import java.util.stream.Collectors;
+import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.LuceneTestCase;
public class TestTrigramAutomaton extends LuceneTestCase {
@@ -58,10 +59,10 @@ public class TestTrigramAutomaton extends LuceneTestCase {
assertEquals(
message,
GeneratingSuggester.ngramScore(3, s1, s2, false),
- new TrigramAutomaton(s1).ngramScore(s2));
+ new TrigramAutomaton(s1).ngramScore(new CharsRef(s2)));
assertEquals(
message,
GeneratingSuggester.ngramScore(3, s2, s1, false),
- new TrigramAutomaton(s2).ngramScore(s1));
+ new TrigramAutomaton(s2).ngramScore(new CharsRef(s1)));
}
}