You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@lucene.apache.org by GitBox <gi...@apache.org> on 2021/02/09 10:00:55 UTC

[GitHub] [lucene-solr] dweiss commented on a change in pull request #2330: LUCENE-9748: Hunspell: suggest inflected dictionary entries similar t…

dweiss commented on a change in pull request #2330:
URL: https://github.com/apache/lucene-solr/pull/2330#discussion_r572744570



##########
File path: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
##########
@@ -33,44 +40,59 @@
  */
 class GeneratingSuggester {
   private static final int MAX_ROOTS = 100;
-  private static final int MAX_GUESSES = 100;
+  private static final int MAX_WORDS = 100;
+  private static final int MAX_GUESSES = 200;
   private final Dictionary dictionary;
+  private final SpellChecker speller;
 
-  GeneratingSuggester(Dictionary dictionary) {
-    this.dictionary = dictionary;
+  GeneratingSuggester(SpellChecker speller) {
+    this.dictionary = speller.dictionary;
+    this.speller = speller;
   }
 
   List<String> suggest(String word, WordCase originalCase, Set<String> prevSuggestions) {
-    List<WeightedWord> roots = findSimilarDictionaryEntries(word, originalCase);
-    List<WeightedWord> expanded = expandRoots(word, roots);
-    TreeSet<WeightedWord> bySimilarity = rankBySimilarity(word, expanded);
+    List<Weighted<DictEntry>> roots = findSimilarDictionaryEntries(word, originalCase);
+    List<Weighted<String>> expanded = expandRoots(word, roots);
+    TreeSet<Weighted<String>> bySimilarity = rankBySimilarity(word, expanded);
     return getMostRelevantSuggestions(bySimilarity, prevSuggestions);
   }
 
-  private List<WeightedWord> findSimilarDictionaryEntries(String word, WordCase originalCase) {
-    try {
-      IntsRefFSTEnum<IntsRef> fstEnum = new IntsRefFSTEnum<>(dictionary.words);
-      TreeSet<WeightedWord> roots = new TreeSet<>();
+  private List<Weighted<DictEntry>> findSimilarDictionaryEntries(
+      String word, WordCase originalCase) {
+    TreeSet<Weighted<DictEntry>> roots = new TreeSet<>();
+    processFST(
+        dictionary.words,
+        (key, forms) -> {
+          if (Math.abs(key.length - word.length()) > 4) return;
+
+          String root = toString(key);
+          List<DictEntry> entries = filterSuitableEntries(root, forms);
+          if (entries.isEmpty()) return;
+
+          if (originalCase == WordCase.LOWER
+              && WordCase.caseOf(root) == WordCase.TITLE
+              && !dictionary.hasLanguage("de")) {
+            return;
+          }
 
-      IntsRefFSTEnum.InputOutput<IntsRef> mapping;
-      while ((mapping = fstEnum.next()) != null) {
-        IntsRef key = mapping.input;
-        if (Math.abs(key.length - word.length()) > 4 || !isSuitableRoot(mapping.output)) continue;
-
-        String root = toString(key);
-        if (originalCase == WordCase.LOWER
-            && WordCase.caseOf(root) == WordCase.TITLE
-            && !dictionary.hasLanguage("de")) {
-          continue;
-        }
+          String lower = dictionary.toLowerCase(root);
+          int sc =
+              ngram(3, word, lower, EnumSet.of(NGramOptions.LONGER_WORSE))
+                  + commonPrefix(word, root);
 
-        String lower = dictionary.toLowerCase(root);
-        int sc =
-            ngram(3, word, lower, EnumSet.of(NGramOptions.LONGER_WORSE)) + commonPrefix(word, root);
+          entries.forEach(e -> roots.add(new Weighted<>(e, sc)));
+        });
+    return roots.stream().limit(MAX_ROOTS).collect(Collectors.toList());
+  }
 
-        roots.add(new WeightedWord(root, sc));
+  private void processFST(FST<IntsRef> fst, BiConsumer<IntsRef, IntsRef> keyValueConsumer) {

Review comment:
       Add a "forEach" method to fstenum, maybe? It'd correspond to Java collections then.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org