You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by do...@apache.org on 2023/01/13 11:48:58 UTC
[lucene] 05/09: [hunspell] speed up WordFormGenerator (#11904)

This is an automated email from the ASF dual-hosted git repository.

donnerpeter pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git

commit b9ff38e37d9f5ddf39c467a0a49f684d87966425
Author: Peter Gromov <pe...@jetbrains.com>
AuthorDate: Mon Nov 7 19:41:17 2022 +0100

    [hunspell] speed up WordFormGenerator (#11904)
---
 .../lucene/analysis/hunspell/AffixCondition.java   |  26 ++++-
 .../lucene/analysis/hunspell/FlagEnumerator.java   |  21 ++--
 .../analysis/hunspell/WordFormGenerator.java       | 125 +++++++++++++--------
 3 files changed, 112 insertions(+), 60 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java
index 25cb2933990..6a00d925c62 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java
@@ -30,8 +30,30 @@ import org.apache.lucene.util.automaton.RegExp;
  */
 interface AffixCondition {
   String ALWAYS_TRUE_KEY = ".*";
-  AffixCondition ALWAYS_TRUE = (word, offset, length) -> true;
-  AffixCondition ALWAYS_FALSE = (word, offset, length) -> false;
+  AffixCondition ALWAYS_TRUE =
+      new AffixCondition() {
+        @Override
+        public boolean acceptsStem(String stem) {
+          return true;
+        }
+
+        @Override
+        public boolean acceptsStem(char[] word, int offset, int length) {
+          return true;
+        }
+      };
+  AffixCondition ALWAYS_FALSE =
+      new AffixCondition() {
+        @Override
+        public boolean acceptsStem(String stem) {
+          return false;
+        }
+
+        @Override
+        public boolean acceptsStem(char[] word, int offset, int length) {
+          return false;
+        }
+      };
 
   default boolean acceptsStem(String stem) {
     return acceptsStem(stem.toCharArray(), 0, stem.length());
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java
index bb31b5ba3da..a7cd45c4a14 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/FlagEnumerator.java
@@ -60,6 +60,17 @@ class FlagEnumerator {
     return new Lookup(result);
   }
 
+  static boolean hasFlagInSortedArray(char flag, char[] array, int start, int length) {
+    if (flag == Dictionary.FLAG_UNSET) return false;
+
+    for (int i = start; i < start + length; i++) {
+      char c = array[i];
+      if (c == flag) return true;
+      if (c > flag) return false;
+    }
+    return false;
+  }
+
   static class Lookup {
     private final char[] data;
 
@@ -68,15 +79,7 @@ class FlagEnumerator {
     }
 
     boolean hasFlag(int entryId, char flag) {
-      if (entryId < 0 || flag == Dictionary.FLAG_UNSET) return false;
-
-      int length = data[entryId];
-      for (int i = entryId + 1; i < entryId + 1 + length; i++) {
-        char c = data[i];
-        if (c == flag) return true;
-        if (c > flag) return false;
-      }
-      return false;
+      return entryId >= 0 && hasFlagInSortedArray(flag, data, entryId + 1, data[entryId]);
     }
 
     boolean hasAnyFlag(int entryId, char[] sortedFlags) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java
index 043df4fbb1d..bc0531edb89 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java
@@ -19,10 +19,12 @@ package org.apache.lucene.analysis.hunspell;
 import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
 import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
 import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
+import static org.apache.lucene.analysis.hunspell.Dictionary.toSortedCharArray;
 
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -109,14 +111,14 @@ public class WordFormGenerator {
    *     by throwing an exception
    */
   public List<AffixedWord> getAllWordForms(String root, Runnable checkCanceled) {
-    Set<AffixedWord> result = new LinkedHashSet<>();
+    List<AffixedWord> result = new ArrayList<>();
     DictEntries entries = dictionary.lookupEntries(root);
     if (entries != null) {
       for (DictEntry entry : entries) {
         result.addAll(getAllWordForms(root, entry.getFlags(), checkCanceled));
       }
     }
-    return new ArrayList<>(result);
+    return result;
   }
 
   /**
@@ -128,20 +130,40 @@ public class WordFormGenerator {
    *     by throwing an exception
    */
   public List<AffixedWord> getAllWordForms(String stem, String flags, Runnable checkCanceled) {
-    var encodedFlags = toSet(dictionary.flagParsingStrategy.parseUtfFlags(flags));
+    var encodedFlags = dictionary.flagParsingStrategy.parseUtfFlags(flags);
     if (!shouldConsiderAtAll(encodedFlags)) return List.of();
 
-    LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
+    encodedFlags = sortAndDeduplicate(encodedFlags);
+    List<AffixedWord> result = new ArrayList<>();
     AffixedWord bare = new AffixedWord(stem, DictEntry.create(stem, flags), List.of(), List.of());
     checkCanceled.run();
-    if (!encodedFlags.contains(dictionary.needaffix)) {
+    if (!FlagEnumerator.hasFlagInSortedArray(
+        dictionary.needaffix, encodedFlags, 0, encodedFlags.length)) {
       result.add(bare);
     }
     result.addAll(expand(bare, encodedFlags, checkCanceled));
-    return new ArrayList<>(result);
+    return result;
+  }
+
+  private static char[] sortAndDeduplicate(char[] flags) {
+    Arrays.sort(flags);
+    for (int i = 1; i < flags.length; i++) {
+      if (flags[i] == flags[i - 1]) {
+        return deduplicate(flags);
+      }
+    }
+    return flags;
+  }
+
+  private static char[] deduplicate(char[] flags) {
+    Set<Character> set = new HashSet<>();
+    for (char flag : flags) {
+      set.add(flag);
+    }
+    return toSortedCharArray(set);
   }
 
-  private boolean canStemToOriginal(AffixedWord derived) {
+  protected boolean canStemToOriginal(AffixedWord derived) {
     String word = derived.getWord();
     char[] chars = word.toCharArray();
     if (isForbiddenWord(chars, 0, chars.length)) {
@@ -190,26 +212,20 @@ public class WordFormGenerator {
     return false;
   }
 
-  private static LinkedHashSet<Character> toSet(char[] flags) {
-    LinkedHashSet<Character> set = new LinkedHashSet<>();
-    for (char c : flags) {
-      set.add(c);
-    }
-    return set;
-  }
-
-  private LinkedHashSet<AffixedWord> expand(
-      AffixedWord stem, LinkedHashSet<Character> flags, Runnable checkCanceled) {
-    LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
-    for (Character flag : flags) {
+  private List<AffixedWord> expand(AffixedWord stem, char[] flags, Runnable checkCanceled) {
+    List<AffixedWord> result = new ArrayList<>();
+    for (char flag : flags) {
       List<AffixEntry> entries = affixes.get(flag);
       if (entries == null) continue;
 
+      AffixKind kind = entries.get(0).kind;
+      if (!isCompatibleWithPreviousAffixes(stem, kind, flag)) continue;
+
       for (AffixEntry affix : entries) {
         checkCanceled.run();
         AffixedWord derived = affix.apply(stem, dictionary);
         if (derived != null) {
-          LinkedHashSet<Character> append = appendFlags(affix);
+          char[] append = appendFlags(affix);
           if (shouldConsiderAtAll(append)) {
             if (canStemToOriginal(derived)) {
               result.add(derived);
@@ -224,25 +240,37 @@ public class WordFormGenerator {
     return result;
   }
 
-  private boolean shouldConsiderAtAll(Set<Character> flags) {
-    return !flags.contains(dictionary.compoundBegin)
-        && !flags.contains(dictionary.compoundMiddle)
-        && !flags.contains(dictionary.compoundEnd)
-        && !flags.contains(dictionary.forbiddenword)
-        && !flags.contains(dictionary.onlyincompound);
+  private boolean shouldConsiderAtAll(char[] flags) {
+    for (char flag : flags) {
+      if (flag == dictionary.compoundBegin
+          || flag == dictionary.compoundMiddle
+          || flag == dictionary.compoundEnd
+          || flag == dictionary.forbiddenword
+          || flag == dictionary.onlyincompound) {
+        return false;
+      }
+    }
+
+    return true;
   }
 
-  private LinkedHashSet<Character> updateFlags(
-      Set<Character> flags, Character toRemove, Set<Character> toAppend) {
-    LinkedHashSet<Character> copy = new LinkedHashSet<>(flags);
-    copy.remove(toRemove);
-    copy.addAll(toAppend);
-    return copy;
+  private char[] updateFlags(char[] flags, char toRemove, char[] toAppend) {
+    char[] result = new char[flags.length + toAppend.length - 1];
+    int index = 0;
+    for (char flag : flags) {
+      if (flag != toRemove && flag != dictionary.needaffix) {
+        result[index++] = flag;
+      }
+    }
+    for (char flag : toAppend) {
+      result[index++] = flag;
+    }
+    return sortAndDeduplicate(result);
   }
 
-  private LinkedHashSet<Character> appendFlags(AffixEntry affix) {
+  private char[] appendFlags(AffixEntry affix) {
     char appendId = dictionary.affixData(affix.id, AFFIX_APPEND);
-    return appendId <= 0 ? new LinkedHashSet<>() : toSet(dictionary.flagLookup.getFlags(appendId));
+    return appendId == 0 ? new char[0] : dictionary.flagLookup.getFlags(appendId);
   }
 
   /**
@@ -272,9 +300,8 @@ public class WordFormGenerator {
 
   private record AffixEntry(
       int id, char flag, AffixKind kind, String affix, String strip, AffixCondition condition) {
-    AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
-      if (!isCompatibleWithPreviousAffixes(stem, dictionary)) return null;
 
+    AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
       String word = stem.getWord();
       boolean isPrefix = kind == AffixKind.PREFIX;
       if (!(isPrefix ? word.startsWith(strip) : word.endsWith(strip))) return null;
@@ -286,24 +313,24 @@ public class WordFormGenerator {
       if (!condition.acceptsStem(stripped)) return null;
 
       String applied = isPrefix ? affix + stripped : stripped + affix;
-      List<Affix> prefixes = new ArrayList<>(stem.getPrefixes());
-      List<Affix> suffixes = new ArrayList<>(stem.getSuffixes());
+      List<Affix> prefixes = isPrefix ? new ArrayList<>(stem.getPrefixes()) : stem.getPrefixes();
+      List<Affix> suffixes = isPrefix ? stem.getSuffixes() : new ArrayList<>(stem.getSuffixes());
       (isPrefix ? prefixes : suffixes).add(0, new Affix(dictionary, id));
       return new AffixedWord(applied, stem.getDictEntry(), prefixes, suffixes);
     }
+  }
 
-    private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, Dictionary dictionary) {
-      boolean isPrefix = kind == AffixKind.PREFIX;
-      List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes();
-      if (sameAffixes.size() == 2) return false;
-      if (isPrefix && sameAffixes.size() == 1 && !dictionary.complexPrefixes) return false;
-      if (!isPrefix && !stem.getPrefixes().isEmpty()) return false;
-      if (sameAffixes.size() == 1
-          && !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) {
-        return false;
-      }
-      return true;
+  private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, AffixKind kind, char flag) {
+    boolean isPrefix = kind == AffixKind.PREFIX;
+    List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes();
+    int size = sameAffixes.size();
+    if (size == 2) return false;
+    if (isPrefix && size == 1 && !dictionary.complexPrefixes) return false;
+    if (!isPrefix && !stem.getPrefixes().isEmpty()) return false;
+    if (size == 1 && !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) {
+      return false;
     }
+    return true;
   }
 
   private class WordCompressor {