You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/04/14 10:14:26 UTC

svn commit: r1587163 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/

Author: rmuir
Date: Mon Apr 14 08:14:26 2014
New Revision: 1587163

URL: http://svn.apache.org/r1587163
Log:
LUCENE-5603: fix hunspell to use FST efficiently

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1587163&r1=1587162&r2=1587163&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Mon Apr 14 08:14:26 2014
@@ -16,6 +16,11 @@ API Changes
   IndexOutput.getFilePointer instead) and IndexOutput.setLength.
   (Mike McCandless)
 
+Optimizations
+
+* LUCENE-5603: hunspell stemmer more efficiently strips prefixes
+  and suffixes.  (Robert Muir)
+
 ======================= Lucene 4.8.0 =======================
 
 System Requirements

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1587163&r1=1587162&r2=1587163&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Mon Apr 14 08:14:26 2014
@@ -207,32 +207,16 @@ public class Dictionary {
     return lookup(words, word, offset, length);
   }
 
-  /**
-   * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
-   *
-   * @param word Char array to generate the String from
-   * @param offset Offset in the char array that the String starts at
-   * @param length Length from the offset that the String is
-   * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
-   */
+  // only for testing
   IntsRef lookupPrefix(char word[], int offset, int length) {
     return lookup(prefixes, word, offset, length);
   }
 
-  /**
-   * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
-   *
-   * @param word Char array to generate the String from
-   * @param offset Offset in the char array that the String starts at
-   * @param length Length from the offset that the String is
-   * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
-   */
+  // only for testing
   IntsRef lookupSuffix(char word[], int offset, int length) {
     return lookup(suffixes, word, offset, length);
   }
   
-  // TODO: this is pretty stupid, considering how the stemming algorithm works
-  // we can speed it up to be significantly faster!
   IntsRef lookup(FST<IntsRef> fst, char word[], int offset, int length) {
     if (fst == null) {
       return null;
@@ -396,6 +380,7 @@ public class Dictionary {
     String args[] = header.split("\\s+");
 
     boolean crossProduct = args[2].equals("Y");
+    boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;
     
     int numLines = Integer.parseInt(args[3]);
     affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
@@ -501,6 +486,10 @@ public class Dictionary {
         affixArg = cleaned.toString();
       }
       
+      if (isSuffix) {
+        affixArg = new StringBuilder(affixArg).reverse().toString();
+      }
+      
       List<Character> list = affixes.get(affixArg);
       if (list == null) {
         list = new ArrayList<>();

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java?rev=1587163&r1=1587162&r2=1587163&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java Mon Apr 14 08:14:26 2014
@@ -31,6 +31,8 @@ import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.Version;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Outputs;
 
 /**
  * Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.  It
@@ -54,6 +56,16 @@ final class Stemmer {
   public Stemmer(Dictionary dictionary) {
     this.dictionary = dictionary;
     this.affixReader = new ByteArrayDataInput(dictionary.affixData);
+    for (int level = 0; level < 3; level++) {
+      if (dictionary.prefixes != null) {
+        prefixArcs[level] = new FST.Arc<>();
+        prefixReaders[level] = dictionary.prefixes.getBytesReader();
+      }
+      if (dictionary.suffixes != null) {
+        suffixArcs[level] = new FST.Arc<>();
+        suffixReaders[level] = dictionary.suffixes.getBytesReader();
+      }
+    }
   } 
   
   /**
@@ -93,7 +105,11 @@ final class Stemmer {
         stems.add(newStem(word, length));
       }
     }
-    stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false));
+    try {
+      stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false));
+    } catch (IOException bogus) {
+      throw new RuntimeException(bogus);
+    }
     return stems;
   }
   
@@ -138,6 +154,16 @@ final class Stemmer {
 
   // ================================================= Helper Methods ================================================
 
+  // some state for traversing FSTs
+  final FST.BytesReader prefixReaders[] = new FST.BytesReader[3];
+  @SuppressWarnings("unchecked")
+  final FST.Arc<IntsRef> prefixArcs[] = new FST.Arc[3];
+  
+  final FST.BytesReader suffixReaders[] = new FST.BytesReader[3];
+  @SuppressWarnings("unchecked")
+  final FST.Arc<IntsRef> suffixArcs[] = new FST.Arc[3];
+
+  
   /**
    * Generates a list of stems for the provided word
    *
@@ -155,16 +181,33 @@ final class Stemmer {
    *        this means inner most suffix must also contain circumfix flag.
    * @return List of stems, or empty list if no stems are found
    */
-  private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix) {
+  private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix) throws IOException {
     
     // TODO: allow this stuff to be reused by tokenfilter
     List<CharsRef> stems = new ArrayList<>();
     
     if (doPrefix && dictionary.prefixes != null) {
-      for (int i = length - 1; i >= 0; i--) {
-        IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
-        if (prefixes == null) {
+      FST<IntsRef> fst = dictionary.prefixes;
+      Outputs<IntsRef> outputs = fst.outputs;
+      FST.BytesReader bytesReader = prefixReaders[recursionDepth];
+      FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
+      fst.getFirstArc(arc);
+      IntsRef NO_OUTPUT = outputs.getNoOutput();
+      IntsRef output = NO_OUTPUT;
+      for (int i = 0; i < length; i++) {
+        if (i > 0) {
+          int ch = word[i-1];
+          if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
+            break;
+          } else if (arc.output != NO_OUTPUT) {
+            output = fst.outputs.add(output, arc.output);
+          }
+        }
+        IntsRef prefixes = null;
+        if (!arc.isFinal()) {
           continue;
+        } else {
+          prefixes = fst.outputs.add(output, arc.nextFinalOutput);
         }
         
         for (int j = 0; j < prefixes.length; j++) {
@@ -218,10 +261,27 @@ final class Stemmer {
     } 
     
     if (doSuffix && dictionary.suffixes != null) {
-      for (int i = 0; i < length; i++) {
-        IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
-        if (suffixes == null) {
+      FST<IntsRef> fst = dictionary.suffixes;
+      Outputs<IntsRef> outputs = fst.outputs;
+      FST.BytesReader bytesReader = suffixReaders[recursionDepth];
+      FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
+      fst.getFirstArc(arc);
+      IntsRef NO_OUTPUT = outputs.getNoOutput();
+      IntsRef output = NO_OUTPUT;
+      for (int i = length; i >= 0; i--) {
+        if (i < length) {
+          int ch = word[i];
+          if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
+            break;
+          } else if (arc.output != NO_OUTPUT) {
+            output = fst.outputs.add(output, arc.output);
+          }
+        }
+        IntsRef suffixes = null;
+        if (!arc.isFinal()) {
           continue;
+        } else {
+          suffixes = fst.outputs.add(output, arc.nextFinalOutput);
         }
         
         for (int j = 0; j < suffixes.length; j++) {
@@ -313,7 +373,7 @@ final class Stemmer {
    * @param prefix true if we are removing a prefix (false if its a suffix)
    * @return List of stems for the word, or an empty list if none are found
    */
-  List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {    
+  List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) throws IOException {    
     // TODO: just pass this in from before, no need to decode it twice
     affixReader.setPosition(8 * affix);
     char flag = (char) (affixReader.readShort() & 0xffff);