You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/12 14:50:49 UTC

svn commit: r1576739 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/

Author: rmuir
Date: Wed Mar 12 13:50:49 2014
New Revision: 1576739

URL: http://svn.apache.org/r1576739
Log:
LUCENE-5518: minor hunspell optimizations

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1576739&r1=1576738&r2=1576739&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Wed Mar 12 13:50:49 2014
@@ -27,6 +27,8 @@ import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.OfflineSorter;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.RegExp;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.CharSequenceOutputs;
 import org.apache.lucene.util.fst.FST;
@@ -54,6 +56,7 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@@ -83,17 +86,16 @@ public class Dictionary {
   private static final String UTF8_FLAG_TYPE = "UTF-8";
   private static final String LONG_FLAG_TYPE = "long";
   
+  // TODO: really for suffixes we should reverse the automaton and run them backwards
   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
 
   FST<IntsRef> prefixes;
   FST<IntsRef> suffixes;
   
-  // all Patterns used by prefixes and suffixes. these are typically re-used across
+  // all condition checks used by prefixes and suffixes. these are typically re-used across
   // many affix stripping rules. so these are deduplicated, to save RAM.
-  // TODO: maybe don't use Pattern for the condition check...
-  // TODO: when we cut over Affix to FST, just store integer index to this.
-  ArrayList<Pattern> patterns = new ArrayList<Pattern>();
+  ArrayList<CharacterRunAutomaton> patterns = new ArrayList<>();
   
   // the entries in the .dic file, mapping to their set of flags.
   // the fst output is the ordinal list for flagLookup
@@ -103,7 +105,8 @@ public class Dictionary {
   BytesRefHash flagLookup = new BytesRefHash();
   
   // the list of unique strip affixes.
-  BytesRefHash stripLookup = new BytesRefHash();
+  char[] stripData;
+  int[] stripOffsets;
   
   // 8 bytes per affix
   byte[] affixData = new byte[64];
@@ -118,6 +121,7 @@ public class Dictionary {
   
   boolean ignoreCase;
   boolean complexPrefixes;
+  boolean twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping
   
   int circumfix = -1; // circumfix flag, or -1 if one is not defined
   
@@ -160,7 +164,6 @@ public class Dictionary {
     this.needsInputCleaning = ignoreCase;
     this.needsOutputCleaning = false; // set if we have an OCONV
     flagLookup.add(new BytesRef()); // no flags -> ord 0
-    stripLookup.add(new BytesRef()); // no strip -> ord 0
 
     File aff = File.createTempFile("affix", "aff", tempDir);
     OutputStream out = new BufferedOutputStream(new FileOutputStream(aff));
@@ -272,6 +275,14 @@ public class Dictionary {
     TreeMap<String, List<Character>> prefixes = new TreeMap<String, List<Character>>();
     TreeMap<String, List<Character>> suffixes = new TreeMap<String, List<Character>>();
     Map<String,Integer> seenPatterns = new HashMap<String,Integer>();
+    
+    // zero condition -> 0 ord
+    seenPatterns.put(".*", 0);
+    patterns.add(null);
+    
+    // zero strip -> 0 ord
+    Map<String,Integer> seenStrips = new LinkedHashMap<>();
+    seenStrips.put("", 0);
 
     LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
     String line = null;
@@ -283,9 +294,9 @@ public class Dictionary {
       if (line.startsWith(ALIAS_KEY)) {
         parseAlias(line);
       } else if (line.startsWith(PREFIX_KEY)) {
-        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns);
+        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
       } else if (line.startsWith(SUFFIX_KEY)) {
-        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns);
+        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
       } else if (line.startsWith(FLAG_KEY)) {
         // Assume that the FLAG line comes before any prefix or suffixes
         // Store the strategy so it can be used when parsing the dic file
@@ -326,6 +337,22 @@ public class Dictionary {
     
     this.prefixes = affixFST(prefixes);
     this.suffixes = affixFST(suffixes);
+    
+    int totalChars = 0;
+    for (String strip : seenStrips.keySet()) {
+      totalChars += strip.length();
+    }
+    stripData = new char[totalChars];
+    stripOffsets = new int[seenStrips.size()+1];
+    int currentOffset = 0;
+    int currentIndex = 0;
+    for (String strip : seenStrips.keySet()) {
+      stripOffsets[currentIndex++] = currentOffset;
+      strip.getChars(0, strip.length(), stripData, currentOffset);
+      currentOffset += strip.length();
+    }
+    assert currentIndex == seenStrips.size();
+    stripOffsets[currentIndex] = currentOffset;
   }
   
   private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
@@ -360,7 +387,8 @@ public class Dictionary {
                           String header,
                           LineNumberReader reader,
                           String conditionPattern,
-                          Map<String,Integer> seenPatterns) throws IOException, ParseException {
+                          Map<String,Integer> seenPatterns,
+                          Map<String,Integer> seenStrips) throws IOException, ParseException {
     
     BytesRef scratch = new BytesRef();
     StringBuilder sb = new StringBuilder();
@@ -399,7 +427,10 @@ public class Dictionary {
         
         appendFlags = flagParsingStrategy.parseFlags(flagPart);
         Arrays.sort(appendFlags);
+        twoStageAffix = true;
       }
+      
+      // TODO: add test and fix zero-affix handling!
 
       String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
       // at least the gascon affix file has this issue
@@ -411,7 +442,16 @@ public class Dictionary {
         condition = condition.replace("-", "\\-");
       }
 
-      String regex = String.format(Locale.ROOT, conditionPattern, condition);
+      final String regex;
+      if (".".equals(condition)) {
+        regex = ".*"; // Zero condition is indicated by dot
+      } else if (condition.equals(strip)) {
+        regex = ".*"; // TODO: optimize this better:
+                      // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
+                      // but this is complicated...
+      } else {
+        regex = String.format(Locale.ROOT, conditionPattern, condition);
+      }
       
       // deduplicate patterns
       Integer patternIndex = seenPatterns.get(regex);
@@ -421,17 +461,17 @@ public class Dictionary {
           throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org");          
         }
         seenPatterns.put(regex, patternIndex);
-        Pattern pattern = Pattern.compile(regex);
+        CharacterRunAutomaton pattern = new CharacterRunAutomaton(new RegExp(regex, RegExp.NONE).toAutomaton());
         patterns.add(pattern);
       }
       
-      scratch.copyChars(strip);
-      int stripOrd = stripLookup.add(scratch);
-      if (stripOrd < 0) {
-        // already exists in our hash
-        stripOrd = (-stripOrd)-1;
-      } else if (stripOrd > Character.MAX_VALUE) {
-        throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
+      Integer stripOrd = seenStrips.get(strip);
+      if (stripOrd == null) {
+        stripOrd = seenStrips.size();
+        seenStrips.put(strip, stripOrd);
+        if (stripOrd > Character.MAX_VALUE) {
+          throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
+        }
       }
 
       if (appendFlags == null) {
@@ -449,7 +489,7 @@ public class Dictionary {
       }
       
       affixWriter.writeShort((short)flag);
-      affixWriter.writeShort((short)stripOrd);
+      affixWriter.writeShort((short)stripOrd.intValue());
       // encode crossProduct into patternIndex
       int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
       affixWriter.writeShort((short)patternOrd);
@@ -774,6 +814,9 @@ public class Dictionary {
   }
   
   static char[] decodeFlags(BytesRef b) {
+    if (b.length == 0) {
+      return CharsRef.EMPTY_CHARS;
+    }
     int len = b.length >>> 1;
     char flags[] = new char[len];
     int upto = 0;

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java?rev=1576739&r1=1576738&r2=1576739&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java Wed Mar 12 13:50:49 2014
@@ -22,7 +22,6 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
-import java.util.regex.Pattern;
 
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.store.ByteArrayDataInput;
@@ -31,6 +30,7 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.Version;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 
 /**
  * Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.  It
@@ -160,7 +160,7 @@ final class Stemmer {
     // TODO: allow this stuff to be reused by tokenfilter
     List<CharsRef> stems = new ArrayList<CharsRef>();
     
-    if (doPrefix) {
+    if (doPrefix && dictionary.prefixes != null) {
       for (int i = length - 1; i >= 0; i--) {
         IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
         if (prefixes == null) {
@@ -197,12 +197,19 @@ final class Stemmer {
             int deAffixedStart = i;
             int deAffixedLength = length - deAffixedStart;
             
-            dictionary.stripLookup.get(stripOrd, scratch);
-            String strippedWord = new StringBuilder().append(scratch.utf8ToString())
-                .append(word, deAffixedStart, deAffixedLength)
-                .toString();
+            int stripStart = dictionary.stripOffsets[stripOrd];
+            int stripEnd = dictionary.stripOffsets[stripOrd+1];
+            int stripLength = stripEnd - stripStart;
             
-            List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, -1, recursionDepth, true, circumfix);
+            if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) {
+              continue;
+            }
+            
+            char strippedWord[] = new char[stripLength + deAffixedLength];
+            System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
+            System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
+
+            List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix);
             
             stems.addAll(stemList);
           }
@@ -210,7 +217,7 @@ final class Stemmer {
       }
     } 
     
-    if (doSuffix) {
+    if (doSuffix && dictionary.suffixes != null) {
       for (int i = 0; i < length; i++) {
         IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
         if (suffixes == null) {
@@ -246,11 +253,20 @@ final class Stemmer {
           if (compatible) {
             int appendLength = length - i;
             int deAffixedLength = length - appendLength;
-            // TODO: can we do this in-place?
-            dictionary.stripLookup.get(stripOrd, scratch);
-            String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
             
-            List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, prefixFlag, recursionDepth, false, circumfix);
+            int stripStart = dictionary.stripOffsets[stripOrd];
+            int stripEnd = dictionary.stripOffsets[stripOrd+1];
+            int stripLength = stripEnd - stripStart;
+            
+            if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
+              continue;
+            }
+
+            char strippedWord[] = new char[stripLength + deAffixedLength];
+            System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
+            System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
+            
+            List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix);
             
             stems.addAll(stemList);
           }
@@ -260,6 +276,30 @@ final class Stemmer {
 
     return stems;
   }
+  
+  /** checks condition of the concatenation of two strings */
+  // note: this is pretty stupid, we really should subtract strip from the condition up front and just check the stem
+  // but this is a little bit more complicated.
+  private boolean checkCondition(int condition, char c1[], int c1off, int c1len, char c2[], int c2off, int c2len) {
+    if (condition != 0) {
+      CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
+      int state = pattern.getInitialState();
+      for (int i = c1off; i < c1off + c1len; i++) {
+        state = pattern.step(state, c1[i]);
+        if (state == -1) {
+          return false;
+        }
+      }
+      for (int i = c2off; i < c2off + c2len; i++) {
+        state = pattern.step(state, c2[i]);
+        if (state == -1) {
+          return false;
+        }
+      }
+      return pattern.isAccept(state);
+    }
+    return true;
+  }
 
   /**
    * Applies the affix rule to the given word, producing a list of stems if any are found
@@ -273,10 +313,7 @@ final class Stemmer {
    * @param prefix true if we are removing a prefix (false if its a suffix)
    * @return List of stems for the word, or an empty list if none are found
    */
-  List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {
-    segment.setLength(0);
-    segment.append(strippedWord, 0, length);
-    
+  List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {    
     // TODO: just pass this in from before, no need to decode it twice
     affixReader.setPosition(8 * affix);
     char flag = (char) (affixReader.readShort() & 0xffff);
@@ -285,11 +322,6 @@ final class Stemmer {
     boolean crossProduct = (condition & 1) == 1;
     condition >>>= 1;
     char append = (char) (affixReader.readShort() & 0xffff);
-    
-    Pattern pattern = dictionary.patterns.get(condition);
-    if (!pattern.matcher(segment).matches()) {
-      return Collections.emptyList();
-    }
 
     List<CharsRef> stems = new ArrayList<CharsRef>();
 
@@ -338,9 +370,9 @@ final class Stemmer {
         if (prefix) {
           // we took away the first prefix.
           // COMPLEXPREFIXES = true:  combine with a second prefix and another suffix 
-          // COMPLEXPREFIXES = false: combine with another suffix
-          stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes, true, true, circumfix));
-        } else if (!dictionary.complexPrefixes) {
+          // COMPLEXPREFIXES = false: combine with a suffix
+          stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix));
+        } else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
           // we took away a suffix.
           // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
           // COMPLEXPREFIXES = false: combine with another suffix
@@ -350,7 +382,7 @@ final class Stemmer {
         if (prefix && dictionary.complexPrefixes) {
           // we took away the second prefix: go look for another suffix
           stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix));
-        } else if (prefix == false && dictionary.complexPrefixes == false) {
+        } else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
           // we took away a prefix, then a suffix: go look for another suffix
           stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
         }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java?rev=1576739&r1=1576738&r2=1576739&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java Wed Mar 12 13:50:49 2014
@@ -171,7 +171,7 @@ public class TestAllDictionaries extends
           System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
                              "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
                              "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
-                             "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
+                             "strips=" + RamUsageEstimator.humanSizeOf(dic.stripData) + ", " +
                              "conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
                              "affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
                              "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java?rev=1576739&r1=1576738&r2=1576739&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java Wed Mar 12 13:50:49 2014
@@ -187,7 +187,7 @@ public class TestAllDictionaries2 extend
           System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
                              "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
                              "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
-                             "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
+                             "strips=" + RamUsageEstimator.humanSizeOf(dic.stripData) + ", " +
                              "conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
                              "affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
                              "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +