You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/10/30 17:58:06 UTC

svn commit: r1403780 - in /lucene/dev/branches/branch_4x: ./ dev-tools/ lucene/ lucene/analysis/ lucene/benchmark/ lucene/codecs/ lucene/core/ lucene/core/src/java/org/apache/lucene/analysis/ lucene/core/src/java/org/apache/lucene/util/automaton/ lucen...

Author: mikemccand
Date: Tue Oct 30 16:58:05 2012
New Revision: 1403780

URL: http://svn.apache.org/viewvc?rev=1403780&view=rev
Log:
LUCENE-3846: add new FuzzySuggester

Added:
    lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
      - copied unchanged from r1403779, lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
      - copied unchanged from r1403779, lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/dev-tools/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/benchmark/   (props changed)
    lucene/dev/branches/branch_4x/lucene/build.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/codecs/   (props changed)
    lucene/dev/branches/branch_4x/lucene/common-build.xml   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
    lucene/dev/branches/branch_4x/lucene/facet/   (props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/   (props changed)
    lucene/dev/branches/branch_4x/lucene/memory/   (props changed)
    lucene/dev/branches/branch_4x/lucene/queries/   (props changed)
    lucene/dev/branches/branch_4x/lucene/site/   (props changed)
    lucene/dev/branches/branch_4x/lucene/spatial/   (props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/   (props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
    lucene/dev/branches/branch_4x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/CHANGES.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/build.xml   (props changed)
    lucene/dev/branches/branch_4x/solr/contrib/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/   (props changed)
    lucene/dev/branches/branch_4x/solr/example/   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/   (props changed)
    lucene/dev/branches/branch_4x/solr/site/   (props changed)
    lucene/dev/branches/branch_4x/solr/solrj/   (props changed)
    lucene/dev/branches/branch_4x/solr/test-framework/   (props changed)

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Tue Oct 30 16:58:05 2012
@@ -39,6 +39,10 @@ New Features
   for better search performance. 
   (Han Jiang, Adrien Grand, Robert Muir, Mike McCandless)
 
+* LUCENE-3846: New FuzzySuggester, like AnalyzingSuggester except it
+  also finds completions allowing for fuzzy edits in the input string.
+  (Robert Muir, Simon Willnauer, Mike McCandless)
+
 API Changes
 
 * LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries

Modified: lucene/dev/branches/branch_4x/lucene/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/common-build.xml?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/common-build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/common-build.xml Tue Oct 30 16:58:05 2012
@@ -833,7 +833,7 @@
             <assertions>
               <enable package="org.apache.lucene"/>
               <enable package="org.apache.solr"/>
-            </assertions>
+            </assertions>  
 
             <!-- JVM arguments and system properties. -->
             <jvmarg line="${args}"/>

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java Tue Oct 30 16:58:05 2012
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
 
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
@@ -88,6 +89,7 @@ public class TokenStreamToAutomaton {
     final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
     final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
     final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
+
     final BytesRef term = termBytesAtt.getBytesRef();
 
     in.reset();

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java Tue Oct 30 16:58:05 2012
@@ -240,6 +240,20 @@ final public class BasicAutomata {
     a.deterministic = true;
     return a;
   }
+  
+  public static Automaton makeString(int[] word, int offset, int length) {
+    Automaton a = new Automaton();
+    a.setDeterministic(true);
+    State s = new State();
+    a.initial = s;
+    for (int i = offset; i < offset+length; i++) {
+      State s2 = new State();
+      s.addTransition(new Transition(word[i], s2));
+      s = s2;
+    }
+    s.accept = true;
+    return a;
+  }
 
   /**
    * Returns a new (deterministic and minimal) automaton that accepts the union

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java Tue Oct 30 16:58:05 2012
@@ -33,12 +33,13 @@ public class LevenshteinAutomata {
   /** @lucene.internal */
   public static final int MAXIMUM_SUPPORTED_DISTANCE = 2;
   /* input word */
-  final String input;
   final int word[];
   /* the automata alphabet. */
   final int alphabet[];
+  /* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */
+  final int alphaMax;
 
-  /* the unicode ranges outside of alphabet */
+  /* the ranges outside of alphabet */
   final int rangeLower[];
   final int rangeUpper[];
   int numRanges = 0;
@@ -50,17 +51,26 @@ public class LevenshteinAutomata {
    * Optionally count transpositions as a primitive edit.
    */
   public LevenshteinAutomata(String input, boolean withTranspositions) {
-    this.input = input;
-    int length = Character.codePointCount(input, 0, input.length());
-    word = new int[length];
-    for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) {
-      word[j++] = cp = input.codePointAt(i);
-    }
-    
+    this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions);
+  }
+
+  /**
+   * Expert: specify a custom maximum possible symbol
+   * (alphaMax); default is Character.MAX_CODE_POINT.
+   */
+  public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) {
+    this.word = word;
+    this.alphaMax = alphaMax;
+
     // calculate the alphabet
     SortedSet<Integer> set = new TreeSet<Integer>();
-    for (int i = 0; i < word.length; i++)
-      set.add(word[i]);
+    for (int i = 0; i < word.length; i++) {
+      int v = word[i];
+      if (v > alphaMax) {
+        throw new IllegalArgumentException("alphaMax exceeded by symbol " + v + " in word");
+      }
+      set.add(v);
+    }
     alphabet = new int[set.size()];
     Iterator<Integer> iterator = set.iterator();
     for (int i = 0; i < alphabet.length; i++)
@@ -81,9 +91,9 @@ public class LevenshteinAutomata {
       lower = higher + 1;
     }
     /* add the final endpoint */
-    if (lower <= Character.MAX_CODE_POINT) {
+    if (lower <= alphaMax) {
       rangeLower[numRanges] = lower;
-      rangeUpper[numRanges] = Character.MAX_CODE_POINT;
+      rangeUpper[numRanges] = alphaMax;
       numRanges++;
     }
 
@@ -94,6 +104,15 @@ public class LevenshteinAutomata {
     };
   }
   
+  private static int[] codePoints(String input) {
+    int length = Character.codePointCount(input, 0, input.length());
+    int word[] = new int[length];
+    for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) {
+      word[j++] = cp = input.codePointAt(i);
+    }
+    return word;
+  }
+  
   /**
    * Compute a DFA that accepts all strings within an edit distance of <code>n</code>.
    * <p>
@@ -106,8 +125,9 @@ public class LevenshteinAutomata {
    * </p>
    */
   public Automaton toAutomaton(int n) {
-    if (n == 0)
-      return BasicAutomata.makeString(input);
+    if (n == 0) {
+      return BasicAutomata.makeString(word, 0, word.length);
+    }
     
     if (n >= descriptions.length)
       return null;

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java Tue Oct 30 16:58:05 2012
@@ -22,6 +22,8 @@ import java.util.*;
 
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.FST.Arc;
+import org.apache.lucene.util.fst.FST.BytesReader;
 
 /** Static helper methods.
  *
@@ -304,7 +306,10 @@ public final class Util {
           path.input.ints[path.input.length++] = path.arc.label;
           final int cmp = bottom.input.compareTo(path.input);
           path.input.length--;
+
+          // We should never see dups:
           assert cmp != 0;
+
           if (cmp < 0) {
             // Doesn't compete
             return;
@@ -846,4 +851,93 @@ public final class Util {
     w.close();
   }
   */
+
+  /**
+   * Reads the first arc greater or equal that the given label into the provided
+   * arc in place and returns it iff found, otherwise return <code>null</code>.
+   * 
+   * @param label the label to ceil on
+   * @param fst the fst to operate on
+   * @param follow the arc to follow reading the label from
+   * @param arc the arc to read into in place
+   * @param in the fst's {@link BytesReader}
+   */
+  public static <T> Arc<T> readCeilArc(int label, FST<T> fst, Arc<T> follow,
+      Arc<T> arc, BytesReader in) throws IOException {
+    // TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum?
+    if (label == FST.END_LABEL) {
+      if (follow.isFinal()) {
+        if (follow.target <= 0) {
+          arc.flags = FST.BIT_LAST_ARC;
+        } else {
+          arc.flags = 0;
+          // NOTE: nextArc is a node (not an address!) in this case:
+          arc.nextArc = follow.target;
+          arc.node = follow.target;
+        }
+        arc.output = follow.nextFinalOutput;
+        arc.label = FST.END_LABEL;
+        return arc;
+      } else {
+        return null;
+      }
+    }
+
+    if (!FST.targetHasArcs(follow)) {
+      return null;
+    }
+    fst.readFirstTargetArc(follow, arc, in);
+    if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
+      // Arcs are fixed array -- use binary search to find
+      // the target.
+
+      int low = arc.arcIdx;
+      int high = arc.numArcs - 1;
+      int mid = 0;
+      // System.out.println("do arc array low=" + low + " high=" + high +
+      // " targetLabel=" + targetLabel);
+      while (low <= high) {
+        mid = (low + high) >>> 1;
+        in.pos = arc.posArcsStart;
+        in.skip(arc.bytesPerArc * mid + 1);
+        final int midLabel = fst.readLabel(in);
+        final int cmp = midLabel - label;
+        // System.out.println("  cycle low=" + low + " high=" + high + " mid=" +
+        // mid + " midLabel=" + midLabel + " cmp=" + cmp);
+        if (cmp < 0) {
+          low = mid + 1;
+        } else if (cmp > 0) {
+          high = mid - 1;
+        } else {
+          arc.arcIdx = mid-1;
+          return fst.readNextRealArc(arc, in);
+        }
+      }
+      if (low == arc.numArcs) {
+        // DEAD END!
+        return null;
+      }
+      
+      arc.arcIdx = (low > high ? high : low);
+      return fst.readNextRealArc(arc, in);
+    }
+
+    // Linear scan
+    fst.readFirstRealTargetArc(follow.target, arc, in);
+
+    while (true) {
+      // System.out.println("  non-bs cycle");
+      // TODO: we should fix this code to not have to create
+      // object for the output of every arc we scan... only
+      // for the matching arc, if found
+      if (arc.label >= label) {
+        // System.out.println("    found!");
+        return arc;
+      } else if (arc.isLast()) {
+        return null;
+      } else {
+        fst.readNextRealArc(arc, in);
+      }
+    }
+  }
 }

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java Tue Oct 30 16:58:05 2012
@@ -31,6 +31,7 @@ import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.TokenStreamToAutomaton;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.search.spell.TermFreqIterator;
 import org.apache.lucene.search.suggest.Lookup;
 import org.apache.lucene.search.suggest.fst.Sort;
@@ -310,7 +311,7 @@ public class AnalyzingSuggester extends 
     }
   }
 
-  private TokenStreamToAutomaton getTokenStreamToAutomaton() {
+  TokenStreamToAutomaton getTokenStreamToAutomaton() {
     if (preserveSep) {
       return new EscapingTokenStreamToAutomaton();
     } else {
@@ -332,6 +333,7 @@ public class AnalyzingSuggester extends 
     BytesRef scratch = new BytesRef();
 
     TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
+
     // analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short) 
     boolean success = false;
     byte buffer[] = new byte[8];
@@ -339,29 +341,8 @@ public class AnalyzingSuggester extends 
       ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
       BytesRef surfaceForm;
       while ((surfaceForm = iterator.next()) != null) {
-
-        // Analyze surface form:
-        TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));
-
-        // Create corresponding automaton: labels are bytes
-        // from each analyzed token, with byte 0 used as
-        // separator between tokens:
-        Automaton automaton = ts2a.toAutomaton(ts);
-        ts.end();
-        ts.close();
-
-        replaceSep(automaton);
-
-        assert SpecialOperations.isFinite(automaton);
-
-        // Get all paths from the automaton (there can be
-        // more than one path, eg if the analyzer created a
-        // graph using SynFilter or WDF):
-
-        // TODO: we could walk & add simultaneously, so we
-        // don't have to alloc [possibly biggish]
-        // intermediate HashSet in RAM:
-        Set<IntsRef> paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
+        Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
+        
         maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
 
         for (IntsRef path : paths) {
@@ -510,27 +491,10 @@ public class AnalyzingSuggester extends 
     }
 
     //System.out.println("lookup key=" + key + " num=" + num);
-
+    final BytesRef utf8Key = new BytesRef(key);
     try {
 
-      // TODO: is there a Reader from a CharSequence?
-      // Turn tokenstream into automaton:
-      TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
-      Automaton automaton = getTokenStreamToAutomaton().toAutomaton(ts);
-      ts.end();
-      ts.close();
-
-      // TODO: we could use the end offset to "guess"
-      // whether the final token was a partial token; this
-      // would only be a heuristic ... but maybe an OK one.
-      // This way we could eg differentiate "net" from "net ",
-      // which we can't today...
-
-      replaceSep(automaton);
-
-      // TODO: we can optimize this somewhat by determinizing
-      // while we convert
-      BasicOperations.determinize(automaton);
+      Automaton lookupAutomaton = toLookupAutomaton(key);
 
       final CharsRef spare = new CharsRef();
 
@@ -538,8 +502,7 @@ public class AnalyzingSuggester extends 
     
       // Intersect automaton w/ suggest wFST and get all
       // prefix starting nodes & their outputs:
-      final List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths;
-      prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst);
+      //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
 
       //System.out.println("  prefixPaths: " + prefixPaths.size());
 
@@ -549,6 +512,8 @@ public class AnalyzingSuggester extends 
 
       final List<LookupResult> results = new ArrayList<LookupResult>();
 
+      List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);
+
       if (exactFirst) {
 
         int count = 0;
@@ -593,9 +558,9 @@ public class AnalyzingSuggester extends 
         // nodes we have and the
         // maxSurfaceFormsPerAnalyzedForm:
         for(MinResult<Pair<Long,BytesRef>> completion : completions) {
-          spare.grow(completion.output.output2.length);
-          UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
-          if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) {
+          if (utf8Key.bytesEquals(completion.output.output2)) {
+            spare.grow(completion.output.output2.length);
+            UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
             results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1)));
             break;
           }
@@ -630,9 +595,7 @@ public class AnalyzingSuggester extends 
             // In exactFirst mode, don't accept any paths
             // matching the surface form since that will
             // create duplicate results:
-            spare.grow(output.output2.length);
-            UnicodeUtil.UTF8toUTF16(output.output2, spare);
-            if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) {
+            if (utf8Key.bytesEquals(output.output2)) {
               // We found exact match, which means we should
               // have already found it in the first search:
               assert results.size() == 1;
@@ -644,6 +607,8 @@ public class AnalyzingSuggester extends 
         }
       };
 
+      prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
+      
       for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
         searcher.addStartPaths(path.fstNode, path.output, true, path.input);
       }
@@ -654,6 +619,10 @@ public class AnalyzingSuggester extends 
         spare.grow(completion.output.output2.length);
         UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
         LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));
+
+        // TODO: for fuzzy case would be nice to return
+        // how many edits were required
+
         //System.out.println("    result=" + result);
         results.add(result);
 
@@ -670,6 +639,63 @@ public class AnalyzingSuggester extends 
     }
   }
 
+  /** Returns all prefix paths to initialize the search. */
+  protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
+                                                                       Automaton lookupAutomaton,
+                                                                       FST<Pair<Long,BytesRef>> fst)
+    throws IOException {
+    return prefixPaths;
+  }
+  
+  final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
+ // Analyze surface form:
+    TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));
+
+    // Create corresponding automaton: labels are bytes
+    // from each analyzed token, with byte 0 used as
+    // separator between tokens:
+    Automaton automaton = ts2a.toAutomaton(ts);
+    ts.end();
+    ts.close();
+
+    replaceSep(automaton);
+
+    assert SpecialOperations.isFinite(automaton);
+
+    // Get all paths from the automaton (there can be
+    // more than one path, eg if the analyzer created a
+    // graph using SynFilter or WDF):
+
+    // TODO: we could walk & add simultaneously, so we
+    // don't have to alloc [possibly biggish]
+    // intermediate HashSet in RAM:
+    return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
+  }
+
+  final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
+    // TODO: is there a Reader from a CharSequence?
+    // Turn tokenstream into automaton:
+    TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
+    Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
+    ts.end();
+    ts.close();
+
+    // TODO: we could use the end offset to "guess"
+    // whether the final token was a partial token; this
+    // would only be a heuristic ... but maybe an OK one.
+    // This way we could eg differentiate "net" from "net ",
+    // which we can't today...
+
+    replaceSep(automaton);
+
+    // TODO: we can optimize this somewhat by determinizing
+    // while we convert
+    BasicOperations.determinize(automaton);
+    return automaton;
+  }
+  
+  
+
   /**
    * Returns the weight associated with an input string,
    * or null if it does not exist.

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java Tue Oct 30 16:58:05 2012
@@ -26,6 +26,7 @@ import org.apache.lucene.util.automaton.
 import org.apache.lucene.util.automaton.State;
 import org.apache.lucene.util.automaton.Transition;
 import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Util;
 
 // TODO: move to core?  nobody else uses it yet though...
 
@@ -62,57 +63,78 @@ public class FSTUtil {
     }
   }
 
-  /** Enumerates all paths in the automaton that also
-   *  intersect the FST, accumulating the FST end node and
-   *  output for each path. */
-  public static<T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst) throws IOException {
+  /**
+   * Enumerates all minimal prefix paths in the automaton that also intersect the FST,
+   * accumulating the FST end node and output for each path.
+   */
+  public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst)
+      throws IOException {
+    assert a.isDeterministic();
     final List<Path<T>> queue = new ArrayList<Path<T>>();
     final List<Path<T>> endNodes = new ArrayList<Path<T>>();
-
-    queue.add(new Path<T>(a.getInitialState(),
-                          fst.getFirstArc(new FST.Arc<T>()),       
-                          fst.outputs.getNoOutput(),
-                          new IntsRef()));
-
+    queue.add(new Path<T>(a.getInitialState(), fst
+        .getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(),
+        new IntsRef()));
+    
     final FST.Arc<T> scratchArc = new FST.Arc<T>();
     final FST.BytesReader fstReader = fst.getBytesReader(0);
-
-    //System.out.println("fst/a intersect");
-
+    
     while (queue.size() != 0) {
-      final Path<T> path = queue.remove(queue.size()-1);
-      //System.out.println("  cycle path=" + path);
+      final Path<T> path = queue.remove(queue.size() - 1);
       if (path.state.isAccept()) {
         endNodes.add(path);
+        // we can stop here if we accept this path,
+        // we accept all further paths too
+        continue;
       }
-
+      
       IntsRef currentInput = path.input;
-      for(Transition t : path.state.getTransitions()) {
-        
-        // TODO: we can fix this if necessary:
-        if (t.getMin() != t.getMax()) {
-          throw new IllegalStateException("can only handle Transitions that match one character");
-        }
-
-        //System.out.println("    t=" + (char) t.getMin());
-
-        final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(), path.fstNode, scratchArc, fstReader);
-        if (nextArc != null) {
-          //System.out.println("      fst matches");
-          // Path continues:
-          IntsRef newInput = new IntsRef(currentInput.length + 1);
-          newInput.copyInts(currentInput);
-          newInput.ints[currentInput.length] = t.getMin();
-          newInput.length = currentInput.length + 1;
-
-          queue.add(new Path<T>(t.getDest(),
-                                new FST.Arc<T>().copyFrom(nextArc),
-                                fst.outputs.add(path.output, nextArc.output),
-                                newInput));
+      for (Transition t : path.state.getTransitions()) {
+        final int min = t.getMin();
+        final int max = t.getMax();
+        if (min == max) {
+          final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(),
+              path.fstNode, scratchArc, fstReader);
+          if (nextArc != null) {
+            final IntsRef newInput = new IntsRef(currentInput.length + 1);
+            newInput.copyInts(currentInput);
+            newInput.ints[currentInput.length] = t.getMin();
+            newInput.length = currentInput.length + 1;
+            queue.add(new Path<T>(t.getDest(), new FST.Arc<T>()
+                .copyFrom(nextArc), fst.outputs
+                .add(path.output, nextArc.output), newInput));
+          }
+        } else {
+          // TODO: if this transition's TO state is accepting, and
+          // it accepts the entire range possible in the FST (ie. 0 to 255),
+          // we can simply use the prefix as the accepted state instead of
+          // looking up all the ranges and terminate early
+          // here.  This just shifts the work from one queue
+          // (this one) to another (the completion search
+          // done in AnalyzingSuggester).
+          FST.Arc<T> nextArc = Util.readCeilArc(min, fst, path.fstNode,
+              scratchArc, fstReader);
+          while (nextArc != null && nextArc.label <= max) {
+            assert nextArc.label <=  max;
+            assert nextArc.label >= min : nextArc.label + " "
+                + min;
+            final IntsRef newInput = new IntsRef(currentInput.length + 1);
+            newInput.copyInts(currentInput);
+            newInput.ints[currentInput.length] = nextArc.label;
+            newInput.length = currentInput.length + 1;
+            queue.add(new Path<T>(t.getDest(), new FST.Arc<T>()
+                .copyFrom(nextArc), fst.outputs
+                .add(path.output, nextArc.output), newInput));
+            final int label = nextArc.label; // used in assert
+            nextArc = nextArc.isLast() ? null : fst.readNextRealArc(nextArc,
+                fstReader);
+            assert nextArc == null || label < nextArc.label : "last: " + label
+                + " next: " + nextArc.label;
+          }
         }
       }
     }
-
     return endNodes;
   }
+  
 }

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java Tue Oct 30 16:58:05 2012
@@ -36,6 +36,7 @@ import org.apache.lucene.analysis.MockAn
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.search.suggest.Lookup; // javadocs
 import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
+import org.apache.lucene.search.suggest.analyzing.FuzzySuggester;
 import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
 import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup;
 import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
@@ -51,17 +52,20 @@ import org.junit.Ignore;
 public class LookupBenchmarkTest extends LuceneTestCase {
   @SuppressWarnings("unchecked")
   private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList(
+      FuzzySuggester.class,
+      AnalyzingSuggester.class,
       JaspellLookup.class, 
       TSTLookup.class,
       FSTCompletionLookup.class,
-      WFSTCompletionLookup.class,
-      AnalyzingSuggester.class);
+      WFSTCompletionLookup.class
+      
+      );
 
   private final static int rounds = 15;
   private final static int warmup = 5;
 
   private final int num = 7;
-  private final boolean onlyMorePopular = true;
+  private final boolean onlyMorePopular = false;
 
   private final static Random random = new Random(0xdeadbeef);
 
@@ -212,8 +216,9 @@ public class LookupBenchmarkTest extends
       final List<String> input = new ArrayList<String>(benchmarkInput.size());
       for (TermFreq tf : benchmarkInput) {
         String s = tf.term.utf8ToString();
-        input.add(s.substring(0, Math.min(s.length(), 
-              minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1))));
+        String sub = s.substring(0, Math.min(s.length(), 
+            minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)));
+        input.add(sub);
       }
 
       BenchmarkResult result = measure(new Callable<Integer>() {
@@ -250,7 +255,9 @@ public class LookupBenchmarkTest extends
       }
       return new BenchmarkResult(times, warmup, rounds);
     } catch (Exception e) {
+      e.printStackTrace();
       throw new RuntimeException(e);
+      
     }
   }