You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/10/30 17:58:06 UTC
svn commit: r1403780 - in /lucene/dev/branches/branch_4x: ./ dev-tools/
lucene/ lucene/analysis/ lucene/benchmark/ lucene/codecs/ lucene/core/
lucene/core/src/java/org/apache/lucene/analysis/
lucene/core/src/java/org/apache/lucene/util/automaton/ lucen...
Author: mikemccand
Date: Tue Oct 30 16:58:05 2012
New Revision: 1403780
URL: http://svn.apache.org/viewvc?rev=1403780&view=rev
Log:
LUCENE-3846: add new FuzzySuggester
Added:
lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
- copied unchanged from r1403779, lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
- copied unchanged from r1403779, lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/dev-tools/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/benchmark/ (props changed)
lucene/dev/branches/branch_4x/lucene/build.xml (props changed)
lucene/dev/branches/branch_4x/lucene/codecs/ (props changed)
lucene/dev/branches/branch_4x/lucene/common-build.xml (contents, props changed)
lucene/dev/branches/branch_4x/lucene/core/ (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
lucene/dev/branches/branch_4x/lucene/facet/ (props changed)
lucene/dev/branches/branch_4x/lucene/highlighter/ (props changed)
lucene/dev/branches/branch_4x/lucene/memory/ (props changed)
lucene/dev/branches/branch_4x/lucene/queries/ (props changed)
lucene/dev/branches/branch_4x/lucene/site/ (props changed)
lucene/dev/branches/branch_4x/lucene/spatial/ (props changed)
lucene/dev/branches/branch_4x/lucene/suggest/ (props changed)
lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
lucene/dev/branches/branch_4x/lucene/test-framework/ (props changed)
lucene/dev/branches/branch_4x/solr/ (props changed)
lucene/dev/branches/branch_4x/solr/CHANGES.txt (props changed)
lucene/dev/branches/branch_4x/solr/build.xml (props changed)
lucene/dev/branches/branch_4x/solr/contrib/ (props changed)
lucene/dev/branches/branch_4x/solr/core/ (props changed)
lucene/dev/branches/branch_4x/solr/example/ (props changed)
lucene/dev/branches/branch_4x/solr/licenses/ (props changed)
lucene/dev/branches/branch_4x/solr/site/ (props changed)
lucene/dev/branches/branch_4x/solr/solrj/ (props changed)
lucene/dev/branches/branch_4x/solr/test-framework/ (props changed)
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Tue Oct 30 16:58:05 2012
@@ -39,6 +39,10 @@ New Features
for better search performance.
(Han Jiang, Adrien Grand, Robert Muir, Mike McCandless)
+* LUCENE-3846: New FuzzySuggester, like AnalyzingSuggester except it
+ also finds completions allowing for fuzzy edits in the input string.
+ (Robert Muir, Simon Willnauer, Mike McCandless)
+
API Changes
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
Modified: lucene/dev/branches/branch_4x/lucene/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/common-build.xml?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/common-build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/common-build.xml Tue Oct 30 16:58:05 2012
@@ -833,7 +833,7 @@
<assertions>
<enable package="org.apache.lucene"/>
<enable package="org.apache.solr"/>
- </assertions>
+ </assertions>
<!-- JVM arguments and system properties. -->
<jvmarg line="${args}"/>
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java Tue Oct 30 16:58:05 2012
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
@@ -88,6 +89,7 @@ public class TokenStreamToAutomaton {
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
+
final BytesRef term = termBytesAtt.getBytesRef();
in.reset();
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java Tue Oct 30 16:58:05 2012
@@ -240,6 +240,20 @@ final public class BasicAutomata {
a.deterministic = true;
return a;
}
+
+ public static Automaton makeString(int[] word, int offset, int length) {
+ Automaton a = new Automaton();
+ a.setDeterministic(true);
+ State s = new State();
+ a.initial = s;
+ for (int i = offset; i < offset+length; i++) {
+ State s2 = new State();
+ s.addTransition(new Transition(word[i], s2));
+ s = s2;
+ }
+ s.accept = true;
+ return a;
+ }
/**
* Returns a new (deterministic and minimal) automaton that accepts the union
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java Tue Oct 30 16:58:05 2012
@@ -33,12 +33,13 @@ public class LevenshteinAutomata {
/** @lucene.internal */
public static final int MAXIMUM_SUPPORTED_DISTANCE = 2;
/* input word */
- final String input;
final int word[];
/* the automata alphabet. */
final int alphabet[];
+ /* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */
+ final int alphaMax;
- /* the unicode ranges outside of alphabet */
+ /* the ranges outside of alphabet */
final int rangeLower[];
final int rangeUpper[];
int numRanges = 0;
@@ -50,17 +51,26 @@ public class LevenshteinAutomata {
* Optionally count transpositions as a primitive edit.
*/
public LevenshteinAutomata(String input, boolean withTranspositions) {
- this.input = input;
- int length = Character.codePointCount(input, 0, input.length());
- word = new int[length];
- for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) {
- word[j++] = cp = input.codePointAt(i);
- }
-
+ this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions);
+ }
+
+ /**
+ * Expert: specify a custom maximum possible symbol
+ * (alphaMax); default is Character.MAX_CODE_POINT.
+ */
+ public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) {
+ this.word = word;
+ this.alphaMax = alphaMax;
+
// calculate the alphabet
SortedSet<Integer> set = new TreeSet<Integer>();
- for (int i = 0; i < word.length; i++)
- set.add(word[i]);
+ for (int i = 0; i < word.length; i++) {
+ int v = word[i];
+ if (v > alphaMax) {
+ throw new IllegalArgumentException("alphaMax exceeded by symbol " + v + " in word");
+ }
+ set.add(v);
+ }
alphabet = new int[set.size()];
Iterator<Integer> iterator = set.iterator();
for (int i = 0; i < alphabet.length; i++)
@@ -81,9 +91,9 @@ public class LevenshteinAutomata {
lower = higher + 1;
}
/* add the final endpoint */
- if (lower <= Character.MAX_CODE_POINT) {
+ if (lower <= alphaMax) {
rangeLower[numRanges] = lower;
- rangeUpper[numRanges] = Character.MAX_CODE_POINT;
+ rangeUpper[numRanges] = alphaMax;
numRanges++;
}
@@ -94,6 +104,15 @@ public class LevenshteinAutomata {
};
}
+ private static int[] codePoints(String input) {
+ int length = Character.codePointCount(input, 0, input.length());
+ int word[] = new int[length];
+ for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) {
+ word[j++] = cp = input.codePointAt(i);
+ }
+ return word;
+ }
+
/**
* Compute a DFA that accepts all strings within an edit distance of <code>n</code>.
* <p>
@@ -106,8 +125,9 @@ public class LevenshteinAutomata {
* </p>
*/
public Automaton toAutomaton(int n) {
- if (n == 0)
- return BasicAutomata.makeString(input);
+ if (n == 0) {
+ return BasicAutomata.makeString(word, 0, word.length);
+ }
if (n >= descriptions.length)
return null;
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java Tue Oct 30 16:58:05 2012
@@ -22,6 +22,8 @@ import java.util.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.FST.Arc;
+import org.apache.lucene.util.fst.FST.BytesReader;
/** Static helper methods.
*
@@ -304,7 +306,10 @@ public final class Util {
path.input.ints[path.input.length++] = path.arc.label;
final int cmp = bottom.input.compareTo(path.input);
path.input.length--;
+
+ // We should never see dups:
assert cmp != 0;
+
if (cmp < 0) {
// Doesn't compete
return;
@@ -846,4 +851,93 @@ public final class Util {
w.close();
}
*/
+
+ /**
+ * Reads the first arc greater or equal that the given label into the provided
+ * arc in place and returns it iff found, otherwise return <code>null</code>.
+ *
+ * @param label the label to ceil on
+ * @param fst the fst to operate on
+ * @param follow the arc to follow reading the label from
+ * @param arc the arc to read into in place
+ * @param in the fst's {@link BytesReader}
+ */
+ public static <T> Arc<T> readCeilArc(int label, FST<T> fst, Arc<T> follow,
+ Arc<T> arc, BytesReader in) throws IOException {
+ // TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum?
+ if (label == FST.END_LABEL) {
+ if (follow.isFinal()) {
+ if (follow.target <= 0) {
+ arc.flags = FST.BIT_LAST_ARC;
+ } else {
+ arc.flags = 0;
+ // NOTE: nextArc is a node (not an address!) in this case:
+ arc.nextArc = follow.target;
+ arc.node = follow.target;
+ }
+ arc.output = follow.nextFinalOutput;
+ arc.label = FST.END_LABEL;
+ return arc;
+ } else {
+ return null;
+ }
+ }
+
+ if (!FST.targetHasArcs(follow)) {
+ return null;
+ }
+ fst.readFirstTargetArc(follow, arc, in);
+ if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
+ // Arcs are fixed array -- use binary search to find
+ // the target.
+
+ int low = arc.arcIdx;
+ int high = arc.numArcs - 1;
+ int mid = 0;
+ // System.out.println("do arc array low=" + low + " high=" + high +
+ // " targetLabel=" + targetLabel);
+ while (low <= high) {
+ mid = (low + high) >>> 1;
+ in.pos = arc.posArcsStart;
+ in.skip(arc.bytesPerArc * mid + 1);
+ final int midLabel = fst.readLabel(in);
+ final int cmp = midLabel - label;
+ // System.out.println(" cycle low=" + low + " high=" + high + " mid=" +
+ // mid + " midLabel=" + midLabel + " cmp=" + cmp);
+ if (cmp < 0) {
+ low = mid + 1;
+ } else if (cmp > 0) {
+ high = mid - 1;
+ } else {
+ arc.arcIdx = mid-1;
+ return fst.readNextRealArc(arc, in);
+ }
+ }
+ if (low == arc.numArcs) {
+ // DEAD END!
+ return null;
+ }
+
+ arc.arcIdx = (low > high ? high : low);
+ return fst.readNextRealArc(arc, in);
+ }
+
+ // Linear scan
+ fst.readFirstRealTargetArc(follow.target, arc, in);
+
+ while (true) {
+ // System.out.println(" non-bs cycle");
+ // TODO: we should fix this code to not have to create
+ // object for the output of every arc we scan... only
+ // for the matching arc, if found
+ if (arc.label >= label) {
+ // System.out.println(" found!");
+ return arc;
+ } else if (arc.isLast()) {
+ return null;
+ } else {
+ fst.readNextRealArc(arc, in);
+ }
+ }
+ }
}
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java Tue Oct 30 16:58:05 2012
@@ -31,6 +31,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.fst.Sort;
@@ -310,7 +311,7 @@ public class AnalyzingSuggester extends
}
}
- private TokenStreamToAutomaton getTokenStreamToAutomaton() {
+ TokenStreamToAutomaton getTokenStreamToAutomaton() {
if (preserveSep) {
return new EscapingTokenStreamToAutomaton();
} else {
@@ -332,6 +333,7 @@ public class AnalyzingSuggester extends
BytesRef scratch = new BytesRef();
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
+
// analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short)
boolean success = false;
byte buffer[] = new byte[8];
@@ -339,29 +341,8 @@ public class AnalyzingSuggester extends
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
BytesRef surfaceForm;
while ((surfaceForm = iterator.next()) != null) {
-
- // Analyze surface form:
- TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));
-
- // Create corresponding automaton: labels are bytes
- // from each analyzed token, with byte 0 used as
- // separator between tokens:
- Automaton automaton = ts2a.toAutomaton(ts);
- ts.end();
- ts.close();
-
- replaceSep(automaton);
-
- assert SpecialOperations.isFinite(automaton);
-
- // Get all paths from the automaton (there can be
- // more than one path, eg if the analyzer created a
- // graph using SynFilter or WDF):
-
- // TODO: we could walk & add simultaneously, so we
- // don't have to alloc [possibly biggish]
- // intermediate HashSet in RAM:
- Set<IntsRef> paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
+ Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
+
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
for (IntsRef path : paths) {
@@ -510,27 +491,10 @@ public class AnalyzingSuggester extends
}
//System.out.println("lookup key=" + key + " num=" + num);
-
+ final BytesRef utf8Key = new BytesRef(key);
try {
- // TODO: is there a Reader from a CharSequence?
- // Turn tokenstream into automaton:
- TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
- Automaton automaton = getTokenStreamToAutomaton().toAutomaton(ts);
- ts.end();
- ts.close();
-
- // TODO: we could use the end offset to "guess"
- // whether the final token was a partial token; this
- // would only be a heuristic ... but maybe an OK one.
- // This way we could eg differentiate "net" from "net ",
- // which we can't today...
-
- replaceSep(automaton);
-
- // TODO: we can optimize this somewhat by determinizing
- // while we convert
- BasicOperations.determinize(automaton);
+ Automaton lookupAutomaton = toLookupAutomaton(key);
final CharsRef spare = new CharsRef();
@@ -538,8 +502,7 @@ public class AnalyzingSuggester extends
// Intersect automaton w/ suggest wFST and get all
// prefix starting nodes & their outputs:
- final List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths;
- prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst);
+ //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
//System.out.println(" prefixPaths: " + prefixPaths.size());
@@ -549,6 +512,8 @@ public class AnalyzingSuggester extends
final List<LookupResult> results = new ArrayList<LookupResult>();
+ List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);
+
if (exactFirst) {
int count = 0;
@@ -593,9 +558,9 @@ public class AnalyzingSuggester extends
// nodes we have and the
// maxSurfaceFormsPerAnalyzedForm:
for(MinResult<Pair<Long,BytesRef>> completion : completions) {
- spare.grow(completion.output.output2.length);
- UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
- if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) {
+ if (utf8Key.bytesEquals(completion.output.output2)) {
+ spare.grow(completion.output.output2.length);
+ UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1)));
break;
}
@@ -630,9 +595,7 @@ public class AnalyzingSuggester extends
// In exactFirst mode, don't accept any paths
// matching the surface form since that will
// create duplicate results:
- spare.grow(output.output2.length);
- UnicodeUtil.UTF8toUTF16(output.output2, spare);
- if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) {
+ if (utf8Key.bytesEquals(output.output2)) {
// We found exact match, which means we should
// have already found it in the first search:
assert results.size() == 1;
@@ -644,6 +607,8 @@ public class AnalyzingSuggester extends
}
};
+ prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
+
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
}
@@ -654,6 +619,10 @@ public class AnalyzingSuggester extends
spare.grow(completion.output.output2.length);
UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));
+
+ // TODO: for fuzzy case would be nice to return
+ // how many edits were required
+
//System.out.println(" result=" + result);
results.add(result);
@@ -670,6 +639,63 @@ public class AnalyzingSuggester extends
}
}
+ /** Returns all prefix paths to initialize the search. */
+ protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
+ Automaton lookupAutomaton,
+ FST<Pair<Long,BytesRef>> fst)
+ throws IOException {
+ return prefixPaths;
+ }
+
+ final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
+ // Analyze surface form:
+ TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));
+
+ // Create corresponding automaton: labels are bytes
+ // from each analyzed token, with byte 0 used as
+ // separator between tokens:
+ Automaton automaton = ts2a.toAutomaton(ts);
+ ts.end();
+ ts.close();
+
+ replaceSep(automaton);
+
+ assert SpecialOperations.isFinite(automaton);
+
+ // Get all paths from the automaton (there can be
+ // more than one path, eg if the analyzer created a
+ // graph using SynFilter or WDF):
+
+ // TODO: we could walk & add simultaneously, so we
+ // don't have to alloc [possibly biggish]
+ // intermediate HashSet in RAM:
+ return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
+ }
+
+ final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
+ // TODO: is there a Reader from a CharSequence?
+ // Turn tokenstream into automaton:
+ TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
+ Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
+ ts.end();
+ ts.close();
+
+ // TODO: we could use the end offset to "guess"
+ // whether the final token was a partial token; this
+ // would only be a heuristic ... but maybe an OK one.
+ // This way we could eg differentiate "net" from "net ",
+ // which we can't today...
+
+ replaceSep(automaton);
+
+ // TODO: we can optimize this somewhat by determinizing
+ // while we convert
+ BasicOperations.determinize(automaton);
+ return automaton;
+ }
+
+
+
/**
* Returns the weight associated with an input string,
* or null if it does not exist.
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java Tue Oct 30 16:58:05 2012
@@ -26,6 +26,7 @@ import org.apache.lucene.util.automaton.
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Util;
// TODO: move to core? nobody else uses it yet though...
@@ -62,57 +63,78 @@ public class FSTUtil {
}
}
- /** Enumerates all paths in the automaton that also
- * intersect the FST, accumulating the FST end node and
- * output for each path. */
- public static<T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst) throws IOException {
+ /**
+ * Enumerates all minimal prefix paths in the automaton that also intersect the FST,
+ * accumulating the FST end node and output for each path.
+ */
+ public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst)
+ throws IOException {
+ assert a.isDeterministic();
final List<Path<T>> queue = new ArrayList<Path<T>>();
final List<Path<T>> endNodes = new ArrayList<Path<T>>();
-
- queue.add(new Path<T>(a.getInitialState(),
- fst.getFirstArc(new FST.Arc<T>()),
- fst.outputs.getNoOutput(),
- new IntsRef()));
-
+ queue.add(new Path<T>(a.getInitialState(), fst
+ .getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(),
+ new IntsRef()));
+
final FST.Arc<T> scratchArc = new FST.Arc<T>();
final FST.BytesReader fstReader = fst.getBytesReader(0);
-
- //System.out.println("fst/a intersect");
-
+
while (queue.size() != 0) {
- final Path<T> path = queue.remove(queue.size()-1);
- //System.out.println(" cycle path=" + path);
+ final Path<T> path = queue.remove(queue.size() - 1);
if (path.state.isAccept()) {
endNodes.add(path);
+ // we can stop here if we accept this path,
+ // we accept all further paths too
+ continue;
}
-
+
IntsRef currentInput = path.input;
- for(Transition t : path.state.getTransitions()) {
-
- // TODO: we can fix this if necessary:
- if (t.getMin() != t.getMax()) {
- throw new IllegalStateException("can only handle Transitions that match one character");
- }
-
- //System.out.println(" t=" + (char) t.getMin());
-
- final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(), path.fstNode, scratchArc, fstReader);
- if (nextArc != null) {
- //System.out.println(" fst matches");
- // Path continues:
- IntsRef newInput = new IntsRef(currentInput.length + 1);
- newInput.copyInts(currentInput);
- newInput.ints[currentInput.length] = t.getMin();
- newInput.length = currentInput.length + 1;
-
- queue.add(new Path<T>(t.getDest(),
- new FST.Arc<T>().copyFrom(nextArc),
- fst.outputs.add(path.output, nextArc.output),
- newInput));
+ for (Transition t : path.state.getTransitions()) {
+ final int min = t.getMin();
+ final int max = t.getMax();
+ if (min == max) {
+ final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(),
+ path.fstNode, scratchArc, fstReader);
+ if (nextArc != null) {
+ final IntsRef newInput = new IntsRef(currentInput.length + 1);
+ newInput.copyInts(currentInput);
+ newInput.ints[currentInput.length] = t.getMin();
+ newInput.length = currentInput.length + 1;
+ queue.add(new Path<T>(t.getDest(), new FST.Arc<T>()
+ .copyFrom(nextArc), fst.outputs
+ .add(path.output, nextArc.output), newInput));
+ }
+ } else {
+ // TODO: if this transition's TO state is accepting, and
+ // it accepts the entire range possible in the FST (ie. 0 to 255),
+ // we can simply use the prefix as the accepted state instead of
+ // looking up all the ranges and terminate early
+ // here. This just shifts the work from one queue
+ // (this one) to another (the completion search
+ // done in AnalyzingSuggester).
+ FST.Arc<T> nextArc = Util.readCeilArc(min, fst, path.fstNode,
+ scratchArc, fstReader);
+ while (nextArc != null && nextArc.label <= max) {
+ assert nextArc.label <= max;
+ assert nextArc.label >= min : nextArc.label + " "
+ + min;
+ final IntsRef newInput = new IntsRef(currentInput.length + 1);
+ newInput.copyInts(currentInput);
+ newInput.ints[currentInput.length] = nextArc.label;
+ newInput.length = currentInput.length + 1;
+ queue.add(new Path<T>(t.getDest(), new FST.Arc<T>()
+ .copyFrom(nextArc), fst.outputs
+ .add(path.output, nextArc.output), newInput));
+ final int label = nextArc.label; // used in assert
+ nextArc = nextArc.isLast() ? null : fst.readNextRealArc(nextArc,
+ fstReader);
+ assert nextArc == null || label < nextArc.label : "last: " + label
+ + " next: " + nextArc.label;
+ }
}
}
}
-
return endNodes;
}
+
}
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java?rev=1403780&r1=1403779&r2=1403780&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java Tue Oct 30 16:58:05 2012
@@ -36,6 +36,7 @@ import org.apache.lucene.analysis.MockAn
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.search.suggest.Lookup; // javadocs
import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
+import org.apache.lucene.search.suggest.analyzing.FuzzySuggester;
import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup;
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
@@ -51,17 +52,20 @@ import org.junit.Ignore;
public class LookupBenchmarkTest extends LuceneTestCase {
@SuppressWarnings("unchecked")
private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList(
+ FuzzySuggester.class,
+ AnalyzingSuggester.class,
JaspellLookup.class,
TSTLookup.class,
FSTCompletionLookup.class,
- WFSTCompletionLookup.class,
- AnalyzingSuggester.class);
+ WFSTCompletionLookup.class
+
+ );
private final static int rounds = 15;
private final static int warmup = 5;
private final int num = 7;
- private final boolean onlyMorePopular = true;
+ private final boolean onlyMorePopular = false;
private final static Random random = new Random(0xdeadbeef);
@@ -212,8 +216,9 @@ public class LookupBenchmarkTest extends
final List<String> input = new ArrayList<String>(benchmarkInput.size());
for (TermFreq tf : benchmarkInput) {
String s = tf.term.utf8ToString();
- input.add(s.substring(0, Math.min(s.length(),
- minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1))));
+ String sub = s.substring(0, Math.min(s.length(),
+ minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)));
+ input.add(sub);
}
BenchmarkResult result = measure(new Callable<Integer>() {
@@ -250,7 +255,9 @@ public class LookupBenchmarkTest extends
}
return new BenchmarkResult(times, warmup, rounds);
} catch (Exception e) {
+ e.printStackTrace();
throw new RuntimeException(e);
+
}
}