You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2014/06/11 23:56:22 UTC
svn commit: r1602031 - in /lucene/dev/branches/lucene5752/lucene:
core/src/java/org/apache/lucene/analysis/
core/src/java/org/apache/lucene/util/automaton/
core/src/test/org/apache/lucene/analysis/
suggest/src/java/org/apache/lucene/search/suggest/anal...
Author: mikemccand
Date: Wed Jun 11 21:56:21 2014
New Revision: 1602031
URL: http://svn.apache.org/r1602031
Log:
LUCENE-5752: cutover suggesters
Modified:
lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/LightAutomaton.java
lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8Light.java
lucene/dev/branches/lucene5752/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
lucene/dev/branches/lucene5752/lucene/suggest/src/test/org/apache/lucene/search/suggest/InputArrayIterator.java
lucene/dev/branches/lucene5752/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
Modified: lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java?rev=1602031&r1=1602030&r2=1602031&view=diff
==============================================================================
--- lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (original)
+++ lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java Wed Jun 11 21:56:21 2014
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RollingBuffer;
import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.LightAutomaton;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
@@ -61,15 +62,15 @@ public class TokenStreamToAutomaton {
private static class Position implements RollingBuffer.Resettable {
// Any tokens that ended at our position arrive to this state:
- State arriving;
+ int arriving = -1;
// Any tokens that start at our position leave from this state:
- State leaving;
+ int leaving = -1;
@Override
public void reset() {
- arriving = null;
- leaving = null;
+ arriving = -1;
+ leaving = -1;
}
}
@@ -98,9 +99,9 @@ public class TokenStreamToAutomaton {
* TokenStream}, and creates the corresponding
* automaton where arcs are bytes (or Unicode code points
* if unicodeArcs = true) from each term. */
- public Automaton toAutomaton(TokenStream in) throws IOException {
- final Automaton a = new Automaton();
- boolean deterministic = true;
+ public LightAutomaton toAutomaton(TokenStream in) throws IOException {
+ final LightAutomaton.Builder builder = new LightAutomaton.Builder();
+ builder.createState();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
@@ -132,34 +133,29 @@ public class TokenStreamToAutomaton {
pos += posInc;
posData = positions.get(pos);
- assert posData.leaving == null;
+ assert posData.leaving == -1;
- if (posData.arriving == null) {
+ if (posData.arriving == -1) {
// No token ever arrived to this position
if (pos == 0) {
// OK: this is the first token
- posData.leaving = a.getInitialState();
+ posData.leaving = 0;
} else {
// This means there's a hole (eg, StopFilter
// does this):
- posData.leaving = new State();
- addHoles(a.getInitialState(), positions, pos);
+ posData.leaving = builder.createState();
+ addHoles(builder, positions, pos);
}
} else {
- posData.leaving = new State();
- posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
+ posData.leaving = builder.createState();
+ builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
if (posInc > 1) {
// A token spanned over a hole; add holes
// "under" it:
- addHoles(a.getInitialState(), positions, pos);
+ addHoles(builder, positions, pos);
}
}
positions.freeBefore(pos);
- } else {
- // note: this isn't necessarily true. its just that we aren't surely det.
- // we could optimize this further (e.g. buffer and sort synonyms at a position)
- // but thats probably overkill. this is cheap and dirty
- deterministic = false;
}
final int endPos = pos + posLengthAtt.getPositionLength();
@@ -168,31 +164,33 @@ public class TokenStreamToAutomaton {
final BytesRef termUTF8 = changeToken(term);
int[] termUnicode = null;
final Position endPosData = positions.get(endPos);
- if (endPosData.arriving == null) {
- endPosData.arriving = new State();
+ if (endPosData.arriving == -1) {
+ endPosData.arriving = builder.createState();
}
- State state = posData.leaving;
int termLen;
if (unicodeArcs) {
final String utf16 = termUTF8.utf8ToString();
termUnicode = new int[utf16.codePointCount(0, utf16.length())];
termLen = termUnicode.length;
- for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp))
+ for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
termUnicode[j++] = cp = utf16.codePointAt(i);
+ }
} else {
termLen = termUTF8.length;
}
+ int state = posData.leaving;
+
for(int byteIDX=0;byteIDX<termLen;byteIDX++) {
- final State nextState = byteIDX == termLen-1 ? endPosData.arriving : new State();
+ final int nextState = byteIDX == termLen-1 ? endPosData.arriving : builder.createState();
int c;
if (unicodeArcs) {
c = termUnicode[byteIDX];
} else {
c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
}
- state.addTransition(new Transition(c, nextState));
+ builder.addTransition(state, nextState, c);
state = nextState;
}
@@ -200,28 +198,26 @@ public class TokenStreamToAutomaton {
}
in.end();
- State endState = null;
+ int endState = -1;
if (offsetAtt.endOffset() > maxOffset) {
- endState = new State();
- endState.setAccept(true);
+ endState = builder.createState();
+ builder.setAccept(endState, true);
}
pos++;
while (pos <= positions.getMaxPos()) {
posData = positions.get(pos);
- if (posData.arriving != null) {
- if (endState != null) {
- posData.arriving.addTransition(new Transition(POS_SEP, endState));
+ if (posData.arriving != -1) {
+ if (endState != -1) {
+ builder.addTransition(posData.arriving, endState, POS_SEP);
} else {
- posData.arriving.setAccept(true);
+ builder.setAccept(posData.arriving, true);
}
}
pos++;
}
- //toDot(a);
- a.setDeterministic(deterministic);
- return a;
+ return builder.finish();
}
// for debugging!
@@ -235,26 +231,26 @@ public class TokenStreamToAutomaton {
}
*/
- private static void addHoles(State startState, RollingBuffer<Position> positions, int pos) {
+ private static void addHoles(LightAutomaton.Builder builder, RollingBuffer<Position> positions, int pos) {
Position posData = positions.get(pos);
Position prevPosData = positions.get(pos-1);
- while(posData.arriving == null || prevPosData.leaving == null) {
- if (posData.arriving == null) {
- posData.arriving = new State();
- posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
+ while(posData.arriving == -1 || prevPosData.leaving == -1) {
+ if (posData.arriving == -1) {
+ posData.arriving = builder.createState();
+ builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
}
- if (prevPosData.leaving == null) {
+ if (prevPosData.leaving == -1) {
if (pos == 1) {
- prevPosData.leaving = startState;
+ prevPosData.leaving = 0;
} else {
- prevPosData.leaving = new State();
+ prevPosData.leaving = builder.createState();
}
- if (prevPosData.arriving != null) {
- prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving));
+ if (prevPosData.arriving != -1) {
+ builder.addTransition(prevPosData.arriving, prevPosData.leaving, POS_SEP);
}
}
- prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving));
+ builder.addTransition(prevPosData.leaving, posData.arriving, HOLE);
pos--;
if (pos <= 0) {
break;
Modified: lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java?rev=1602031&r1=1602030&r2=1602031&view=diff
==============================================================================
--- lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (original)
+++ lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java Wed Jun 11 21:56:21 2014
@@ -483,6 +483,21 @@ final public class BasicAutomata {
return a;
}
+ public static LightAutomaton makeStringLight(int[] word, int offset, int length) {
+ LightAutomaton a = new LightAutomaton();
+ a.createState();
+ int s = 0;
+ for (int i = offset; i < offset+length; i++) {
+ int s2 = a.createState();
+ a.addTransition(s, s2, word[i]);
+ s = s2;
+ }
+ a.setAccept(s, true);
+ a.finish();
+
+ return a;
+ }
+
/**
* Returns a new (deterministic and minimal) automaton that accepts the union
* of the given collection of {@link BytesRef}s representing UTF-8 encoded
Modified: lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/LightAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/LightAutomaton.java?rev=1602031&r1=1602030&r2=1602031&view=diff
==============================================================================
--- lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/LightAutomaton.java (original)
+++ lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/LightAutomaton.java Wed Jun 11 21:56:21 2014
@@ -35,7 +35,7 @@ import org.apache.lucene.util.Sorter;
// - could use packed int arrays instead
// - could encode dest w/ delta from to?
-// nocommit should we keep determinized bit?
+// nocommit should we keep determinized bit? it could be entirely privately computed now?
/** Uses only int[]s to represent the automaton, but requires that all
* transitions for each state are added at once. If this is too restrictive,
Modified: lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8Light.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8Light.java?rev=1602031&r1=1602030&r2=1602031&view=diff
==============================================================================
--- lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8Light.java (original)
+++ lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8Light.java Wed Jun 11 21:56:21 2014
@@ -274,6 +274,9 @@ public final class UTF32ToUTF8Light {
//if (utf32.isSingleton()) {
//utf32 = utf32.cloneExpanded();
//}
+ if (utf32.getNumStates() == 0) {
+ return utf32;
+ }
int[] map = new int[utf32.getNumStates()];
Arrays.fill(map, -1);
Modified: lucene/dev/branches/lucene5752/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5752/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java?rev=1602031&r1=1602030&r2=1602031&view=diff
==============================================================================
--- lucene/dev/branches/lucene5752/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (original)
+++ lucene/dev/branches/lucene5752/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java Wed Jun 11 21:56:21 2014
@@ -18,8 +18,8 @@ package org.apache.lucene.analysis;
*/
import java.io.IOException;
-import java.io.StringWriter;
import java.io.PrintWriter;
+import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -32,6 +32,7 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.LightAutomaton;
public class TestGraphTokenizers extends BaseTokenStreamTestCase {
@@ -409,9 +410,10 @@ public class TestGraphTokenizers extends
new Token[] {
token("abc", 1, 1),
});
- final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton expected = BasicAutomata.makeString("abc");
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+ final LightAutomaton expected = s2a("abc");
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+ BasicOperations.determinize(actual)));
}
public void testMultipleHoles() throws Exception {
@@ -420,9 +422,10 @@ public class TestGraphTokenizers extends
token("a", 1, 1),
token("b", 3, 1),
});
- final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+ final LightAutomaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+ BasicOperations.determinize(actual)));
}
public void testSynOverMultipleHoles() throws Exception {
@@ -432,11 +435,12 @@ public class TestGraphTokenizers extends
token("x", 0, 3),
token("b", 3, 1),
});
- final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
- final Automaton a2 = join(s2a("x"), SEP_A, s2a("b"));
- final Automaton expected = BasicOperations.union(a1, a2);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+ final LightAutomaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
+ final LightAutomaton a2 = join(s2a("x"), SEP_A, s2a("b"));
+ final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+ BasicOperations.determinize(actual)));
}
// for debugging!
@@ -450,25 +454,25 @@ public class TestGraphTokenizers extends
}
*/
- private static final Automaton SEP_A = BasicAutomata.makeChar(TokenStreamToAutomaton.POS_SEP);
- private static final Automaton HOLE_A = BasicAutomata.makeChar(TokenStreamToAutomaton.HOLE);
+ private static final LightAutomaton SEP_A = BasicAutomata.makeCharLight(TokenStreamToAutomaton.POS_SEP);
+ private static final LightAutomaton HOLE_A = BasicAutomata.makeCharLight(TokenStreamToAutomaton.HOLE);
- private Automaton join(String ... strings) {
- List<Automaton> as = new ArrayList<>();
+ private LightAutomaton join(String ... strings) {
+ List<LightAutomaton> as = new ArrayList<>();
for(String s : strings) {
- as.add(BasicAutomata.makeString(s));
+ as.add(s2a(s));
as.add(SEP_A);
}
as.remove(as.size()-1);
- return BasicOperations.concatenate(as);
+ return BasicOperations.concatenateLight(as);
}
- private Automaton join(Automaton ... as) {
- return BasicOperations.concatenate(Arrays.asList(as));
+ private LightAutomaton join(LightAutomaton ... as) {
+ return BasicOperations.concatenateLight(Arrays.asList(as));
}
- private Automaton s2a(String s) {
- return BasicAutomata.makeString(s);
+ private LightAutomaton s2a(String s) {
+ return BasicAutomata.makeStringLight(s);
}
public void testTwoTokens() throws Exception {
@@ -478,11 +482,12 @@ public class TestGraphTokenizers extends
token("abc", 1, 1),
token("def", 1, 1),
});
- final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton expected = join("abc", "def");
+ final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+ final LightAutomaton expected = join("abc", "def");
//toDot(actual);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+ BasicOperations.determinize(actual)));
}
public void testHole() throws Exception {
@@ -492,12 +497,13 @@ public class TestGraphTokenizers extends
token("abc", 1, 1),
token("def", 2, 1),
});
- final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+ final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));
+ final LightAutomaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));
//toDot(actual);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+ BasicOperations.determinize(actual)));
}
public void testOverlappedTokensSausage() throws Exception {
@@ -508,11 +514,12 @@ public class TestGraphTokenizers extends
token("abc", 1, 1),
token("xyz", 0, 1)
});
- final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton a1 = BasicAutomata.makeString("abc");
- final Automaton a2 = BasicAutomata.makeString("xyz");
- final Automaton expected = BasicOperations.union(a1, a2);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+ final LightAutomaton a1 = s2a("abc");
+ final LightAutomaton a2 = s2a("xyz");
+ final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+ BasicOperations.determinize(actual)));
}
public void testOverlappedTokensLattice() throws Exception {
@@ -523,13 +530,14 @@ public class TestGraphTokenizers extends
token("xyz", 0, 2),
token("def", 1, 1),
});
- final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton a1 = BasicAutomata.makeString("xyz");
- final Automaton a2 = join("abc", "def");
+ final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+ final LightAutomaton a1 = s2a("xyz");
+ final LightAutomaton a2 = join("abc", "def");
- final Automaton expected = BasicOperations.union(a1, a2);
+ final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
//toDot(actual);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+ BasicOperations.determinize(actual)));
}
public void testSynOverHole() throws Exception {
@@ -540,14 +548,15 @@ public class TestGraphTokenizers extends
token("X", 0, 2),
token("b", 2, 1),
});
- final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton a1 = BasicOperations.union(
+ final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+ final LightAutomaton a1 = BasicOperations.unionLight(
join(s2a("a"), SEP_A, HOLE_A),
- BasicAutomata.makeString("X"));
- final Automaton expected = BasicOperations.concatenate(a1,
+ s2a("X"));
+ final LightAutomaton expected = BasicOperations.concatenateLight(a1,
join(SEP_A, s2a("b")));
//toDot(actual);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+ BasicOperations.determinize(actual)));
}
public void testSynOverHole2() throws Exception {
@@ -558,11 +567,12 @@ public class TestGraphTokenizers extends
token("abc", 0, 3),
token("def", 2, 1),
});
- final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton expected = BasicOperations.union(
+ final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+ final LightAutomaton expected = BasicOperations.unionLight(
join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")),
- BasicAutomata.makeString("abc"));
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ s2a("abc"));
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+ BasicOperations.determinize(actual)));
}
public void testOverlappedTokensLattice2() throws Exception {
@@ -574,12 +584,13 @@ public class TestGraphTokenizers extends
token("def", 1, 1),
token("ghi", 1, 1),
});
- final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton a1 = BasicAutomata.makeString("xyz");
- final Automaton a2 = join("abc", "def", "ghi");
- final Automaton expected = BasicOperations.union(a1, a2);
+ final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+ final LightAutomaton a1 = s2a("xyz");
+ final LightAutomaton a2 = join("abc", "def", "ghi");
+ final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
//toDot(actual);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+ BasicOperations.determinize(actual)));
}
public void testToDot() throws Exception {
@@ -594,10 +605,11 @@ public class TestGraphTokenizers extends
new Token[] {
token("abc", 2, 1),
});
- final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton expected = join(HOLE_A, SEP_A, s2a("abc"));
+ final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+ final LightAutomaton expected = join(HOLE_A, SEP_A, s2a("abc"));
//toDot(actual);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+ BasicOperations.determinize(actual)));
}
// TODO: testEndsWithHole... but we need posInc to set in TS.end()
@@ -608,9 +620,10 @@ public class TestGraphTokenizers extends
token("a", 1, 1),
token("X", 0, 10),
});
- final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton expected = BasicOperations.union(BasicAutomata.makeString("a"),
- BasicAutomata.makeString("X"));
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+ final LightAutomaton expected = BasicOperations.unionLight(s2a("a"),
+ s2a("X"));
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+ BasicOperations.determinize(actual)));
}
}
Modified: lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java?rev=1602031&r1=1602030&r2=1602031&view=diff
==============================================================================
--- lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (original)
+++ lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java Wed Jun 11 21:56:21 2014
@@ -17,6 +17,16 @@ package org.apache.lucene.search.suggest
* limitations under the License.
*/
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
@@ -35,28 +45,20 @@ import org.apache.lucene.util.OfflineSor
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.LightAutomaton;
import org.apache.lucene.util.automaton.SpecialOperations;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
-import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.BytesReader;
-import org.apache.lucene.util.fst.PairOutputs;
+import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs.Pair;
+import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
-import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.fst.Util.Result;
import org.apache.lucene.util.fst.Util.TopResults;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
+import org.apache.lucene.util.fst.Util;
/**
* Suggester that first analyzes the surface form, adds the
@@ -255,37 +257,65 @@ public class AnalyzingSuggester extends
return fst == null ? 0 : fst.ramBytesUsed();
}
- private void copyDestTransitions(State from, State to, List<Transition> transitions) {
- if (to.isAccept()) {
- from.setAccept(true);
- }
- for(Transition t : to.getTransitions()) {
- transitions.add(t);
+ private int[] topoSortStates(LightAutomaton a) {
+ int[] states = new int[a.getNumStates()];
+ final Set<Integer> visited = new HashSet<>();
+ final LinkedList<Integer> worklist = new LinkedList<>();
+ worklist.add(0);
+ visited.add(0);
+ int upto = 0;
+ states[upto] = 0;
+ upto++;
+ LightAutomaton.Transition t = new LightAutomaton.Transition();
+ while (worklist.size() > 0) {
+ int s = worklist.removeFirst();
+ int count = a.initTransition(s, t);
+ for (int i=0;i<count;i++) {
+ a.getNextTransition(t);
+ if (!visited.contains(t.dest)) {
+ visited.add(t.dest);
+ worklist.add(t.dest);
+ states[upto++] = t.dest;
+ }
+ }
}
+ return states;
}
+
// Replaces SEP with epsilon or remaps them if
// we were asked to preserve them:
- private void replaceSep(Automaton a) {
+ private LightAutomaton replaceSep(LightAutomaton a) {
- State[] states = a.getNumberedStates();
+ LightAutomaton result = new LightAutomaton();
+
+ // Copy all states over
+ int numStates = a.getNumStates();
+ for(int s=0;s<numStates;s++) {
+ result.createState();
+ result.setAccept(s, a.isAccept(s));
+ }
// Go in reverse topo sort so we know we only have to
// make one pass:
- for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) {
- final State state = states[stateNumber];
+ LightAutomaton.Transition t = new LightAutomaton.Transition();
+ int[] topoSortStates = topoSortStates(a);
+ for(int i=0;i<topoSortStates.length;i++) {
+ int state = topoSortStates[topoSortStates.length-1-i];
List<Transition> newTransitions = new ArrayList<>();
- for(Transition t : state.getTransitions()) {
- assert t.getMin() == t.getMax();
- if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {
+ int count = a.initTransition(state, t);
+ for(int j=0;j<count;j++) {
+ a.getNextTransition(t);
+ if (t.min == TokenStreamToAutomaton.POS_SEP) {
+ assert t.max == TokenStreamToAutomaton.POS_SEP;
if (preserveSep) {
// Remap to SEP_LABEL:
- newTransitions.add(new Transition(SEP_LABEL, t.getDest()));
+ result.addTransition(state, t.dest, SEP_LABEL);
} else {
- copyDestTransitions(state, t.getDest(), newTransitions);
- a.setDeterministic(false);
+ result.addEpsilon(state, t.dest);
}
- } else if (t.getMin() == TokenStreamToAutomaton.HOLE) {
+ } else if (t.min == TokenStreamToAutomaton.HOLE) {
+ assert t.max == TokenStreamToAutomaton.HOLE;
// Just remove the hole: there will then be two
// SEP tokens next to each other, which will only
@@ -294,19 +324,21 @@ public class AnalyzingSuggester extends
// that's somehow a problem we can always map HOLE
// to a dedicated byte (and escape it in the
// input).
- copyDestTransitions(state, t.getDest(), newTransitions);
- a.setDeterministic(false);
+ result.addEpsilon(state, t.dest);
} else {
- newTransitions.add(t);
+ result.addTransition(state, t.dest, t.min, t.max);
}
}
- state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()]));
}
+
+ result.finish();
+
+ return result;
}
/** Used by subclass to change the lookup automaton, if
* necessary. */
- protected Automaton convertAutomaton(Automaton a) {
+ protected LightAutomaton convertAutomaton(LightAutomaton a) {
return a;
}
@@ -665,8 +697,7 @@ public class AnalyzingSuggester extends
}
final BytesRef utf8Key = new BytesRef(key);
try {
-
- Automaton lookupAutomaton = toLookupAutomaton(key);
+ LightAutomaton lookupAutomaton = toLookupAutomaton(key);
final CharsRef spare = new CharsRef();
@@ -818,7 +849,7 @@ public class AnalyzingSuggester extends
/** Returns all prefix paths to initialize the search. */
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
- Automaton lookupAutomaton,
+ LightAutomaton lookupAutomaton,
FST<Pair<Long,BytesRef>> fst)
throws IOException {
return prefixPaths;
@@ -826,7 +857,7 @@ public class AnalyzingSuggester extends
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
// Analyze surface form:
- Automaton automaton = null;
+ LightAutomaton automaton = null;
try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
// Create corresponding automaton: labels are bytes
@@ -835,7 +866,7 @@ public class AnalyzingSuggester extends
automaton = ts2a.toAutomaton(ts);
}
- replaceSep(automaton);
+ automaton = replaceSep(automaton);
automaton = convertAutomaton(automaton);
// TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
@@ -848,32 +879,25 @@ public class AnalyzingSuggester extends
// TODO: we could walk & add simultaneously, so we
// don't have to alloc [possibly biggish]
// intermediate HashSet in RAM:
+
return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
}
- final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
+ final LightAutomaton toLookupAutomaton(final CharSequence key) throws IOException {
// TODO: is there a Reader from a CharSequence?
// Turn tokenstream into automaton:
- Automaton automaton = null;
+ LightAutomaton automaton = null;
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
- automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
+ automaton = getTokenStreamToAutomaton().toAutomaton(ts);
}
- // TODO: we could use the end offset to "guess"
- // whether the final token was a partial token; this
- // would only be a heuristic ... but maybe an OK one.
- // This way we could eg differentiate "net" from "net ",
- // which we can't today...
-
- replaceSep(automaton);
+ automaton = replaceSep(automaton);
// TODO: we can optimize this somewhat by determinizing
// while we convert
- BasicOperations.determinize(automaton);
+ automaton = BasicOperations.determinize(automaton);
return automaton;
}
-
-
/**
* Returns the weight associated with an input string,
Modified: lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java?rev=1602031&r1=1602030&r2=1602031&view=diff
==============================================================================
--- lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java (original)
+++ lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java Wed Jun 11 21:56:21 2014
@@ -17,12 +17,14 @@ package org.apache.lucene.search.suggest
* limitations under the License.
*/
+import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
-import java.io.IOException;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.LightAutomaton;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.FST;
@@ -43,7 +45,7 @@ public class FSTUtil {
public static final class Path<T> {
/** Node in the automaton where path ends: */
- public final State state;
+ public final int state;
/** Node in the FST where path ends: */
public final FST.Arc<T> fstNode;
@@ -55,7 +57,7 @@ public class FSTUtil {
public final IntsRef input;
/** Sole constructor. */
- public Path(State state, FST.Arc<T> fstNode, T output, IntsRef input) {
+ public Path(int state, FST.Arc<T> fstNode, T output, IntsRef input) {
this.state = state;
this.fstNode = fstNode;
this.output = output;
@@ -67,21 +69,23 @@ public class FSTUtil {
* Enumerates all minimal prefix paths in the automaton that also intersect the FST,
* accumulating the FST end node and output for each path.
*/
- public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst)
+ public static <T> List<Path<T>> intersectPrefixPaths(LightAutomaton a, FST<T> fst)
throws IOException {
- assert a.isDeterministic();
+ assert BasicOperations.isDeterministic(a);
final List<Path<T>> queue = new ArrayList<>();
final List<Path<T>> endNodes = new ArrayList<>();
- queue.add(new Path<>(a.getInitialState(), fst
+ queue.add(new Path<>(0, fst
.getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(),
new IntsRef()));
final FST.Arc<T> scratchArc = new FST.Arc<>();
final FST.BytesReader fstReader = fst.getBytesReader();
-
+
+ LightAutomaton.Transition t = new LightAutomaton.Transition();
+
while (queue.size() != 0) {
final Path<T> path = queue.remove(queue.size() - 1);
- if (path.state.isAccept()) {
+ if (a.isAccept(path.state)) {
endNodes.add(path);
// we can stop here if we accept this path,
// we accept all further paths too
@@ -89,18 +93,20 @@ public class FSTUtil {
}
IntsRef currentInput = path.input;
- for (Transition t : path.state.getTransitions()) {
- final int min = t.getMin();
- final int max = t.getMax();
+ int count = a.initTransition(path.state, t);
+ for (int i=0;i<count;i++) {
+ a.getNextTransition(t);
+ final int min = t.min;
+ final int max = t.max;
if (min == max) {
- final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(),
+ final FST.Arc<T> nextArc = fst.findTargetArc(t.min,
path.fstNode, scratchArc, fstReader);
if (nextArc != null) {
final IntsRef newInput = new IntsRef(currentInput.length + 1);
newInput.copyInts(currentInput);
- newInput.ints[currentInput.length] = t.getMin();
+ newInput.ints[currentInput.length] = t.min;
newInput.length = currentInput.length + 1;
- queue.add(new Path<>(t.getDest(), new FST.Arc<T>()
+ queue.add(new Path<>(t.dest, new FST.Arc<T>()
.copyFrom(nextArc), fst.outputs
.add(path.output, nextArc.output), newInput));
}
@@ -122,7 +128,7 @@ public class FSTUtil {
newInput.copyInts(currentInput);
newInput.ints[currentInput.length] = nextArc.label;
newInput.length = currentInput.length + 1;
- queue.add(new Path<>(t.getDest(), new FST.Arc<T>()
+ queue.add(new Path<>(t.dest, new FST.Arc<T>()
.copyFrom(nextArc), fst.outputs
.add(path.output, nextArc.output), newInput));
final int label = nextArc.label; // used in assert
Modified: lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java?rev=1602031&r1=1602030&r2=1602031&view=diff
==============================================================================
--- lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (original)
+++ lucene/dev/branches/lucene5752/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java Wed Jun 11 21:56:21 2014
@@ -32,8 +32,10 @@ import org.apache.lucene.util.automaton.
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
+import org.apache.lucene.util.automaton.LightAutomaton;
import org.apache.lucene.util.automaton.SpecialOperations;
import org.apache.lucene.util.automaton.UTF32ToUTF8;
+import org.apache.lucene.util.automaton.UTF32ToUTF8Light;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs.Pair;
@@ -177,7 +179,7 @@ public final class FuzzySuggester extend
@Override
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
- Automaton lookupAutomaton,
+ LightAutomaton lookupAutomaton,
FST<Pair<Long,BytesRef>> fst)
throws IOException {
@@ -191,7 +193,7 @@ public final class FuzzySuggester extend
// "compete") ... in which case I think the wFST needs
// to be log weights or something ...
- Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
+ LightAutomaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
/*
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
w.write(levA.toDot());
@@ -202,10 +204,10 @@ public final class FuzzySuggester extend
}
@Override
- protected Automaton convertAutomaton(Automaton a) {
+ protected LightAutomaton convertAutomaton(LightAutomaton a) {
if (unicodeAware) {
- Automaton utf8automaton = new UTF32ToUTF8().convert(a);
- BasicOperations.determinize(utf8automaton);
+ LightAutomaton utf8automaton = new UTF32ToUTF8Light().convert(a);
+ utf8automaton = BasicOperations.determinize(utf8automaton);
return utf8automaton;
} else {
return a;
@@ -219,16 +221,16 @@ public final class FuzzySuggester extend
return tsta;
}
- Automaton toLevenshteinAutomata(Automaton automaton) {
+ LightAutomaton toLevenshteinAutomata(LightAutomaton automaton) {
final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
- Automaton subs[] = new Automaton[ref.size()];
+ LightAutomaton subs[] = new LightAutomaton[ref.size()];
int upto = 0;
for (IntsRef path : ref) {
if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) {
- subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);
+ subs[upto] = BasicAutomata.makeStringLight(path.ints, path.offset, path.length);
upto++;
} else {
- Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix);
+ LightAutomaton prefix = BasicAutomata.makeStringLight(path.ints, path.offset, nonFuzzyPrefix);
int ints[] = new int[path.length-nonFuzzyPrefix];
System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length);
// TODO: maybe add alphaMin to LevenshteinAutomata,
@@ -237,9 +239,8 @@ public final class FuzzySuggester extend
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
- Automaton levAutomaton = lev.toAutomaton(maxEdits);
- Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
- combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
+ LightAutomaton levAutomaton = lev.toLightAutomaton(maxEdits);
+ LightAutomaton combined = BasicOperations.concatenateLight(prefix, levAutomaton);
subs[upto] = combined;
upto++;
}
@@ -247,19 +248,17 @@ public final class FuzzySuggester extend
if (subs.length == 0) {
// automaton is empty, there is no accepted paths through it
- return BasicAutomata.makeEmpty(); // matches nothing
+ return BasicAutomata.makeEmptyLight(); // matches nothing
} else if (subs.length == 1) {
// no synonyms or anything: just a single path through the tokenstream
return subs[0];
} else {
// multiple paths: this is really scary! is it slow?
// maybe we should not do this and throw UOE?
- Automaton a = BasicOperations.union(Arrays.asList(subs));
+ LightAutomaton a = BasicOperations.unionLight(Arrays.asList(subs));
// TODO: we could call toLevenshteinAutomata() before det?
// this only happens if you have multiple paths anyway (e.g. synonyms)
- BasicOperations.determinize(a);
-
- return a;
+ return BasicOperations.determinize(a);
}
}
}
Modified: lucene/dev/branches/lucene5752/lucene/suggest/src/test/org/apache/lucene/search/suggest/InputArrayIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5752/lucene/suggest/src/test/org/apache/lucene/search/suggest/InputArrayIterator.java?rev=1602031&r1=1602030&r2=1602031&view=diff
==============================================================================
--- lucene/dev/branches/lucene5752/lucene/suggest/src/test/org/apache/lucene/search/suggest/InputArrayIterator.java (original)
+++ lucene/dev/branches/lucene5752/lucene/suggest/src/test/org/apache/lucene/search/suggest/InputArrayIterator.java Wed Jun 11 21:56:21 2014
@@ -92,4 +92,4 @@ public final class InputArrayIterator im
public boolean hasContexts() {
return hasContexts;
}
-}
\ No newline at end of file
+}
Modified: lucene/dev/branches/lucene5752/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5752/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java?rev=1602031&r1=1602030&r2=1602031&view=diff
==============================================================================
--- lucene/dev/branches/lucene5752/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (original)
+++ lucene/dev/branches/lucene5752/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java Wed Jun 11 21:56:21 2014
@@ -40,14 +40,16 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.Input;
import org.apache.lucene.search.suggest.InputArrayIterator;
+import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.LightAutomaton;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.fst.Util;
@@ -752,29 +754,30 @@ public class FuzzySuggesterTest extends
// us the "answer key" (ie maybe we have a bug in
// suggester.toLevA ...) ... but testRandom2() fixes
// this:
- Automaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)));
- assertTrue(automaton.isDeterministic());
+ LightAutomaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)));
+ assertTrue(BasicOperations.isDeterministic(automaton));
+
// TODO: could be faster... but its slowCompletor for a reason
BytesRef spare = new BytesRef();
for (TermFreqPayload2 e : slowCompletor) {
spare.copyChars(e.analyzedForm);
Set<IntsRef> finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton);
for (IntsRef intsRef : finiteStrings) {
- State p = automaton.getInitialState();
+ int p = 0;
BytesRef ref = Util.toBytesRef(intsRef, spare);
boolean added = false;
for (int i = ref.offset; i < ref.length; i++) {
- State q = p.step(ref.bytes[i] & 0xff);
- if (q == null) {
+ int q = automaton.step(p, ref.bytes[i] & 0xff);
+ if (q == -1) {
break;
- } else if (q.isAccept()) {
+ } else if (automaton.isAccept(q)) {
matches.add(new LookupResult(e.surfaceForm, e.weight));
added = true;
break;
}
p = q;
}
- if (!added && p.isAccept()) {
+ if (!added && automaton.isAccept(p)) {
matches.add(new LookupResult(e.surfaceForm, e.weight));
}
}