You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2014/06/19 10:22:11 UTC
svn commit: r1603752 [1/3] - in /lucene/dev/trunk: ./ dev-tools/ lucene/
lucene/analysis/ lucene/analysis/common/
lucene/analysis/common/src/test/org/apache/lucene/analysis/core/
lucene/codecs/ lucene/codecs/src/java/org/apache/lucene/codecs/memory/ lu...
Author: mikemccand
Date: Thu Jun 19 08:22:08 2014
New Revision: 1603752
URL: http://svn.apache.org/r1603752
Log:
LUCENE-5752: switch to simpler Automaton implementation
Added:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
- copied unchanged from r1603645, lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
- copied unchanged from r1603645, lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java
- copied unchanged from r1603645, lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
- copied unchanged from r1603645, lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java
- copied unchanged from r1603645, lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
- copied unchanged from r1603645, lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/SortedIntSet.java
- copied unchanged from r1603645, lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/SortedIntSet.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java
- copied unchanged from r1603645, lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java
- copied unchanged from r1603645, lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java
- copied unchanged from r1603645, lucene/dev/branches/lucene5752/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
- copied unchanged from r1603645, lucene/dev/branches/lucene5752/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java
- copied unchanged from r1603645, lucene/dev/branches/lucene5752/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java
Removed:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/State.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestSpecialOperations.java
Modified:
lucene/dev/trunk/ (props changed)
lucene/dev/trunk/dev-tools/ (props changed)
lucene/dev/trunk/lucene/ (props changed)
lucene/dev/trunk/lucene/CHANGES.txt (contents, props changed)
lucene/dev/trunk/lucene/analysis/ (props changed)
lucene/dev/trunk/lucene/analysis/common/ (props changed)
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDuelingAnalyzers.java
lucene/dev/trunk/lucene/build.xml (contents, props changed)
lucene/dev/trunk/lucene/codecs/ (props changed)
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java
lucene/dev/trunk/lucene/common-build.xml (props changed)
lucene/dev/trunk/lucene/core/ (props changed)
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestDocTermOrdsRewriteMethod.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminizeLexicon.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
lucene/dev/trunk/lucene/expressions/ (props changed)
lucene/dev/trunk/lucene/facet/ (props changed)
lucene/dev/trunk/lucene/highlighter/ (props changed)
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java
lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
lucene/dev/trunk/lucene/join/ (props changed)
lucene/dev/trunk/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java
lucene/dev/trunk/lucene/memory/ (props changed)
lucene/dev/trunk/lucene/misc/ (props changed)
lucene/dev/trunk/lucene/queries/ (props changed)
lucene/dev/trunk/lucene/queryparser/ (props changed)
lucene/dev/trunk/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java
lucene/dev/trunk/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java
lucene/dev/trunk/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
lucene/dev/trunk/lucene/sandbox/ (props changed)
lucene/dev/trunk/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java
lucene/dev/trunk/lucene/spatial/ (props changed)
lucene/dev/trunk/lucene/suggest/ (props changed)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/InputArrayIterator.java
lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
lucene/dev/trunk/lucene/test-framework/ (props changed)
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java
lucene/dev/trunk/lucene/tools/ (props changed)
lucene/dev/trunk/solr/ (props changed)
lucene/dev/trunk/solr/CHANGES.txt (props changed)
lucene/dev/trunk/solr/contrib/ (props changed)
lucene/dev/trunk/solr/core/ (props changed)
lucene/dev/trunk/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java
lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java
lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java
lucene/dev/trunk/solr/solrj/ (props changed)
lucene/dev/trunk/solr/test-framework/ (props changed)
lucene/dev/trunk/solr/test-framework/src/java/org/apache/solr/analysis/MockTokenFilterFactory.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Thu Jun 19 08:22:08 2014
@@ -98,6 +98,10 @@ Other
(No Changes)
+API Changes
+
+* LUCENE-5752: Simplified Automaton API to be immutable. (Mike McCandless)
+
======================= Lucene 4.9.0 =======================
Changes in Runtime Behavior
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDuelingAnalyzers.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDuelingAnalyzers.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDuelingAnalyzers.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDuelingAnalyzers.java Thu Jun 19 08:22:08 2014
@@ -31,11 +31,9 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
-import org.apache.lucene.util.automaton.State;
-import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.automaton.Automaton;
/**
* Compares MockTokenizer (which is simple with no optimizations) with equivalent
@@ -50,18 +48,18 @@ public class TestDuelingAnalyzers extend
@Override
public void setUp() throws Exception {
super.setUp();
+ Automaton single = new Automaton();
+ int initial = single.createState();
+ int accept = single.createState();
+ single.setAccept(accept, true);
+
// build an automaton matching this jvm's letter definition
- State initial = new State();
- State accept = new State();
- accept.setAccept(true);
for (int i = 0; i <= 0x10FFFF; i++) {
if (Character.isLetter(i)) {
- initial.addTransition(new Transition(i, i, accept));
+ single.addTransition(initial, accept, i);
}
}
- Automaton single = new Automaton(initial);
- single.reduce();
- Automaton repeat = BasicOperations.repeat(single);
+ Automaton repeat = Operations.repeat(single);
jvmLetter = new CharacterRunAutomaton(repeat);
}
Modified: lucene/dev/trunk/lucene/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/build.xml?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/build.xml (original)
+++ lucene/dev/trunk/lucene/build.xml Thu Jun 19 08:22:08 2014
@@ -258,6 +258,7 @@
<!-- test-framework: problems -->
<!-- too much to fix core/ for now, but enforce full javadocs for key packages -->
+ <check-missing-javadocs dir="build/docs/core/org/apache/lucene/util/automaton" level="method"/>
<check-missing-javadocs dir="build/docs/core/org/apache/lucene/analysis" level="method"/>
<check-missing-javadocs dir="build/docs/core/org/apache/lucene/document" level="method"/>
<check-missing-javadocs dir="build/docs/core/org/apache/lucene/search/similarities" level="method"/>
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java Thu Jun 19 08:22:08 2014
@@ -29,8 +29,8 @@ import org.apache.lucene.codecs.Postings
import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; // javadocs
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
-import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.SegmentReadState;
@@ -940,10 +940,11 @@ public final class DirectPostingsFormat
private final class State {
int changeOrd;
int state;
- Transition[] transitions;
int transitionUpto;
+ int transitionCount;
int transitionMax;
int transitionMin;
+ final Transition transition = new Transition();
}
private State[] states;
@@ -957,7 +958,8 @@ public final class DirectPostingsFormat
states[0] = new State();
states[0].changeOrd = terms.length;
states[0].state = runAutomaton.getInitialState();
- states[0].transitions = compiledAutomaton.sortedTransitions[states[0].state];
+ states[0].transitionCount = compiledAutomaton.automaton.getNumTransitions(states[0].state);
+ compiledAutomaton.automaton.initTransition(states[0].state, states[0].transition);
states[0].transitionUpto = -1;
states[0].transitionMax = -1;
@@ -978,9 +980,10 @@ public final class DirectPostingsFormat
while (label > states[i].transitionMax) {
states[i].transitionUpto++;
- assert states[i].transitionUpto < states[i].transitions.length;
- states[i].transitionMin = states[i].transitions[states[i].transitionUpto].getMin();
- states[i].transitionMax = states[i].transitions[states[i].transitionUpto].getMax();
+ assert states[i].transitionUpto < states[i].transitionCount;
+ compiledAutomaton.automaton.getNextTransition(states[i].transition);
+ states[i].transitionMin = states[i].transition.min;
+ states[i].transitionMax = states[i].transition.max;
assert states[i].transitionMin >= 0;
assert states[i].transitionMin <= 255;
assert states[i].transitionMax >= 0;
@@ -1037,7 +1040,8 @@ public final class DirectPostingsFormat
stateUpto++;
states[stateUpto].changeOrd = skips[skipOffset + skipUpto++];
states[stateUpto].state = nextState;
- states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState];
+ states[stateUpto].transitionCount = compiledAutomaton.automaton.getNumTransitions(nextState);
+ compiledAutomaton.automaton.initTransition(states[stateUpto].state, states[stateUpto].transition);
states[stateUpto].transitionUpto = -1;
states[stateUpto].transitionMax = -1;
//System.out.println(" push " + states[stateUpto].transitions.length + " trans");
@@ -1191,7 +1195,7 @@ public final class DirectPostingsFormat
while (label > state.transitionMax) {
//System.out.println(" label=" + label + " vs max=" + state.transitionMax + " transUpto=" + state.transitionUpto + " vs " + state.transitions.length);
state.transitionUpto++;
- if (state.transitionUpto == state.transitions.length) {
+ if (state.transitionUpto == state.transitionCount) {
// We've exhausted transitions leaving this
// state; force pop+next/skip now:
//System.out.println("forcepop: stateUpto=" + stateUpto);
@@ -1210,9 +1214,10 @@ public final class DirectPostingsFormat
}
continue nextTerm;
}
- assert state.transitionUpto < state.transitions.length: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitions.length;
- state.transitionMin = state.transitions[state.transitionUpto].getMin();
- state.transitionMax = state.transitions[state.transitionUpto].getMax();
+ compiledAutomaton.automaton.getNextTransition(state.transition);
+ assert state.transitionUpto < state.transitionCount: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitionCount;
+ state.transitionMin = state.transition.min;
+ state.transitionMax = state.transition.max;
assert state.transitionMin >= 0;
assert state.transitionMin <= 255;
assert state.transitionMax >= 0;
@@ -1310,7 +1315,8 @@ public final class DirectPostingsFormat
stateUpto++;
states[stateUpto].state = nextState;
states[stateUpto].changeOrd = skips[skipOffset + skipUpto++];
- states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState];
+ states[stateUpto].transitionCount = compiledAutomaton.automaton.getNumTransitions(nextState);
+ compiledAutomaton.automaton.initTransition(nextState, states[stateUpto].transition);
states[stateUpto].transitionUpto = -1;
states[stateUpto].transitionMax = -1;
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java Thu Jun 19 08:22:08 2014
@@ -26,8 +26,6 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RollingBuffer;
import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.State;
-import org.apache.lucene.util.automaton.Transition;
// TODO: maybe also toFST? then we can translate atts into FST outputs/weights
@@ -61,15 +59,15 @@ public class TokenStreamToAutomaton {
private static class Position implements RollingBuffer.Resettable {
// Any tokens that ended at our position arrive to this state:
- State arriving;
+ int arriving = -1;
// Any tokens that start at our position leave from this state:
- State leaving;
+ int leaving = -1;
@Override
public void reset() {
- arriving = null;
- leaving = null;
+ arriving = -1;
+ leaving = -1;
}
}
@@ -99,8 +97,8 @@ public class TokenStreamToAutomaton {
* automaton where arcs are bytes (or Unicode code points
* if unicodeArcs = true) from each term. */
public Automaton toAutomaton(TokenStream in) throws IOException {
- final Automaton a = new Automaton();
- boolean deterministic = true;
+ final Automaton.Builder builder = new Automaton.Builder();
+ builder.createState();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
@@ -132,34 +130,29 @@ public class TokenStreamToAutomaton {
pos += posInc;
posData = positions.get(pos);
- assert posData.leaving == null;
+ assert posData.leaving == -1;
- if (posData.arriving == null) {
+ if (posData.arriving == -1) {
// No token ever arrived to this position
if (pos == 0) {
// OK: this is the first token
- posData.leaving = a.getInitialState();
+ posData.leaving = 0;
} else {
// This means there's a hole (eg, StopFilter
// does this):
- posData.leaving = new State();
- addHoles(a.getInitialState(), positions, pos);
+ posData.leaving = builder.createState();
+ addHoles(builder, positions, pos);
}
} else {
- posData.leaving = new State();
- posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
+ posData.leaving = builder.createState();
+ builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
if (posInc > 1) {
// A token spanned over a hole; add holes
// "under" it:
- addHoles(a.getInitialState(), positions, pos);
+ addHoles(builder, positions, pos);
}
}
positions.freeBefore(pos);
- } else {
- // note: this isn't necessarily true. its just that we aren't surely det.
- // we could optimize this further (e.g. buffer and sort synonyms at a position)
- // but thats probably overkill. this is cheap and dirty
- deterministic = false;
}
final int endPos = pos + posLengthAtt.getPositionLength();
@@ -168,31 +161,33 @@ public class TokenStreamToAutomaton {
final BytesRef termUTF8 = changeToken(term);
int[] termUnicode = null;
final Position endPosData = positions.get(endPos);
- if (endPosData.arriving == null) {
- endPosData.arriving = new State();
+ if (endPosData.arriving == -1) {
+ endPosData.arriving = builder.createState();
}
- State state = posData.leaving;
int termLen;
if (unicodeArcs) {
final String utf16 = termUTF8.utf8ToString();
termUnicode = new int[utf16.codePointCount(0, utf16.length())];
termLen = termUnicode.length;
- for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp))
+ for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
termUnicode[j++] = cp = utf16.codePointAt(i);
+ }
} else {
termLen = termUTF8.length;
}
+ int state = posData.leaving;
+
for(int byteIDX=0;byteIDX<termLen;byteIDX++) {
- final State nextState = byteIDX == termLen-1 ? endPosData.arriving : new State();
+ final int nextState = byteIDX == termLen-1 ? endPosData.arriving : builder.createState();
int c;
if (unicodeArcs) {
c = termUnicode[byteIDX];
} else {
c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
}
- state.addTransition(new Transition(c, nextState));
+ builder.addTransition(state, nextState, c);
state = nextState;
}
@@ -200,28 +195,26 @@ public class TokenStreamToAutomaton {
}
in.end();
- State endState = null;
+ int endState = -1;
if (offsetAtt.endOffset() > maxOffset) {
- endState = new State();
- endState.setAccept(true);
+ endState = builder.createState();
+ builder.setAccept(endState, true);
}
pos++;
while (pos <= positions.getMaxPos()) {
posData = positions.get(pos);
- if (posData.arriving != null) {
- if (endState != null) {
- posData.arriving.addTransition(new Transition(POS_SEP, endState));
+ if (posData.arriving != -1) {
+ if (endState != -1) {
+ builder.addTransition(posData.arriving, endState, POS_SEP);
} else {
- posData.arriving.setAccept(true);
+ builder.setAccept(posData.arriving, true);
}
}
pos++;
}
- //toDot(a);
- a.setDeterministic(deterministic);
- return a;
+ return builder.finish();
}
// for debugging!
@@ -235,26 +228,26 @@ public class TokenStreamToAutomaton {
}
*/
- private static void addHoles(State startState, RollingBuffer<Position> positions, int pos) {
+ private static void addHoles(Automaton.Builder builder, RollingBuffer<Position> positions, int pos) {
Position posData = positions.get(pos);
Position prevPosData = positions.get(pos-1);
- while(posData.arriving == null || prevPosData.leaving == null) {
- if (posData.arriving == null) {
- posData.arriving = new State();
- posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
+ while(posData.arriving == -1 || prevPosData.leaving == -1) {
+ if (posData.arriving == -1) {
+ posData.arriving = builder.createState();
+ builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
}
- if (prevPosData.leaving == null) {
+ if (prevPosData.leaving == -1) {
if (pos == 1) {
- prevPosData.leaving = startState;
+ prevPosData.leaving = 0;
} else {
- prevPosData.leaving = new State();
+ prevPosData.leaving = builder.createState();
}
- if (prevPosData.arriving != null) {
- prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving));
+ if (prevPosData.arriving != -1) {
+ builder.addTransition(prevPosData.arriving, prevPosData.leaving, POS_SEP);
}
}
- prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving));
+ builder.addTransition(prevPosData.leaving, posData.arriving, HOLE);
pos--;
if (pos <= 0) {
break;
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java Thu Jun 19 08:22:08 2014
@@ -18,43 +18,25 @@ package org.apache.lucene.codecs.blocktr
*/
import java.io.IOException;
-import java.io.PrintStream;
import java.util.Collections;
import java.util.Iterator;
import java.util.TreeMap;
-import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.CorruptIndexException;
-import org.apache.lucene.index.DocsAndPositionsEnum;
-import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
-import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.RunAutomaton;
-import org.apache.lucene.util.automaton.Transition;
-import org.apache.lucene.util.fst.ByteSequenceOutputs;
-import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.Outputs;
-import org.apache.lucene.util.fst.Util;
/** A block-based terms index and dictionary that assigns
* terms to variable length blocks according to how they
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java Thu Jun 19 08:22:08 2014
@@ -347,7 +347,7 @@ final class IntersectTermsEnum extends T
if (currentFrame.suffix != 0) {
final int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff;
while (label > currentFrame.curTransitionMax) {
- if (currentFrame.transitionIndex >= currentFrame.transitions.length-1) {
+ if (currentFrame.transitionIndex >= currentFrame.transitionCount-1) {
// Stop processing this frame -- no further
// matches are possible because we've moved
// beyond what the max transition will allow
@@ -359,7 +359,8 @@ final class IntersectTermsEnum extends T
continue nextTerm;
}
currentFrame.transitionIndex++;
- currentFrame.curTransitionMax = currentFrame.transitions[currentFrame.transitionIndex].getMax();
+ compiledAutomaton.automaton.getNextTransition(currentFrame.transition);
+ currentFrame.curTransitionMax = currentFrame.transition.max;
//if (DEBUG) System.out.println(" next trans=" + currentFrame.transitions[currentFrame.transitionIndex]);
}
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java Thu Jun 19 08:22:08 2014
@@ -68,9 +68,10 @@ final class IntersectTermsEnumFrame {
int numFollowFloorBlocks;
int nextFloorLabel;
- Transition[] transitions;
+ Transition transition = new Transition();
int curTransitionMax;
int transitionIndex;
+ int transitionCount;
FST.Arc<BytesRef> arc;
@@ -112,7 +113,7 @@ final class IntersectTermsEnumFrame {
nextFloorLabel = 256;
}
// if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel);
- } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[transitionIndex].getMin());
+ } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
load(null);
}
@@ -120,9 +121,11 @@ final class IntersectTermsEnumFrame {
public void setState(int state) {
this.state = state;
transitionIndex = 0;
- transitions = ite.compiledAutomaton.sortedTransitions[state];
- if (transitions.length != 0) {
- curTransitionMax = transitions[0].getMax();
+ transitionCount = ite.compiledAutomaton.automaton.getNumTransitions(state);
+ if (transitionCount != 0) {
+ ite.compiledAutomaton.automaton.initTransition(state, transition);
+ ite.compiledAutomaton.automaton.getNextTransition(transition);
+ curTransitionMax = transition.max;
} else {
curTransitionMax = -1;
}
@@ -132,7 +135,7 @@ final class IntersectTermsEnumFrame {
// if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state));
- if (frameIndexData != null && transitions.length != 0) {
+ if (frameIndexData != null && transitionCount != 0) {
// Floor frame
if (floorData.length < frameIndexData.length) {
this.floorData = new byte[ArrayUtil.oversize(frameIndexData.length, 1)];
@@ -151,7 +154,8 @@ final class IntersectTermsEnumFrame {
// first block in case it has empty suffix:
if (!ite.runAutomaton.isAccept(state)) {
// Maybe skip floor blocks:
- while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[0].getMin()) {
+ assert transitionIndex == 0: "transitionIndex=" + transitionIndex;
+ while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min) {
fp = fpOrig + (floorDataReader.readVLong() >>> 1);
numFollowFloorBlocks--;
// if (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java Thu Jun 19 08:22:08 2014
@@ -24,6 +24,7 @@ import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Transition;
/**
@@ -51,7 +52,7 @@ class AutomatonTermsEnum extends Filtere
// true if the automaton accepts a finite language
private final boolean finite;
// array of sorted transitions for each state, indexed by state number
- private final Transition[][] allTransitions;
+ private final Automaton automaton;
// for path tracking: each long records gen when we last
// visited the state; we use gens to avoid having to clear
private final long[] visited;
@@ -79,7 +80,7 @@ class AutomatonTermsEnum extends Filtere
this.runAutomaton = compiled.runAutomaton;
assert this.runAutomaton != null;
this.commonSuffixRef = compiled.commonSuffixRef;
- this.allTransitions = compiled.sortedTransitions;
+ this.automaton = compiled.automaton;
// used for path tracking, where each bit is a numbered state.
visited = new long[runAutomaton.getSize()];
@@ -124,6 +125,8 @@ class AutomatonTermsEnum extends Filtere
}
}
+ private Transition transition = new Transition();
+
/**
* Sets the enum to operate in linear fashion, as we have found
* a looping transition at position: we set an upper bound and
@@ -133,16 +136,20 @@ class AutomatonTermsEnum extends Filtere
assert linear == false;
int state = runAutomaton.getInitialState();
+ assert state == 0;
int maxInterval = 0xff;
+ //System.out.println("setLinear pos=" + position + " seekbytesRef=" + seekBytesRef);
for (int i = 0; i < position; i++) {
state = runAutomaton.step(state, seekBytesRef.bytes[i] & 0xff);
assert state >= 0: "state=" + state;
}
- for (int i = 0; i < allTransitions[state].length; i++) {
- Transition t = allTransitions[state][i];
- if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) &&
- (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) {
- maxInterval = t.getMax();
+ final int numTransitions = automaton.getNumTransitions(state);
+ automaton.initTransition(state, transition);
+ for (int i = 0; i < numTransitions; i++) {
+ automaton.getNextTransition(transition);
+ if (transition.min <= (seekBytesRef.bytes[position] & 0xff) &&
+ (seekBytesRef.bytes[position] & 0xff) <= transition.max) {
+ maxInterval = transition.max;
break;
}
}
@@ -250,19 +257,19 @@ class AutomatonTermsEnum extends Filtere
seekBytesRef.length = position;
visited[state] = curGen;
- Transition transitions[] = allTransitions[state];
-
+ final int numTransitions = automaton.getNumTransitions(state);
+ automaton.initTransition(state, transition);
// find the minimal path (lexicographic order) that is >= c
- for (int i = 0; i < transitions.length; i++) {
- Transition transition = transitions[i];
- if (transition.getMax() >= c) {
- int nextChar = Math.max(c, transition.getMin());
+ for (int i = 0; i < numTransitions; i++) {
+ automaton.getNextTransition(transition);
+ if (transition.max >= c) {
+ int nextChar = Math.max(c, transition.min);
// append either the next sequential char, or the minimum transition
seekBytesRef.grow(seekBytesRef.length + 1);
seekBytesRef.length++;
seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) nextChar;
- state = transition.getDest().getNumber();
+ state = transition.dest;
/*
* as long as is possible, continue down the minimal path in
* lexicographic order. if a loop or accept state is encountered, stop.
@@ -274,13 +281,14 @@ class AutomatonTermsEnum extends Filtere
* so the below is ok, if it is not an accept state,
* then there MUST be at least one transition.
*/
- transition = allTransitions[state][0];
- state = transition.getDest().getNumber();
+ automaton.initTransition(state, transition);
+ automaton.getNextTransition(transition);
+ state = transition.dest;
// append the minimum transition
seekBytesRef.grow(seekBytesRef.length + 1);
seekBytesRef.length++;
- seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.getMin();
+ seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.min;
// we found a loop, record it for faster enumeration
if (!finite && !linear && visited[state] == curGen) {
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java Thu Jun 19 08:22:08 2014
@@ -24,11 +24,11 @@ import java.util.List;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;
@@ -36,8 +36,6 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.BasicAutomata;
-import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
@@ -152,7 +150,7 @@ public class FuzzyTermsEnum extends Term
throws IOException {
final List<CompiledAutomaton> runAutomata = initAutomata(editDistance);
if (editDistance < runAutomata.size()) {
- //if (BlockTreeTermsWriter.DEBUG) System.out.println("FuzzyTE.getAEnum: ed=" + editDistance + " lastTerm=" + (lastTerm==null ? "null" : lastTerm.utf8ToString()));
+ //System.out.println("FuzzyTE.getAEnum: ed=" + editDistance + " lastTerm=" + (lastTerm==null ? "null" : lastTerm.utf8ToString()));
final CompiledAutomaton compiled = runAutomata.get(editDistance);
return new AutomatonFuzzyTermsEnum(terms.intersect(compiled, lastTerm == null ? null : compiled.floor(lastTerm, new BytesRef())),
runAutomata.subList(0, editDistance + 1).toArray(new CompiledAutomaton[editDistance + 1]));
@@ -165,20 +163,15 @@ public class FuzzyTermsEnum extends Term
private List<CompiledAutomaton> initAutomata(int maxDistance) {
final List<CompiledAutomaton> runAutomata = dfaAtt.automata();
//System.out.println("cached automata size: " + runAutomata.size());
- if (runAutomata.size() <= maxDistance &&
+ if (runAutomata.size() <= maxDistance &&
maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
LevenshteinAutomata builder =
new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions);
+ String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
for (int i = runAutomata.size(); i <= maxDistance; i++) {
- Automaton a = builder.toAutomaton(i);
+ Automaton a = builder.toAutomaton(i, prefix);
//System.out.println("compute automaton n=" + i);
- // constant prefix
- if (realPrefixLength > 0) {
- Automaton prefix = BasicAutomata.makeString(
- UnicodeUtil.newString(termText, 0, realPrefixLength));
- a = BasicOperations.concatenate(prefix, a);
- }
runAutomata.add(new CompiledAutomaton(a, true, false));
}
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java Thu Jun 19 08:22:08 2014
@@ -1,7 +1,6 @@
package org.apache.lucene.search;
import org.apache.lucene.index.Term;
-
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonProvider;
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java Thu Jun 19 08:22:08 2014
@@ -17,14 +17,14 @@ package org.apache.lucene.search;
* limitations under the License.
*/
+import java.util.ArrayList;
+import java.util.List;
+
import org.apache.lucene.index.Term;
import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.BasicAutomata;
-import org.apache.lucene.util.automaton.BasicOperations;
-
-import java.util.ArrayList;
-import java.util.List;
/** Implements the wildcard search query. Supported wildcards are <code>*</code>, which
* matches any character sequence (including the empty one), and <code>?</code>,
@@ -72,26 +72,26 @@ public class WildcardQuery extends Autom
int length = Character.charCount(c);
switch(c) {
case WILDCARD_STRING:
- automata.add(BasicAutomata.makeAnyString());
+ automata.add(Automata.makeAnyString());
break;
case WILDCARD_CHAR:
- automata.add(BasicAutomata.makeAnyChar());
+ automata.add(Automata.makeAnyChar());
break;
case WILDCARD_ESCAPE:
// add the next codepoint instead, if it exists
if (i + length < wildcardText.length()) {
final int nextChar = wildcardText.codePointAt(i + length);
length += Character.charCount(nextChar);
- automata.add(BasicAutomata.makeChar(nextChar));
+ automata.add(Automata.makeChar(nextChar));
break;
} // else fallthru, lenient parsing with a trailing \
default:
- automata.add(BasicAutomata.makeChar(c));
+ automata.add(Automata.makeChar(c));
}
i += length;
}
- return BasicOperations.concatenate(automata);
+ return Operations.concatenate(automata);
}
/**
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java Thu Jun 19 08:22:08 2014
@@ -21,7 +21,8 @@ package org.apache.lucene.util.automaton
* Automaton representation for matching UTF-8 byte[].
*/
public class ByteRunAutomaton extends RunAutomaton {
-
+
+ /** Converts incoming automaton to byte-based (UTF32ToUTF8) first */
public ByteRunAutomaton(Automaton a) {
this(a, false);
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java Thu Jun 19 08:22:08 2014
@@ -22,6 +22,7 @@ package org.apache.lucene.util.automaton
*/
public class CharacterRunAutomaton extends RunAutomaton {
+ /** Sole constructor. */
public CharacterRunAutomaton(Automaton a) {
super(a, Character.MAX_CODE_POINT, false);
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java Thu Jun 19 08:22:08 2014
@@ -19,7 +19,6 @@ package org.apache.lucene.util.automaton
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.List;
import org.apache.lucene.index.Terms;
@@ -52,6 +51,8 @@ public class CompiledAutomaton {
/** Catch-all for any other automata. */
NORMAL
};
+
+ /** If simplify is true this will be the "simplified" type; else, this is NORMAL */
public final AUTOMATON_TYPE type;
/**
@@ -65,21 +66,22 @@ public class CompiledAutomaton {
* only valid for {@link AUTOMATON_TYPE#NORMAL}.
*/
public final ByteRunAutomaton runAutomaton;
- // TODO: would be nice if these sortedTransitions had "int
- // to;" instead of "State to;" somehow:
+
/**
* Two dimensional array of transitions, indexed by state
* number for traversal. The state numbering is consistent with
* {@link #runAutomaton}.
* Only valid for {@link AUTOMATON_TYPE#NORMAL}.
*/
- public final Transition[][] sortedTransitions;
+ public final Automaton automaton;
+
/**
* Shared common suffix accepted by the automaton. Only valid
* for {@link AUTOMATON_TYPE#NORMAL}, and only when the
* automaton accepts an infinite language.
*/
public final BytesRef commonSuffixRef;
+
/**
* Indicates if the automaton accepts a finite set of strings.
* Null if this was not computed.
@@ -87,125 +89,157 @@ public class CompiledAutomaton {
*/
public final Boolean finite;
+ /** Create this, passing simplify=true and finite=null, so that we try
+ * to simplify the automaton and determine if it is finite. */
public CompiledAutomaton(Automaton automaton) {
this(automaton, null, true);
}
+ /** Create this. If finite is null, we use {@link Operations#isFinite}
+ * to determine whether it is finite. If simplify is true, we run
+ * possibly expensive operations to determine if the automaton is one
+ * the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. */
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {
+ if (automaton.getNumStates() == 0) {
+ automaton = new Automaton();
+ automaton.createState();
+ }
+
if (simplify) {
+
// Test whether the automaton is a "simple" form and
// if so, don't create a runAutomaton. Note that on a
// large automaton these tests could be costly:
- if (BasicOperations.isEmpty(automaton)) {
+
+ if (Operations.isEmpty(automaton)) {
// matches nothing
type = AUTOMATON_TYPE.NONE;
term = null;
commonSuffixRef = null;
runAutomaton = null;
- sortedTransitions = null;
+ this.automaton = null;
this.finite = null;
return;
- } else if (BasicOperations.isTotal(automaton)) {
+ // NOTE: only approximate, because automaton may not be minimal:
+ } else if (Operations.isTotal(automaton)) {
// matches all possible strings
type = AUTOMATON_TYPE.ALL;
term = null;
commonSuffixRef = null;
runAutomaton = null;
- sortedTransitions = null;
+ this.automaton = null;
this.finite = null;
return;
} else {
- final String commonPrefix;
+
+ automaton = Operations.determinize(automaton);
+
+ final String commonPrefix = Operations.getCommonPrefix(automaton);
final String singleton;
- if (automaton.getSingleton() == null) {
- commonPrefix = SpecialOperations.getCommonPrefix(automaton);
- if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
- singleton = commonPrefix;
- } else {
- singleton = null;
- }
+
+ if (commonPrefix.length() > 0 && Operations.sameLanguage(automaton, Automata.makeString(commonPrefix))) {
+ singleton = commonPrefix;
} else {
- commonPrefix = null;
- singleton = automaton.getSingleton();
+ singleton = null;
}
-
+
if (singleton != null) {
- // matches a fixed string in singleton or expanded
- // representation
+ // matches a fixed string
type = AUTOMATON_TYPE.SINGLE;
term = new BytesRef(singleton);
commonSuffixRef = null;
runAutomaton = null;
- sortedTransitions = null;
- this.finite = null;
- return;
- } else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate(
- BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) {
- // matches a constant prefix
- type = AUTOMATON_TYPE.PREFIX;
- term = new BytesRef(commonPrefix);
- commonSuffixRef = null;
- runAutomaton = null;
- sortedTransitions = null;
+ this.automaton = null;
this.finite = null;
return;
+ } else if (commonPrefix.length() > 0) {
+ Automaton other = Operations.concatenate(Automata.makeString(commonPrefix), Automata.makeAnyString());
+ other = Operations.determinize(other);
+ assert Operations.hasDeadStates(other) == false;
+ if (Operations.sameLanguage(automaton, other)) {
+ // matches a constant prefix
+ type = AUTOMATON_TYPE.PREFIX;
+ term = new BytesRef(commonPrefix);
+ commonSuffixRef = null;
+ runAutomaton = null;
+ this.automaton = null;
+ this.finite = null;
+ return;
+ }
}
}
}
type = AUTOMATON_TYPE.NORMAL;
term = null;
+
if (finite == null) {
- this.finite = SpecialOperations.isFinite(automaton);
+ this.finite = Operations.isFinite(automaton);
} else {
this.finite = finite;
}
+
Automaton utf8 = new UTF32ToUTF8().convert(automaton);
if (this.finite) {
commonSuffixRef = null;
} else {
- commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8);
+ commonSuffixRef = Operations.getCommonSuffixBytesRef(utf8);
}
runAutomaton = new ByteRunAutomaton(utf8, true);
- sortedTransitions = utf8.getSortedTransitions();
+
+ this.automaton = runAutomaton.automaton;
}
+
+ private Transition transition = new Transition();
//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
private BytesRef addTail(int state, BytesRef term, int idx, int leadLabel) {
-
+ //System.out.println("addTail state=" + state + " term=" + term.utf8ToString() + " idx=" + idx + " leadLabel=" + (char) leadLabel);
+ //System.out.println(automaton.toDot());
// Find biggest transition that's < label
// TODO: use binary search here
- Transition maxTransition = null;
- for (Transition transition : sortedTransitions[state]) {
+ int maxIndex = -1;
+ int numTransitions = automaton.initTransition(state, transition);
+ for(int i=0;i<numTransitions;i++) {
+ automaton.getNextTransition(transition);
if (transition.min < leadLabel) {
- maxTransition = transition;
+ maxIndex = i;
+ } else {
+ // Transitions are alway sorted
+ break;
}
}
- assert maxTransition != null;
+ //System.out.println(" maxIndex=" + maxIndex);
+
+ assert maxIndex != -1;
+ automaton.getTransition(state, maxIndex, transition);
// Append floorLabel
final int floorLabel;
- if (maxTransition.max > leadLabel-1) {
+ if (transition.max > leadLabel-1) {
floorLabel = leadLabel-1;
} else {
- floorLabel = maxTransition.max;
+ floorLabel = transition.max;
}
+ //System.out.println(" floorLabel=" + (char) floorLabel);
if (idx >= term.bytes.length) {
term.grow(1+idx);
}
//if (DEBUG) System.out.println(" add floorLabel=" + (char) floorLabel + " idx=" + idx);
term.bytes[idx] = (byte) floorLabel;
- state = maxTransition.to.getNumber();
+ state = transition.dest;
+ //System.out.println(" dest: " + state);
idx++;
// Push down to last accept state
while (true) {
- Transition[] transitions = sortedTransitions[state];
- if (transitions.length == 0) {
+ numTransitions = automaton.getNumTransitions(state);
+ if (numTransitions == 0) {
+ //System.out.println("state=" + state + " 0 trans");
assert runAutomaton.isAccept(state);
term.length = idx;
//if (DEBUG) System.out.println(" return " + term.utf8ToString());
@@ -213,14 +247,15 @@ public class CompiledAutomaton {
} else {
// We are pushing "top" -- so get last label of
// last transition:
- assert transitions.length != 0;
- Transition lastTransition = transitions[transitions.length-1];
+ //System.out.println("get state=" + state + " numTrans=" + numTransitions);
+ automaton.getTransition(state, numTransitions-1, transition);
if (idx >= term.bytes.length) {
term.grow(1+idx);
}
//if (DEBUG) System.out.println(" push maxLabel=" + (char) lastTransition.max + " idx=" + idx);
- term.bytes[idx] = (byte) lastTransition.max;
- state = lastTransition.to.getNumber();
+ //System.out.println(" add trans dest=" + scratch.dest + " label=" + (char) scratch.max);
+ term.bytes[idx] = (byte) transition.max;
+ state = transition.dest;
idx++;
}
}
@@ -229,6 +264,8 @@ public class CompiledAutomaton {
// TODO: should this take startTerm too? This way
// Terms.intersect could forward to this method if type !=
// NORMAL:
+ /** Return a {@link TermsEnum} intersecting the provided {@link Terms}
+ * with the terms accepted by this automaton. */
public TermsEnum getTermsEnum(Terms terms) throws IOException {
switch(type) {
case NONE:
@@ -301,33 +338,36 @@ public class CompiledAutomaton {
// Pop back to a state that has a transition
// <= our label:
while (true) {
- Transition[] transitions = sortedTransitions[state];
- if (transitions.length == 0) {
+ int numTransitions = automaton.getNumTransitions(state);
+ if (numTransitions == 0) {
assert runAutomaton.isAccept(state);
output.length = idx;
//if (DEBUG) System.out.println(" return " + output.utf8ToString());
return output;
- } else if (label-1 < transitions[0].min) {
+ } else {
+ automaton.getTransition(state, 0, transition);
- if (runAutomaton.isAccept(state)) {
- output.length = idx;
- //if (DEBUG) System.out.println(" return " + output.utf8ToString());
- return output;
- }
- // pop
- if (stack.size() == 0) {
- //if (DEBUG) System.out.println(" pop ord=" + idx + " return null");
- return null;
+ if (label-1 < transition.min) {
+
+ if (runAutomaton.isAccept(state)) {
+ output.length = idx;
+ //if (DEBUG) System.out.println(" return " + output.utf8ToString());
+ return output;
+ }
+ // pop
+ if (stack.size() == 0) {
+ //if (DEBUG) System.out.println(" pop ord=" + idx + " return null");
+ return null;
+ } else {
+ state = stack.remove(stack.size()-1);
+ idx--;
+ //if (DEBUG) System.out.println(" pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min);
+ label = input.bytes[input.offset + idx] & 0xff;
+ }
} else {
- state = stack.remove(stack.size()-1);
- idx--;
- //if (DEBUG) System.out.println(" pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min);
- label = input.bytes[input.offset + idx] & 0xff;
+ //if (DEBUG) System.out.println(" stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min);
+ break;
}
-
- } else {
- //if (DEBUG) System.out.println(" stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min);
- break;
}
}
@@ -346,26 +386,6 @@ public class CompiledAutomaton {
}
}
}
-
- public String toDot() {
- StringBuilder b = new StringBuilder("digraph CompiledAutomaton {\n");
- b.append(" rankdir = LR;\n");
- int initial = runAutomaton.getInitialState();
- for (int i = 0; i < sortedTransitions.length; i++) {
- b.append(" ").append(i);
- if (runAutomaton.isAccept(i)) b.append(" [shape=doublecircle,label=\"\"];\n");
- else b.append(" [shape=circle,label=\"\"];\n");
- if (i == initial) {
- b.append(" initial [shape=plaintext,label=\"\"];\n");
- b.append(" initial -> ").append(i).append("\n");
- }
- for (int j = 0; j < sortedTransitions[i].length; j++) {
- b.append(" ").append(i);
- sortedTransitions[i][j].appendDot(b);
- }
- }
- return b.append("}\n").toString();
- }
@Override
public int hashCode() {
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java Thu Jun 19 08:22:08 2014
@@ -21,6 +21,8 @@ import java.util.Iterator;
import java.util.SortedSet;
import java.util.TreeSet;
+import org.apache.lucene.util.UnicodeUtil;
+
/**
* Class to construct DFAs that match a word within some edit distance.
* <p>
@@ -30,7 +32,8 @@ import java.util.TreeSet;
* @lucene.experimental
*/
public class LevenshteinAutomata {
- /** @lucene.internal */
+ /** Maximum edit distance this class can generate an automaton for.
+ * @lucene.internal */
public static final int MAXIMUM_SUPPORTED_DISTANCE = 2;
/* input word */
final int word[];
@@ -112,7 +115,7 @@ public class LevenshteinAutomata {
}
return word;
}
-
+
/**
* Compute a DFA that accepts all strings within an edit distance of <code>n</code>.
* <p>
@@ -125,8 +128,25 @@ public class LevenshteinAutomata {
* </p>
*/
public Automaton toAutomaton(int n) {
+ return toAutomaton(n, "");
+ }
+
+ /**
+ * Compute a DFA that accepts all strings within an edit distance of <code>n</code>,
+ * matching the specified exact prefix.
+ * <p>
+ * All automata have the following properties:
+ * <ul>
+ * <li>They are deterministic (DFA).
+ * <li>There are no transitions to dead states.
+ * <li>They are not minimal (some transitions could be combined).
+ * </ul>
+ * </p>
+ */
+ public Automaton toAutomaton(int n, String prefix) {
+ assert prefix != null;
if (n == 0) {
- return BasicAutomata.makeString(word, 0, word.length);
+ return Automata.makeString(prefix + UnicodeUtil.newString(word, 0, word.length));
}
if (n >= descriptions.length)
@@ -135,15 +155,36 @@ public class LevenshteinAutomata {
final int range = 2*n+1;
ParametricDescription description = descriptions[n];
// the number of states is based on the length of the word and n
- State states[] = new State[description.size()];
+ int numStates = description.size();
+
+ Automaton a = new Automaton();
+ int lastState;
+ if (prefix != null) {
+ // Insert prefix
+ lastState = a.createState();
+ for (int i = 0, cp = 0; i < prefix.length(); i += Character.charCount(cp)) {
+ int state = a.createState();
+ cp = prefix.codePointAt(i);
+ a.addTransition(lastState, state, cp, cp);
+ lastState = state;
+ }
+ } else {
+ lastState = a.createState();
+ }
+
+ int stateOffset = lastState;
+ a.setAccept(lastState, description.isAccept(0));
+
// create all states, and mark as accept states if appropriate
- for (int i = 0; i < states.length; i++) {
- states[i] = new State();
- states[i].number = i;
- states[i].setAccept(description.isAccept(i));
+ for (int i = 1; i < numStates; i++) {
+ int state = a.createState();
+ a.setAccept(state, description.isAccept(i));
}
+
+ // TODO: this creates bogus states/transitions (states are final, have self loops, and can't be reached from an init state)
+
// create transitions from state to state
- for (int k = 0; k < states.length; k++) {
+ for (int k = 0; k < numStates; k++) {
final int xpos = description.getPosition(k);
if (xpos < 0)
continue;
@@ -154,31 +195,26 @@ public class LevenshteinAutomata {
// get the characteristic vector at this position wrt ch
final int cvec = getVector(ch, xpos, end);
int dest = description.transition(k, xpos, cvec);
- if (dest >= 0)
- states[k].addTransition(new Transition(ch, states[dest]));
+ if (dest >= 0) {
+ a.addTransition(stateOffset+k, stateOffset+dest, ch);
+ }
}
// add transitions for all other chars in unicode
// by definition, their characteristic vectors are always 0,
// because they do not exist in the input string.
int dest = description.transition(k, xpos, 0); // by definition
- if (dest >= 0)
- for (int r = 0; r < numRanges; r++)
- states[k].addTransition(new Transition(rangeLower[r], rangeUpper[r], states[dest]));
+ if (dest >= 0) {
+ for (int r = 0; r < numRanges; r++) {
+ a.addTransition(stateOffset+k, stateOffset+dest, rangeLower[r], rangeUpper[r]);
+ }
+ }
}
- Automaton a = new Automaton(states[0]);
- a.setDeterministic(true);
- // we create some useless unconnected states, and its a net-win overall to remove these,
- // as well as to combine any adjacent transitions (it makes later algorithms more efficient).
- // so, while we could set our numberedStates here, its actually best not to, and instead to
- // force a traversal in reduce, pruning the unconnected states while we combine adjacent transitions.
- //a.setNumberedStates(states);
- a.reduce();
- // we need not trim transitions to dead states, as they are not created.
- //a.restoreInvariant();
+ a.finishState();
+ assert a.isDeterministic();
return a;
}
-
+
/**
* Get the characteristic vector <code>X(x, V)</code>
* where V is <code>substring(pos, end)</code>
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java Thu Jun 19 08:22:08 2014
@@ -361,8 +361,6 @@ public class RegExp {
*/
public static final int NONE = 0x0000;
- private static boolean allow_mutation = false;
-
Kind kind;
RegExp exp1, exp2;
String s;
@@ -419,13 +417,13 @@ public class RegExp {
to = e.to;
b = null;
}
-
+
/**
* Constructs new <code>Automaton</code> from this <code>RegExp</code>. Same
* as <code>toAutomaton(null)</code> (empty automaton map).
*/
public Automaton toAutomaton() {
- return toAutomatonAllowMutate(null, null);
+ return toAutomaton(null, null);
}
/**
@@ -439,7 +437,7 @@ public class RegExp {
*/
public Automaton toAutomaton(AutomatonProvider automaton_provider)
throws IllegalArgumentException {
- return toAutomatonAllowMutate(null, automaton_provider);
+ return toAutomaton(null, automaton_provider);
}
/**
@@ -454,32 +452,9 @@ public class RegExp {
*/
public Automaton toAutomaton(Map<String,Automaton> automata)
throws IllegalArgumentException {
- return toAutomatonAllowMutate(automata, null);
- }
-
- /**
- * Sets or resets allow mutate flag. If this flag is set, then automata
- * construction uses mutable automata, which is slightly faster but not thread
- * safe. By default, the flag is not set.
- *
- * @param flag if true, the flag is set
- * @return previous value of the flag
- */
- public boolean setAllowMutate(boolean flag) {
- boolean b = allow_mutation;
- allow_mutation = flag;
- return b;
- }
-
- private Automaton toAutomatonAllowMutate(Map<String,Automaton> automata,
- AutomatonProvider automaton_provider) throws IllegalArgumentException {
- boolean b = false;
- if (allow_mutation) b = Automaton.setAllowMutate(true); // thread unsafe
- Automaton a = toAutomaton(automata, automaton_provider);
- if (allow_mutation) Automaton.setAllowMutate(b);
- return a;
+ return toAutomaton(automata, null);
}
-
+
private Automaton toAutomaton(Map<String,Automaton> automata,
AutomatonProvider automaton_provider) throws IllegalArgumentException {
List<Automaton> list;
@@ -489,8 +464,8 @@ public class RegExp {
list = new ArrayList<>();
findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider);
findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider);
- a = BasicOperations.union(list);
- MinimizationOperations.minimize(a);
+ a = Operations.union(list);
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_CONCATENATION:
list = new ArrayList<>();
@@ -498,66 +473,72 @@ public class RegExp {
automaton_provider);
findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata,
automaton_provider);
- a = BasicOperations.concatenate(list);
- MinimizationOperations.minimize(a);
+ a = Operations.concatenate(list);
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_INTERSECTION:
- a = exp1.toAutomaton(automata, automaton_provider).intersection(
+ a = Operations.intersection(
+ exp1.toAutomaton(automata, automaton_provider),
exp2.toAutomaton(automata, automaton_provider));
- MinimizationOperations.minimize(a);
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_OPTIONAL:
- a = exp1.toAutomaton(automata, automaton_provider).optional();
- MinimizationOperations.minimize(a);
+ a = Operations.optional(exp1.toAutomaton(automata, automaton_provider));
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_REPEAT:
- a = exp1.toAutomaton(automata, automaton_provider).repeat();
- MinimizationOperations.minimize(a);
+ a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider));
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_REPEAT_MIN:
- a = exp1.toAutomaton(automata, automaton_provider).repeat(min);
- MinimizationOperations.minimize(a);
+ a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider), min);
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_REPEAT_MINMAX:
- a = exp1.toAutomaton(automata, automaton_provider).repeat(min, max);
- MinimizationOperations.minimize(a);
+ a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider), min, max);
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_COMPLEMENT:
- a = exp1.toAutomaton(automata, automaton_provider).complement();
- MinimizationOperations.minimize(a);
+ a = Operations.complement(exp1.toAutomaton(automata, automaton_provider));
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_CHAR:
- a = BasicAutomata.makeChar(c);
+ a = Automata.makeChar(c);
break;
case REGEXP_CHAR_RANGE:
- a = BasicAutomata.makeCharRange(from, to);
+ a = Automata.makeCharRange(from, to);
break;
case REGEXP_ANYCHAR:
- a = BasicAutomata.makeAnyChar();
+ a = Automata.makeAnyChar();
break;
case REGEXP_EMPTY:
- a = BasicAutomata.makeEmpty();
+ a = Automata.makeEmpty();
break;
case REGEXP_STRING:
- a = BasicAutomata.makeString(s);
+ a = Automata.makeString(s);
break;
case REGEXP_ANYSTRING:
- a = BasicAutomata.makeAnyString();
+ a = Automata.makeAnyString();
break;
case REGEXP_AUTOMATON:
Automaton aa = null;
- if (automata != null) aa = automata.get(s);
- if (aa == null && automaton_provider != null) try {
- aa = automaton_provider.getAutomaton(s);
- } catch (IOException e) {
- throw new IllegalArgumentException(e);
+ if (automata != null) {
+ aa = automata.get(s);
}
- if (aa == null) throw new IllegalArgumentException("'" + s
- + "' not found");
- a = aa.clone(); // always clone here (ignore allow_mutate)
+ if (aa == null && automaton_provider != null) {
+ try {
+ aa = automaton_provider.getAutomaton(s);
+ } catch (IOException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+ if (aa == null) {
+ throw new IllegalArgumentException("'" + s + "' not found");
+ }
+ a = aa;
break;
case REGEXP_INTERVAL:
- a = BasicAutomata.makeInterval(min, max, digits);
+ a = Automata.makeInterval(min, max, digits);
break;
}
return a;
@@ -568,7 +549,9 @@ public class RegExp {
if (exp.kind == kind) {
findLeaves(exp.exp1, kind, list, automata, automaton_provider);
findLeaves(exp.exp2, kind, list, automata, automaton_provider);
- } else list.add(exp.toAutomaton(automata, automaton_provider));
+ } else {
+ list.add(exp.toAutomaton(automata, automaton_provider));
+ }
}
/**
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java?rev=1603752&r1=1603751&r2=1603752&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java Thu Jun 19 08:22:08 2014
@@ -37,6 +37,7 @@ import java.util.Arrays;
* @lucene.experimental
*/
public abstract class RunAutomaton {
+ final Automaton automaton;
final int maxInterval;
final int size;
final boolean[] accept;
@@ -65,10 +66,10 @@ public abstract class RunAutomaton {
if (j + 1 < points.length) max = (points[j + 1] - 1);
else max = maxInterval;
b.append(" ");
- Transition.appendCharString(min, b);
+ Automaton.appendCharString(min, b);
if (min != max) {
b.append("-");
- Transition.appendCharString(max, b);
+ Automaton.appendCharString(max, b);
}
b.append(" -> ").append(k).append("\n");
}
@@ -110,7 +111,7 @@ public abstract class RunAutomaton {
* Gets character class of given codepoint
*/
final int getCharClass(int c) {
- return SpecialOperations.findIndex(c, points);
+ return Operations.findIndex(c, points);
}
/**
@@ -121,23 +122,23 @@ public abstract class RunAutomaton {
*/
public RunAutomaton(Automaton a, int maxInterval, boolean tableize) {
this.maxInterval = maxInterval;
- a.determinize();
+ a = Operations.determinize(a);
+ this.automaton = a;
points = a.getStartPoints();
- final State[] states = a.getNumberedStates();
- initial = a.initial.number;
- size = states.length;
+ initial = 0;
+ size = Math.max(1,a.getNumStates());
accept = new boolean[size];
transitions = new int[size * points.length];
- for (int n = 0; n < size * points.length; n++)
- transitions[n] = -1;
- for (State s : states) {
- int n = s.number;
- accept[n] = s.accept;
+ Arrays.fill(transitions, -1);
+ for (int n=0;n<size;n++) {
+ accept[n] = a.isAccept(n);
for (int c = 0; c < points.length; c++) {
- State q = s.step(points[c]);
- if (q != null) transitions[n * points.length + c] = q.number;
+ int dest = a.step(n, points[c]);
+ assert dest == -1 || dest < size;
+ transitions[n * points.length + c] = dest;
}
}
+
/*
* Set alphabet table for optimal run performance.
*/
@@ -145,8 +146,9 @@ public abstract class RunAutomaton {
classmap = new int[maxInterval + 1];
int i = 0;
for (int j = 0; j <= maxInterval; j++) {
- if (i + 1 < points.length && j == points[i + 1])
+ if (i + 1 < points.length && j == points[i + 1]) {
i++;
+ }
classmap[j] = i;
}
} else {
@@ -162,10 +164,11 @@ public abstract class RunAutomaton {
* transition function.)
*/
public final int step(int state, int c) {
- if (classmap == null)
+ if (classmap == null) {
return transitions[state * points.length + getCharClass(c)];
- else
+ } else {
return transitions[state * points.length + classmap[c]];
+ }
}
@Override