You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by pa...@apache.org on 2021/12/20 22:52:15 UTC
[lucene] branch main updated: LUCENE-10010 Introduce NFARunAutomaton to run NFA directly (#225)
This is an automated email from the ASF dual-hosted git repository.
patrickz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new c972c6a LUCENE-10010 Introduce NFARunAutomaton to run NFA directly (#225)
c972c6a is described below
commit c972c6a7754e9ff1f838ed72ff09e7def00be5f8
Author: Patrick Zhai <zh...@users.noreply.github.com>
AuthorDate: Mon Dec 20 14:52:09 2021 -0800
LUCENE-10010 Introduce NFARunAutomaton to run NFA directly (#225)
LUCENE-10010 Introduce NFARunAutomaton to run NFA directly
Created 2 new interfaces ByteRunnable and TransisitonAccessor to represent function necessary for performing automaton run. Introduced NFARunAutomaton and incorporated into CompiledAutomaton and enables directly running AutomatonQuery based on NFA for all codecs.
---
lucene/CHANGES.txt | 6 +-
.../lucene40/blocktree/FieldReader.java | 6 +-
.../lucene40/blocktree/IntersectTermsEnum.java | 12 +-
.../blocktreeords/OrdsIntersectTermsEnum.java | 38 +-
.../blocktreeords/OrdsIntersectTermsEnumFrame.java | 8 +-
.../lucene/codecs/memory/DirectPostingsFormat.java | 56 +--
.../lucene/codecs/memory/FSTTermsReader.java | 6 +-
.../codecs/uniformsplit/IntersectBlockReader.java | 69 ++--
.../codecs/lucene90/blocktree/FieldReader.java | 6 +-
.../lucene90/blocktree/IntersectTermsEnum.java | 12 +-
.../apache/lucene/index/AutomatonTermsEnum.java | 52 +--
.../org/apache/lucene/search/AutomatonQuery.java | 6 +-
.../apache/lucene/util/automaton/Automaton.java | 16 +-
.../lucene/util/automaton/ByteRunAutomaton.java | 13 +-
.../{ByteRunAutomaton.java => ByteRunnable.java} | 42 +-
.../lucene/util/automaton/CompiledAutomaton.java | 82 +++-
.../lucene/util/automaton/NFARunAutomaton.java | 429 +++++++++++++++++++++
.../apache/lucene/util/automaton/Operations.java | 6 +-
.../apache/lucene/util/automaton/RunAutomaton.java | 7 +-
.../org/apache/lucene/util/automaton/StateSet.java | 5 +
.../lucene/util/automaton/TransitionAccessor.java | 39 ++
.../apache/lucene/search/TestAutomatonQuery.java | 12 -
.../util/automaton/MinimizationOperations.java | 1 +
.../lucene/util/automaton/TestNFARunAutomaton.java | 176 +++++++++
24 files changed, 896 insertions(+), 209 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 974197f..79a2fdd 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -41,13 +41,15 @@ API Changes
* LUCENE-10244: MultiCollector::getCollectors is now public, allowing users to access the wrapped
collectors. (Andriy Redko)
-
+
* LUCENE-10197: UnifiedHighlighter now has a Builder to construct it. The UH's setters are now
deprecated. (Animesh Pandey, David Smiley)
New Features
---------------------
+* LUCENE-10010 Introduce NFARunAutomaton to run NFA directly. (Patrick Zhai)
+
* LUCENE-10255: Lucene JARs are now proper modules, with module descriptors and dependency information.
(Chris Hegarty, Uwe Schindler, Tomoko Uchida, Dawid Weiss)
@@ -112,7 +114,7 @@ Bug Fixes
* LUCENE-10287: Fix Luke startup script to add jdk.unsupported module so
MMapDirectory is used when opening indexes. (Uwe Schindler)
-
+
* LUCENE-10236: Stop duplicating norms when scoring in CombinedFieldQuery.
(Zach Chen, Jim Ferenczi, Julie Tibshirani)
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/FieldReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/FieldReader.java
index 284b090..0ede117 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/FieldReader.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/FieldReader.java
@@ -188,7 +188,11 @@ public final class FieldReader extends Terms {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
return new IntersectTermsEnum(
- this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm);
+ this,
+ compiled.getTransitionAccessor(),
+ compiled.getByteRunnable(),
+ compiled.commonSuffixRef,
+ startTerm);
}
@Override
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/IntersectTermsEnum.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/IntersectTermsEnum.java
index 12a40d7..c61795b 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/IntersectTermsEnum.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/IntersectTermsEnum.java
@@ -27,9 +27,9 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.RunAutomaton;
+import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.automaton.TransitionAccessor;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
@@ -53,8 +53,8 @@ final class IntersectTermsEnum extends BaseTermsEnum {
@SuppressWarnings({"rawtypes", "unchecked"})
private FST.Arc<BytesRef>[] arcs = new FST.Arc[5];
- final RunAutomaton runAutomaton;
- final Automaton automaton;
+ final ByteRunnable runAutomaton;
+ final TransitionAccessor automaton;
final BytesRef commonSuffix;
private IntersectTermsEnumFrame currentFrame;
@@ -72,8 +72,8 @@ final class IntersectTermsEnum extends BaseTermsEnum {
// regexp foo*bar must be at least length 6 bytes
public IntersectTermsEnum(
FieldReader fr,
- Automaton automaton,
- RunAutomaton runAutomaton,
+ TransitionAccessor automaton,
+ ByteRunnable runAutomaton,
BytesRef commonSuffix,
BytesRef startTerm)
throws IOException {
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnum.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnum.java
index 2a95da5..a4752ee 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnum.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnum.java
@@ -27,8 +27,9 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.RunAutomaton;
+import org.apache.lucene.util.automaton.TransitionAccessor;
import org.apache.lucene.util.fst.FST;
// NOTE: cannot seek!
@@ -40,8 +41,9 @@ final class OrdsIntersectTermsEnum extends BaseTermsEnum {
@SuppressWarnings({"rawtypes", "unchecked"})
private FST.Arc<Output>[] arcs = new FST.Arc[5];
- final RunAutomaton runAutomaton;
- final CompiledAutomaton compiledAutomaton;
+ final ByteRunnable byteRunnable;
+ protected final TransitionAccessor transitionAccessor;
+ private final BytesRef commonSuffixRef;
private OrdsIntersectTermsEnumFrame currentFrame;
@@ -62,8 +64,9 @@ final class OrdsIntersectTermsEnum extends BaseTermsEnum {
// brToString(compiled.commonSuffixRef));
// }
this.fr = fr;
- runAutomaton = compiled.runAutomaton;
- compiledAutomaton = compiled;
+ this.byteRunnable = compiled.getByteRunnable();
+ this.transitionAccessor = compiled.getTransitionAccessor();
+ commonSuffixRef = compiled.commonSuffixRef;
in = fr.parent.in.clone();
stack = new OrdsIntersectTermsEnumFrame[5];
for (int idx = 0; idx < stack.length; idx++) {
@@ -220,7 +223,7 @@ final class OrdsIntersectTermsEnum extends BaseTermsEnum {
int state = currentFrame.state;
for (int idx = 0; idx < currentFrame.suffix; idx++) {
state =
- runAutomaton.step(
+ byteRunnable.step(
state, currentFrame.suffixBytes[currentFrame.startBytePos + idx] & 0xff);
assert state != -1;
}
@@ -390,7 +393,7 @@ final class OrdsIntersectTermsEnum extends BaseTermsEnum {
continue nextTerm;
}
currentFrame.transitionIndex++;
- compiledAutomaton.automaton.getNextTransition(currentFrame.transition);
+ transitionAccessor.getNextTransition(currentFrame.transition);
currentFrame.curTransitionMax = currentFrame.transition.max;
// if (DEBUG) System.out.println(" next trans=" +
// currentFrame.transitions[currentFrame.transitionIndex]);
@@ -398,9 +401,9 @@ final class OrdsIntersectTermsEnum extends BaseTermsEnum {
}
// First test the common suffix, if set:
- if (compiledAutomaton.commonSuffixRef != null && !isSubBlock) {
+ if (commonSuffixRef != null && !isSubBlock) {
final int termLen = currentFrame.prefix + currentFrame.suffix;
- if (termLen < compiledAutomaton.commonSuffixRef.length) {
+ if (termLen < commonSuffixRef.length) {
// No match
// if (DEBUG) {
// System.out.println(" skip: common suffix length");
@@ -409,10 +412,10 @@ final class OrdsIntersectTermsEnum extends BaseTermsEnum {
}
final byte[] suffixBytes = currentFrame.suffixBytes;
- final byte[] commonSuffixBytes = compiledAutomaton.commonSuffixRef.bytes;
+ final byte[] commonSuffixBytes = commonSuffixRef.bytes;
- final int lenInPrefix = compiledAutomaton.commonSuffixRef.length - currentFrame.suffix;
- assert compiledAutomaton.commonSuffixRef.offset == 0;
+ final int lenInPrefix = commonSuffixRef.length - currentFrame.suffix;
+ assert commonSuffixRef.offset == 0;
int suffixBytesPos;
int commonSuffixBytesPos = 0;
@@ -434,14 +437,11 @@ final class OrdsIntersectTermsEnum extends BaseTermsEnum {
}
suffixBytesPos = currentFrame.startBytePos;
} else {
- suffixBytesPos =
- currentFrame.startBytePos
- + currentFrame.suffix
- - compiledAutomaton.commonSuffixRef.length;
+ suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - commonSuffixRef.length;
}
// Test overlapping suffix part:
- final int commonSuffixBytesPosEnd = compiledAutomaton.commonSuffixRef.length;
+ final int commonSuffixBytesPosEnd = commonSuffixRef.length;
while (commonSuffixBytesPos < commonSuffixBytesPosEnd) {
if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
// if (DEBUG) {
@@ -462,7 +462,7 @@ final class OrdsIntersectTermsEnum extends BaseTermsEnum {
int state = currentFrame.state;
for (int idx = 0; idx < currentFrame.suffix; idx++) {
state =
- runAutomaton.step(
+ byteRunnable.step(
state, currentFrame.suffixBytes[currentFrame.startBytePos + idx] & 0xff);
if (state == -1) {
// No match
@@ -485,7 +485,7 @@ final class OrdsIntersectTermsEnum extends BaseTermsEnum {
// currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" :
// currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" +
// currentFrame.outputPrefix);
- } else if (runAutomaton.isAccept(state)) {
+ } else if (byteRunnable.isAccept(state)) {
copyTerm();
// if (DEBUG) System.out.println(" term match to state=" + state + "; return term=" +
// brToString(term));
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java
index d2d48f9..88e5536 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java
@@ -126,10 +126,10 @@ final class OrdsIntersectTermsEnumFrame {
public void setState(int state) {
this.state = state;
transitionIndex = 0;
- transitionCount = ite.compiledAutomaton.automaton.getNumTransitions(state);
+ transitionCount = ite.transitionAccessor.getNumTransitions(state);
if (transitionCount != 0) {
- ite.compiledAutomaton.automaton.initTransition(state, transition);
- ite.compiledAutomaton.automaton.getNextTransition(transition);
+ ite.transitionAccessor.initTransition(state, transition);
+ ite.transitionAccessor.getNextTransition(transition);
curTransitionMax = transition.max;
} else {
curTransitionMax = -1;
@@ -164,7 +164,7 @@ final class OrdsIntersectTermsEnumFrame {
// If current state is accept, we must process
// first block in case it has empty suffix:
- if (!ite.runAutomaton.isAccept(state)) {
+ if (!ite.byteRunnable.isAccept(state)) {
// Maybe skip floor blocks:
assert transitionIndex == 0 : "transitionIndex=" + transitionIndex;
while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min) {
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java
index f66ca45..c243dd0 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java
@@ -44,9 +44,10 @@ import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.RunAutomaton;
import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.automaton.TransitionAccessor;
// TODO:
// - build depth-N prefix hash?
@@ -943,8 +944,9 @@ public final class DirectPostingsFormat extends PostingsFormat {
}
private final class DirectIntersectTermsEnum extends BaseTermsEnum {
- private final RunAutomaton runAutomaton;
- private final CompiledAutomaton compiledAutomaton;
+ private final ByteRunnable byteRunnable;
+ protected final TransitionAccessor transitionAccessor;
+ private final BytesRef commonSuffixRef;
private int termOrd;
private final BytesRef scratch = new BytesRef();
@@ -962,15 +964,16 @@ public final class DirectPostingsFormat extends PostingsFormat {
private int stateUpto;
public DirectIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) {
- runAutomaton = compiled.runAutomaton;
- compiledAutomaton = compiled;
+ this.byteRunnable = compiled.getByteRunnable();
+ this.transitionAccessor = compiled.getTransitionAccessor();
+ commonSuffixRef = compiled.commonSuffixRef;
termOrd = -1;
states = new State[1];
states[0] = new State();
states[0].changeOrd = terms.length;
states[0].state = 0;
- states[0].transitionCount = compiledAutomaton.automaton.getNumTransitions(states[0].state);
- compiledAutomaton.automaton.initTransition(states[0].state, states[0].transition);
+ states[0].transitionCount = transitionAccessor.getNumTransitions(states[0].state);
+ transitionAccessor.initTransition(states[0].state, states[0].transition);
states[0].transitionUpto = -1;
states[0].transitionMax = -1;
@@ -992,7 +995,7 @@ public final class DirectPostingsFormat extends PostingsFormat {
while (label > states[i].transitionMax) {
states[i].transitionUpto++;
assert states[i].transitionUpto < states[i].transitionCount;
- compiledAutomaton.automaton.getNextTransition(states[i].transition);
+ transitionAccessor.getNextTransition(states[i].transition);
states[i].transitionMin = states[i].transition.min;
states[i].transitionMax = states[i].transition.max;
assert states[i].transitionMin >= 0;
@@ -1045,7 +1048,7 @@ public final class DirectPostingsFormat extends PostingsFormat {
if (skipUpto < numSkips) {
grow();
- final int nextState = runAutomaton.step(states[stateUpto].state, label);
+ final int nextState = byteRunnable.step(states[stateUpto].state, label);
// Automaton is required to accept startTerm:
assert nextState != -1;
@@ -1054,8 +1057,8 @@ public final class DirectPostingsFormat extends PostingsFormat {
states[stateUpto].changeOrd = skips[skipOffset + skipUpto++];
states[stateUpto].state = nextState;
states[stateUpto].transitionCount =
- compiledAutomaton.automaton.getNumTransitions(nextState);
- compiledAutomaton.automaton.initTransition(
+ transitionAccessor.getNumTransitions(nextState);
+ transitionAccessor.initTransition(
states[stateUpto].state, states[stateUpto].transition);
states[stateUpto].transitionUpto = -1;
states[stateUpto].transitionMax = -1;
@@ -1154,7 +1157,7 @@ public final class DirectPostingsFormat extends PostingsFormat {
// if (DEBUG) {
// System.out.println(" visit empty string");
// }
- if (runAutomaton.isAccept(states[0].state)) {
+ if (byteRunnable.isAccept(states[0].state)) {
scratch.bytes = termBytes;
scratch.offset = 0;
scratch.length = 0;
@@ -1236,7 +1239,7 @@ public final class DirectPostingsFormat extends PostingsFormat {
}
continue nextTerm;
}
- compiledAutomaton.automaton.getNextTransition(state.transition);
+ transitionAccessor.getNextTransition(state.transition);
assert state.transitionUpto < state.transitionCount
: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitionCount;
state.transitionMin = state.transition.min;
@@ -1304,7 +1307,7 @@ public final class DirectPostingsFormat extends PostingsFormat {
}
}
- int nextState = runAutomaton.step(states[stateUpto].state, label);
+ int nextState = byteRunnable.step(states[stateUpto].state, label);
if (nextState == -1) {
// Skip
@@ -1340,9 +1343,8 @@ public final class DirectPostingsFormat extends PostingsFormat {
stateUpto++;
states[stateUpto].state = nextState;
states[stateUpto].changeOrd = skips[skipOffset + skipUpto++];
- states[stateUpto].transitionCount =
- compiledAutomaton.automaton.getNumTransitions(nextState);
- compiledAutomaton.automaton.initTransition(nextState, states[stateUpto].transition);
+ states[stateUpto].transitionCount = transitionAccessor.getNumTransitions(nextState);
+ transitionAccessor.initTransition(nextState, states[stateUpto].transition);
states[stateUpto].transitionUpto = -1;
states[stateUpto].transitionMax = -1;
@@ -1350,7 +1352,7 @@ public final class DirectPostingsFormat extends PostingsFormat {
// if (DEBUG) {
// System.out.println(" term ends after push");
// }
- if (runAutomaton.isAccept(nextState)) {
+ if (byteRunnable.isAccept(nextState)) {
// if (DEBUG) {
// System.out.println(" automaton accepts: return");
// }
@@ -1374,17 +1376,17 @@ public final class DirectPostingsFormat extends PostingsFormat {
// TODO: add assert that we don't inc too many times
- if (compiledAutomaton.commonSuffixRef != null) {
- // System.out.println("suffix " + compiledAutomaton.commonSuffixRef.utf8ToString());
- assert compiledAutomaton.commonSuffixRef.offset == 0;
- if (termLength < compiledAutomaton.commonSuffixRef.length) {
+ if (commonSuffixRef != null) {
+ // System.out.println("suffix " + commonSuffixRef.utf8ToString());
+ assert commonSuffixRef.offset == 0;
+ if (termLength < commonSuffixRef.length) {
termOrd++;
skipUpto = 0;
continue nextTerm;
}
- int offset = termOffset + termLength - compiledAutomaton.commonSuffixRef.length;
- for (int suffix = 0; suffix < compiledAutomaton.commonSuffixRef.length; suffix++) {
- if (termBytes[offset + suffix] != compiledAutomaton.commonSuffixRef.bytes[suffix]) {
+ int offset = termOffset + termLength - commonSuffixRef.length;
+ for (int suffix = 0; suffix < commonSuffixRef.length; suffix++) {
+ if (termBytes[offset + suffix] != commonSuffixRef.bytes[suffix]) {
termOrd++;
skipUpto = 0;
continue nextTerm;
@@ -1394,7 +1396,7 @@ public final class DirectPostingsFormat extends PostingsFormat {
int upto = stateUpto + 1;
while (upto < termLength) {
- nextState = runAutomaton.step(nextState, termBytes[termOffset + upto] & 0xFF);
+ nextState = byteRunnable.step(nextState, termBytes[termOffset + upto] & 0xFF);
if (nextState == -1) {
termOrd++;
skipUpto = 0;
@@ -1406,7 +1408,7 @@ public final class DirectPostingsFormat extends PostingsFormat {
upto++;
}
- if (runAutomaton.isAccept(nextState)) {
+ if (byteRunnable.isAccept(nextState)) {
scratch.bytes = termBytes;
scratch.offset = termOffsets[termOrd];
scratch.length = termOffsets[1 + termOrd] - scratch.offset;
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java
index 4f3791e..8a16de8 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java
@@ -45,7 +45,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.automaton.ByteRunAutomaton;
+import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput;
@@ -436,7 +436,7 @@ public class FSTTermsReader extends FieldsProducer {
final Outputs<FSTTermOutputs.TermData> fstOutputs;
/* query automaton to intersect with */
- final ByteRunAutomaton fsa;
+ final ByteRunnable fsa;
private final class Frame {
/* fst stats */
@@ -464,7 +464,7 @@ public class FSTTermsReader extends FieldsProducer {
this.fst = dict;
this.fstReader = fst.getBytesReader();
this.fstOutputs = dict.outputs;
- this.fsa = compiled.runAutomaton;
+ this.fsa = compiled.getByteRunnable();
this.level = -1;
this.stack = new Frame[16];
for (int i = 0; i < stack.length; i++) {
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IntersectBlockReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IntersectBlockReader.java
index d003807..22559af 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IntersectBlockReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IntersectBlockReader.java
@@ -27,10 +27,10 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.ByteRunAutomaton;
+import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.automaton.TransitionAccessor;
/**
* The "intersect" {@link TermsEnum} response to {@link
@@ -72,8 +72,8 @@ public class IntersectBlockReader extends BlockReader {
*/
protected final int NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD = 4;
- protected final Automaton automaton;
- protected final ByteRunAutomaton runAutomaton;
+ protected final TransitionAccessor transitionAccessor;
+ protected final ByteRunnable byteRunnable;
protected final boolean finite;
protected final BytesRef commonSuffix; // maybe null
protected final int minTermLength;
@@ -106,8 +106,8 @@ public class IntersectBlockReader extends BlockReader {
BlockDecoder blockDecoder)
throws IOException {
super(dictionaryBrowserSupplier, blockInput, postingsReader, fieldMetadata, blockDecoder);
- automaton = compiled.automaton;
- runAutomaton = compiled.runAutomaton;
+ this.byteRunnable = compiled.getByteRunnable();
+ this.transitionAccessor = compiled.getTransitionAccessor();
finite = compiled.finite;
commonSuffix = compiled.commonSuffixRef;
minTermLength = getMinTermLength();
@@ -136,16 +136,16 @@ public class IntersectBlockReader extends BlockReader {
int state = 0;
Transition t = null;
while (true) {
- if (runAutomaton.isAccept(state)) {
+ if (byteRunnable.isAccept(state)) {
// The common prefix reaches a final state. So common prefix and common suffix overlap.
// Min term length is the max between common prefix and common suffix lengths.
return Math.max(commonPrefixLength, commonSuffixLength);
}
- if (automaton.getNumTransitions(state) == 1) {
+ if (transitionAccessor.getNumTransitions(state) == 1) {
if (t == null) {
t = new Transition();
}
- automaton.getTransition(state, 0, t);
+ transitionAccessor.getTransition(state, 0, t);
if (t.min == t.max) {
state = t.dest;
commonPrefixLength++;
@@ -230,10 +230,10 @@ public class IntersectBlockReader extends BlockReader {
int state = states[index];
while (true) {
if (index == lineTerm.length) {
- if (runAutomaton.isAccept(state)) {
+ if (byteRunnable.isAccept(state)) {
// The automaton accepts the current term. Record the number of matched bytes and
// return the term.
- assert runAutomaton.run(lineTerm.bytes, 0, lineTerm.length);
+ assert byteRunnable.run(lineTerm.bytes, 0, lineTerm.length);
numMatchedBytes = index;
if (numConsecutivelyRejectedTerms > 0) {
numConsecutivelyRejectedTerms = 0;
@@ -243,7 +243,7 @@ public class IntersectBlockReader extends BlockReader {
}
break;
}
- state = runAutomaton.step(state, lineTerm.bytes[index] & 0xff);
+ state = byteRunnable.step(state, lineTerm.bytes[index] & 0xff);
if (state == -1) {
// The automaton rejects the current term.
break;
@@ -254,7 +254,7 @@ public class IntersectBlockReader extends BlockReader {
}
// The current term is not accepted by the automaton.
// Still record the reached automaton state to start the next term steps from there.
- assert !runAutomaton.run(lineTerm.bytes, 0, lineTerm.length);
+ assert !byteRunnable.run(lineTerm.bytes, 0, lineTerm.length);
numMatchedBytes = index;
// If the number of consecutively rejected terms reaches the threshold,
// then determine whether it is worthwhile to jump to a block away.
@@ -366,7 +366,7 @@ public class IntersectBlockReader extends BlockReader {
protected class AutomatonNextTermCalculator {
// for path tracking: each short records gen when we last
// visited the state; we use gens to avoid having to clear
- protected final short[] visited;
+ protected short[] visited;
protected short curGen;
// the reference used for seeking forwards through the term dictionary
protected final BytesRefBuilder seekBytesRef = new BytesRefBuilder();
@@ -380,19 +380,22 @@ public class IntersectBlockReader extends BlockReader {
protected final IntsRefBuilder savedStates = new IntsRefBuilder();
protected AutomatonNextTermCalculator(CompiledAutomaton compiled) {
- visited = compiled.finite ? null : new short[runAutomaton.getSize()];
+ visited = compiled.finite ? null : new short[byteRunnable.getSize()];
}
/** Records the given state has been visited. */
- protected void setVisited(int state) {
- if (!finite) {
+ private void setVisited(int state) {
+ if (finite == false) {
+ if (state >= visited.length) {
+ visited = ArrayUtil.grow(visited, state + 1);
+ }
visited[state] = curGen;
}
}
/** Indicates whether the given state has been visited. */
- protected boolean isVisited(int state) {
- return !finite && visited[state] == curGen;
+ private boolean isVisited(int state) {
+ return !finite && state < visited.length && visited[state] == curGen;
}
/** True if the current state of the automata is best iterated linearly (without seeking). */
@@ -406,7 +409,7 @@ public class IntersectBlockReader extends BlockReader {
if (term == null) {
assert seekBytesRef.length() == 0;
// return the empty term, as it's valid
- if (runAutomaton.isAccept(0)) {
+ if (byteRunnable.isAccept(0)) {
return seekBytesRef.get();
}
} else {
@@ -433,13 +436,13 @@ public class IntersectBlockReader extends BlockReader {
int maxInterval = 0xff;
// System.out.println("setLinear pos=" + position + " seekbytesRef=" + seekBytesRef);
for (int i = 0; i < position; i++) {
- state = runAutomaton.step(state, seekBytesRef.byteAt(i) & 0xff);
+ state = byteRunnable.step(state, seekBytesRef.byteAt(i) & 0xff);
assert state >= 0 : "state=" + state;
}
- final int numTransitions = automaton.getNumTransitions(state);
- automaton.initTransition(state, transition);
+ final int numTransitions = transitionAccessor.getNumTransitions(state);
+ transitionAccessor.initTransition(state, transition);
for (int i = 0; i < numTransitions; i++) {
- automaton.getNextTransition(transition);
+ transitionAccessor.getNextTransition(transition);
if (transition.min <= (seekBytesRef.byteAt(position) & 0xff)
&& (seekBytesRef.byteAt(position) & 0xff) <= transition.max) {
maxInterval = transition.max;
@@ -484,7 +487,7 @@ public class IntersectBlockReader extends BlockReader {
// walk the automaton until a character is rejected.
for (state = savedStates.intAt(pos); pos < seekBytesRef.length(); pos++) {
setVisited(state);
- int nextState = runAutomaton.step(state, seekBytesRef.byteAt(pos) & 0xff);
+ int nextState = byteRunnable.step(state, seekBytesRef.byteAt(pos) & 0xff);
if (nextState == -1) break;
savedStates.setIntAt(pos + 1, nextState);
// we found a loop, record it for faster enumeration
@@ -502,8 +505,8 @@ public class IntersectBlockReader extends BlockReader {
/* no more solutions exist from this useful portion, backtrack */
if ((pos = backtrack(pos)) < 0) /* no more solutions at all */ return false;
final int newState =
- runAutomaton.step(savedStates.intAt(pos), seekBytesRef.byteAt(pos) & 0xff);
- if (newState >= 0 && runAutomaton.isAccept(newState))
+ byteRunnable.step(savedStates.intAt(pos), seekBytesRef.byteAt(pos) & 0xff);
+ if (newState >= 0 && byteRunnable.isAccept(newState))
/* String is good to go as-is */
return true;
/* else advance further */
@@ -548,12 +551,12 @@ public class IntersectBlockReader extends BlockReader {
seekBytesRef.setLength(position);
setVisited(state);
- final int numTransitions = automaton.getNumTransitions(state);
- automaton.initTransition(state, transition);
+ final int numTransitions = transitionAccessor.getNumTransitions(state);
+ transitionAccessor.initTransition(state, transition);
// find the minimal path (lexicographic order) that is >= c
for (int i = 0; i < numTransitions; i++) {
- automaton.getNextTransition(transition);
+ transitionAccessor.getNextTransition(transition);
if (transition.max >= c) {
int nextChar = Math.max(c, transition.min);
// append either the next sequential char, or the minimum transition
@@ -564,15 +567,15 @@ public class IntersectBlockReader extends BlockReader {
* as long as is possible, continue down the minimal path in
* lexicographic order. if a loop or accept state is encountered, stop.
*/
- while (!isVisited(state) && !runAutomaton.isAccept(state)) {
+ while (!isVisited(state) && !byteRunnable.isAccept(state)) {
setVisited(state);
/*
* Note: we work with a DFA with no transitions to dead states.
* so the below is ok, if it is not an accept state,
* then there MUST be at least one transition.
*/
- automaton.initTransition(state, transition);
- automaton.getNextTransition(transition);
+ transitionAccessor.initTransition(state, transition);
+ transitionAccessor.getNextTransition(transition);
state = transition.dest;
// append the minimum transition
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java
index 78556f4..7aba781 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java
@@ -184,7 +184,11 @@ public final class FieldReader extends Terms {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
return new IntersectTermsEnum(
- this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm);
+ this,
+ compiled.getTransitionAccessor(),
+ compiled.getByteRunnable(),
+ compiled.commonSuffixRef,
+ startTerm);
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java
index 7c35f51..5773e4a 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java
@@ -27,9 +27,9 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.RunAutomaton;
+import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.automaton.TransitionAccessor;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
@@ -53,8 +53,8 @@ final class IntersectTermsEnum extends BaseTermsEnum {
@SuppressWarnings({"rawtypes", "unchecked"})
private FST.Arc<BytesRef>[] arcs = new FST.Arc[5];
- final RunAutomaton runAutomaton;
- final Automaton automaton;
+ final ByteRunnable runAutomaton;
+ final TransitionAccessor automaton;
final BytesRef commonSuffix;
private IntersectTermsEnumFrame currentFrame;
@@ -72,8 +72,8 @@ final class IntersectTermsEnum extends BaseTermsEnum {
// regexp foo*bar must be at least length 6 bytes
public IntersectTermsEnum(
FieldReader fr,
- Automaton automaton,
- RunAutomaton runAutomaton,
+ TransitionAccessor automaton,
+ ByteRunnable runAutomaton,
BytesRef commonSuffix,
BytesRef startTerm)
throws IOException {
diff --git a/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java
index d6884dd..ac16477 100644
--- a/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java
@@ -23,10 +23,10 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.ByteRunAutomaton;
+import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.automaton.TransitionAccessor;
/**
* A FilteredTermsEnum that enumerates terms based upon what is accepted by a DFA.
@@ -47,16 +47,16 @@ import org.apache.lucene.util.automaton.Transition;
*/
public class AutomatonTermsEnum extends FilteredTermsEnum {
// a tableized array-based form of the DFA
- private final ByteRunAutomaton runAutomaton;
+ private final ByteRunnable byteRunnable;
// common suffix of the automaton
private final BytesRef commonSuffixRef;
// true if the automaton accepts a finite language
private final boolean finite;
// array of sorted transitions for each state, indexed by state number
- private final Automaton automaton;
+ private final TransitionAccessor transitionAccessor;
// Used for visited state tracking: each short records gen when we last
// visited the state; we use gens to avoid having to clear
- private final short[] visited;
+ private short[] visited;
private short curGen;
// the reference used for seeking forwards through the term dictionary
private final BytesRefBuilder seekBytesRef = new BytesRefBuilder();
@@ -82,25 +82,27 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
this.finite = compiled.finite;
- this.runAutomaton = compiled.runAutomaton;
- assert this.runAutomaton != null;
+ this.byteRunnable = compiled.getByteRunnable();
+ this.transitionAccessor = compiled.getTransitionAccessor();
this.commonSuffixRef = compiled.commonSuffixRef;
- this.automaton = compiled.automaton;
// No need to track visited states for a finite language without loops.
- visited = finite ? null : new short[runAutomaton.getSize()];
+ visited = finite ? null : new short[byteRunnable.getSize()];
}
/** Records the given state has been visited. */
private void setVisited(int state) {
if (!finite) {
+ if (state >= visited.length) {
+ visited = ArrayUtil.grow(visited, state + 1);
+ }
visited[state] = curGen;
}
}
/** Indicates whether the given state has been visited. */
private boolean isVisited(int state) {
- return !finite && visited[state] == curGen;
+ return !finite && state < visited.length && visited[state] == curGen;
}
/**
@@ -110,7 +112,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
@Override
protected AcceptStatus accept(final BytesRef term) {
if (commonSuffixRef == null || StringHelper.endsWith(term, commonSuffixRef)) {
- if (runAutomaton.run(term.bytes, term.offset, term.length))
+ if (byteRunnable.run(term.bytes, term.offset, term.length))
return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK;
else
return (linear && term.compareTo(linearUpperBound) < 0)
@@ -129,7 +131,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
if (term == null) {
assert seekBytesRef.length() == 0;
// return the empty term, as it's valid
- if (runAutomaton.isAccept(0)) {
+ if (byteRunnable.isAccept(0)) {
return seekBytesRef.get();
}
} else {
@@ -155,13 +157,13 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
int maxInterval = 0xff;
// System.out.println("setLinear pos=" + position + " seekbytesRef=" + seekBytesRef);
for (int i = 0; i < position; i++) {
- state = runAutomaton.step(state, seekBytesRef.byteAt(i) & 0xff);
+ state = byteRunnable.step(state, seekBytesRef.byteAt(i) & 0xff);
assert state >= 0 : "state=" + state;
}
- final int numTransitions = automaton.getNumTransitions(state);
- automaton.initTransition(state, transition);
+ final int numTransitions = transitionAccessor.getNumTransitions(state);
+ transitionAccessor.initTransition(state, transition);
for (int i = 0; i < numTransitions; i++) {
- automaton.getNextTransition(transition);
+ transitionAccessor.getNextTransition(transition);
if (transition.min <= (seekBytesRef.byteAt(position) & 0xff)
&& (seekBytesRef.byteAt(position) & 0xff) <= transition.max) {
maxInterval = transition.max;
@@ -206,7 +208,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
// walk the automaton until a character is rejected.
for (state = savedStates.intAt(pos); pos < seekBytesRef.length(); pos++) {
setVisited(state);
- int nextState = runAutomaton.step(state, seekBytesRef.byteAt(pos) & 0xff);
+ int nextState = byteRunnable.step(state, seekBytesRef.byteAt(pos) & 0xff);
if (nextState == -1) break;
savedStates.setIntAt(pos + 1, nextState);
// we found a loop, record it for faster enumeration
@@ -227,8 +229,8 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
return false;
}
final int newState =
- runAutomaton.step(savedStates.intAt(pos), seekBytesRef.byteAt(pos) & 0xff);
- if (newState >= 0 && runAutomaton.isAccept(newState)) {
+ byteRunnable.step(savedStates.intAt(pos), seekBytesRef.byteAt(pos) & 0xff);
+ if (newState >= 0 && byteRunnable.isAccept(newState)) {
/* String is good to go as-is */
return true;
}
@@ -274,12 +276,12 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
seekBytesRef.setLength(position);
setVisited(state);
- final int numTransitions = automaton.getNumTransitions(state);
- automaton.initTransition(state, transition);
+ final int numTransitions = transitionAccessor.getNumTransitions(state);
+ transitionAccessor.initTransition(state, transition);
// find the minimal path (lexicographic order) that is >= c
for (int i = 0; i < numTransitions; i++) {
- automaton.getNextTransition(transition);
+ transitionAccessor.getNextTransition(transition);
if (transition.max >= c) {
int nextChar = Math.max(c, transition.min);
// append either the next sequential char, or the minimum transition
@@ -290,15 +292,15 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
* as long as is possible, continue down the minimal path in
* lexicographic order. if a loop or accept state is encountered, stop.
*/
- while (!isVisited(state) && !runAutomaton.isAccept(state)) {
+ while (!isVisited(state) && !byteRunnable.isAccept(state)) {
setVisited(state);
/*
* Note: we work with a DFA with no transitions to dead states.
* so the below is ok, if it is not an accept state,
* then there MUST be at least one transition.
*/
- automaton.initTransition(state, transition);
- automaton.getNextTransition(transition);
+ transitionAccessor.initTransition(state, transition);
+ transitionAccessor.getNextTransition(transition);
state = transition.dest;
// append the minimum transition
diff --git a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
index 327960f..b3f056c 100644
--- a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
@@ -61,7 +61,6 @@ public class AutomatonQuery extends MultiTermQuery implements Accountable {
* @param term Term containing field and possibly some pattern structure. The term text is
* ignored.
* @param automaton Automaton to run, terms that are accepted are considered a match.
- * @throws IllegalArgumentException if automaton is not determinized
*/
public AutomatonQuery(final Term term, Automaton automaton) {
this(term, automaton, false);
@@ -75,7 +74,6 @@ public class AutomatonQuery extends MultiTermQuery implements Accountable {
* @param automaton Automaton to run, terms that are accepted are considered a match.
* @param isBinary if true, this automaton is already binary and will not go through the
* UTF32ToUTF8 conversion
- * @throws IllegalArgumentException if automaton is not determinized
*/
public AutomatonQuery(final Term term, Automaton automaton, boolean isBinary) {
super(term.field());
@@ -143,6 +141,10 @@ public class AutomatonQuery extends MultiTermQuery implements Accountable {
return automaton;
}
+ public CompiledAutomaton getCompiled() {
+ return compiled;
+ }
+
/** Is this a binary (byte) oriented automaton. See the constructor. */
public boolean isAutomatonBinary() {
return automatonIsBinary;
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
index f461046..8c0ae65 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
@@ -46,7 +46,7 @@ import org.apache.lucene.util.Sorter;
*
* @lucene.experimental
*/
-public class Automaton implements Accountable {
+public class Automaton implements Accountable, TransitionAccessor {
/**
* Where we next write to the int[] states; this increments by 2 for each added state because we
@@ -340,7 +340,7 @@ public class Automaton implements Accountable {
return nextTransition / 3;
}
- /** How many transitions this state has. */
+ @Override
public int getNumTransitions(int state) {
assert state >= 0;
int count = states[2 * state + 1];
@@ -475,11 +475,7 @@ public class Automaton implements Accountable {
}
};
- /**
- * Initialize the provided Transition to iterate through all transitions leaving the specified
- * state. You must call {@link #getNextTransition} to get each transition. Returns the number of
- * transitions leaving this state.
- */
+ @Override
public int initTransition(int state, Transition t) {
assert state < nextState / 2 : "state=" + state + " nextState=" + nextState;
t.source = state;
@@ -487,7 +483,7 @@ public class Automaton implements Accountable {
return getNumTransitions(state);
}
- /** Iterate to the next transition after the provided one */
+ @Override
public void getNextTransition(Transition t) {
// Make sure there is still a transition left:
assert (t.transitionUpto + 3 - states[2 * t.source]) <= 3 * states[2 * t.source + 1];
@@ -535,9 +531,7 @@ public class Automaton implements Accountable {
return false;
}
- /**
- * Fill the provided {@link Transition} with the index'th transition leaving the specified state.
- */
+ @Override
public void getTransition(int state, int index, Transition t) {
int i = states[2 * state] + 3 * index;
t.source = state;
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
index 5f65445..d26dcd5 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
@@ -17,7 +17,7 @@
package org.apache.lucene.util.automaton;
/** Automaton representation for matching UTF-8 byte[]. */
-public class ByteRunAutomaton extends RunAutomaton {
+public class ByteRunAutomaton extends RunAutomaton implements ByteRunnable {
/**
* Converts incoming automaton to byte-based (UTF32ToUTF8) first
@@ -44,15 +44,4 @@ public class ByteRunAutomaton extends RunAutomaton {
// we checked the input is a DFA, according to mike this determinization is contained :)
return Operations.determinize(new UTF32ToUTF8().convert(a), Integer.MAX_VALUE);
}
-
- /** Returns true if the given byte array is accepted by this automaton */
- public boolean run(byte[] s, int offset, int length) {
- int p = 0;
- int l = offset + length;
- for (int i = offset; i < l; i++) {
- p = step(p, s[i] & 0xFF);
- if (p == -1) return false;
- }
- return accept.get(p);
- }
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunnable.java
similarity index 53%
copy from lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
copy to lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunnable.java
index 5f65445..e355b65 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunnable.java
@@ -16,43 +16,43 @@
*/
package org.apache.lucene.util.automaton;
-/** Automaton representation for matching UTF-8 byte[]. */
-public class ByteRunAutomaton extends RunAutomaton {
+/** A runnable automaton accepting byte array as input */
+public interface ByteRunnable {
/**
- * Converts incoming automaton to byte-based (UTF32ToUTF8) first
+ * Returns the state obtained by reading the given char from the given state. Returns -1 if not
+ * obtaining any such state.
*
- * @throws IllegalArgumentException if the automaton is not deterministic
+ * @param state the last state
+ * @param c the input codepoint
+ * @return the next state, -1 if no such transaction
*/
- public ByteRunAutomaton(Automaton a) {
- this(a, false);
- }
+ int step(int state, int c);
/**
- * expert: if isBinary is true, the input is already byte-based
+ * Returns acceptance status for given state.
*
- * @throws IllegalArgumentException if the automaton is not deterministic
+ * @param state the state
+ * @return whether the state is accepted
*/
- public ByteRunAutomaton(Automaton a, boolean isBinary) {
- super(isBinary ? a : convert(a), 256);
- }
+ boolean isAccept(int state);
- static Automaton convert(Automaton a) {
- if (!a.isDeterministic()) {
- throw new IllegalArgumentException("Automaton must be deterministic");
- }
- // we checked the input is a DFA, according to mike this determinization is contained :)
- return Operations.determinize(new UTF32ToUTF8().convert(a), Integer.MAX_VALUE);
- }
+ /**
+ * Returns number of states this automaton has, note this may not be an accurate number in case of
+ * NFA
+ *
+ * @return number of states
+ */
+ int getSize();
/** Returns true if the given byte array is accepted by this automaton */
- public boolean run(byte[] s, int offset, int length) {
+ default boolean run(byte[] s, int offset, int length) {
int p = 0;
int l = offset + length;
for (int i = offset; i < l; i++) {
p = step(p, s[i] & 0xFF);
if (p == -1) return false;
}
- return accept.get(p);
+ return isAccept(p);
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
index 1627bb6..c842271 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
@@ -19,6 +19,7 @@ package org.apache.lucene.util.automaton;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import java.util.Objects;
import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
@@ -34,8 +35,10 @@ import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil;
/**
- * Immutable class holding compiled details for a given Automaton. The Automaton must be
- * deterministic, must not have dead states but is not necessarily minimal.
+ * Immutable class holding compiled details for a given Automaton. The Automaton could either be
+ * deterministic or non-deterministic, For deterministic automaton, it must not have dead states but
+ * is not necessarily minimal. And will be executed using {@link ByteRunAutomaton} For
+ * non-deterministic automaton, it will be executed using {@link NFARunAutomaton}
*
* @lucene.experimental
*/
@@ -78,6 +81,14 @@ public class CompiledAutomaton implements Accountable {
public final Automaton automaton;
/**
+ * Matcher directly run on a NFA, it will determinize the state on need and caches it, note that
+ * this field and {@link #runAutomaton} will not be non-null at the same time
+ *
+ * <p>TODO: merge this with runAutomaton
+ */
+ final NFARunAutomaton nfaRunAutomaton;
+
+ /**
* Shared common suffix accepted by the automaton. Only valid for {@link AUTOMATON_TYPE#NORMAL},
* and only when the automaton accepts an infinite language. This will be null if the common
* prefix is length 0.
@@ -131,8 +142,6 @@ public class CompiledAutomaton implements Accountable {
* Create this. If finite is null, we use {@link Operations#isFinite} to determine whether it is
* finite. If simplify is true, we run possibly expensive operations to determine if the automaton
* is one the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}.
- *
- * @throws IllegalArgumentException if the automaton is not deterministic
*/
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {
this(automaton, finite, simplify, false);
@@ -142,22 +151,17 @@ public class CompiledAutomaton implements Accountable {
* Create this. If finite is null, we use {@link Operations#isFinite} to determine whether it is
* finite. If simplify is true, we run possibly expensive operations to determine if the automaton
* is one the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}.
- *
- * @throws IllegalArgumentException if the automaton is not deterministic
*/
public CompiledAutomaton(
Automaton automaton, Boolean finite, boolean simplify, boolean isBinary) {
- // require DFA for input, as this class currently can't handle NFAs.
- if (!automaton.isDeterministic()) {
- throw new IllegalArgumentException("Automaton must be deterministic");
- }
if (automaton.getNumStates() == 0) {
automaton = new Automaton();
automaton.createState();
}
- if (simplify) {
+ // simplify requires a DFA
+ if (simplify && automaton.isDeterministic()) {
// Test whether the automaton is a "simple" form and
// if so, don't create a runAutomaton. Note that on a
@@ -172,6 +176,7 @@ public class CompiledAutomaton implements Accountable {
this.automaton = null;
this.finite = null;
sinkState = -1;
+ nfaRunAutomaton = null;
return;
}
@@ -193,6 +198,7 @@ public class CompiledAutomaton implements Accountable {
this.automaton = null;
this.finite = null;
sinkState = -1;
+ nfaRunAutomaton = null;
return;
}
@@ -214,6 +220,7 @@ public class CompiledAutomaton implements Accountable {
UnicodeUtil.newString(singleton.ints, singleton.offset, singleton.length));
}
sinkState = -1;
+ nfaRunAutomaton = null;
return;
}
}
@@ -251,16 +258,24 @@ public class CompiledAutomaton implements Accountable {
}
}
- // We already had a DFA (or threw exception), according to mike UTF32toUTF8 won't "blow up"
- binary = Operations.determinize(binary, Integer.MAX_VALUE);
- runAutomaton = new ByteRunAutomaton(binary, true);
+ if (automaton.isDeterministic() == false && binary.isDeterministic() == false) {
+ this.automaton = null;
+ this.runAutomaton = null;
+ this.sinkState = -1;
+ this.nfaRunAutomaton = new NFARunAutomaton(binary, 0xff);
+ } else {
+ // We already had a DFA (or threw exception), according to mike UTF32toUTF8 won't "blow up"
+ binary = Operations.determinize(binary, Integer.MAX_VALUE);
+ runAutomaton = new ByteRunAutomaton(binary, true);
- this.automaton = runAutomaton.automaton;
+ this.automaton = runAutomaton.automaton;
- // TODO: this is a bit fragile because if the automaton is not minimized there could be more
- // than 1 sink state but auto-prefix will fail
- // to run for those:
- sinkState = findSinkState(this.automaton);
+ // TODO: this is a bit fragile because if the automaton is not minimized there could be more
+ // than 1 sink state but auto-prefix will fail
+ // to run for those:
+ sinkState = findSinkState(this.automaton);
+ nfaRunAutomaton = null;
+ }
}
private Transition transition = new Transition();
@@ -472,6 +487,32 @@ public class CompiledAutomaton implements Accountable {
}
}
+ /**
+ * Get a {@link ByteRunnable} instance, it will be different depending on whether a NFA or DFA is
+ * passed in, and does not guarantee returning non-null object
+ */
+ public ByteRunnable getByteRunnable() {
+ // they can be both null but not both non-null
+ assert nfaRunAutomaton == null || runAutomaton == null;
+ if (nfaRunAutomaton == null) {
+ return runAutomaton;
+ }
+ return nfaRunAutomaton;
+ }
+
+ /**
+ * Get a {@link TransitionAccessor} instance, it will be different depending on whether a NFA or
+ * DFA is passed in, and does not guarantee returning non-null object
+ */
+ public TransitionAccessor getTransitionAccessor() {
+ // they can be both null but not both non-null
+ assert nfaRunAutomaton == null || automaton == null;
+ if (nfaRunAutomaton == null) {
+ return automaton;
+ }
+ return nfaRunAutomaton;
+ }
+
@Override
public int hashCode() {
final int prime = 31;
@@ -492,7 +533,8 @@ public class CompiledAutomaton implements Accountable {
if (type == AUTOMATON_TYPE.SINGLE) {
if (!term.equals(other.term)) return false;
} else if (type == AUTOMATON_TYPE.NORMAL) {
- if (!runAutomaton.equals(other.runAutomaton)) return false;
+ return Objects.equals(runAutomaton, other.runAutomaton)
+ && Objects.equals(nfaRunAutomaton, other.nfaRunAutomaton);
}
return true;
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
new file mode 100644
index 0000000..0957715
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
@@ -0,0 +1,429 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.hppc.BitMixer;
+
+/**
+ * A RunAutomaton that does not require DFA. It will lazily determinize on-demand, memorizing the
+ * generated DFA states that has been explored
+ *
+ * <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
+ */
+public class NFARunAutomaton implements ByteRunnable, TransitionAccessor {
+
+ /** state ordinal of "no such state" */
+ public static final int MISSING = -1;
+
+ private static final int NOT_COMPUTED = -2;
+
+ private final Automaton automaton;
+ private final int[] points;
+ private final Map<DState, Integer> dStateToOrd = new HashMap<>(); // could init lazily?
+ private DState[] dStates;
+ private final int alphabetSize;
+ final int[] classmap; // map from char number to class
+
+ private final Operations.PointTransitionSet transitionSet =
+ new Operations.PointTransitionSet(); // reusable
+ private final StateSet statesSet = new StateSet(5); // reusable
+
+ /**
+ * Constructor, assuming alphabet size is the whole Unicode code point space
+ *
+ * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} for
+ * better efficiency
+ */
+ public NFARunAutomaton(Automaton automaton) {
+ this(automaton, Character.MAX_CODE_POINT);
+ }
+
+ /**
+ * Constructor
+ *
+ * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} *
+ * for better efficiency
+ * @param alphabetSize alphabet size
+ */
+ public NFARunAutomaton(Automaton automaton, int alphabetSize) {
+ this.automaton = automaton;
+ points = automaton.getStartPoints();
+ this.alphabetSize = alphabetSize;
+ dStates = new DState[10];
+ findDState(new DState(new int[] {0}));
+
+ /*
+ * Set alphabet table for optimal run performance.
+ */
+ classmap = new int[Math.min(256, alphabetSize)];
+ int i = 0;
+ for (int j = 0; j < classmap.length; j++) {
+ if (i + 1 < points.length && j == points[i + 1]) {
+ i++;
+ }
+ classmap[j] = i;
+ }
+ }
+
+ /**
+ * For a given state and an incoming character (codepoint), return the next state
+ *
+ * @param state incoming state, should either be 0 or some state that is returned previously by
+ * this function
+ * @param c codepoint
+ * @return the next state or {@link #MISSING} if the transition doesn't exist
+ */
+ @Override
+ public int step(int state, int c) {
+ assert dStates[state] != null;
+ return step(dStates[state], c);
+ }
+
+ @Override
+ public boolean isAccept(int state) {
+ assert dStates[state] != null;
+ return dStates[state].isAccept;
+ }
+
+ @Override
+ public int getSize() {
+ return dStates.length;
+ }
+
+ /**
+ * Run through a given codepoint array, return accepted or not, should only be used in test
+ *
+ * @param s String represented by an int array
+ * @return accept or not
+ */
+ boolean run(int[] s) {
+ int p = 0;
+ for (int c : s) {
+ p = step(p, c);
+ if (p == MISSING) return false;
+ }
+ return dStates[p].isAccept;
+ }
+
+ /**
+ * From an existing DFA state, step to next DFA state given character c if the transition is
+ * previously tried then this operation will just use the cached result, otherwise it will call
+ * {@link DState#step(int)} to get the next state and cache the result
+ */
+ private int step(DState dState, int c) {
+ int charClass = getCharClass(c);
+ return dState.nextState(charClass);
+ }
+
+ /**
+ * return the ordinal of given DFA state, generate a new ordinal if the given DFA state is a new
+ * one
+ */
+ private int findDState(DState dState) {
+ if (dState == null) {
+ return MISSING;
+ }
+ int ord = dStateToOrd.getOrDefault(dState, -1);
+ if (ord >= 0) {
+ return ord;
+ }
+ ord = dStateToOrd.size();
+ dStateToOrd.put(dState, ord);
+ assert ord >= dStates.length || dStates[ord] == null;
+ if (ord >= dStates.length) {
+ dStates = ArrayUtil.grow(dStates, ord + 1);
+ }
+ dStates[ord] = dState;
+ return ord;
+ }
+
+ /** Gets character class of given codepoint */
+ final int getCharClass(int c) {
+ assert c < alphabetSize;
+
+ if (c < classmap.length) {
+ return classmap[c];
+ }
+
+ // binary search
+ int a = 0;
+ int b = points.length;
+ while (b - a > 1) {
+ int d = (a + b) >>> 1;
+ if (points[d] > c) b = d;
+ else if (points[d] < c) a = d;
+ else return d;
+ }
+ return a;
+ }
+
+ @Override
+ public int initTransition(int state, Transition t) {
+ t.source = state;
+ t.transitionUpto = -1;
+ return getNumTransitions(state);
+ }
+
+ @Override
+ public void getNextTransition(Transition t) {
+ assert t.transitionUpto < points.length - 1 && t.transitionUpto >= -1;
+ while (dStates[t.source].transitions[++t.transitionUpto] == MISSING) {
+ // this shouldn't throw AIOOBE as long as this function is only called
+ // numTransitions times
+ }
+ assert dStates[t.source].transitions[t.transitionUpto] != NOT_COMPUTED;
+ t.dest = dStates[t.source].transitions[t.transitionUpto];
+
+ t.min = points[t.transitionUpto];
+ if (t.transitionUpto == points.length - 1) {
+ t.max = alphabetSize - 1;
+ } else {
+ t.max = points[t.transitionUpto + 1] - 1;
+ }
+ }
+
+ @Override
+ public int getNumTransitions(int state) {
+ dStates[state].determinize();
+ return dStates[state].outgoingTransitions;
+ }
+
+ @Override
+ public void getTransition(int state, int index, Transition t) {
+ dStates[state].determinize();
+ int outgoingTransitions = -1;
+ t.transitionUpto = -1;
+ t.source = state;
+ while (outgoingTransitions < index && t.transitionUpto < points.length - 1) {
+ if (dStates[t.source].transitions[++t.transitionUpto] != MISSING) {
+ outgoingTransitions++;
+ }
+ }
+ assert outgoingTransitions == index;
+
+ t.min = points[t.transitionUpto];
+ if (t.transitionUpto == points.length - 1) {
+ t.max = alphabetSize - 1;
+ } else {
+ t.max = points[t.transitionUpto + 1] - 1;
+ }
+ }
+
+ private class DState {
+ private final int[] nfaStates;
+ // this field is lazily init'd when first time caller wants to add a new transition
+ private int[] transitions;
+ private final int hashCode;
+ private final boolean isAccept;
+ private final Transition stepTransition = new Transition();
+ private Transition minimalTransition;
+ private int computedTransitions;
+ private int outgoingTransitions;
+
+ private DState(int[] nfaStates) {
+ assert nfaStates != null && nfaStates.length > 0;
+ this.nfaStates = nfaStates;
+ int hashCode = nfaStates.length;
+ boolean isAccept = false;
+ for (int s : nfaStates) {
+ hashCode += BitMixer.mix(s);
+ if (automaton.isAccept(s)) {
+ isAccept = true;
+ }
+ }
+ this.isAccept = isAccept;
+ this.hashCode = hashCode;
+ }
+
+ private int nextState(int charClass) {
+ initTransitions();
+ assert charClass < transitions.length;
+ if (transitions[charClass] == NOT_COMPUTED) {
+ assignTransition(charClass, findDState(step(points[charClass])));
+ // we could potentially update more than one char classes
+ if (minimalTransition != null) {
+ // to the left
+ int cls = charClass;
+ while (cls > 0 && points[--cls] >= minimalTransition.min) {
+ assert transitions[cls] == NOT_COMPUTED || transitions[cls] == transitions[charClass];
+ assignTransition(cls, transitions[charClass]);
+ }
+ // to the right
+ cls = charClass;
+ while (cls < points.length - 1 && points[++cls] <= minimalTransition.max) {
+ assert transitions[cls] == NOT_COMPUTED || transitions[cls] == transitions[charClass];
+ assignTransition(cls, transitions[charClass]);
+ }
+ minimalTransition = null;
+ }
+ }
+ return transitions[charClass];
+ }
+
+ private void assignTransition(int charClass, int dest) {
+ if (transitions[charClass] == NOT_COMPUTED) {
+ computedTransitions++;
+ transitions[charClass] = dest;
+ if (transitions[charClass] != MISSING) {
+ outgoingTransitions++;
+ }
+ }
+ }
+
+ /**
+ * given a list of NFA states and a character c, compute the output list of NFA state which is
+ * wrapped as a DFA state
+ */
+ private DState step(int c) {
+ statesSet.reset(); // TODO: fork IntHashSet from hppc instead?
+ int numTransitions;
+ int left = -1, right = alphabetSize;
+ for (int nfaState : nfaStates) {
+ numTransitions = automaton.initTransition(nfaState, stepTransition);
+ // TODO: binary search should be faster, since transitions are sorted
+ for (int i = 0; i < numTransitions; i++) {
+ automaton.getNextTransition(stepTransition);
+ if (stepTransition.min <= c && stepTransition.max >= c) {
+ statesSet.incr(stepTransition.dest);
+ left = Math.max(stepTransition.min, left);
+ right = Math.min(stepTransition.max, right);
+ }
+ if (stepTransition.max < c) {
+ left = Math.max(stepTransition.max + 1, left);
+ }
+ if (stepTransition.min > c) {
+ right = Math.min(stepTransition.min - 1, right);
+ // transitions in automaton are sorted
+ break;
+ }
+ }
+ }
+ if (statesSet.size() == 0) {
+ return null;
+ }
+ minimalTransition = new Transition();
+ minimalTransition.min = left;
+ minimalTransition.max = right;
+ return new DState(statesSet.getArray());
+ }
+
+ // determinize this state only
+ private void determinize() {
+ if (transitions != null && computedTransitions == transitions.length) {
+ // already determinized
+ return;
+ }
+ initTransitions();
+ // Mostly forked from Operations.determinize
+ transitionSet.reset();
+ for (int nfaState : nfaStates) {
+ int numTransitions = automaton.initTransition(nfaState, stepTransition);
+ for (int i = 0; i < numTransitions; i++) {
+ automaton.getNextTransition(stepTransition);
+ transitionSet.add(stepTransition);
+ }
+ }
+ if (transitionSet.count == 0) {
+ // no outgoing transitions
+ Arrays.fill(transitions, MISSING);
+ computedTransitions = transitions.length;
+ return;
+ }
+
+ transitionSet
+ .sort(); // TODO: could use a PQ (heap) instead, since transitions for each state are
+ // sorted
+ statesSet.reset();
+ int lastPoint = -1;
+ int charClass = 0;
+ for (int i = 0; i < transitionSet.count; i++) {
+ final int point = transitionSet.points[i].point;
+ if (statesSet.size() > 0) {
+ assert lastPoint != -1;
+ int ord = findDState(new DState(statesSet.getArray()));
+ while (points[charClass] < lastPoint) {
+ assignTransition(charClass++, MISSING);
+ }
+ assert points[charClass] == lastPoint;
+ while (charClass < points.length && points[charClass] < point) {
+ assert transitions[charClass] == NOT_COMPUTED || transitions[charClass] == ord;
+ assignTransition(charClass++, ord);
+ }
+ assert (charClass == points.length && point == alphabetSize)
+ || points[charClass] == point;
+ }
+
+ // process transitions that end on this point
+ // (closes an overlapping interval)
+ int[] transitions = transitionSet.points[i].ends.transitions;
+ int limit = transitionSet.points[i].ends.next;
+ for (int j = 0; j < limit; j += 3) {
+ int dest = transitions[j];
+ statesSet.decr(dest);
+ }
+ transitionSet.points[i].ends.next = 0;
+
+ // process transitions that start on this point
+ // (opens a new interval)
+ transitions = transitionSet.points[i].starts.transitions;
+ limit = transitionSet.points[i].starts.next;
+ for (int j = 0; j < limit; j += 3) {
+ int dest = transitions[j];
+ statesSet.incr(dest);
+ }
+
+ lastPoint = point;
+ transitionSet.points[i].starts.next = 0;
+ }
+ assert statesSet.size() == 0;
+ assert computedTransitions
+ >= charClass; // it's also possible that some transitions after the charClass has already
+ // been explored
+ // no more outgoing transitions, set rest of transition to MISSING
+ assert charClass == transitions.length
+ || transitions[charClass] == MISSING
+ || transitions[charClass] == NOT_COMPUTED;
+ Arrays.fill(transitions, charClass, transitions.length, MISSING);
+ computedTransitions = transitions.length;
+ }
+
+ private void initTransitions() {
+ if (transitions == null) {
+ transitions = new int[points.length];
+ Arrays.fill(transitions, NOT_COMPUTED);
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return hashCode;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ DState dState = (DState) o;
+ return hashCode == dState.hashCode && Arrays.equals(nfaStates, dState.nfaStates);
+ }
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
index 1b54b57..08bcfa6 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
@@ -526,7 +526,7 @@ public final class Operations {
}
// Simple custom ArrayList<Transition>
- private static final class TransitionList {
+ static final class TransitionList {
// dest, min, max
int[] transitions = new int[3];
int next;
@@ -544,7 +544,7 @@ public final class Operations {
// Holds all transitions that start on this int point, or
// end at this point-1
- private static final class PointTransitions implements Comparable<PointTransitions> {
+ static final class PointTransitions implements Comparable<PointTransitions> {
int point;
final TransitionList ends = new TransitionList();
final TransitionList starts = new TransitionList();
@@ -571,7 +571,7 @@ public final class Operations {
}
}
- private static final class PointTransitionSet {
+ static final class PointTransitionSet {
int count;
PointTransitions[] points = new PointTransitions[5];
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
index dea87b9..06a674c 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
@@ -130,7 +130,12 @@ public abstract class RunAutomaton implements Accountable {
return size;
}
- /** Returns acceptance status for given state. */
+ /**
+ * Returns acceptance status for given state.
+ *
+ * @param state the state
+ * @return whether the state is accepted
+ */
public final boolean isAccept(int state) {
return accept.get(state);
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java
index 6d4a7ea..985ed65 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java
@@ -66,6 +66,11 @@ final class StateSet extends IntSet {
}
}
+ void reset() {
+ inner.clear();
+ keyChanged();
+ }
+
/**
* Create a snapshot of this int set associated with a given state. The snapshot will not retain
* any frequency information about the elements of this set, only existence.
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/TransitionAccessor.java b/lucene/core/src/java/org/apache/lucene/util/automaton/TransitionAccessor.java
new file mode 100644
index 0000000..33a3d5e
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/TransitionAccessor.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.automaton;
+
+/** Interface accessing the transitions of an automaton */
+public interface TransitionAccessor {
+
+ /**
+ * Initialize the provided Transition to iterate through all transitions leaving the specified
+ * state. You must call {@link #getNextTransition} to get each transition. Returns the number of
+ * transitions leaving this state.
+ */
+ int initTransition(int state, Transition t);
+
+ /** Iterate to the next transition after the provided one */
+ void getNextTransition(Transition t);
+
+ /** How many transitions this state has. */
+ int getNumTransitions(int state);
+
+ /**
+ * Fill the provided {@link Transition} with the index'th transition leaving the specified state.
+ */
+ void getTransition(int state, int index, Transition t);
+}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
index 71ed126..706f39e 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
@@ -127,18 +127,6 @@ public class TestAutomatonQuery extends LuceneTestCase {
DEFAULT_DETERMINIZE_WORK_LIMIT));
}
- /** Test that a nondeterministic automaton fails, it should throw Exception */
- public void testNFA() throws IOException {
- // accept this or three, the union is an NFA (two transitions for 't' from
- // initial state)
- Automaton nfa = Operations.union(Automata.makeString("this"), Automata.makeString("three"));
- expectThrows(
- IllegalArgumentException.class,
- () -> {
- assertAutomatonHits(2, nfa);
- });
- }
-
public void testEquals() {
AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), Automata.makeString("foobar"));
// reference to a1
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/MinimizationOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/MinimizationOperations.java
index ffd6edd..d45d602 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/MinimizationOperations.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/MinimizationOperations.java
@@ -53,6 +53,7 @@ public final class MinimizationOperations {
* what to specify.
*/
public static Automaton minimize(Automaton a, int determinizeWorkLimit) {
+
if (a.getNumStates() == 0 || (a.isAccept(0) == false && a.getNumTransitions(0) == 0)) {
// Fastmatch for common case
return new Automaton();
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestNFARunAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestNFARunAutomaton.java
new file mode 100644
index 0000000..6b4c480
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestNFARunAutomaton.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.AutomatonQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestNFARunAutomaton extends LuceneTestCase {
+
+ private static final String FIELD = "field";
+
+ @SuppressWarnings("unused")
+ public void testWithRandomRegex() {
+ for (int i = 0; i < 100; i++) {
+ RegExp regExp = new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE);
+ ;
+ Automaton nfa = regExp.toAutomaton();
+ if (nfa.isDeterministic()) {
+ i--;
+ continue;
+ }
+ Automaton dfa = Operations.determinize(nfa, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+ NFARunAutomaton candidate = new NFARunAutomaton(nfa);
+ AutomatonTestUtil.RandomAcceptedStrings randomStringGen;
+ try {
+ randomStringGen = new AutomatonTestUtil.RandomAcceptedStrings(dfa);
+ } catch (IllegalArgumentException e) {
+ i--;
+ continue; // sometimes the automaton accept nothing and throw this exception
+ }
+
+ for (int round = 0; round < 20; round++) {
+ // test order of accepted strings and random (likely rejected) strings alternatively to make
+ // sure caching system works correctly
+ if (random().nextBoolean()) {
+ testAcceptedString(regExp, randomStringGen, candidate, 10);
+ testRandomString(regExp, dfa, candidate, 10);
+ } else {
+ testRandomString(regExp, dfa, candidate, 10);
+ testAcceptedString(regExp, randomStringGen, candidate, 10);
+ }
+ }
+ }
+ }
+
+ public void testRandomAutomatonQuery() throws IOException {
+ final int docNum = 50;
+ final int automatonNum = 50;
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
+
+ Set<String> vocab = new HashSet<>();
+ Set<String> perDocVocab = new HashSet<>();
+ for (int i = 0; i < docNum; i++) {
+ perDocVocab.clear();
+ int termNum = random().nextInt(20) + 30;
+ while (perDocVocab.size() < termNum) {
+ String randomString;
+ while ((randomString = TestUtil.randomUnicodeString(random())).length() == 0)
+ ;
+ perDocVocab.add(randomString);
+ vocab.add(randomString);
+ }
+ Document document = new Document();
+ document.add(
+ newTextField(
+ FIELD, perDocVocab.stream().reduce("", (s1, s2) -> s1 + " " + s2), Field.Store.NO));
+ writer.addDocument(document);
+ }
+ writer.commit();
+ IndexReader reader = DirectoryReader.open(directory);
+ IndexSearcher searcher = new IndexSearcher(reader);
+
+ Set<String> foreignVocab = new HashSet<>();
+ while (foreignVocab.size() < vocab.size()) {
+ String randomString;
+ while ((randomString = TestUtil.randomUnicodeString(random())).length() == 0)
+ ;
+ foreignVocab.add(randomString);
+ }
+
+ ArrayList<String> vocabList = new ArrayList<>(vocab);
+ ArrayList<String> foreignVocabList = new ArrayList<>(foreignVocab);
+
+ Set<String> perQueryVocab = new HashSet<>();
+
+ for (int i = 0; i < automatonNum; i++) {
+ perQueryVocab.clear();
+ int termNum = random().nextInt(40) + 30;
+ while (perQueryVocab.size() < termNum) {
+ if (random().nextBoolean()) {
+ perQueryVocab.add(vocabList.get(random().nextInt(vocabList.size())));
+ } else {
+ perQueryVocab.add(foreignVocabList.get(random().nextInt(foreignVocabList.size())));
+ }
+ }
+ Automaton a = null;
+ for (String term : perQueryVocab) {
+ if (a == null) {
+ a = Automata.makeString(term);
+ } else {
+ a = Operations.union(a, Automata.makeString(term));
+ }
+ }
+ if (a.isDeterministic()) {
+ i--;
+ continue;
+ }
+ AutomatonQuery dfaQuery =
+ new AutomatonQuery(
+ new Term(FIELD),
+ Operations.determinize(a, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT));
+ AutomatonQuery nfaQuery = new AutomatonQuery(new Term(FIELD), a);
+ assertNotNull(nfaQuery.getCompiled().nfaRunAutomaton);
+ assertEquals(searcher.count(dfaQuery), searcher.count(nfaQuery));
+ }
+ reader.close();
+ writer.close();
+ directory.close();
+ }
+
+ private void testAcceptedString(
+ RegExp regExp,
+ AutomatonTestUtil.RandomAcceptedStrings randomStringGen,
+ NFARunAutomaton candidate,
+ int repeat) {
+ for (int n = 0; n < repeat; n++) {
+ int[] acceptedString = randomStringGen.getRandomAcceptedString(random());
+ assertTrue(
+ "regExp: " + regExp + " testString: " + Arrays.toString(acceptedString),
+ candidate.run(acceptedString));
+ }
+ }
+
+ private void testRandomString(
+ RegExp regExp, Automaton dfa, NFARunAutomaton candidate, int repeat) {
+ for (int n = 0; n < repeat; n++) {
+ int[] randomString =
+ random().ints(random().nextInt(50), 0, Character.MAX_CODE_POINT).toArray();
+ assertEquals(
+ "regExp: " + regExp + " testString: " + Arrays.toString(randomString),
+ Operations.run(dfa, new IntsRef(randomString, 0, randomString.length)),
+ candidate.run(randomString));
+ }
+ }
+}