You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@lucene.apache.org by GitBox <gi...@apache.org> on 2021/07/27 16:42:41 UTC

[GitHub] [lucene] mikemccand commented on a change in pull request #225: LUCENE-10010 Introduce NFARunAutomaton to run NFA directly

mikemccand commented on a change in pull request #225:
URL: https://github.com/apache/lucene/pull/225#discussion_r677506600



##########
File path: lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
##########
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.hppc.BitMixer;
+
+/**
+ * A RunAutomaton that does not require DFA, it will determinize and memorize the generated DFA
+ * state along with the run
+ *
+ * <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
+ */
+public class NFARunAutomaton {
+
+  /** state ordinal of "no such state" */
+  public static final int MISSING = -1;
+
+  private static final int NOT_COMPUTED = -2;
+
+  private final Automaton automaton;
+  private final int[] points;
+  private final Map<DState, Integer> dStateToOrd = new HashMap<>(); // could init lazily?
+  private DState[] dStates;
+  private final int alphabetSize;
+
+  /**
+   * Constructor, assuming alphabet size is the whole codepoint space

Review comment:
       Maybe say `whole Unicode code point space`?

##########
File path: lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
##########
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.hppc.BitMixer;
+
+/**
+ * A RunAutomaton that does not require DFA, it will determinize and memorize the generated DFA
+ * state along with the run
+ *
+ * <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
+ */
+public class NFARunAutomaton {
+
+  /** state ordinal of "no such state" */
+  public static final int MISSING = -1;
+
+  private static final int NOT_COMPUTED = -2;
+
+  private final Automaton automaton;
+  private final int[] points;
+  private final Map<DState, Integer> dStateToOrd = new HashMap<>(); // could init lazily?
+  private DState[] dStates;
+  private final int alphabetSize;
+
+  /**
+   * Constructor, assuming alphabet size is the whole codepoint space
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} for
+   *     better efficiency
+   */
+  public NFARunAutomaton(Automaton automaton) {
+    this(automaton, Character.MAX_CODE_POINT);
+  }
+
+  /**
+   * Constructor
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} *
+   *     for better efficiency
+   * @param alphabetSize alphabet size
+   */
+  public NFARunAutomaton(Automaton automaton, int alphabetSize) {
+    this.automaton = automaton;
+    points = automaton.getStartPoints();
+    this.alphabetSize = alphabetSize;
+    dStates = new DState[10];
+    findDState(new DState(new int[] {0}));
+  }
+
+  /**
+   * For a given state and an incoming character (codepoint), return the next state
+   *
+   * @param state incoming state, should either be 0 or some state that is returned previously by
+   *     this function
+   * @param c codepoint
+   * @return the next state or {@link #MISSING} if the transition doesn't exist
+   */
+  public int step(int state, int c) {
+    assert dStates[state] != null;
+    return step(dStates[state], c);
+  }
+
+  /**
+   * Run through a given codepoint array, return accepted or not, should only be used in test
+   *
+   * @param s String represented by an int array
+   * @return accept or not
+   */
+  boolean run(int[] s) {
+    int p = 0;
+    for (int c : s) {
+      p = step(p, c);
+      if (p == MISSING) return false;
+    }
+    return dStates[p].isAccept;
+  }
+
+  /**
+   * From an existing DFA state, step to next DFA state given character c if the transition is
+   * previously tried then this operation will just use the cached result, otherwise it will call
+   * {@link #step(int[], int)} to get the next state and cache the result
+   */
+  private int step(DState dState, int c) {
+    int charClass = getCharClass(c);
+    if (dState.nextState(charClass) == NOT_COMPUTED) {
+      // the next dfa state has not been computed yet
+      dState.setNextState(charClass, findDState(step(dState.nfaStates, c)));
+    }
+    return dState.nextState(charClass);
+  }
+
+  /**
+   * given a list of NFA states and a character c, compute the output list of NFA state which is
+   * wrapped as a DFA state
+   */
+  private DState step(int[] nfaStates, int c) {
+    Transition transition = new Transition();
+    StateSet stateSet = new StateSet(5); // fork IntHashSet from hppc instead?
+    int numTransitions;
+    for (int nfaState : nfaStates) {
+      numTransitions = automaton.initTransition(nfaState, transition);
+      for (int i = 0; i < numTransitions; i++) {
+        automaton.getNextTransition(transition);
+        if (transition.min <= c && transition.max >= c) {
+          stateSet.incr(transition.dest);
+        }
+      }
+    }
+    if (stateSet.size() == 0) {
+      return null;
+    }
+    return new DState(stateSet.getArray());
+  }
+
+  /**
+   * return the ordinal of given DFA state, generate a new ordinal if the given DFA state is a new
+   * one
+   */
+  private int findDState(DState dState) {
+    if (dState == null) {
+      return MISSING;
+    }
+    int ord = dStateToOrd.getOrDefault(dState, -1);
+    if (ord >= 0) {
+      return ord;
+    }
+    ord = dStateToOrd.size();
+    dStateToOrd.put(dState, ord);
+    assert ord >= dStates.length || dStates[ord] == null;
+    if (ord >= dStates.length) {
+      dStates = ArrayUtil.grow(dStates, ord + 1);
+    }
+    dStates[ord] = dState;
+    return ord;
+  }
+
+  /** Gets character class of given codepoint */
+  final int getCharClass(int c) {
+    assert c < alphabetSize;
+    // binary search
+    int a = 0;
+    int b = points.length;
+    while (b - a > 1) {
+      int d = (a + b) >>> 1;
+      if (points[d] > c) b = d;
+      else if (points[d] < c) a = d;
+      else return d;
+    }
+    return a;
+  }
+
+  private class DState {
+    private final int[] nfaStates;
+    private int[] transitions;
+    private final int hashCode;
+    private final boolean isAccept;
+
+    private DState(int[] nfaStates) {
+      assert nfaStates != null && nfaStates.length > 0;
+      this.nfaStates = nfaStates;
+      int hashCode = nfaStates.length;
+      boolean isAccept = false;
+      for (int s : nfaStates) {
+        hashCode += BitMixer.mix(s);
+        if (automaton.isAccept(s)) {
+          isAccept = true;
+        }
+      }
+      this.isAccept = isAccept;
+      this.hashCode = hashCode;
+    }
+
+    private int nextState(int charClass) {
+      initTransitions();
+      assert charClass < transitions.length;
+      return transitions[charClass];
+    }
+
+    private void setNextState(int charClass, int nextState) {
+      initTransitions();
+      assert charClass < transitions.length;
+      transitions[charClass] = nextState;
+    }
+
+    private void initTransitions() {
+      if (transitions == null) {
+        transitions = new int[points.length];

Review comment:
       Hmm, `points` is global across the whole `Automaton` gathering all unique char classes that were ever seen in the automaton?  So this is overkill, to always allocate this many slots for outgoing transitions from this state?  Though, I guess this matches how `RunAutomaton` works, with its pre-compiled transition lookup table?

##########
File path: lucene/core/src/test/org/apache/lucene/util/automaton/TestNFARunAutomaton.java
##########
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestNFARunAutomaton extends LuceneTestCase {
+
+  public void testRandom() {
+    for (int i = 0; i < 100; i++) {
+      RegExp regExp = null;
+      while (regExp == null) {
+        try {
+          regExp = new RegExp(AutomatonTestUtil.randomRegexp(random()));
+        } catch (IllegalArgumentException e) {
+          ignoreException(e);

Review comment:
       Hmm did `javac` or one of our `ecj` linters demand that you do this :)  (Instead of leaving a comment explaining why we are ignoring).
   
   And maybe add a comment anyways explaining how `randomRegexp` will sometimes/often result in this exception?  In fact, maybe we should fix `AutomatonTestUtil` to do this "filtering" for us, so that it returns a valid `RegExp`?

##########
File path: lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java
##########
@@ -53,6 +53,11 @@ private MinimizationOperations() {}
    *     what to specify.
    */
   public static Automaton minimize(Automaton a, int determinizeWorkLimit) {
+    // nocommit: probably shouldn't set the logic here

Review comment:
       Hmm why did you need this?

##########
File path: lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
##########
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.hppc.BitMixer;
+
+/**
+ * A RunAutomaton that does not require DFA, it will determinize and memorize the generated DFA
+ * state along with the run
+ *
+ * <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
+ */
+public class NFARunAutomaton {
+
+  /** state ordinal of "no such state" */
+  public static final int MISSING = -1;
+
+  private static final int NOT_COMPUTED = -2;
+
+  private final Automaton automaton;
+  private final int[] points;
+  private final Map<DState, Integer> dStateToOrd = new HashMap<>(); // could init lazily?
+  private DState[] dStates;
+  private final int alphabetSize;
+
+  /**
+   * Constructor, assuming alphabet size is the whole codepoint space
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} for
+   *     better efficiency
+   */
+  public NFARunAutomaton(Automaton automaton) {
+    this(automaton, Character.MAX_CODE_POINT);
+  }
+
+  /**
+   * Constructor
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} *
+   *     for better efficiency
+   * @param alphabetSize alphabet size
+   */
+  public NFARunAutomaton(Automaton automaton, int alphabetSize) {
+    this.automaton = automaton;
+    points = automaton.getStartPoints();
+    this.alphabetSize = alphabetSize;
+    dStates = new DState[10];
+    findDState(new DState(new int[] {0}));
+  }
+
+  /**
+   * For a given state and an incoming character (codepoint), return the next state
+   *
+   * @param state incoming state, should either be 0 or some state that is returned previously by
+   *     this function
+   * @param c codepoint
+   * @return the next state or {@link #MISSING} if the transition doesn't exist
+   */
+  public int step(int state, int c) {
+    assert dStates[state] != null;
+    return step(dStates[state], c);
+  }
+
+  /**
+   * Run through a given codepoint array, return accepted or not, should only be used in test
+   *
+   * @param s String represented by an int array
+   * @return accept or not
+   */
+  boolean run(int[] s) {
+    int p = 0;
+    for (int c : s) {
+      p = step(p, c);
+      if (p == MISSING) return false;
+    }
+    return dStates[p].isAccept;
+  }
+
+  /**
+   * From an existing DFA state, step to next DFA state given character c if the transition is
+   * previously tried then this operation will just use the cached result, otherwise it will call
+   * {@link #step(int[], int)} to get the next state and cache the result
+   */
+  private int step(DState dState, int c) {
+    int charClass = getCharClass(c);
+    if (dState.nextState(charClass) == NOT_COMPUTED) {
+      // the next dfa state has not been computed yet
+      dState.setNextState(charClass, findDState(step(dState.nfaStates, c)));
+    }
+    return dState.nextState(charClass);
+  }
+
+  /**
+   * given a list of NFA states and a character c, compute the output list of NFA state which is
+   * wrapped as a DFA state
+   */
+  private DState step(int[] nfaStates, int c) {
+    Transition transition = new Transition();
+    StateSet stateSet = new StateSet(5); // fork IntHashSet from hppc instead?
+    int numTransitions;
+    for (int nfaState : nfaStates) {
+      numTransitions = automaton.initTransition(nfaState, transition);
+      for (int i = 0; i < numTransitions; i++) {
+        automaton.getNextTransition(transition);
+        if (transition.min <= c && transition.max >= c) {
+          stateSet.incr(transition.dest);
+        }
+      }
+    }
+    if (stateSet.size() == 0) {
+      return null;
+    }
+    return new DState(stateSet.getArray());
+  }
+
+  /**
+   * return the ordinal of given DFA state, generate a new ordinal if the given DFA state is a new
+   * one
+   */
+  private int findDState(DState dState) {
+    if (dState == null) {
+      return MISSING;
+    }
+    int ord = dStateToOrd.getOrDefault(dState, -1);
+    if (ord >= 0) {
+      return ord;
+    }
+    ord = dStateToOrd.size();
+    dStateToOrd.put(dState, ord);
+    assert ord >= dStates.length || dStates[ord] == null;
+    if (ord >= dStates.length) {
+      dStates = ArrayUtil.grow(dStates, ord + 1);
+    }
+    dStates[ord] = dState;
+    return ord;
+  }
+
+  /** Gets character class of given codepoint */
+  final int getCharClass(int c) {
+    assert c < alphabetSize;
+    // binary search
+    int a = 0;
+    int b = points.length;
+    while (b - a > 1) {
+      int d = (a + b) >>> 1;
+      if (points[d] > c) b = d;
+      else if (points[d] < c) a = d;
+      else return d;
+    }
+    return a;
+  }
+
+  private class DState {
+    private final int[] nfaStates;
+    private int[] transitions;
+    private final int hashCode;
+    private final boolean isAccept;
+
+    private DState(int[] nfaStates) {
+      assert nfaStates != null && nfaStates.length > 0;
+      this.nfaStates = nfaStates;
+      int hashCode = nfaStates.length;
+      boolean isAccept = false;
+      for (int s : nfaStates) {
+        hashCode += BitMixer.mix(s);
+        if (automaton.isAccept(s)) {
+          isAccept = true;
+        }
+      }
+      this.isAccept = isAccept;
+      this.hashCode = hashCode;
+    }
+
+    private int nextState(int charClass) {
+      initTransitions();
+      assert charClass < transitions.length;
+      return transitions[charClass];

Review comment:
       Maybe inside here, if the result is `NOT_COMPUTED`, we should go and lazily compute it, instead of expecting caller to?  And then remove `setNextState` (or make it private)?

##########
File path: lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
##########
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.hppc.BitMixer;
+
+/**
+ * A RunAutomaton that does not require DFA, it will determinize and memorize the generated DFA
+ * state along with the run
+ *
+ * <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
+ */
+public class NFARunAutomaton {
+
+  /** state ordinal of "no such state" */
+  public static final int MISSING = -1;
+
+  private static final int NOT_COMPUTED = -2;
+
+  private final Automaton automaton;
+  private final int[] points;
+  private final Map<DState, Integer> dStateToOrd = new HashMap<>(); // could init lazily?
+  private DState[] dStates;
+  private final int alphabetSize;
+
+  /**
+   * Constructor, assuming alphabet size is the whole codepoint space
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} for
+   *     better efficiency
+   */
+  public NFARunAutomaton(Automaton automaton) {
+    this(automaton, Character.MAX_CODE_POINT);
+  }
+
+  /**
+   * Constructor
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} *
+   *     for better efficiency
+   * @param alphabetSize alphabet size
+   */
+  public NFARunAutomaton(Automaton automaton, int alphabetSize) {
+    this.automaton = automaton;
+    points = automaton.getStartPoints();
+    this.alphabetSize = alphabetSize;
+    dStates = new DState[10];
+    findDState(new DState(new int[] {0}));
+  }
+
+  /**
+   * For a given state and an incoming character (codepoint), return the next state
+   *
+   * @param state incoming state, should either be 0 or some state that is returned previously by
+   *     this function
+   * @param c codepoint
+   * @return the next state or {@link #MISSING} if the transition doesn't exist
+   */
+  public int step(int state, int c) {
+    assert dStates[state] != null;
+    return step(dStates[state], c);
+  }
+
+  /**
+   * Run through a given codepoint array, return accepted or not, should only be used in test
+   *
+   * @param s String represented by an int array
+   * @return accept or not
+   */
+  boolean run(int[] s) {
+    int p = 0;
+    for (int c : s) {
+      p = step(p, c);
+      if (p == MISSING) return false;
+    }
+    return dStates[p].isAccept;
+  }
+
+  /**
+   * From an existing DFA state, step to next DFA state given character c if the transition is
+   * previously tried then this operation will just use the cached result, otherwise it will call
+   * {@link #step(int[], int)} to get the next state and cache the result
+   */
+  private int step(DState dState, int c) {
+    int charClass = getCharClass(c);
+    if (dState.nextState(charClass) == NOT_COMPUTED) {
+      // the next dfa state has not been computed yet
+      dState.setNextState(charClass, findDState(step(dState.nfaStates, c)));
+    }
+    return dState.nextState(charClass);
+  }
+
+  /**
+   * given a list of NFA states and a character c, compute the output list of NFA state which is
+   * wrapped as a DFA state
+   */
+  private DState step(int[] nfaStates, int c) {
+    Transition transition = new Transition();
+    StateSet stateSet = new StateSet(5); // fork IntHashSet from hppc instead?
+    int numTransitions;
+    for (int nfaState : nfaStates) {
+      numTransitions = automaton.initTransition(nfaState, transition);
+      for (int i = 0; i < numTransitions; i++) {
+        automaton.getNextTransition(transition);
+        if (transition.min <= c && transition.max >= c) {
+          stateSet.incr(transition.dest);
+        }
+      }
+    }
+    if (stateSet.size() == 0) {
+      return null;
+    }
+    return new DState(stateSet.getArray());
+  }
+
+  /**
+   * return the ordinal of given DFA state, generate a new ordinal if the given DFA state is a new
+   * one
+   */
+  private int findDState(DState dState) {
+    if (dState == null) {
+      return MISSING;
+    }
+    int ord = dStateToOrd.getOrDefault(dState, -1);
+    if (ord >= 0) {
+      return ord;
+    }
+    ord = dStateToOrd.size();
+    dStateToOrd.put(dState, ord);
+    assert ord >= dStates.length || dStates[ord] == null;
+    if (ord >= dStates.length) {
+      dStates = ArrayUtil.grow(dStates, ord + 1);
+    }
+    dStates[ord] = dState;
+    return ord;
+  }
+
+  /** Gets character class of given codepoint */
+  final int getCharClass(int c) {
+    assert c < alphabetSize;
+    // binary search
+    int a = 0;
+    int b = points.length;
+    while (b - a > 1) {
+      int d = (a + b) >>> 1;
+      if (points[d] > c) b = d;
+      else if (points[d] < c) a = d;
+      else return d;
+    }
+    return a;
+  }
+
+  private class DState {

Review comment:
       Does this represent the same thing as `FrozenIntSet`?  I.e. it is a powerset, a subset of the NFA states that logically represent one state in the DFA?  But we couldn't re-use the `FrozenIntSet`?

##########
File path: lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
##########
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.hppc.BitMixer;
+
+/**
+ * A RunAutomaton that does not require DFA, it will determinize and memorize the generated DFA
+ * state along with the run
+ *
+ * <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
+ */
+public class NFARunAutomaton {
+
+  /** state ordinal of "no such state" */
+  public static final int MISSING = -1;
+
+  private static final int NOT_COMPUTED = -2;
+
+  private final Automaton automaton;
+  private final int[] points;
+  private final Map<DState, Integer> dStateToOrd = new HashMap<>(); // could init lazily?
+  private DState[] dStates;
+  private final int alphabetSize;
+
+  /**
+   * Constructor, assuming alphabet size is the whole codepoint space
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} for
+   *     better efficiency
+   */
+  public NFARunAutomaton(Automaton automaton) {
+    this(automaton, Character.MAX_CODE_POINT);
+  }
+
+  /**
+   * Constructor
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} *
+   *     for better efficiency
+   * @param alphabetSize alphabet size
+   */
+  public NFARunAutomaton(Automaton automaton, int alphabetSize) {
+    this.automaton = automaton;
+    points = automaton.getStartPoints();
+    this.alphabetSize = alphabetSize;
+    dStates = new DState[10];
+    findDState(new DState(new int[] {0}));
+  }
+
+  /**
+   * For a given state and an incoming character (codepoint), return the next state
+   *
+   * @param state incoming state, should either be 0 or some state that is returned previously by
+   *     this function
+   * @param c codepoint
+   * @return the next state or {@link #MISSING} if the transition doesn't exist
+   */
+  public int step(int state, int c) {
+    assert dStates[state] != null;
+    return step(dStates[state], c);
+  }
+
+  /**
+   * Run through a given codepoint array, return accepted or not, should only be used in test
+   *
+   * @param s String represented by an int array
+   * @return accept or not
+   */
+  boolean run(int[] s) {
+    int p = 0;
+    for (int c : s) {
+      p = step(p, c);
+      if (p == MISSING) return false;
+    }
+    return dStates[p].isAccept;
+  }
+
+  /**
+   * From an existing DFA state, step to next DFA state given character c if the transition is
+   * previously tried then this operation will just use the cached result, otherwise it will call
+   * {@link #step(int[], int)} to get the next state and cache the result
+   */
+  private int step(DState dState, int c) {
+    int charClass = getCharClass(c);
+    if (dState.nextState(charClass) == NOT_COMPUTED) {
+      // the next dfa state has not been computed yet
+      dState.setNextState(charClass, findDState(step(dState.nfaStates, c)));
+    }
+    return dState.nextState(charClass);
+  }
+
+  /**
+   * given a list of NFA states and a character c, compute the output list of NFA state which is
+   * wrapped as a DFA state
+   */
+  private DState step(int[] nfaStates, int c) {
+    Transition transition = new Transition();
+    StateSet stateSet = new StateSet(5); // fork IntHashSet from hppc instead?
+    int numTransitions;
+    for (int nfaState : nfaStates) {
+      numTransitions = automaton.initTransition(nfaState, transition);
+      for (int i = 0; i < numTransitions; i++) {
+        automaton.getNextTransition(transition);
+        if (transition.min <= c && transition.max >= c) {
+          stateSet.incr(transition.dest);
+        }
+      }
+    }
+    if (stateSet.size() == 0) {
+      return null;
+    }
+    return new DState(stateSet.getArray());
+  }
+
+  /**
+   * return the ordinal of given DFA state, generate a new ordinal if the given DFA state is a new
+   * one
+   */
+  private int findDState(DState dState) {
+    if (dState == null) {
+      return MISSING;
+    }
+    int ord = dStateToOrd.getOrDefault(dState, -1);
+    if (ord >= 0) {
+      return ord;
+    }
+    ord = dStateToOrd.size();
+    dStateToOrd.put(dState, ord);
+    assert ord >= dStates.length || dStates[ord] == null;
+    if (ord >= dStates.length) {
+      dStates = ArrayUtil.grow(dStates, ord + 1);
+    }
+    dStates[ord] = dState;
+    return ord;
+  }
+
+  /** Gets character class of given codepoint */
+  final int getCharClass(int c) {

Review comment:
       Can we somehow use the version in `RunAutomaton` directly?

##########
File path: lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
##########
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.hppc.BitMixer;
+
+/**
+ * A RunAutomaton that does not require DFA, it will determinize and memorize the generated DFA
+ * state along with the run
+ *
+ * <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
+ */
+public class NFARunAutomaton {
+
+  /** state ordinal of "no such state" */
+  public static final int MISSING = -1;
+
+  private static final int NOT_COMPUTED = -2;
+
+  private final Automaton automaton;
+  private final int[] points;
+  private final Map<DState, Integer> dStateToOrd = new HashMap<>(); // could init lazily?
+  private DState[] dStates;
+  private final int alphabetSize;
+
+  /**
+   * Constructor, assuming alphabet size is the whole codepoint space
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} for
+   *     better efficiency
+   */
+  public NFARunAutomaton(Automaton automaton) {
+    this(automaton, Character.MAX_CODE_POINT);
+  }
+
+  /**
+   * Constructor
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} *
+   *     for better efficiency
+   * @param alphabetSize alphabet size
+   */
+  public NFARunAutomaton(Automaton automaton, int alphabetSize) {
+    this.automaton = automaton;
+    points = automaton.getStartPoints();
+    this.alphabetSize = alphabetSize;
+    dStates = new DState[10];
+    findDState(new DState(new int[] {0}));
+  }
+
+  /**
+   * For a given state and an incoming character (codepoint), return the next state
+   *
+   * @param state incoming state, should either be 0 or some state that is returned previously by
+   *     this function
+   * @param c codepoint
+   * @return the next state or {@link #MISSING} if the transition doesn't exist
+   */
+  public int step(int state, int c) {
+    assert dStates[state] != null;
+    return step(dStates[state], c);
+  }
+
+  /**
+   * Run through a given codepoint array, return accepted or not, should only be used in test
+   *
+   * @param s String represented by an int array
+   * @return accept or not
+   */
+  boolean run(int[] s) {
+    int p = 0;
+    for (int c : s) {
+      p = step(p, c);
+      if (p == MISSING) return false;
+    }
+    return dStates[p].isAccept;
+  }
+
+  /**
+   * From an existing DFA state, step to next DFA state given character c if the transition is
+   * previously tried then this operation will just use the cached result, otherwise it will call
+   * {@link #step(int[], int)} to get the next state and cache the result
+   */
+  private int step(DState dState, int c) {
+    int charClass = getCharClass(c);
+    if (dState.nextState(charClass) == NOT_COMPUTED) {
+      // the next dfa state has not been computed yet
+      dState.setNextState(charClass, findDState(step(dState.nfaStates, c)));
+    }
+    return dState.nextState(charClass);
+  }
+
+  /**
+   * given a list of NFA states and a character c, compute the output list of NFA state which is
+   * wrapped as a DFA state
+   */
+  private DState step(int[] nfaStates, int c) {
+    Transition transition = new Transition();
+    StateSet stateSet = new StateSet(5); // fork IntHashSet from hppc instead?
+    int numTransitions;
+    for (int nfaState : nfaStates) {
+      numTransitions = automaton.initTransition(nfaState, transition);
+      for (int i = 0; i < numTransitions; i++) {
+        automaton.getNextTransition(transition);
+        if (transition.min <= c && transition.max >= c) {
+          stateSet.incr(transition.dest);
+        }
+      }
+    }
+    if (stateSet.size() == 0) {
+      return null;
+    }
+    return new DState(stateSet.getArray());

Review comment:
       We can't just use `StateSet.freeze` here?

##########
File path: lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
##########
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.hppc.BitMixer;
+
+/**
+ * A RunAutomaton that does not require DFA, it will determinize and memorize the generated DFA
+ * state along with the run
+ *
+ * <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
+ */
+public class NFARunAutomaton {
+
+  /** state ordinal of "no such state" */
+  public static final int MISSING = -1;
+
+  private static final int NOT_COMPUTED = -2;
+
+  private final Automaton automaton;
+  private final int[] points;
+  private final Map<DState, Integer> dStateToOrd = new HashMap<>(); // could init lazily?
+  private DState[] dStates;
+  private final int alphabetSize;
+
+  /**
+   * Constructor, assuming alphabet size is the whole codepoint space
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} for
+   *     better efficiency
+   */
+  public NFARunAutomaton(Automaton automaton) {
+    this(automaton, Character.MAX_CODE_POINT);
+  }
+
+  /**
+   * Constructor
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} *
+   *     for better efficiency
+   * @param alphabetSize alphabet size
+   */
+  public NFARunAutomaton(Automaton automaton, int alphabetSize) {
+    this.automaton = automaton;
+    points = automaton.getStartPoints();
+    this.alphabetSize = alphabetSize;
+    dStates = new DState[10];
+    findDState(new DState(new int[] {0}));
+  }
+
+  /**
+   * For a given state and an incoming character (codepoint), return the next state
+   *
+   * @param state incoming state, should either be 0 or some state that is returned previously by
+   *     this function
+   * @param c codepoint
+   * @return the next state or {@link #MISSING} if the transition doesn't exist
+   */
+  public int step(int state, int c) {
+    assert dStates[state] != null;
+    return step(dStates[state], c);
+  }
+
+  /**
+   * Run through a given codepoint array, return accepted or not, should only be used in test
+   *
+   * @param s String represented by an int array
+   * @return accept or not
+   */
+  boolean run(int[] s) {

Review comment:
       Could we share this with `RunAutomaton`?

##########
File path: lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
##########
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.hppc.BitMixer;
+
+/**
+ * A RunAutomaton that does not require DFA, it will determinize and memorize the generated DFA
+ * state along with the run
+ *
+ * <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
+ */
+public class NFARunAutomaton {
+
+  /** state ordinal of "no such state" */
+  public static final int MISSING = -1;
+
+  private static final int NOT_COMPUTED = -2;
+
+  private final Automaton automaton;
+  private final int[] points;
+  private final Map<DState, Integer> dStateToOrd = new HashMap<>(); // could init lazily?
+  private DState[] dStates;
+  private final int alphabetSize;
+
+  /**
+   * Constructor, assuming alphabet size is the whole codepoint space
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} for
+   *     better efficiency
+   */
+  public NFARunAutomaton(Automaton automaton) {
+    this(automaton, Character.MAX_CODE_POINT);
+  }
+
+  /**
+   * Constructor
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} *
+   *     for better efficiency
+   * @param alphabetSize alphabet size
+   */
+  public NFARunAutomaton(Automaton automaton, int alphabetSize) {
+    this.automaton = automaton;
+    points = automaton.getStartPoints();
+    this.alphabetSize = alphabetSize;
+    dStates = new DState[10];
+    findDState(new DState(new int[] {0}));
+  }
+
+  /**
+   * For a given state and an incoming character (codepoint), return the next state
+   *
+   * @param state incoming state, should either be 0 or some state that is returned previously by
+   *     this function
+   * @param c codepoint
+   * @return the next state or {@link #MISSING} if the transition doesn't exist
+   */
+  public int step(int state, int c) {
+    assert dStates[state] != null;
+    return step(dStates[state], c);
+  }
+
+  /**
+   * Run through a given codepoint array, return accepted or not, should only be used in test
+   *
+   * @param s String represented by an int array
+   * @return accept or not
+   */
+  boolean run(int[] s) {
+    int p = 0;
+    for (int c : s) {
+      p = step(p, c);
+      if (p == MISSING) return false;
+    }
+    return dStates[p].isAccept;
+  }
+
+  /**
+   * From an existing DFA state, step to next DFA state given character c if the transition is
+   * previously tried then this operation will just use the cached result, otherwise it will call
+   * {@link #step(int[], int)} to get the next state and cache the result
+   */
+  private int step(DState dState, int c) {
+    int charClass = getCharClass(c);
+    if (dState.nextState(charClass) == NOT_COMPUTED) {
+      // the next dfa state has not been computed yet
+      dState.setNextState(charClass, findDState(step(dState.nfaStates, c)));
+    }
+    return dState.nextState(charClass);
+  }
+
+  /**
+   * given a list of NFA states and a character c, compute the output list of NFA state which is
+   * wrapped as a DFA state
+   */
+  private DState step(int[] nfaStates, int c) {
+    Transition transition = new Transition();
+    StateSet stateSet = new StateSet(5); // fork IntHashSet from hppc instead?
+    int numTransitions;
+    for (int nfaState : nfaStates) {
+      numTransitions = automaton.initTransition(nfaState, transition);
+      for (int i = 0; i < numTransitions; i++) {

Review comment:
       This is perhaps quite costly in some cases?  We are having to step through all transitions for every NFA state in this `DState`, scanning to find ones that include the character we are building a transition for.  But I don't see any better way -- `Automaton` doesn't have any inverted index to map a transition character to the matching transitions for a given state.  And in practice this is likely fine: most NFA states do not have so many outgoing transitions. Though the are surely adversaries ...

##########
File path: lucene/core/src/test/org/apache/lucene/util/automaton/TestNFARunAutomaton.java
##########
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestNFARunAutomaton extends LuceneTestCase {
+
+  public void testRandom() {
+    for (int i = 0; i < 100; i++) {
+      RegExp regExp = null;
+      while (regExp == null) {
+        try {
+          regExp = new RegExp(AutomatonTestUtil.randomRegexp(random()));
+        } catch (IllegalArgumentException e) {
+          ignoreException(e);
+        }
+      }
+      Automaton dfa = regExp.toAutomaton();

Review comment:
       Hmm should we rename this existing method to `toDFA()` maybe?

##########
File path: lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
##########
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.hppc.BitMixer;
+
+/**
+ * A RunAutomaton that does not require DFA, it will determinize and memorize the generated DFA
+ * state along with the run
+ *
+ * <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
+ */
+public class NFARunAutomaton {
+
+  /** state ordinal of "no such state" */
+  public static final int MISSING = -1;
+
+  private static final int NOT_COMPUTED = -2;
+
+  private final Automaton automaton;
+  private final int[] points;
+  private final Map<DState, Integer> dStateToOrd = new HashMap<>(); // could init lazily?
+  private DState[] dStates;
+  private final int alphabetSize;
+
+  /**
+   * Constructor, assuming alphabet size is the whole codepoint space
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} for
+   *     better efficiency
+   */
+  public NFARunAutomaton(Automaton automaton) {
+    this(automaton, Character.MAX_CODE_POINT);
+  }
+
+  /**
+   * Constructor
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} *
+   *     for better efficiency
+   * @param alphabetSize alphabet size
+   */
+  public NFARunAutomaton(Automaton automaton, int alphabetSize) {
+    this.automaton = automaton;
+    points = automaton.getStartPoints();
+    this.alphabetSize = alphabetSize;
+    dStates = new DState[10];
+    findDState(new DState(new int[] {0}));
+  }
+
+  /**
+   * For a given state and an incoming character (codepoint), return the next state
+   *
+   * @param state incoming state, should either be 0 or some state that is returned previously by
+   *     this function
+   * @param c codepoint
+   * @return the next state or {@link #MISSING} if the transition doesn't exist
+   */
+  public int step(int state, int c) {
+    assert dStates[state] != null;
+    return step(dStates[state], c);
+  }
+
+  /**
+   * Run through a given codepoint array, return accepted or not, should only be used in test
+   *
+   * @param s String represented by an int array
+   * @return accept or not
+   */
+  boolean run(int[] s) {
+    int p = 0;
+    for (int c : s) {
+      p = step(p, c);
+      if (p == MISSING) return false;
+    }
+    return dStates[p].isAccept;
+  }
+
+  /**
+   * From an existing DFA state, step to next DFA state given character c if the transition is
+   * previously tried then this operation will just use the cached result, otherwise it will call
+   * {@link #step(int[], int)} to get the next state and cache the result
+   */
+  private int step(DState dState, int c) {
+    int charClass = getCharClass(c);
+    if (dState.nextState(charClass) == NOT_COMPUTED) {
+      // the next dfa state has not been computed yet
+      dState.setNextState(charClass, findDState(step(dState.nfaStates, c)));
+    }
+    return dState.nextState(charClass);
+  }
+
+  /**
+   * given a list of NFA states and a character c, compute the output list of NFA state which is
+   * wrapped as a DFA state
+   */
+  private DState step(int[] nfaStates, int c) {
+    Transition transition = new Transition();
+    StateSet stateSet = new StateSet(5); // fork IntHashSet from hppc instead?
+    int numTransitions;
+    for (int nfaState : nfaStates) {
+      numTransitions = automaton.initTransition(nfaState, transition);
+      for (int i = 0; i < numTransitions; i++) {
+        automaton.getNextTransition(transition);
+        if (transition.min <= c && transition.max >= c) {
+          stateSet.incr(transition.dest);
+        }
+      }
+    }
+    if (stateSet.size() == 0) {
+      return null;
+    }
+    return new DState(stateSet.getArray());
+  }
+
+  /**
+   * return the ordinal of given DFA state, generate a new ordinal if the given DFA state is a new
+   * one
+   */
+  private int findDState(DState dState) {
+    if (dState == null) {
+      return MISSING;
+    }
+    int ord = dStateToOrd.getOrDefault(dState, -1);
+    if (ord >= 0) {
+      return ord;
+    }
+    ord = dStateToOrd.size();
+    dStateToOrd.put(dState, ord);
+    assert ord >= dStates.length || dStates[ord] == null;
+    if (ord >= dStates.length) {
+      dStates = ArrayUtil.grow(dStates, ord + 1);
+    }
+    dStates[ord] = dState;
+    return ord;
+  }
+
+  /** Gets character class of given codepoint */
+  final int getCharClass(int c) {
+    assert c < alphabetSize;
+    // binary search
+    int a = 0;
+    int b = points.length;
+    while (b - a > 1) {
+      int d = (a + b) >>> 1;
+      if (points[d] > c) b = d;
+      else if (points[d] < c) a = d;
+      else return d;
+    }
+    return a;
+  }
+
+  private class DState {
+    private final int[] nfaStates;
+    private int[] transitions;
+    private final int hashCode;
+    private final boolean isAccept;
+
+    private DState(int[] nfaStates) {
+      assert nfaStates != null && nfaStates.length > 0;
+      this.nfaStates = nfaStates;
+      int hashCode = nfaStates.length;
+      boolean isAccept = false;
+      for (int s : nfaStates) {
+        hashCode += BitMixer.mix(s);
+        if (automaton.isAccept(s)) {
+          isAccept = true;
+        }
+      }
+      this.isAccept = isAccept;
+      this.hashCode = hashCode;
+    }
+
+    private int nextState(int charClass) {
+      initTransitions();
+      assert charClass < transitions.length;
+      return transitions[charClass];
+    }
+
+    private void setNextState(int charClass, int nextState) {
+      initTransitions();
+      assert charClass < transitions.length;
+      transitions[charClass] = nextState;

Review comment:
       Maybe also `assert transitions[charClass] == NOT_COMPUTED` so we confirm we are not double-computing transitions or something?

##########
File path: lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
##########
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.hppc.BitMixer;
+
+/**
+ * A RunAutomaton that does not require DFA, it will determinize and memorize the generated DFA
+ * state along with the run
+ *
+ * <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
+ */
+public class NFARunAutomaton {
+
+  /** state ordinal of "no such state" */
+  public static final int MISSING = -1;
+
+  private static final int NOT_COMPUTED = -2;
+
+  private final Automaton automaton;
+  private final int[] points;
+  private final Map<DState, Integer> dStateToOrd = new HashMap<>(); // could init lazily?
+  private DState[] dStates;
+  private final int alphabetSize;
+
+  /**
+   * Constructor, assuming alphabet size is the whole codepoint space
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} for
+   *     better efficiency
+   */
+  public NFARunAutomaton(Automaton automaton) {
+    this(automaton, Character.MAX_CODE_POINT);
+  }
+
+  /**
+   * Constructor
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} *
+   *     for better efficiency
+   * @param alphabetSize alphabet size
+   */
+  public NFARunAutomaton(Automaton automaton, int alphabetSize) {
+    this.automaton = automaton;
+    points = automaton.getStartPoints();
+    this.alphabetSize = alphabetSize;
+    dStates = new DState[10];
+    findDState(new DState(new int[] {0}));
+  }
+
+  /**
+   * For a given state and an incoming character (codepoint), return the next state
+   *
+   * @param state incoming state, should either be 0 or some state that is returned previously by
+   *     this function
+   * @param c codepoint
+   * @return the next state or {@link #MISSING} if the transition doesn't exist
+   */
+  public int step(int state, int c) {
+    assert dStates[state] != null;
+    return step(dStates[state], c);
+  }
+
+  /**
+   * Run through a given codepoint array, return accepted or not, should only be used in test
+   *
+   * @param s String represented by an int array
+   * @return accept or not
+   */
+  boolean run(int[] s) {
+    int p = 0;
+    for (int c : s) {
+      p = step(p, c);
+      if (p == MISSING) return false;
+    }
+    return dStates[p].isAccept;
+  }
+
+  /**
+   * From an existing DFA state, step to next DFA state given character c if the transition is
+   * previously tried then this operation will just use the cached result, otherwise it will call
+   * {@link #step(int[], int)} to get the next state and cache the result
+   */
+  private int step(DState dState, int c) {
+    int charClass = getCharClass(c);
+    if (dState.nextState(charClass) == NOT_COMPUTED) {
+      // the next dfa state has not been computed yet
+      dState.setNextState(charClass, findDState(step(dState.nfaStates, c)));
+    }
+    return dState.nextState(charClass);
+  }
+
+  /**
+   * given a list of NFA states and a character c, compute the output list of NFA state which is
+   * wrapped as a DFA state
+   */
+  private DState step(int[] nfaStates, int c) {
+    Transition transition = new Transition();
+    StateSet stateSet = new StateSet(5); // fork IntHashSet from hppc instead?
+    int numTransitions;
+    for (int nfaState : nfaStates) {
+      numTransitions = automaton.initTransition(nfaState, transition);
+      for (int i = 0; i < numTransitions; i++) {
+        automaton.getNextTransition(transition);
+        if (transition.min <= c && transition.max >= c) {
+          stateSet.incr(transition.dest);
+        }
+      }
+    }
+    if (stateSet.size() == 0) {
+      return null;
+    }
+    return new DState(stateSet.getArray());
+  }
+
+  /**
+   * return the ordinal of given DFA state, generate a new ordinal if the given DFA state is a new
+   * one
+   */
+  private int findDState(DState dState) {
+    if (dState == null) {

Review comment:
       Hmm why would `dState` ever be null like that?

##########
File path: lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
##########
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.hppc.BitMixer;
+
+/**
+ * A RunAutomaton that does not require DFA, it will determinize and memorize the generated DFA
+ * state along with the run
+ *
+ * <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
+ */
+public class NFARunAutomaton {
+
+  /** state ordinal of "no such state" */
+  public static final int MISSING = -1;
+
+  private static final int NOT_COMPUTED = -2;
+
+  private final Automaton automaton;
+  private final int[] points;
+  private final Map<DState, Integer> dStateToOrd = new HashMap<>(); // could init lazily?
+  private DState[] dStates;
+  private final int alphabetSize;
+
+  /**
+   * Constructor, assuming alphabet size is the whole codepoint space
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} for
+   *     better efficiency
+   */
+  public NFARunAutomaton(Automaton automaton) {
+    this(automaton, Character.MAX_CODE_POINT);
+  }
+
+  /**
+   * Constructor
+   *
+   * @param automaton incoming automaton, should be NFA, for DFA please use {@link RunAutomaton} *
+   *     for better efficiency
+   * @param alphabetSize alphabet size
+   */
+  public NFARunAutomaton(Automaton automaton, int alphabetSize) {
+    this.automaton = automaton;
+    points = automaton.getStartPoints();
+    this.alphabetSize = alphabetSize;
+    dStates = new DState[10];
+    findDState(new DState(new int[] {0}));
+  }
+
+  /**
+   * For a given state and an incoming character (codepoint), return the next state
+   *
+   * @param state incoming state, should either be 0 or some state that is returned previously by
+   *     this function
+   * @param c codepoint
+   * @return the next state or {@link #MISSING} if the transition doesn't exist
+   */
+  public int step(int state, int c) {
+    assert dStates[state] != null;
+    return step(dStates[state], c);
+  }
+
+  /**
+   * Run through a given codepoint array, return accepted or not, should only be used in test
+   *
+   * @param s String represented by an int array
+   * @return accept or not
+   */
+  boolean run(int[] s) {
+    int p = 0;
+    for (int c : s) {
+      p = step(p, c);
+      if (p == MISSING) return false;
+    }
+    return dStates[p].isAccept;
+  }
+
+  /**
+   * From an existing DFA state, step to next DFA state given character c if the transition is
+   * previously tried then this operation will just use the cached result, otherwise it will call
+   * {@link #step(int[], int)} to get the next state and cache the result
+   */
+  private int step(DState dState, int c) {
+    int charClass = getCharClass(c);
+    if (dState.nextState(charClass) == NOT_COMPUTED) {
+      // the next dfa state has not been computed yet
+      dState.setNextState(charClass, findDState(step(dState.nfaStates, c)));
+    }
+    return dState.nextState(charClass);
+  }
+
+  /**
+   * given a list of NFA states and a character c, compute the output list of NFA state which is
+   * wrapped as a DFA state
+   */
+  private DState step(int[] nfaStates, int c) {
+    Transition transition = new Transition();
+    StateSet stateSet = new StateSet(5); // fork IntHashSet from hppc instead?
+    int numTransitions;
+    for (int nfaState : nfaStates) {
+      numTransitions = automaton.initTransition(nfaState, transition);
+      for (int i = 0; i < numTransitions; i++) {
+        automaton.getNextTransition(transition);
+        if (transition.min <= c && transition.max >= c) {
+          stateSet.incr(transition.dest);
+        }
+      }
+    }
+    if (stateSet.size() == 0) {
+      return null;
+    }
+    return new DState(stateSet.getArray());
+  }
+
+  /**
+   * return the ordinal of given DFA state, generate a new ordinal if the given DFA state is a new
+   * one
+   */
+  private int findDState(DState dState) {
+    if (dState == null) {
+      return MISSING;
+    }
+    int ord = dStateToOrd.getOrDefault(dState, -1);
+    if (ord >= 0) {
+      return ord;
+    }
+    ord = dStateToOrd.size();
+    dStateToOrd.put(dState, ord);
+    assert ord >= dStates.length || dStates[ord] == null;
+    if (ord >= dStates.length) {
+      dStates = ArrayUtil.grow(dStates, ord + 1);
+    }
+    dStates[ord] = dState;
+    return ord;
+  }
+
+  /** Gets character class of given codepoint */
+  final int getCharClass(int c) {
+    assert c < alphabetSize;
+    // binary search
+    int a = 0;
+    int b = points.length;
+    while (b - a > 1) {
+      int d = (a + b) >>> 1;
+      if (points[d] > c) b = d;
+      else if (points[d] < c) a = d;
+      else return d;
+    }
+    return a;
+  }
+
+  private class DState {
+    private final int[] nfaStates;
+    private int[] transitions;

Review comment:
       Maybe add comment that this is lazy-init'd first time caller wants transition for a given character?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org