You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2023/05/15 14:22:34 UTC
[lucene] branch branch_9x updated: toposort use iterator to avoid stackoverflow (#12286)
This is an automated email from the ASF dual-hosted git repository.
uschindler pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 76e8a4201a5 toposort use iterator to avoid stackoverflow (#12286)
76e8a4201a5 is described below
commit 76e8a4201a5ada5bca3fbbe22d03e274fbc22941
Author: tang donghai <ta...@gmail.com>
AuthorDate: Mon May 15 22:20:15 2023 +0800
toposort use iterator to avoid stackoverflow (#12286)
Co-authored-by: tangdonghai <ta...@meituan.com>
# Conflicts:
# lucene/CHANGES.txt
---
lucene/CHANGES.txt | 2 +
.../apache/lucene/util/automaton/Operations.java | 69 ++++++++++-----
.../lucene/util/automaton/TestOperations.java | 98 +++++++++++++++++++---
.../suggest/analyzing/TestAnalyzingSuggester.java | 16 ----
4 files changed, 136 insertions(+), 49 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 8875a3f0953..230c33b875f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -27,6 +27,8 @@ Optimizations
* GITHUB#12160: Concurrent rewrite for AbstractKnnVectorQuery. (Kaival Parikh)
+* GITHUB#12286 Toposort use iterator to avoid stackoverflow. (Tang Donghai)
+
Bug Fixes
---------------------
(No changes)
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
index 6db58e7119e..a25d992e351 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
@@ -1311,9 +1311,14 @@ public final class Operations {
}
/**
- * Returns the topological sort of all states reachable from the initial state. Behavior is
- * undefined if this automaton has cycles. CPU cost is O(numTransitions), and the implementation
- * is recursive so an automaton matching long strings may exhaust the java stack.
+ * Returns the topological sort of all states reachable from the initial state. This method
+ * assumes that the automaton does not contain cycles, and will throw an IllegalArgumentException
+ * if a cycle is detected. The CPU cost is O(numTransitions), and the implementation is
+ * non-recursive, so it will not exhaust the java stack for automaton matching long strings. If
+ * there are dead states in the automaton, they will be removed from the returned array.
+ *
+ * @param a the Automaton to be sorted
+ * @return the topologically sorted array of state ids
*/
public static int[] topoSortStates(Automaton a) {
if (a.getNumStates() == 0) {
@@ -1321,8 +1326,7 @@ public final class Operations {
}
int numStates = a.getNumStates();
int[] states = new int[numStates];
- final BitSet visited = new BitSet(numStates);
- int upto = topoSortStatesRecurse(a, visited, states, 0, 0, 0);
+ int upto = topoSortStates(a, states);
if (upto < states.length) {
// There were dead states
@@ -1341,24 +1345,49 @@ public final class Operations {
return states;
}
- // TODO: not great that this is recursive... in theory a
- // large automata could exceed java's stack so the maximum level of recursion is bounded to 1000
- private static int topoSortStatesRecurse(
- Automaton a, BitSet visited, int[] states, int upto, int state, int level) {
- if (level > MAX_RECURSION_LEVEL) {
- throw new IllegalArgumentException("input automaton is too large: " + level);
- }
+ /**
+ * Performs a topological sort on the states of the given Automaton.
+ *
+ * @param a The automaton whose states are to be topologically sorted.
+ * @param states An int array which stores the states.
+ * @return the number of states in the final sorted list.
+ * @throws IllegalArgumentException if the input automaton has a cycle.
+ */
+ private static int topoSortStates(Automaton a, int[] states) {
+ BitSet onStack = new BitSet(a.getNumStates());
+ BitSet visited = new BitSet(a.getNumStates());
+ var stack = new ArrayDeque<Integer>();
+ stack.push(0); // Assuming that the initial state is 0.
+ int upto = 0;
Transition t = new Transition();
- int count = a.initTransition(state, t);
- for (int i = 0; i < count; i++) {
- a.getNextTransition(t);
- if (!visited.get(t.dest)) {
- visited.set(t.dest);
- upto = topoSortStatesRecurse(a, visited, states, upto, t.dest, level + 1);
+
+ while (!stack.isEmpty()) {
+ int state = stack.peek(); // Just peek, don't remove the state yet
+
+ int count = a.initTransition(state, t);
+ boolean pushed = false;
+ for (int i = 0; i < count; i++) {
+ a.getNextTransition(t);
+ if (!visited.get(t.dest)) {
+ visited.set(t.dest);
+ stack.push(t.dest); // Push the next unvisited state onto the stack
+ onStack.set(state);
+ pushed = true;
+ break; // Exit the loop, we'll continue from here in the next iteration
+ } else if (onStack.get(t.dest)) {
+ // If the state is on the current recursion stack, we have detected a cycle
+ throw new IllegalArgumentException("Input automaton has a cycle.");
+ }
+ }
+
+ // If we haven't pushed any new state onto the stack, we're done with this state
+ if (!pushed) {
+ onStack.clear(state); // remove the node from the current recursion stack
+ stack.pop();
+ states[upto] = state;
+ upto++;
}
}
- states[upto] = state;
- upto++;
return upto;
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java
index 62913c16659..a639d593258 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java
@@ -17,6 +17,7 @@
package org.apache.lucene.util.automaton;
import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;
+import static org.apache.lucene.util.automaton.Operations.topoSortStates;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.util.ArrayList;
@@ -69,6 +70,42 @@ public class TestOperations extends LuceneTestCase {
assertTrue(Operations.isEmpty(concat));
}
+ /**
+ * Test case for the topoSortStates method when the input Automaton contains a cycle. This test
+ * case constructs an Automaton with two disjoint sets of states—one without a cycle and one with
+ * a cycle. The topoSortStates method should detect the presence of a cycle and throw an
+ * IllegalArgumentException.
+ */
+ public void testCycledAutomaton() {
+ Automaton a = generateRandomAutomaton(true);
+ IllegalArgumentException exc =
+ expectThrows(IllegalArgumentException.class, () -> topoSortStates(a));
+ assertTrue(exc.getMessage().contains("Input automaton has a cycle"));
+ }
+
+ public void testTopoSortStates() {
+ Automaton a = generateRandomAutomaton(false);
+
+ int[] sorted = topoSortStates(a);
+ int[] stateMap = new int[a.getNumStates()];
+ Arrays.fill(stateMap, -1);
+ int order = 0;
+ for (int state : sorted) {
+ assertEquals(-1, stateMap[state]);
+ stateMap[state] = (order++);
+ }
+
+ Transition transition = new Transition();
+ for (int state : sorted) {
+ int count = a.initTransition(state, transition);
+ for (int i = 0; i < count; i++) {
+ a.getNextTransition(transition);
+ // ensure dest's order is higher than current state
+ assertTrue(stateMap[transition.dest] > stateMap[state]);
+ }
+ }
+ }
+
/** Test optimization to concatenate() with empty String to an NFA */
public void testEmptySingletonNFAConcatenate() {
Automaton singleton = Automata.makeString("");
@@ -144,19 +181,6 @@ public class TestOperations extends LuceneTestCase {
assertTrue(exc.getMessage().contains("input automaton is too large"));
}
- public void testTopoSortEatsStack() {
- char[] chars = new char[50000];
- TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
- String bigString1 = new String(chars);
- TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
- String bigString2 = new String(chars);
- Automaton a =
- Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2));
- IllegalArgumentException exc =
- expectThrows(IllegalArgumentException.class, () -> Operations.topoSortStates(a));
- assertTrue(exc.getMessage().contains("input automaton is too large"));
- }
-
/**
* Returns the set of all accepted strings.
*
@@ -190,4 +214,52 @@ public class TestOperations extends LuceneTestCase {
return result;
}
+
+ /**
+ * This method creates a random Automaton by generating states at multiple levels. At each level,
+ * a random number of states are created, and transitions are added between the states of the
+ * current and the previous level randomly, If the 'hasCycle' parameter is true, a transition is
+ * added from the first state of the last level back to the initial state to create a cycle in the
+ * Automaton..
+ *
+ * @param hasCycle if true, the generated Automaton will have a cycle; if false, it won't have a
+ * cycle.
+ * @return a randomly generated Automaton instance.
+ */
+ private Automaton generateRandomAutomaton(boolean hasCycle) {
+ Automaton a = new Automaton();
+ List<Integer> lastLevelStates = new ArrayList<>();
+ int initialState = a.createState();
+ int maxLevel = random().nextInt(4, 10);
+ lastLevelStates.add(initialState);
+
+ for (int level = 1; level < maxLevel; level++) {
+ int numStates = random().nextInt(3, 10);
+ List<Integer> nextLevelStates = new ArrayList<>();
+
+ for (int i = 0; i < numStates; i++) {
+ int nextState = a.createState();
+ nextLevelStates.add(nextState);
+ }
+
+ for (int lastState : lastLevelStates) {
+ for (int nextState : nextLevelStates) {
+ // if hasCycle is enabled, we will always add a transition, so we could make sure the
+ // generated Automaton has a cycle.
+ if (hasCycle || random().nextInt(7) >= 1) {
+ a.addTransition(lastState, nextState, random().nextInt(10));
+ }
+ }
+ }
+ lastLevelStates = nextLevelStates;
+ }
+
+ if (hasCycle) {
+ int lastState = lastLevelStates.get(0);
+ a.addTransition(lastState, initialState, random().nextInt(10));
+ }
+
+ a.finishState();
+ return a;
+ }
}
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingSuggester.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingSuggester.java
index b5e49c5ec79..cb14d983716 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingSuggester.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingSuggester.java
@@ -1325,22 +1325,6 @@ public class TestAnalyzingSuggester extends LuceneTestCase {
return asList;
}
- // TODO: we need BaseSuggesterTestCase?
- public void testTooLongSuggestion() throws Exception {
- Analyzer a = new MockAnalyzer(random());
- Directory tempDir = getDirectory();
- AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a);
- String bigString = TestUtil.randomSimpleString(random(), 30000, 30000);
- IllegalArgumentException ex =
- expectThrows(
- IllegalArgumentException.class,
- () -> {
- suggester.build(new InputArrayIterator(new Input[] {new Input(bigString, 7)}));
- });
- assertTrue(ex.getMessage().contains("input automaton is too large"));
- IOUtils.close(a, tempDir);
- }
-
private Directory getDirectory() {
return newDirectory();
}