You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2014/11/04 21:38:26 UTC

svn commit: r1636728 [1/2] - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/search/ lucene/core/src/java/org/apache/lucene/util/automaton/ lucene/core/src/test/org/apache/lucene/analysis/ lucene/core/s...

Author: mikemccand
Date: Tue Nov  4 20:38:25 2014
New Revision: 1636728

URL: http://svn.apache.org/r1636728
Log:
LUCENE-6046: add maxDeterminizedStates to determinize to prevent exhausting CPU/RAM when the automaton is too difficult to determinize

Added:
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/TooComplexToDeterminizeException.java
      - copied unchanged from r1636716, lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/TooComplexToDeterminizeException.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
      - copied unchanged from r1636716, lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/lucene/core/   (props changed)
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminizeLexicon.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java
    lucene/dev/branches/branch_5x/lucene/highlighter/   (props changed)
    lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java
    lucene/dev/branches/branch_5x/lucene/queryparser/   (props changed)
    lucene/dev/branches/branch_5x/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
    lucene/dev/branches/branch_5x/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/RegexpQueryNodeBuilder.java
    lucene/dev/branches/branch_5x/lucene/sandbox/   (props changed)
    lucene/dev/branches/branch_5x/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java
    lucene/dev/branches/branch_5x/lucene/suggest/   (props changed)
    lucene/dev/branches/branch_5x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
    lucene/dev/branches/branch_5x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
    lucene/dev/branches/branch_5x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java
    lucene/dev/branches/branch_5x/solr/   (props changed)
    lucene/dev/branches/branch_5x/solr/core/   (props changed)
    lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java
    lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java

Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Tue Nov  4 20:38:25 2014
@@ -201,6 +201,11 @@ Bug Fixes
 * LUCENE-6042: CustomScoreQuery explain was incorrect in some cases,
   such as when nested inside a boolean query. (Denis Lantsman via Robert Muir)
 
+* LUCENE-6046: Add maxDeterminizedStates safety to determinize (which has
+  an exponential worst case) so that if it would create too many states, it
+  now throws an exception instead of exhausting CPU/RAM.  (Nik
+  Everett via Mike McCandless)
+
 Documentation
 
 * LUCENE-5392: Add/improve analysis package documentation to reflect

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java Tue Nov  4 20:38:25 2014
@@ -1,11 +1,5 @@
 package org.apache.lucene.search;
 
-import org.apache.lucene.index.Term;
-import org.apache.lucene.util.ToStringUtils;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.AutomatonProvider;
-import org.apache.lucene.util.automaton.RegExp;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -23,6 +17,13 @@ import org.apache.lucene.util.automaton.
  * limitations under the License.
  */
 
+import org.apache.lucene.index.Term;
+import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonProvider;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
+
 /**
  * A fast regular expression query based on the
  * {@link org.apache.lucene.util.automaton} package.
@@ -75,18 +76,37 @@ public class RegexpQuery extends Automat
    * @param flags optional RegExp features from {@link RegExp}
    */
   public RegexpQuery(Term term, int flags) {
-    this(term, flags, defaultProvider);
+    this(term, flags, defaultProvider,
+      Operations.DEFAULT_MAX_DETERMINIZED_STATES);
   }
-  
+
+  /**
+   * Constructs a query for terms matching <code>term</code>.
+   * 
+   * @param term regular expression.
+   * @param flags optional RegExp features from {@link RegExp}
+   * @param maxDeterminizedStates maximum number of states that compiling the
+   *  automaton for the regexp can result in.  Set higher to allow more complex
+   *  queries and lower to prevent memory exhaustion.
+   */
+  public RegexpQuery(Term term, int flags, int maxDeterminizedStates) {
+    this(term, flags, defaultProvider, maxDeterminizedStates);
+  }
+
   /**
    * Constructs a query for terms matching <code>term</code>.
    * 
    * @param term regular expression.
    * @param flags optional RegExp features from {@link RegExp}
    * @param provider custom AutomatonProvider for named automata
+   * @param maxDeterminizedStates maximum number of states that compiling the
+   *  automaton for the regexp can result in.  Set higher to allow more complex
+   *  queries and lower to prevent memory exhaustion.
    */
-  public RegexpQuery(Term term, int flags, AutomatonProvider provider) {
-    super(term, new RegExp(term.text(), flags).toAutomaton(provider));
+  public RegexpQuery(Term term, int flags, AutomatonProvider provider,
+      int maxDeterminizedStates) {
+    super(term, new RegExp(term.text(), flags).toAutomaton(
+      provider, maxDeterminizedStates));
   }
   
   /** Prints a user-readable version of this query. */

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java Tue Nov  4 20:38:25 2014
@@ -33,7 +33,7 @@ import java.io.IOException;
 
 /**
  * Automaton provider for <code>RegExp.</code>
- * {@link RegExp#toAutomaton(AutomatonProvider)}
+ * {@link RegExp#toAutomaton(AutomatonProvider,int)}
  * 
  * @lucene.experimental
  */

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java Tue Nov  4 20:38:25 2014
@@ -24,12 +24,12 @@ public class ByteRunAutomaton extends Ru
 
   /** Converts incoming automaton to byte-based (UTF32ToUTF8) first */
   public ByteRunAutomaton(Automaton a) {
-    this(a, false);
+    this(a, false, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
   }
   
   /** expert: if utf8 is true, the input is already byte-based */
-  public ByteRunAutomaton(Automaton a, boolean utf8) {
-    super(utf8 ? a : new UTF32ToUTF8().convert(a), 256, true);
+  public ByteRunAutomaton(Automaton a, boolean utf8, int maxDeterminizedStates) {
+    super(utf8 ? a : new UTF32ToUTF8().convert(a), 256, true, maxDeterminizedStates);
   }
 
   /**

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java Tue Nov  4 20:38:25 2014
@@ -21,10 +21,22 @@ package org.apache.lucene.util.automaton
  * Automaton representation for matching char[].
  */
 public class CharacterRunAutomaton extends RunAutomaton {
-
-  /** Sole constructor. */
+  /**
+   * Construct with a default number of maxDeterminizedStates.
+   */
   public CharacterRunAutomaton(Automaton a) {
-    super(a, Character.MAX_CODE_POINT, false);
+    this(a, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+  }
+
+  /**
+   * Construct specifying maxDeterminizedStates.
+   * @param a Automaton to match
+   * @param maxDeterminizedStates maximum number of states that the automataon
+   *   can have once determinized.  If more states are required to determinize
+   *   it then a TooComplexToDeterminizeException is thrown.
+   */ 
+  public CharacterRunAutomaton(Automaton a, int maxDeterminizedStates) {
+    super(a, Character.MAX_CODE_POINT, false, maxDeterminizedStates);
   }
 
   /**

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java Tue Nov  4 20:38:25 2014
@@ -21,10 +21,10 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.lucene.index.SingleTermsEnum;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.PrefixTermsEnum;
-import org.apache.lucene.index.SingleTermsEnum;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 
@@ -101,7 +101,20 @@ public class CompiledAutomaton {
    *  possibly expensive operations to determine if the automaton is one
    *  the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. */
   public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {
+    this(automaton, finite, simplify, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+  }
+
 
+  /** Create this.  If finite is null, we use {@link Operations#isFinite}
+   *  to determine whether it is finite.  If simplify is true, we run
+   *  possibly expensive operations to determine if the automaton is one
+   *  the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. If simplify
+   *  requires determinizing the autaomaton then only maxDeterminizedStates
+   *  will be created.  Any more than that will cause a
+   *  TooComplexToDeterminizeException.
+   */
+  public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify,
+      int maxDeterminizedStates) {
     if (automaton.getNumStates() == 0) {
       automaton = new Automaton();
       automaton.createState();
@@ -134,7 +147,7 @@ public class CompiledAutomaton {
         return;
       } else {
 
-        automaton = Operations.determinize(automaton);
+        automaton = Operations.determinize(automaton, maxDeterminizedStates);
 
         final String commonPrefix = Operations.getCommonPrefix(automaton);
         final String singleton;
@@ -156,7 +169,7 @@ public class CompiledAutomaton {
           return;
         } else if (commonPrefix.length() > 0) {
           Automaton other = Operations.concatenate(Automata.makeString(commonPrefix), Automata.makeAnyString());
-          other = Operations.determinize(other);
+          other = Operations.determinize(other, maxDeterminizedStates);
           assert Operations.hasDeadStates(other) == false;
           if (Operations.sameLanguage(automaton, other)) {
             // matches a constant prefix
@@ -185,9 +198,9 @@ public class CompiledAutomaton {
     if (this.finite) {
       commonSuffixRef = null;
     } else {
-      commonSuffixRef = Operations.getCommonSuffixBytesRef(utf8);
+      commonSuffixRef = Operations.getCommonSuffixBytesRef(utf8, maxDeterminizedStates);
     }
-    runAutomaton = new ByteRunAutomaton(utf8, true);
+    runAutomaton = new ByteRunAutomaton(utf8, true, maxDeterminizedStates);
 
     this.automaton = runAutomaton.automaton;
   }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java Tue Nov  4 20:38:25 2014
@@ -29,8 +29,8 @@
 
 package org.apache.lucene.util.automaton;
 
-import java.util.BitSet;
 import java.util.ArrayList;
+import java.util.BitSet;
 import java.util.HashSet;
 import java.util.LinkedList;
 
@@ -45,21 +45,17 @@ final public class MinimizationOperation
 
   /**
    * Minimizes (and determinizes if not already deterministic) the given
-   * automaton.
-   */
-  public static Automaton minimize(Automaton a) {
-    return minimizeHopcroft(a);
-  }
-  
-  /**
-   * Minimizes the given automaton using Hopcroft's algorithm.
+   * automaton using Hopcroft's algorighm.
+   * @param maxDeterminizedStates maximum number of states determinizing the
+   *  automaton can result in.  Set higher to allow more complex queries and
+   *  lower to prevent memory exhaustion.
    */
-  public static Automaton minimizeHopcroft(Automaton a) {
+  public static Automaton minimize(Automaton a, int maxDeterminizedStates) {
     if (a.getNumStates() == 0 || (a.isAccept(0) == false && a.getNumTransitions(0) == 0)) {
       // Fastmatch for common case
       return new Automaton();
     }
-    a = Operations.determinize(a);
+    a = Operations.determinize(a, maxDeterminizedStates);
     //a.writeDot("adet");
     if (a.getNumTransitions(0) == 1) {
       Transition t = new Transition();

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java Tue Nov  4 20:38:25 2014
@@ -53,7 +53,11 @@ import org.apache.lucene.util.RamUsageEs
  * @lucene.experimental
  */
 final public class Operations {
-  
+  /**
+   * Default maximum number of states that {@link Operations#determinize} should create.
+   */
+  public static final int DEFAULT_MAX_DETERMINIZED_STATES = 10000;
+
   private Operations() {}
 
   /**
@@ -202,12 +206,12 @@ final public class Operations {
    * <p>
    * Complexity: linear in number of states and in <code>min</code>.
    */
-  static public Automaton repeat(Automaton a, int min) {
-    if (min == 0) {
+  static public Automaton repeat(Automaton a, int count) {
+    if (count == 0) {
       return repeat(a);
     }
     List<Automaton> as = new ArrayList<>();
-    while (min-- > 0) {
+    while (count-- > 0) {
       as.add(a);
     }
     as.add(repeat(a));
@@ -242,19 +246,18 @@ final public class Operations {
     }
 
     Set<Integer> prevAcceptStates = toSet(b, 0);
-
+    Automaton.Builder builder = new Automaton.Builder();
+    builder.copy(b);
     for(int i=min;i<max;i++) {
-      int numStates = b.getNumStates();
-      b.copy(a);
+      int numStates = builder.getNumStates();
+      builder.copy(a);
       for(int s : prevAcceptStates) {
-        b.addEpsilon(s, numStates);
+        builder.addEpsilon(s, numStates);
       }
       prevAcceptStates = toSet(a, numStates);
     }
 
-    b.finishState();
-
-    return b;
+    return builder.finish();
   }
 
   private static Set<Integer> toSet(Automaton a, int offset) {
@@ -274,10 +277,14 @@ final public class Operations {
    * Returns a (deterministic) automaton that accepts the complement of the
    * language of the given automaton.
    * <p>
-   * Complexity: linear in number of states (if already deterministic).
+   * Complexity: linear in number of states if already deterministic and
+   *  exponential otherwise.
+   * @param maxDeterminizedStates maximum number of states determinizing the
+   *  automaton can result in.  Set higher to allow more complex queries and
+   *  lower to prevent memory exhaustion.
    */
-  static public Automaton complement(Automaton a) {
-    a = totalize(determinize(a));
+  static public Automaton complement(Automaton a, int maxDeterminizedStates) {
+    a = totalize(determinize(a, maxDeterminizedStates));
     int numStates = a.getNumStates();
     for (int p=0;p<numStates;p++) {
       a.setAccept(p, !a.isAccept(p));
@@ -291,16 +298,17 @@ final public class Operations {
    * <code>a2</code>. As a side-effect, the automata may be determinized, if not
    * already deterministic.
    * <p>
-   * Complexity: quadratic in number of states (if already deterministic).
+   * Complexity: quadratic in number of states if a2 already deterministic and
+   *  exponential in number of a2's states otherwise.
    */
-  static public Automaton minus(Automaton a1, Automaton a2) {
+  static public Automaton minus(Automaton a1, Automaton a2, int maxDeterminizedStates) {
     if (Operations.isEmpty(a1) || a1 == a2) {
       return Automata.makeEmpty();
     }
     if (Operations.isEmpty(a2)) {
       return a1;
     }
-    return intersection(a1, complement(a2));
+    return intersection(a1, complement(a2, maxDeterminizedStates));
   }
   
   /**
@@ -490,7 +498,6 @@ final public class Operations {
     result.createState();
 
     // Copy over all automata
-    Transition t = new Transition();
     for(Automaton a : l) {
       result.copy(a);
     }
@@ -644,8 +651,15 @@ final public class Operations {
    * Determinizes the given automaton.
    * <p>
    * Worst case complexity: exponential in number of states.
+   * @param maxDeterminizedStates Maximum number of states created when
+   *   determinizing.  Higher numbers allow this operation to consume more
+   *   memory but allow more complex automatons.  Use
+   *   DEFAULT_MAX_DETERMINIZED_STATES as a decent default if you don't know
+   *   how many to allow.
+   * @throws TooComplexToDeterminizeException if determinizing a creates an
+   *   automaton with more than maxDeterminizedStates
    */
-  public static Automaton determinize(Automaton a) {
+  public static Automaton determinize(Automaton a, int maxDeterminizedStates) {
     if (a.isDeterministic()) {
       // Already determinized
       return a;
@@ -674,11 +688,6 @@ final public class Operations {
     b.setAccept(0, a.isAccept(0));
     newstate.put(initialset, 0);
 
-    int newStateUpto = 0;
-    int[] newStatesArray = new int[5];
-    newStatesArray[newStateUpto] = 0;
-    newStateUpto++;
-
     // like Set<Integer,PointTransitions>
     final PointTransitionSet points = new PointTransitionSet();
 
@@ -726,6 +735,9 @@ final public class Operations {
           Integer q = newstate.get(statesSet);
           if (q == null) {
             q = b.createState();
+            if (q >= maxDeterminizedStates) {
+              throw new TooComplexToDeterminizeException(a, maxDeterminizedStates);
+            }
             final SortedIntSet.FrozenIntSet p = statesSet.freeze(q);
             //System.out.println("  make new state=" + q + " -> " + p + " accCount=" + accCount);
             worklist.add(p);
@@ -1100,12 +1112,14 @@ final public class Operations {
    * Returns the longest BytesRef that is a suffix of all accepted strings.
    * Worst case complexity: exponential in number of states (this calls
    * determinize).
-   *
+   * @param maxDeterminizedStates maximum number of states determinizing the
+   *  automaton can result in.  Set higher to allow more complex queries and
+   *  lower to prevent memory exhaustion.
    * @return common suffix
    */
-  public static BytesRef getCommonSuffixBytesRef(Automaton a) {
+  public static BytesRef getCommonSuffixBytesRef(Automaton a, int maxDeterminizedStates) {
     // reverse the language of the automaton, then reverse its common prefix.
-    Automaton r = Operations.determinize(reverse(a));
+    Automaton r = Operations.determinize(reverse(a), maxDeterminizedStates);
     BytesRef ref = getCommonPrefixBytesRef(r);
     reverseBytes(ref);
     return ref;

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java Tue Nov  4 20:38:25 2014
@@ -360,7 +360,8 @@ public class RegExp {
    * Syntax flag, enables no optional regexp syntax.
    */
   public static final int NONE = 0x0000;
-  
+
+  private final String originalString;
   Kind kind;
   RegExp exp1, exp2;
   String s;
@@ -368,11 +369,12 @@ public class RegExp {
   int min, max, digits;
   int from, to;
   
-  String b;
   int flags;
   int pos;
   
-  RegExp() {}
+  RegExp() {
+    this.originalString = null;
+  }
   
   /**
    * Constructs new <code>RegExp</code> from a string. Same as
@@ -396,13 +398,13 @@ public class RegExp {
    *              regular expression
    */
   public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
-    b = s;
+    originalString = s;
     flags = syntax_flags;
     RegExp e;
     if (s.length() == 0) e = makeString("");
     else {
       e = parseUnionExp();
-      if (pos < b.length()) throw new IllegalArgumentException(
+      if (pos < originalString.length()) throw new IllegalArgumentException(
           "end-of-string expected at position " + pos);
     }
     kind = e.kind;
@@ -415,7 +417,6 @@ public class RegExp {
     digits = e.digits;
     from = e.from;
     to = e.to;
-    b = null;
   }
 
   /**
@@ -423,21 +424,47 @@ public class RegExp {
    * as <code>toAutomaton(null)</code> (empty automaton map).
    */
   public Automaton toAutomaton() {
-    return toAutomaton(null, null);
+    return toAutomaton(null, null, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
   }
-  
+
   /**
    * Constructs new <code>Automaton</code> from this <code>RegExp</code>. The
    * constructed automaton is minimal and deterministic and has no transitions
    * to dead states.
    * 
-   * @param automaton_provider provider of automata for named identifiers
+   * @param maxDeterminizedStates maximum number of states in the resulting
+   *   automata.  If the automata would need more than this many states
+   *   TooComplextToDeterminizeException is thrown.  Higher number require more
+   *   space but can process more complex regexes.
    * @exception IllegalArgumentException if this regular expression uses a named
    *              identifier that is not available from the automaton provider
+   * @exception TooComplexToDeterminizeException if determinizing this regexp
+   *   requires more than maxDeterminizedStates states
    */
-  public Automaton toAutomaton(AutomatonProvider automaton_provider)
-      throws IllegalArgumentException {
-    return toAutomaton(null, automaton_provider);
+  public Automaton toAutomaton(int maxDeterminizedStates)
+      throws IllegalArgumentException, TooComplexToDeterminizeException {
+    return toAutomaton(null, null, maxDeterminizedStates);
+  }
+
+  /**
+   * Constructs new <code>Automaton</code> from this <code>RegExp</code>. The
+   * constructed automaton is minimal and deterministic and has no transitions
+   * to dead states.
+   * 
+   * @param automaton_provider provider of automata for named identifiers
+   * @param maxDeterminizedStates maximum number of states in the resulting
+   *   automata.  If the automata would need more than this many states
+   *   TooComplextToDeterminizeException is thrown.  Higher number require more
+   *   space but can process more complex regexes.
+   * @exception IllegalArgumentException if this regular expression uses a named
+   *   identifier that is not available from the automaton provider
+   * @exception TooComplexToDeterminizeException if determinizing this regexp
+   *   requires more than maxDeterminizedStates states
+   */
+  public Automaton toAutomaton(AutomatonProvider automaton_provider,
+      int maxDeterminizedStates) throws IllegalArgumentException,
+      TooComplexToDeterminizeException {
+    return toAutomaton(null, automaton_provider, maxDeterminizedStates);
   }
   
   /**
@@ -447,60 +474,95 @@ public class RegExp {
    * 
    * @param automata a map from automaton identifiers to automata (of type
    *          <code>Automaton</code>).
+   * @param maxDeterminizedStates maximum number of states in the resulting
+   *   automata.  If the automata would need more than this many states
+   *   TooComplexToDeterminizeException is thrown.  Higher number require more
+   *   space but can process more complex regexes.
    * @exception IllegalArgumentException if this regular expression uses a named
-   *              identifier that does not occur in the automaton map
+   *   identifier that does not occur in the automaton map
+   * @exception TooComplexToDeterminizeException if determinizing this regexp
+   *   requires more than maxDeterminizedStates states
    */
-  public Automaton toAutomaton(Map<String,Automaton> automata)
-      throws IllegalArgumentException {
-    return toAutomaton(automata, null);
+  public Automaton toAutomaton(Map<String,Automaton> automata,
+      int maxDeterminizedStates) throws IllegalArgumentException,
+      TooComplexToDeterminizeException {
+    return toAutomaton(automata, null, maxDeterminizedStates);
   }
 
   private Automaton toAutomaton(Map<String,Automaton> automata,
-      AutomatonProvider automaton_provider) throws IllegalArgumentException {
+      AutomatonProvider automaton_provider, int maxDeterminizedStates)
+      throws IllegalArgumentException, TooComplexToDeterminizeException {
+    try {
+      return toAutomatonInternal(automata, automaton_provider,
+        maxDeterminizedStates);
+    } catch (TooComplexToDeterminizeException e) {
+      throw new TooComplexToDeterminizeException(this, e);
+    }
+  }
+
+  private Automaton toAutomatonInternal(Map<String,Automaton> automata,
+      AutomatonProvider automaton_provider, int maxDeterminizedStates)
+      throws IllegalArgumentException {
     List<Automaton> list;
     Automaton a = null;
     switch (kind) {
       case REGEXP_UNION:
         list = new ArrayList<>();
-        findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider);
-        findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider);
+        findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider,
+          maxDeterminizedStates);
+        findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider,
+          maxDeterminizedStates);
         a = Operations.union(list);
-        a = MinimizationOperations.minimize(a);
+        a = MinimizationOperations.minimize(a, maxDeterminizedStates);
         break;
       case REGEXP_CONCATENATION:
         list = new ArrayList<>();
         findLeaves(exp1, Kind.REGEXP_CONCATENATION, list, automata,
-            automaton_provider);
+            automaton_provider, maxDeterminizedStates);
         findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata,
-            automaton_provider);
+            automaton_provider, maxDeterminizedStates);
         a = Operations.concatenate(list);
-        a = MinimizationOperations.minimize(a);
+        a = MinimizationOperations.minimize(a, maxDeterminizedStates);
         break;
       case REGEXP_INTERSECTION:
         a = Operations.intersection(
-            exp1.toAutomaton(automata, automaton_provider),
-            exp2.toAutomaton(automata, automaton_provider));
-        a = MinimizationOperations.minimize(a);
+            exp1.toAutomatonInternal(
+              automata, automaton_provider, maxDeterminizedStates),
+            exp2.toAutomatonInternal(
+              automata, automaton_provider, maxDeterminizedStates));
+        a = MinimizationOperations.minimize(a, maxDeterminizedStates);
         break;
       case REGEXP_OPTIONAL:
-        a = Operations.optional(exp1.toAutomaton(automata, automaton_provider));
-        a = MinimizationOperations.minimize(a);
+        a = Operations.optional(exp1.toAutomatonInternal(automata,
+          automaton_provider, maxDeterminizedStates));
+        a = MinimizationOperations.minimize(a, maxDeterminizedStates);
         break;
       case REGEXP_REPEAT:
-        a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider));
-        a = MinimizationOperations.minimize(a);
+        a = Operations.repeat(exp1.toAutomatonInternal(
+          automata, automaton_provider, maxDeterminizedStates));
+        a = MinimizationOperations.minimize(a, maxDeterminizedStates);
         break;
       case REGEXP_REPEAT_MIN:
-        a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider), min);
-        a = MinimizationOperations.minimize(a);
+        a = Operations.repeat(
+          exp1.toAutomatonInternal(automata, automaton_provider,
+            maxDeterminizedStates),
+          min);
+        a = MinimizationOperations.minimize(a, maxDeterminizedStates);
         break;
       case REGEXP_REPEAT_MINMAX:
-        a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider), min, max);
-        a = MinimizationOperations.minimize(a);
+        a = Operations.repeat(
+          exp1.toAutomatonInternal(automata, automaton_provider,
+            maxDeterminizedStates),
+          min,
+          max);
+        a = MinimizationOperations.minimize(a, maxDeterminizedStates);
         break;
       case REGEXP_COMPLEMENT:
-        a = Operations.complement(exp1.toAutomaton(automata, automaton_provider));
-        a = MinimizationOperations.minimize(a);
+        a = Operations.complement(
+          exp1.toAutomatonInternal(automata, automaton_provider,
+            maxDeterminizedStates),
+          maxDeterminizedStates);
+        a = MinimizationOperations.minimize(a, maxDeterminizedStates);
         break;
       case REGEXP_CHAR:
         a = Automata.makeChar(c);
@@ -545,24 +607,37 @@ public class RegExp {
   }
   
   private void findLeaves(RegExp exp, Kind kind, List<Automaton> list,
-      Map<String,Automaton> automata, AutomatonProvider automaton_provider) {
+      Map<String,Automaton> automata, AutomatonProvider automaton_provider,
+      int maxDeterminizedStates) {
     if (exp.kind == kind) {
-      findLeaves(exp.exp1, kind, list, automata, automaton_provider);
-      findLeaves(exp.exp2, kind, list, automata, automaton_provider);
+      findLeaves(exp.exp1, kind, list, automata, automaton_provider,
+        maxDeterminizedStates);
+      findLeaves(exp.exp2, kind, list, automata, automaton_provider,
+        maxDeterminizedStates);
     } else {
-      list.add(exp.toAutomaton(automata, automaton_provider));
+      list.add(exp.toAutomatonInternal(automata, automaton_provider, 
+        maxDeterminizedStates));
     }
   }
-  
+
+  /**
+   * The string that was used to construct the regex.  Compare to toString.
+   */
+  public String getOriginalString() {
+    return originalString;
+  }
+
   /**
    * Constructs string from parsed regular expression.
    */
   @Override
   public String toString() {
-    return toStringBuilder(new StringBuilder()).toString();
+    StringBuilder b = new StringBuilder();
+    toStringBuilder(b);
+    return b.toString();
   }
   
-  StringBuilder toStringBuilder(StringBuilder b) {
+  void toStringBuilder(StringBuilder b) {
     switch (kind) {
       case REGEXP_UNION:
         b.append("(");
@@ -640,9 +715,112 @@ public class RegExp {
         b.append(s2).append(">");
         break;
     }
-    return b;
   }
-  
+
+  /**
+   * Like to string, but more verbose (shows the higherchy more clearly).
+   */
+  public String toStringTree() {
+    StringBuilder b = new StringBuilder();
+    toStringTree(b, "");
+    return b.toString();
+  }
+
+  void toStringTree(StringBuilder b, String indent) {
+    switch (kind) {
+      // binary
+      case REGEXP_UNION:
+      case REGEXP_CONCATENATION:
+      case REGEXP_INTERSECTION:
+        b.append(indent);
+        b.append(kind);
+        b.append('\n');
+        exp1.toStringTree(b, indent + "  ");
+        exp2.toStringTree(b, indent + "  ");
+        break;
+      // unary
+      case REGEXP_OPTIONAL:
+      case REGEXP_REPEAT:
+      case REGEXP_COMPLEMENT:
+        b.append(indent);
+        b.append(kind);
+        b.append('\n');
+        exp1.toStringTree(b, indent + "  ");
+        break;
+      case REGEXP_REPEAT_MIN:
+        b.append(indent);
+        b.append(kind);
+        b.append(" min=");
+        b.append(min);
+        b.append('\n');
+        exp1.toStringTree(b, indent + "  ");
+        break;
+      case REGEXP_REPEAT_MINMAX:
+        b.append(indent);
+        b.append(kind);
+        b.append(" min=");
+        b.append(min);
+        b.append(" max=");
+        b.append(max);
+        b.append('\n');
+        exp1.toStringTree(b, indent + "  ");
+        break;
+      case REGEXP_CHAR:
+        b.append(indent);
+        b.append(kind);
+        b.append(" char=");
+        b.appendCodePoint(c);
+        b.append('\n');
+        break;
+      case REGEXP_CHAR_RANGE:
+        b.append(indent);
+        b.append(kind);
+        b.append(" from=");
+        b.appendCodePoint(from);
+        b.append(" to=");
+        b.appendCodePoint(to);
+        b.append('\n');
+        break;
+      case REGEXP_ANYCHAR:
+      case REGEXP_EMPTY:
+        b.append(indent);
+        b.append(kind);
+        b.append('\n');
+        break;
+      case REGEXP_STRING:
+        b.append(indent);
+        b.append(kind);
+        b.append(" string=");
+        b.append(s);
+        b.append('\n');
+        break;
+      case REGEXP_ANYSTRING:
+        b.append(indent);
+        b.append(kind);
+        b.append('\n');
+        break;
+      case REGEXP_AUTOMATON:
+        b.append(indent);
+        b.append(kind);
+        b.append('\n');
+        break;
+      case REGEXP_INTERVAL:
+        b.append(indent);
+        b.append(kind);
+        String s1 = Integer.toString(min);
+        String s2 = Integer.toString(max);
+        b.append("<");
+        if (digits > 0) for (int i = s1.length(); i < digits; i++)
+          b.append('0');
+        b.append(s1).append("-");
+        if (digits > 0) for (int i = s2.length(); i < digits; i++)
+          b.append('0');
+        b.append(s2).append(">");
+        b.append('\n');
+        break;
+    }
+  }
+
   /**
    * Returns set of automaton identifiers that occur in this regular expression.
    */
@@ -819,12 +997,12 @@ public class RegExp {
   }
   
   private boolean peek(String s) {
-    return more() && s.indexOf(b.codePointAt(pos)) != -1;
+    return more() && s.indexOf(originalString.codePointAt(pos)) != -1;
   }
   
   private boolean match(int c) {
-    if (pos >= b.length()) return false;
-    if (b.codePointAt(pos) == c) {
+    if (pos >= originalString.length()) return false;
+    if (originalString.codePointAt(pos) == c) {
       pos += Character.charCount(c);
       return true;
     }
@@ -832,12 +1010,12 @@ public class RegExp {
   }
   
   private boolean more() {
-    return pos < b.length();
+    return pos < originalString.length();
   }
   
   private int next() throws IllegalArgumentException {
     if (!more()) throw new IllegalArgumentException("unexpected end-of-string");
-    int ch = b.codePointAt(pos);
+    int ch = originalString.codePointAt(pos);
     pos += Character.charCount(ch);
     return ch;
   }
@@ -878,13 +1056,14 @@ public class RegExp {
           next();
         if (start == pos) throw new IllegalArgumentException(
             "integer expected at position " + pos);
-        int n = Integer.parseInt(b.substring(start, pos));
+        int n = Integer.parseInt(originalString.substring(start, pos));
         int m = -1;
         if (match(',')) {
           start = pos;
           while (peek("0123456789"))
             next();
-          if (start != pos) m = Integer.parseInt(b.substring(start, pos));
+          if (start != pos) m = Integer.parseInt(
+            originalString.substring(start, pos));
         } else m = n;
         if (!match('}')) throw new IllegalArgumentException(
             "expected '}' at position " + pos);
@@ -935,7 +1114,7 @@ public class RegExp {
         next();
       if (!match('"')) throw new IllegalArgumentException(
           "expected '\"' at position " + pos);
-      return makeString(b.substring(start, pos - 1));
+      return makeString(originalString.substring(start, pos - 1));
     } else if (match('(')) {
       if (match(')')) return makeString("");
       RegExp e = parseUnionExp();
@@ -948,7 +1127,7 @@ public class RegExp {
         next();
       if (!match('>')) throw new IllegalArgumentException(
           "expected '>' at position " + pos);
-      String s = b.substring(start, pos - 1);
+      String s = originalString.substring(start, pos - 1);
       int i = s.indexOf('-');
       if (i == -1) {
         if (!check(AUTOMATON)) throw new IllegalArgumentException(

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java Tue Nov  4 20:38:25 2014
@@ -121,8 +121,21 @@ public abstract class RunAutomaton {
    * @param a an automaton
    */
   public RunAutomaton(Automaton a, int maxInterval, boolean tableize) {
+    this(a, maxInterval, tableize, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+  }
+
+  /**
+   * Constructs a new <code>RunAutomaton</code> from a deterministic
+   * <code>Automaton</code>.
+   * 
+   * @param a an automaton
+   * @param maxDeterminizedStates maximum number of states that can be created
+   *   while determinizing a
+   */
+  public RunAutomaton(Automaton a, int maxInterval, boolean tableize,
+      int maxDeterminizedStates) {
     this.maxInterval = maxInterval;
-    a = Operations.determinize(a);
+    a = Operations.determinize(a, maxDeterminizedStates);
     this.automaton = a;
     points = a.getStartPoints();
     initial = 0;

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java Tue Nov  4 20:38:25 2014
@@ -30,8 +30,10 @@ import org.apache.lucene.analysis.tokena
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.util.automaton.Automata;
-import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.Operations;
+
+import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
 
 public class TestGraphTokenizers extends BaseTokenStreamTestCase {
 
@@ -404,15 +406,11 @@ public class TestGraphTokenizers extends
   }
 
   public void testSingleToken() throws Exception {
-
     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("abc", 1, 1),
       });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton expected = s2a("abc");
-    assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
-                                       Operations.determinize(Operations.removeDeadStates(actual))));
+    assertSameLanguage(s2a("abc"), ts);
   }
 
   public void testMultipleHoles() throws Exception {
@@ -421,10 +419,7 @@ public class TestGraphTokenizers extends
         token("a", 1, 1),
         token("b", 3, 1),
       });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
-    assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
-                                       Operations.determinize(Operations.removeDeadStates(actual))));
+    assertSameLanguage(join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")), ts);
   }
 
   public void testSynOverMultipleHoles() throws Exception {
@@ -434,12 +429,9 @@ public class TestGraphTokenizers extends
         token("x", 0, 3),
         token("b", 3, 1),
       });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
     final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
     final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); 
-    final Automaton expected = Operations.union(a1, a2);
-    assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
-                                       Operations.determinize(Operations.removeDeadStates(actual))));
+    assertSameLanguage(Operations.union(a1, a2), ts);
   }
 
   // for debugging!
@@ -475,18 +467,12 @@ public class TestGraphTokenizers extends
   }
 
   public void testTwoTokens() throws Exception {
-
     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("abc", 1, 1),
         token("def", 1, 1),
       });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton expected =  join("abc", "def");
-
-    //toDot(actual);
-    assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
-                                       Operations.determinize(Operations.removeDeadStates(actual))));
+    assertSameLanguage(join("abc", "def"), ts);
   }
 
   public void testHole() throws Exception {
@@ -496,13 +482,7 @@ public class TestGraphTokenizers extends
         token("abc", 1, 1),
         token("def", 2, 1),
       });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-
-    final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));
-
-    //toDot(actual);
-    assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
-                                       Operations.determinize(Operations.removeDeadStates(actual))));
+    assertSameLanguage(join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")), ts);
   }
 
   public void testOverlappedTokensSausage() throws Exception {
@@ -513,12 +493,9 @@ public class TestGraphTokenizers extends
         token("abc", 1, 1),
         token("xyz", 0, 1)
       });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
     final Automaton a1 = s2a("abc");
     final Automaton a2 = s2a("xyz");
-    final Automaton expected = Operations.union(a1, a2);
-    assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
-                                       Operations.determinize(Operations.removeDeadStates(actual))));
+    assertSameLanguage(Operations.union(a1, a2), ts);
   }
 
   public void testOverlappedTokensLattice() throws Exception {
@@ -529,14 +506,9 @@ public class TestGraphTokenizers extends
         token("xyz", 0, 2),
         token("def", 1, 1),
       });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
     final Automaton a1 = s2a("xyz");
     final Automaton a2 = join("abc", "def");
-                                                                   
-    final Automaton expected = Operations.union(a1, a2);
-    //toDot(actual);
-    assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
-                                       Operations.determinize(Operations.removeDeadStates(actual))));
+    assertSameLanguage(Operations.union(a1, a2), ts);
   }
 
   public void testSynOverHole() throws Exception {
@@ -547,15 +519,9 @@ public class TestGraphTokenizers extends
         token("X", 0, 2),
         token("b", 2, 1),
       });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton a1 = Operations.union(
-                                               join(s2a("a"), SEP_A, HOLE_A),
-                                               s2a("X"));
-    final Automaton expected = Operations.concatenate(a1,
-                                                           join(SEP_A, s2a("b")));
-    //toDot(actual);
-    assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
-                                       Operations.determinize(Operations.removeDeadStates(actual))));
+    final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X"));
+    final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b")));
+    assertSameLanguage(expected, ts);
   }
 
   public void testSynOverHole2() throws Exception {
@@ -566,12 +532,9 @@ public class TestGraphTokenizers extends
         token("abc", 0, 3),
         token("def", 2, 1),
       });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
     final Automaton expected = Operations.union(
-                                                     join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")),
-                                                     s2a("abc"));
-    assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
-                                       Operations.determinize(Operations.removeDeadStates(actual))));
+      join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc"));
+    assertSameLanguage(expected, ts);
   }
 
   public void testOverlappedTokensLattice2() throws Exception {
@@ -583,13 +546,9 @@ public class TestGraphTokenizers extends
         token("def", 1, 1),
         token("ghi", 1, 1),
       });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
     final Automaton a1 = s2a("xyz");
     final Automaton a2 = join("abc", "def", "ghi");
-    final Automaton expected = Operations.union(a1, a2);
-    //toDot(actual);
-    assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
-                                       Operations.determinize(Operations.removeDeadStates(actual))));
+    assertSameLanguage(Operations.union(a1, a2), ts);
   }
 
   public void testToDot() throws Exception {
@@ -604,11 +563,7 @@ public class TestGraphTokenizers extends
       new Token[] {
         token("abc", 2, 1),
       });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton expected = join(HOLE_A, SEP_A, s2a("abc"));
-    //toDot(actual);
-    assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
-                                       Operations.determinize(Operations.removeDeadStates(actual))));
+    assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc")), ts);
   }
 
   // TODO: testEndsWithHole... but we need posInc to set in TS.end()
@@ -619,10 +574,16 @@ public class TestGraphTokenizers extends
         token("a", 1, 1),
         token("X", 0, 10),
       });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton expected = Operations.union(s2a("a"),
-                                                               s2a("X"));
-    assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
-                                       Operations.determinize(Operations.removeDeadStates(actual))));
+    assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
+  }
+
+  private void assertSameLanguage(Automaton expected, TokenStream ts) throws IOException {
+    assertSameLanguage(expected, new TokenStreamToAutomaton().toAutomaton(ts));
+  }
+
+  private void assertSameLanguage(Automaton expected, Automaton actual) {
+    assertTrue(Operations.sameLanguage(
+      Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_MAX_DETERMINIZED_STATES),
+      Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_MAX_DETERMINIZED_STATES)));
   }
 }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java Tue Nov  4 20:38:25 2014
@@ -40,6 +40,8 @@ import org.apache.lucene.util.automaton.
 import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.RegExp;
 
+import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
+
 public class TestMockAnalyzer extends BaseTokenStreamTestCase {
 
   /** Test a configuration that behaves a lot like WhitespaceAnalyzer */
@@ -166,7 +168,8 @@ public class TestMockAnalyzer extends Ba
       new CharacterRunAutomaton(
           Operations.complement(
               Operations.union(
-                  Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar")))));
+                  Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar"))),
+              DEFAULT_MAX_DETERMINIZED_STATES));
     Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords);
     assertAnalyzesTo(a, "quick foo brown bar bar fox foo",
         new String[] { "foo", "bar", "bar", "foo" },

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java Tue Nov  4 20:38:25 2014
@@ -29,12 +29,12 @@ import org.apache.lucene.search.DocIdSet
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LineFileDocs;
-import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.automaton.Automata;
-import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.apache.lucene.util.automaton.RegExp;
 
 @SuppressCodecs({ "SimpleText", "Memory", "Direct" })
@@ -182,7 +182,6 @@ public class TestTermsEnum extends Lucen
 
   // Tests Terms.intersect
   public void testIntersectRandom() throws IOException {
-
     final Directory dir = newDirectory();
     final RandomIndexWriter w = new RandomIndexWriter(random(), dir);
 
@@ -261,7 +260,7 @@ public class TestTermsEnum extends Lucen
         a = Automata.makeStringUnion(sortedAcceptTerms);
       }
       
-      final CompiledAutomaton c = new CompiledAutomaton(a, true, false);
+      final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000);
 
       final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()];
       final Set<BytesRef> acceptTermsSet = new HashSet<>();

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java Tue Nov  4 20:38:25 2014
@@ -38,6 +38,8 @@ import org.apache.lucene.util.LuceneTest
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.automaton.*;
 
+import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
+
 public class TestTermsEnum2 extends LuceneTestCase {
   private Directory dir;
   private IndexReader reader;
@@ -86,7 +88,8 @@ public class TestTermsEnum2 extends Luce
 
     for (int i = 0; i < numIterations; i++) {
       String reg = AutomatonTestUtil.randomRegexp(random());
-      Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton());
+      Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton(),
+        DEFAULT_MAX_DETERMINIZED_STATES);
       final List<BytesRef> matchedTerms = new ArrayList<>();
       for(BytesRef t : terms) {
         if (Operations.run(automaton, t.utf8ToString())) {
@@ -111,7 +114,8 @@ public class TestTermsEnum2 extends Luce
   public void testSeeking() throws Exception {
     for (int i = 0; i < numIterations; i++) {
       String reg = AutomatonTestUtil.randomRegexp(random());
-      Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton());
+      Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton(),
+        DEFAULT_MAX_DETERMINIZED_STATES);
       TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null);
       ArrayList<BytesRef> unsortedTerms = new ArrayList<>(terms);
       Collections.shuffle(unsortedTerms, random());
@@ -158,13 +162,15 @@ public class TestTermsEnum2 extends Luce
       Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
       CompiledAutomaton ca = new CompiledAutomaton(automaton, Operations.isFinite(automaton), false);
       TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null);
-      Automaton expected = Operations.determinize(Operations.intersection(termsAutomaton, automaton));
+      Automaton expected = Operations.determinize(Operations.intersection(termsAutomaton, automaton),
+        DEFAULT_MAX_DETERMINIZED_STATES);
       TreeSet<BytesRef> found = new TreeSet<>();
       while (te.next() != null) {
         found.add(BytesRef.deepCopyOf(te.term()));
       }
       
-      Automaton actual = Operations.determinize(Automata.makeStringUnion(found));
+      Automaton actual = Operations.determinize(Automata.makeStringUnion(found),
+        DEFAULT_MAX_DETERMINIZED_STATES);
       assertTrue(Operations.sameLanguage(expected, actual));
     }
   }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java Tue Nov  4 20:38:25 2014
@@ -33,10 +33,12 @@ import org.apache.lucene.store.Directory
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.Rethrow;
 import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.automaton.AutomatonTestUtil;
 import org.apache.lucene.util.automaton.Automata;
-import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonTestUtil;
+import org.apache.lucene.util.automaton.Operations;
+
+import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
 
 public class TestAutomatonQuery extends LuceneTestCase {
   private Directory directory;
@@ -118,7 +120,7 @@ public class TestAutomatonQuery extends 
     assertAutomatonHits(0, Operations.intersection(Automata
         .makeChar('a'), Automata.makeChar('b')));
     assertAutomatonHits(1, Operations.minus(Automata.makeCharRange('a', 'b'), 
-        Automata.makeChar('a')));
+        Automata.makeChar('a'), DEFAULT_MAX_DETERMINIZED_STATES));
   }
 
   /**

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java Tue Nov  4 20:38:25 2014
@@ -28,11 +28,13 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.automaton.Automata;
-import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.AutomatonProvider;
+import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.RegExp;
 
+import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
+
 /**
  * Some simple regex tests, mostly converted from contrib's TestRegexQuery.
  */
@@ -108,7 +110,8 @@ public class TestRegexpQuery extends Luc
         else return null;
       }
     };
-    RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL, myProvider);
+    RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL,
+      myProvider, DEFAULT_MAX_DETERMINIZED_STATES);
     assertEquals(1, searcher.search(query, 5).totalHits);
   }
   

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java Tue Nov  4 20:38:25 2014
@@ -36,6 +36,8 @@ import org.apache.lucene.util.UnicodeUti
 import org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings;
 import org.apache.lucene.util.fst.Util;
 
+import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
+
 public class TestAutomaton extends LuceneTestCase {
 
   public void testBasic() throws Exception {
@@ -111,7 +113,7 @@ public class TestAutomaton extends Lucen
                             Automata.makeAnyString(),
                             Automata.makeString("n"),
                             Automata.makeAnyString()));
-    a = Operations.determinize(a);
+    a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
     assertTrue(Operations.run(a, "mn"));
     assertTrue(Operations.run(a, "mone"));
     assertFalse(Operations.run(a, "m"));
@@ -122,7 +124,7 @@ public class TestAutomaton extends Lucen
     Automaton a = Operations.union(Arrays.asList(
                             Automata.makeString("foobar"),
                             Automata.makeString("barbaz")));
-    a = Operations.determinize(a);
+    a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
     assertTrue(Operations.run(a, "foobar"));
     assertTrue(Operations.run(a, "barbaz"));
 
@@ -134,7 +136,7 @@ public class TestAutomaton extends Lucen
                             Automata.makeString("foobar"),
                             Automata.makeString(""),
                             Automata.makeString("barbaz")));
-    a = Operations.determinize(a);
+    a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
     assertTrue(Operations.run(a, "foobar"));
     assertTrue(Operations.run(a, "barbaz"));
     assertTrue(Operations.run(a, ""));
@@ -144,7 +146,7 @@ public class TestAutomaton extends Lucen
 
   public void testMinimizeSimple() throws Exception {
     Automaton a = Automata.makeString("foobar");
-    Automaton aMin = MinimizationOperations.minimize(a);
+    Automaton aMin = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
 
     assertTrue(Operations.sameLanguage(a, aMin));
   }
@@ -152,14 +154,16 @@ public class TestAutomaton extends Lucen
   public void testMinimize2() throws Exception {
     Automaton a = Operations.union(Arrays.asList(Automata.makeString("foobar"),
                                                            Automata.makeString("boobar")));
-    Automaton aMin = MinimizationOperations.minimize(a);
-    assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(a)), aMin));
+    Automaton aMin = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
+    assertTrue(Operations.sameLanguage(Operations.determinize(
+      Operations.removeDeadStates(a), DEFAULT_MAX_DETERMINIZED_STATES), aMin));
   }
 
   public void testReverse() throws Exception {
     Automaton a = Automata.makeString("foobar");
     Automaton ra = Operations.reverse(a);
-    Automaton a2 = Operations.determinize(Operations.reverse(ra));
+    Automaton a2 = Operations.determinize(Operations.reverse(ra),
+      DEFAULT_MAX_DETERMINIZED_STATES);
     
     assertTrue(Operations.sameLanguage(a, a2));
   }
@@ -167,7 +171,7 @@ public class TestAutomaton extends Lucen
   public void testOptional() throws Exception {
     Automaton a = Automata.makeString("foobar");
     Automaton a2 = Operations.optional(a);
-    a2 = Operations.determinize(a2);
+    a2 = Operations.determinize(a2, DEFAULT_MAX_DETERMINIZED_STATES);
     
     assertTrue(Operations.run(a, "foobar"));
     assertFalse(Operations.run(a, ""));
@@ -177,7 +181,8 @@ public class TestAutomaton extends Lucen
 
   public void testRepeatAny() throws Exception {
     Automaton a = Automata.makeString("zee");
-    Automaton a2 = Operations.determinize(Operations.repeat(a));
+    Automaton a2 = Operations.determinize(Operations.repeat(a),
+      DEFAULT_MAX_DETERMINIZED_STATES);
     assertTrue(Operations.run(a2, ""));
     assertTrue(Operations.run(a2, "zee"));    
     assertTrue(Operations.run(a2, "zeezee"));
@@ -186,7 +191,8 @@ public class TestAutomaton extends Lucen
 
   public void testRepeatMin() throws Exception {
     Automaton a = Automata.makeString("zee");
-    Automaton a2 = Operations.determinize(Operations.repeat(a, 2));
+    Automaton a2 = Operations.determinize(Operations.repeat(a, 2),
+      DEFAULT_MAX_DETERMINIZED_STATES);
     assertFalse(Operations.run(a2, ""));
     assertFalse(Operations.run(a2, "zee"));    
     assertTrue(Operations.run(a2, "zeezee"));
@@ -195,7 +201,8 @@ public class TestAutomaton extends Lucen
 
   public void testRepeatMinMax1() throws Exception {
     Automaton a = Automata.makeString("zee");
-    Automaton a2 = Operations.determinize(Operations.repeat(a, 0, 2));
+    Automaton a2 = Operations.determinize(Operations.repeat(a, 0, 2),
+      DEFAULT_MAX_DETERMINIZED_STATES);
     assertTrue(Operations.run(a2, ""));
     assertTrue(Operations.run(a2, "zee"));    
     assertTrue(Operations.run(a2, "zeezee"));
@@ -204,7 +211,8 @@ public class TestAutomaton extends Lucen
 
   public void testRepeatMinMax2() throws Exception {
     Automaton a = Automata.makeString("zee");
-    Automaton a2 = Operations.determinize(Operations.repeat(a, 2, 4));
+    Automaton a2 = Operations.determinize(Operations.repeat(a, 2, 4),
+      DEFAULT_MAX_DETERMINIZED_STATES);
     assertFalse(Operations.run(a2, ""));
     assertFalse(Operations.run(a2, "zee"));    
     assertTrue(Operations.run(a2, "zeezee"));
@@ -215,7 +223,8 @@ public class TestAutomaton extends Lucen
 
   public void testComplement() throws Exception {
     Automaton a = Automata.makeString("zee");
-    Automaton a2 = Operations.determinize(Operations.complement(a));
+    Automaton a2 = Operations.determinize(Operations.complement(a,
+      DEFAULT_MAX_DETERMINIZED_STATES), DEFAULT_MAX_DETERMINIZED_STATES);
     assertTrue(Operations.run(a2, ""));
     assertFalse(Operations.run(a2, "zee"));    
     assertTrue(Operations.run(a2, "zeezee"));
@@ -223,7 +232,8 @@ public class TestAutomaton extends Lucen
   }
 
   public void testInterval() throws Exception {
-    Automaton a = Operations.determinize(Automata.makeInterval(17, 100, 3));
+    Automaton a = Operations.determinize(Automata.makeInterval(17, 100, 3),
+      DEFAULT_MAX_DETERMINIZED_STATES);
     assertFalse(Operations.run(a, ""));
     assertTrue(Operations.run(a, "017"));
     assertTrue(Operations.run(a, "100"));
@@ -239,7 +249,8 @@ public class TestAutomaton extends Lucen
     a.addTransition(init, fini, 'm');
     a.addTransition(fini, fini, 'm');
     a.finishState();
-    assertEquals(0, Operations.getCommonSuffixBytesRef(a).length);
+    assertEquals(0, Operations.getCommonSuffixBytesRef(a,
+      DEFAULT_MAX_DETERMINIZED_STATES).length);
   }
 
   public void testReverseRandom1() throws Exception {
@@ -248,8 +259,9 @@ public class TestAutomaton extends Lucen
       Automaton a = AutomatonTestUtil.randomAutomaton(random());
       Automaton ra = Operations.reverse(a);
       Automaton rra = Operations.reverse(ra);
-      assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(a)),
-                                              Operations.determinize(Operations.removeDeadStates(rra))));
+      assertTrue(Operations.sameLanguage(
+        Operations.determinize(Operations.removeDeadStates(a), DEFAULT_MAX_DETERMINIZED_STATES),
+        Operations.determinize(Operations.removeDeadStates(rra), DEFAULT_MAX_DETERMINIZED_STATES)));
     }
   }
 
@@ -262,7 +274,7 @@ public class TestAutomaton extends Lucen
         a = Operations.removeDeadStates(a);
       }
       Automaton ra = Operations.reverse(a);
-      Automaton rda = Operations.determinize(ra);
+      Automaton rda = Operations.determinize(ra, DEFAULT_MAX_DETERMINIZED_STATES);
 
       if (Operations.isEmpty(a)) {
         assertTrue(Operations.isEmpty(rda));
@@ -290,7 +302,8 @@ public class TestAutomaton extends Lucen
   }
 
   public void testAnyStringEmptyString() throws Exception {
-    Automaton a = Operations.determinize(Automata.makeAnyString());
+    Automaton a = Operations.determinize(Automata.makeAnyString(),
+      DEFAULT_MAX_DETERMINIZED_STATES);
     assertTrue(Operations.run(a, ""));
   }
 
@@ -349,9 +362,9 @@ public class TestAutomaton extends Lucen
       }
 
       assertTrue(Operations.sameLanguage(
-                    Operations.determinize(Operations.removeDeadStates(a)),
-                    Operations.determinize(Operations.removeDeadStates(builder.finish()))));
-      
+        Operations.determinize(Operations.removeDeadStates(a), DEFAULT_MAX_DETERMINIZED_STATES),
+        Operations.determinize(Operations.removeDeadStates(builder.finish()),
+          DEFAULT_MAX_DETERMINIZED_STATES)));
     }
   }
 
@@ -368,7 +381,8 @@ public class TestAutomaton extends Lucen
     a.finishState();
     assertFalse(Operations.isTotal(a));
     a.setAccept(init, true);
-    assertTrue(Operations.isTotal(MinimizationOperations.minimize(a)));
+    assertTrue(Operations.isTotal(MinimizationOperations.minimize(a,
+      DEFAULT_MAX_DETERMINIZED_STATES)));
   }
 
   public void testMinimizeEmpty() throws Exception {
@@ -377,7 +391,7 @@ public class TestAutomaton extends Lucen
     int fini = a.createState();
     a.addTransition(init, fini, 'a');
     a.finishState();
-    a = MinimizationOperations.minimize(a);
+    a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
     assertEquals(0, a.getNumStates());
   }
 
@@ -387,26 +401,29 @@ public class TestAutomaton extends Lucen
     Automaton a3 = Automata.makeString("beebar");
     Automaton a = Operations.union(Arrays.asList(a1, a2, a3));
     if (random().nextBoolean()) {
-      a = Operations.determinize(a);
+      a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
     } else if (random().nextBoolean()) {
-      a = MinimizationOperations.minimize(a);
+      a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
     }
     assertMatches(a, "foobar", "beebar", "boobar");
 
-    Automaton a4 = Operations.determinize(Operations.minus(a, a2));
+    Automaton a4 = Operations.determinize(Operations.minus(a, a2,
+      DEFAULT_MAX_DETERMINIZED_STATES), DEFAULT_MAX_DETERMINIZED_STATES);
     
     assertTrue(Operations.run(a4, "foobar"));
     assertFalse(Operations.run(a4, "boobar"));
     assertTrue(Operations.run(a4, "beebar"));
     assertMatches(a4, "foobar", "beebar");
 
-    a4 = Operations.determinize(Operations.minus(a4, a1));
+    a4 = Operations.determinize(Operations.minus(a4, a1,
+      DEFAULT_MAX_DETERMINIZED_STATES), DEFAULT_MAX_DETERMINIZED_STATES);
     assertFalse(Operations.run(a4, "foobar"));
     assertFalse(Operations.run(a4, "boobar"));
     assertTrue(Operations.run(a4, "beebar"));
     assertMatches(a4, "beebar");
 
-    a4 = Operations.determinize(Operations.minus(a4, a3));
+    a4 = Operations.determinize(Operations.minus(a4, a3,
+      DEFAULT_MAX_DETERMINIZED_STATES), DEFAULT_MAX_DETERMINIZED_STATES);
     assertFalse(Operations.run(a4, "foobar"));
     assertFalse(Operations.run(a4, "boobar"));
     assertFalse(Operations.run(a4, "beebar"));
@@ -415,7 +432,7 @@ public class TestAutomaton extends Lucen
 
   public void testOneInterval() throws Exception {
     Automaton a = Automata.makeInterval(999, 1032, 0);
-    a = Operations.determinize(a);
+    a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
     assertTrue(Operations.run(a, "0999"));
     assertTrue(Operations.run(a, "00999"));
     assertTrue(Operations.run(a, "000999"));
@@ -423,7 +440,7 @@ public class TestAutomaton extends Lucen
 
   public void testAnotherInterval() throws Exception {
     Automaton a = Automata.makeInterval(1, 2, 0);
-    a = Operations.determinize(a);
+    a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
     assertTrue(Operations.run(a, "01"));
   }
 
@@ -445,9 +462,10 @@ public class TestAutomaton extends Lucen
       }
       String prefix = b.toString();
 
-      Automaton a = Operations.determinize(Automata.makeInterval(min, max, digits));
+      Automaton a = Operations.determinize(Automata.makeInterval(min, max, digits),
+        DEFAULT_MAX_DETERMINIZED_STATES);
       if (random().nextBoolean()) {
-        a = MinimizationOperations.minimize(a);
+        a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
       }
       String mins = Integer.toString(min);
       String maxs = Integer.toString(max);
@@ -487,7 +505,8 @@ public class TestAutomaton extends Lucen
       expected.add(Util.toUTF32(s, ints));
     }
 
-    assertEquals(expected, Operations.getFiniteStrings(Operations.determinize(a), -1)); 
+    assertEquals(expected, Operations.getFiniteStrings(Operations.determinize(a,
+      DEFAULT_MAX_DETERMINIZED_STATES), -1)); 
   }
 
   public void testConcatenatePreservesDet() throws Exception {
@@ -578,13 +597,13 @@ public class TestAutomaton extends Lucen
       if (VERBOSE) {
         System.out.println("  randomNoOp: determinize");
       }
-      return Operations.determinize(a);
+      return Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
     case 1:
       if (a.getNumStates() < 100) {
         if (VERBOSE) {
           System.out.println("  randomNoOp: minimize");
         }
-        return MinimizationOperations.minimize(a);
+        return MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
       } else {
         if (VERBOSE) {
           System.out.println("  randomNoOp: skip op=minimize: too many states (" + a.getNumStates() + ")");
@@ -725,7 +744,7 @@ public class TestAutomaton extends Lucen
         if (VERBOSE) {
           System.out.println("  op=determinize");
         }
-        a = Operations.determinize(a);
+        a = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
         assertTrue(a.isDeterministic());
         break;
 
@@ -735,7 +754,7 @@ public class TestAutomaton extends Lucen
             System.out.println("  op=minimize");
           }
           // minimize
-          a = MinimizationOperations.minimize(a);
+          a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
         } else if (VERBOSE) {
           System.out.println("  skip op=minimize: too many states (" + a.getNumStates() + ")");
         }
@@ -791,7 +810,7 @@ public class TestAutomaton extends Lucen
               assertTrue(removed);
             }
             Automaton a2 = unionTerms(toRemove);
-            a = Operations.minus(a, a2);
+            a = Operations.minus(a, a2, DEFAULT_MAX_DETERMINIZED_STATES);
           }
         }
         break;
@@ -831,7 +850,7 @@ public class TestAutomaton extends Lucen
             }
           }
           Automaton a2 = randomNoOp(Operations.union(as));
-          a = Operations.minus(a, a2);
+          a = Operations.minus(a, a2, DEFAULT_MAX_DETERMINIZED_STATES);
         }
         break;
 
@@ -868,9 +887,9 @@ public class TestAutomaton extends Lucen
 
           Automaton a2 = Operations.union(as);
           if (random().nextBoolean()) {
-            a2 = Operations.determinize(a2);
+            a2 = Operations.determinize(a2, DEFAULT_MAX_DETERMINIZED_STATES);
           } else if (random().nextBoolean()) {
-            a2 = MinimizationOperations.minimize(a2);
+            a2 = MinimizationOperations.minimize(a2, DEFAULT_MAX_DETERMINIZED_STATES);
           }
           a = Operations.intersection(a, a2);
 
@@ -944,7 +963,7 @@ public class TestAutomaton extends Lucen
         if (VERBOSE) {
           System.out.println("  op=remove the empty string");
         }
-        a = Operations.minus(a, Automata.makeEmptyString());
+        a = Operations.minus(a, Automata.makeEmptyString(), DEFAULT_MAX_DETERMINIZED_STATES);
         terms.remove(new BytesRef());
         break;
 
@@ -1024,7 +1043,7 @@ public class TestAutomaton extends Lucen
       assertTrue(Operations.isFinite(a));
       assertFalse(Operations.isTotal(a));
 
-      Automaton detA = Operations.determinize(a);
+      Automaton detA = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
 
       // Make sure all terms are accepted:
       IntsRefBuilder scratch = new IntsRefBuilder();
@@ -1058,8 +1077,10 @@ public class TestAutomaton extends Lucen
       }
 
       // Use sameLanguage:
-      Automaton a2 = Operations.removeDeadStates(Operations.determinize(unionTerms(terms)));
-      assertTrue(Operations.sameLanguage(a2, Operations.removeDeadStates(Operations.determinize(a))));
+      Automaton a2 = Operations.removeDeadStates(Operations.determinize(unionTerms(terms),
+        DEFAULT_MAX_DETERMINIZED_STATES));
+      assertTrue(Operations.sameLanguage(a2, Operations.removeDeadStates(Operations.determinize(a,
+        DEFAULT_MAX_DETERMINIZED_STATES))));
 
       // Do same check, in UTF8 space
       Automaton utf8 = randomNoOp(new UTF32ToUTF8().convert(a));

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java Tue Nov  4 20:38:25 2014
@@ -31,14 +31,14 @@ import org.apache.lucene.util.TestUtil;
 
 public class TestCompiledAutomaton extends LuceneTestCase {
 
-  private CompiledAutomaton build(String... strings) {
+  private CompiledAutomaton build(int maxDeterminizedStates, String... strings) {
     final List<BytesRef> terms = new ArrayList<>();
     for(String s : strings) {
       terms.add(new BytesRef(s));
     }
     Collections.sort(terms);
     final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
-    return new CompiledAutomaton(a, true, false);
+    return new CompiledAutomaton(a, true, false, maxDeterminizedStates);
   }
 
   private void testFloor(CompiledAutomaton c, String input, String expected) {
@@ -53,8 +53,8 @@ public class TestCompiledAutomaton exten
     }
   }
 
-  private void testTerms(String[] terms) throws Exception {
-    final CompiledAutomaton c = build(terms);
+  private void testTerms(int maxDeterminizedStates, String[] terms) throws Exception {
+    final CompiledAutomaton c = build(maxDeterminizedStates, terms);
     final BytesRef[] termBytes = new BytesRef[terms.length];
     for(int idx=0;idx<terms.length;idx++) {
       termBytes[idx] = new BytesRef(terms[idx]);
@@ -100,7 +100,7 @@ public class TestCompiledAutomaton exten
     while(terms.size() != numTerms) {
       terms.add(randomString());
     }
-    testTerms(terms.toArray(new String[terms.size()]));
+    testTerms(numTerms * 100, terms.toArray(new String[terms.size()]));
   }
 
   private String randomString() {
@@ -109,7 +109,8 @@ public class TestCompiledAutomaton exten
   }
 
   public void testBasic() throws Exception {
-    CompiledAutomaton c = build("fob", "foo", "goo");
+    CompiledAutomaton c = build(Operations.DEFAULT_MAX_DETERMINIZED_STATES,
+      "fob", "foo", "goo");
     testFloor(c, "goo", "goo");
     testFloor(c, "ga", "foo");
     testFloor(c, "g", "foo");

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java Tue Nov  4 20:38:25 2014
@@ -19,6 +19,8 @@ package org.apache.lucene.util.automaton
 
 import org.apache.lucene.util.LuceneTestCase;
 
+import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
+
 /**
  * Not completely thorough, but tries to test determinism correctness
  * somewhat randomly.
@@ -39,29 +41,32 @@ public class TestDeterminism extends Luc
     for (int i = 0; i < num; i++) {
       Automaton a = AutomatonTestUtil.randomAutomaton(random());
       a = AutomatonTestUtil.determinizeSimple(a);
-      Automaton b = Operations.determinize(a);
+      Automaton b = Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
       // TODO: more verifications possible?
       assertTrue(Operations.sameLanguage(a, b));
     }
   }
   
   private static void assertAutomaton(Automaton a) {
-    a = Operations.determinize(Operations.removeDeadStates(a));
+    a = Operations.determinize(Operations.removeDeadStates(a), DEFAULT_MAX_DETERMINIZED_STATES);
 
     // complement(complement(a)) = a
-    Automaton equivalent = Operations.complement(Operations.complement(a));
+    Automaton equivalent = Operations.complement(Operations.complement(a,
+      DEFAULT_MAX_DETERMINIZED_STATES), DEFAULT_MAX_DETERMINIZED_STATES);
     assertTrue(Operations.sameLanguage(a, equivalent));
     
     // a union a = a
-    equivalent = Operations.determinize(Operations.removeDeadStates(Operations.union(a, a)));
+    equivalent = Operations.determinize(Operations.removeDeadStates(Operations.union(a, a)),
+      DEFAULT_MAX_DETERMINIZED_STATES);
     assertTrue(Operations.sameLanguage(a, equivalent));
     
     // a intersect a = a
-    equivalent = Operations.determinize(Operations.removeDeadStates(Operations.intersection(a, a)));
+    equivalent = Operations.determinize(Operations.removeDeadStates(Operations.intersection(a, a)),
+      DEFAULT_MAX_DETERMINIZED_STATES);
     assertTrue(Operations.sameLanguage(a, equivalent));
     
     // a minus a = empty
-    Automaton empty = Operations.minus(a, a);
+    Automaton empty = Operations.minus(a, a, DEFAULT_MAX_DETERMINIZED_STATES);
     assertTrue(Operations.isEmpty(empty));
     
     // as long as don't accept the empty string
@@ -70,7 +75,8 @@ public class TestDeterminism extends Luc
       //System.out.println("test " + a);
       Automaton optional = Operations.optional(a);
       //System.out.println("optional " + optional);
-      equivalent = Operations.minus(optional, Automata.makeEmptyString());
+      equivalent = Operations.minus(optional, Automata.makeEmptyString(),
+        DEFAULT_MAX_DETERMINIZED_STATES);
       //System.out.println("equiv " + equivalent);
       assertTrue(Operations.sameLanguage(a, equivalent));
     }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminizeLexicon.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminizeLexicon.java?rev=1636728&r1=1636727&r2=1636728&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminizeLexicon.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminizeLexicon.java Tue Nov  4 20:38:25 2014
@@ -50,12 +50,12 @@ public class TestDeterminizeLexicon exte
   public void assertLexicon() throws Exception {
     Collections.shuffle(automata, random());
     Automaton lex = Operations.union(automata);
-    lex = Operations.determinize(lex);
+    lex = Operations.determinize(lex, 1000000);
     assertTrue(Operations.isFinite(lex));
     for (String s : terms) {
       assertTrue(Operations.run(lex, s));
     }
-    final ByteRunAutomaton lexByte = new ByteRunAutomaton(lex);
+    final ByteRunAutomaton lexByte = new ByteRunAutomaton(lex, false, 1000000);
     for (String s : terms) {
       byte bytes[] = s.getBytes(StandardCharsets.UTF_8);
       assertTrue(lexByte.run(bytes, 0, bytes.length));