You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2012/06/29 12:02:39 UTC

svn commit: r1355302 - in /lucene/dev/trunk/lucene: ./ core/src/java/org/apache/lucene/util/automaton/ core/src/test/org/apache/lucene/index/ core/src/test/org/apache/lucene/util/automaton/ test-framework/src/java/org/apache/lucene/util/automaton/

Author: dweiss
Date: Fri Jun 29 10:02:37 2012
New Revision: 1355302

URL: http://svn.apache.org/viewvc?rev=1355302&view=rev
Log:
LUCENE-3832: Port BasicAutomata.stringUnion from Brics to Lucene.

Added:
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
      - copied, changed from r1355276, lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
Removed:
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1355302&r1=1355301&r2=1355302&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Jun 29 10:02:37 2012
@@ -9,6 +9,12 @@ http://s.apache.org/luceneversions
 
 ======================= Lucene 4.0.0-BETA =======================
 
+New features
+
+* LUCENE-3832: Added BasicAutomata.makeStringUnion method to efficiently
+  create automata from a fixed collection of UTF-8 encoded BytesRef
+  (Dawid Weiss, Robert Muir)
+
 API Changes
 
 * LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java?rev=1355302&r1=1355301&r2=1355302&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java Fri Jun 29 10:02:37 2012
@@ -29,8 +29,9 @@
 
 package org.apache.lucene.util.automaton;
 
-import java.util.ArrayList;
-import java.util.Collection;
+import java.util.*;
+
+import org.apache.lucene.util.BytesRef;
 
 /**
  * Construction of basic automata.
@@ -239,4 +240,25 @@ final public class BasicAutomata {
     a.deterministic = true;
     return a;
   }
+
+  /**
+   * Returns a new (deterministic and minimal) automaton that accepts the union
+   * of the given collection of {@link BytesRef}s representing UTF-8 encoded
+   * strings.
+   * 
+   * @param utf8Strings
+   *          The input strings, UTF-8 encoded. The collection must be in sorted
+   *          order.
+   * 
+   * @return An {@link Automaton} accepting all input strings. The resulting
+   *         automaton is codepoint based (full unicode codepoints on
+   *         transitions).
+   */
+  public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
+    if (utf8Strings.isEmpty()) {
+      return makeEmpty();
+    } else {
+      return DaciukMihovAutomatonBuilder.build(utf8Strings);
+    }
+  }
 }

Copied: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java (from r1355276, lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java?p2=lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java&p1=lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java&r1=1355276&r2=1355302&rev=1355302&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java Fri Jun 29 10:02:37 2012
@@ -24,15 +24,18 @@ import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.UnicodeUtil;
 
 /**
- * Builds a minimal deterministic automaton that accepts a set of strings. The
- * algorithm requires sorted input data, but is very fast (nearly linear with
- * the input size).
+ * Builds a minimal, deterministic {@link Automaton} that accepts a set of 
+ * strings. The algorithm requires sorted input data, but is very fast 
+ * (nearly linear with the input size).
+ * 
+ * @see #build(Collection)
+ * @see BasicAutomata#makeStringUnion(Collection)
  */
-public final class DaciukMihovAutomatonBuilder {
+final class DaciukMihovAutomatonBuilder {
   /**
    * DFSA state with <code>char</code> labels on transitions.
    */
-  public final static class State {
+  private final static class State {
     
     /** An empty set of labels. */
     private final static int[] NO_LABELS = new int[0];
@@ -63,29 +66,12 @@ public final class DaciukMihovAutomatonB
      * with <code>label</code>. If no such transition exists, returns
      * <code>null</code>.
      */
-    public State getState(int label) {
+    State getState(int label) {
       final int index = Arrays.binarySearch(labels, label);
       return index >= 0 ? states[index] : null;
     }
     
     /**
-     * Returns an array of outgoing transition labels. The array is sorted in
-     * lexicographic order and indexes correspond to states returned from
-     * {@link #getStates()}.
-     */
-    public int[] getTransitionLabels() {
-      return this.labels;
-    }
-    
-    /**
-     * Returns an array of outgoing transitions from this state. The returned
-     * array must not be changed.
-     */
-    public State[] getStates() {
-      return this.states;
-    }
-    
-    /**
      * Two states are equal if:
      * <ul>
      * <li>they have an identical number of outgoing transitions, labeled with
@@ -103,21 +89,6 @@ public final class DaciukMihovAutomatonB
     }
     
     /**
-     * Return <code>true</code> if this state has any children (outgoing
-     * transitions).
-     */
-    public boolean hasChildren() {
-      return labels.length > 0;
-    }
-    
-    /**
-     * Is this state a final state in the automaton?
-     */
-    public boolean isFinal() {
-      return is_final;
-    }
-    
-    /**
      * Compute the hash code of the <i>current</i> status of this state.
      */
     @Override
@@ -142,6 +113,14 @@ public final class DaciukMihovAutomatonB
     }
     
     /**
+     * Return <code>true</code> if this state has any children (outgoing
+     * transitions).
+     */
+    boolean hasChildren() {
+      return labels.length > 0;
+    }
+
+    /**
      * Create a new outgoing transition labeled <code>label</code> and return
      * the newly created target state for this transition.
      */
@@ -149,9 +128,9 @@ public final class DaciukMihovAutomatonB
       assert Arrays.binarySearch(labels, label) < 0 : "State already has transition labeled: "
           + label;
       
-      labels = copyOf(labels, labels.length + 1);
-      states = copyOf(states, states.length + 1);
-      
+      labels = Arrays.copyOf(labels, labels.length + 1);
+      states = Arrays.copyOf(states, states.length + 1);
+
       labels[labels.length - 1] = label;
       return states[states.length - 1] = new State();
     }
@@ -188,42 +167,27 @@ public final class DaciukMihovAutomatonB
     }
     
     /**
-     * JDK1.5-replacement of {@link Arrays#copyOf(int[], int)}
-     */
-    private static int[] copyOf(int[] original, int newLength) {
-      int[] copy = new int[newLength];
-      System.arraycopy(original, 0, copy, 0,
-          Math.min(original.length, newLength));
-      return copy;
-    }
-    
-    /**
-     * JDK1.5-replacement of {@link Arrays#copyOf(char[], int)}
-     */
-    public static State[] copyOf(State[] original, int newLength) {
-      State[] copy = new State[newLength];
-      System.arraycopy(original, 0, copy, 0,
-          Math.min(original.length, newLength));
-      return copy;
-    }
-    
-    /**
      * Compare two lists of objects for reference-equality.
      */
     private static boolean referenceEquals(Object[] a1, Object[] a2) {
-      if (a1.length != a2.length) return false;
-      
-      for (int i = 0; i < a1.length; i++)
-        if (a1[i] != a2[i]) return false;
-      
+      if (a1.length != a2.length) { 
+        return false;
+      }
+
+      for (int i = 0; i < a1.length; i++) {
+        if (a1[i] != a2[i]) { 
+          return false;
+        }
+      }
+
       return true;
     }
   }
   
   /**
-   * "register" for state interning.
+   * A "registry" for state interning.
    */
-  private HashMap<State,State> register = new HashMap<State,State>();
+  private HashMap<State,State> stateRegistry = new HashMap<State,State>();
   
   /**
    * Root automaton state.
@@ -231,10 +195,14 @@ public final class DaciukMihovAutomatonB
   private State root = new State();
   
   /**
-   * Previous sequence added to the automaton in {@link #add(CharSequence)}.
+   * Previous sequence added to the automaton in {@link #add(CharsRef)}.
    */
   private CharsRef previous;
-  
+
+  /**
+   * A comparator used for enforcing sorted UTF8 order, used in assertions only.
+   */
+  @SuppressWarnings("deprecation")
   private static final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
 
   /**
@@ -243,12 +211,12 @@ public final class DaciukMihovAutomatonB
    * to this automaton (the input must be sorted).
    */
   public void add(CharsRef current) {
-    assert register != null : "Automaton already built.";
+    assert stateRegistry != null : "Automaton already built.";
     assert previous == null
-        || comparator.compare(previous, current) <= 0 : "Input must be sorted: "
+        || comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: "
         + previous + " >= " + current;
     assert setPrevious(current);
-    
+
     // Descend in the automaton (find matching prefix).
     int pos = 0, max = current.length();
     State next, state = root;
@@ -270,11 +238,11 @@ public final class DaciukMihovAutomatonB
    * @return Root automaton state.
    */
   public State complete() {
-    if (this.register == null) throw new IllegalStateException();
+    if (this.stateRegistry == null) throw new IllegalStateException();
     
     if (root.hasChildren()) replaceOrRegister(root);
     
-    register = null;
+    stateRegistry = null;
     return root;
   }
   
@@ -293,15 +261,16 @@ public final class DaciukMihovAutomatonB
     int i = 0;
     int[] labels = s.labels;
     for (DaciukMihovAutomatonBuilder.State target : s.states) {
-      converted.addTransition(new Transition(labels[i++], convert(target,
-          visited)));
+      converted.addTransition(
+          new Transition(labels[i++], convert(target, visited)));
     }
     
     return converted;
   }
-  
+
   /**
-   * Build a minimal, deterministic automaton from a sorted list of strings.
+   * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
+   * strings in UTF-8. These strings must be binary-sorted.
    */
   public static Automaton build(Collection<BytesRef> input) {
     final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
@@ -313,7 +282,9 @@ public final class DaciukMihovAutomatonB
     }
     
     Automaton a = new Automaton();
-    a.initial = convert(builder.complete(), new IdentityHashMap<State,org.apache.lucene.util.automaton.State>());
+    a.initial = convert(
+        builder.complete(), 
+        new IdentityHashMap<State,org.apache.lucene.util.automaton.State>());
     a.deterministic = true;
     return a;
   }
@@ -330,21 +301,21 @@ public final class DaciukMihovAutomatonB
   
   /**
    * Replace last child of <code>state</code> with an already registered state
-   * or register the last child state.
+   * or stateRegistry the last child state.
    */
   private void replaceOrRegister(State state) {
     final State child = state.lastChild();
     
     if (child.hasChildren()) replaceOrRegister(child);
     
-    final State registered = register.get(child);
+    final State registered = stateRegistry.get(child);
     if (registered != null) {
       state.replaceLastChild(registered);
     } else {
-      register.put(child, child);
+      stateRegistry.put(child, child);
     }
   }
-  
+
   /**
    * Add a suffix of <code>current</code> starting at <code>fromIndex</code>
    * (inclusive) to state <code>state</code>.

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java?rev=1355302&r1=1355301&r2=1355302&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java Fri Jun 29 10:02:37 2012
@@ -35,7 +35,6 @@ import org.apache.lucene.util._TestUtil;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.BasicAutomata;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
 import org.apache.lucene.util.automaton.RegExp;
 
 @SuppressCodecs({ "SimpleText", "Memory" })
@@ -257,7 +256,7 @@ public class TestTermsEnum extends Lucen
           acceptTerms.add(s2);
           sortedAcceptTerms.add(new BytesRef(s2));
         }
-        a = DaciukMihovAutomatonBuilder.build(sortedAcceptTerms);
+        a = BasicAutomata.makeStringUnion(sortedAcceptTerms);
       }
       
       if (random().nextBoolean()) {

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java?rev=1355302&r1=1355301&r2=1355302&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java Fri Jun 29 10:02:37 2012
@@ -35,13 +35,7 @@ import org.apache.lucene.store.Directory
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util._TestUtil;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.AutomatonTestUtil;
-import org.apache.lucene.util.automaton.BasicOperations;
-import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
-import org.apache.lucene.util.automaton.RegExp;
-import org.apache.lucene.util.automaton.SpecialOperations;
+import org.apache.lucene.util.automaton.*;
 
 public class TestTermsEnum2 extends LuceneTestCase {
   private Directory dir;
@@ -72,7 +66,7 @@ public class TestTermsEnum2 extends Luce
       writer.addDocument(doc);
     }
     
-    termsAutomaton = DaciukMihovAutomatonBuilder.build(terms);
+    termsAutomaton = BasicAutomata.makeStringUnion(terms);
     
     reader = writer.getReader();
     searcher = newSearcher(reader);
@@ -97,7 +91,7 @@ public class TestTermsEnum2 extends Luce
         }
       }
 
-      Automaton alternate = DaciukMihovAutomatonBuilder.build(matchedTerms);
+      Automaton alternate = BasicAutomata.makeStringUnion(matchedTerms);
       //System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + " states, sigma=" + alternate.getStartPoints().length);
       //AutomatonTestUtil.minimizeSimple(alternate);
       //System.out.println("minmize done");
@@ -164,7 +158,7 @@ public class TestTermsEnum2 extends Luce
         found.add(BytesRef.deepCopyOf(te.term()));
       }
       
-      Automaton actual = DaciukMihovAutomatonBuilder.build(found);     
+      Automaton actual = BasicAutomata.makeStringUnion(found);     
       assertTrue(BasicOperations.sameLanguage(expected, actual));
     }
   }

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java?rev=1355302&r1=1355301&r2=1355302&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java Fri Jun 29 10:02:37 2012
@@ -17,10 +17,35 @@ package org.apache.lucene.util.automaton
  * limitations under the License.
  */
 
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.UnicodeUtil;
+import java.util.*;
+
+import org.apache.lucene.util.*;
+
+import com.carrotsearch.randomizedtesting.generators.RandomInts;
+
+public class TestBasicOperations extends LuceneTestCase {
+  /** Test string union. */
+  public void testStringUnion() {
+    List<BytesRef> strings = new ArrayList<BytesRef>();
+    for (int i = RandomInts.randomIntBetween(random(), 0, 1000); --i >= 0;) {
+      strings.add(new BytesRef(_TestUtil.randomUnicodeString(random())));
+    }
+
+    Collections.sort(strings);
+    Automaton union = BasicAutomata.makeStringUnion(strings);
+    assertTrue(union.isDeterministic());
+    assertTrue(BasicOperations.sameLanguage(union, naiveUnion(strings)));
+  }
+
+  private static Automaton naiveUnion(List<BytesRef> strings) {
+    Automaton [] eachIndividual = new Automaton [strings.size()];
+    int i = 0;
+    for (BytesRef bref : strings) {
+      eachIndividual[i++] = BasicAutomata.makeString(bref.utf8ToString());
+    }
+    return BasicOperations.union(Arrays.asList(eachIndividual));
+  }
 
-public class TestBasicOperations extends LuceneTestCase { 
   /** Test optimization to concatenate() */
   public void testSingletonConcatenate() {
     Automaton singleton = BasicAutomata.makeString("prefix");