You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2012/06/29 12:02:39 UTC
svn commit: r1355302 - in /lucene/dev/trunk/lucene: ./
core/src/java/org/apache/lucene/util/automaton/
core/src/test/org/apache/lucene/index/
core/src/test/org/apache/lucene/util/automaton/
test-framework/src/java/org/apache/lucene/util/automaton/
Author: dweiss
Date: Fri Jun 29 10:02:37 2012
New Revision: 1355302
URL: http://svn.apache.org/viewvc?rev=1355302&view=rev
Log:
LUCENE-3832: Port BasicAutomata.stringUnion from Brics to Lucene.
Added:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
- copied, changed from r1355276, lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
Removed:
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1355302&r1=1355301&r2=1355302&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Jun 29 10:02:37 2012
@@ -9,6 +9,12 @@ http://s.apache.org/luceneversions
======================= Lucene 4.0.0-BETA =======================
+New features
+
+* LUCENE-3832: Added BasicAutomata.makeStringUnion method to efficiently
+ create automata from a fixed collection of UTF-8 encoded BytesRef
+ (Dawid Weiss, Robert Muir)
+
API Changes
* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java?rev=1355302&r1=1355301&r2=1355302&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java Fri Jun 29 10:02:37 2012
@@ -29,8 +29,9 @@
package org.apache.lucene.util.automaton;
-import java.util.ArrayList;
-import java.util.Collection;
+import java.util.*;
+
+import org.apache.lucene.util.BytesRef;
/**
* Construction of basic automata.
@@ -239,4 +240,25 @@ final public class BasicAutomata {
a.deterministic = true;
return a;
}
+
+ /**
+ * Returns a new (deterministic and minimal) automaton that accepts the union
+ * of the given collection of {@link BytesRef}s representing UTF-8 encoded
+ * strings.
+ *
+ * @param utf8Strings
+ * The input strings, UTF-8 encoded. The collection must be in sorted
+ * order.
+ *
+ * @return An {@link Automaton} accepting all input strings. The resulting
+ * automaton is codepoint based (full unicode codepoints on
+ * transitions).
+ */
+ public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
+ if (utf8Strings.isEmpty()) {
+ return makeEmpty();
+ } else {
+ return DaciukMihovAutomatonBuilder.build(utf8Strings);
+ }
+ }
}
Copied: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java (from r1355276, lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java?p2=lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java&p1=lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java&r1=1355276&r2=1355302&rev=1355302&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java Fri Jun 29 10:02:37 2012
@@ -24,15 +24,18 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
/**
- * Builds a minimal deterministic automaton that accepts a set of strings. The
- * algorithm requires sorted input data, but is very fast (nearly linear with
- * the input size).
+ * Builds a minimal, deterministic {@link Automaton} that accepts a set of
+ * strings. The algorithm requires sorted input data, but is very fast
+ * (nearly linear with the input size).
+ *
+ * @see #build(Collection)
+ * @see BasicAutomata#makeStringUnion(Collection)
*/
-public final class DaciukMihovAutomatonBuilder {
+final class DaciukMihovAutomatonBuilder {
/**
* DFSA state with <code>char</code> labels on transitions.
*/
- public final static class State {
+ private final static class State {
/** An empty set of labels. */
private final static int[] NO_LABELS = new int[0];
@@ -63,29 +66,12 @@ public final class DaciukMihovAutomatonB
* with <code>label</code>. If no such transition exists, returns
* <code>null</code>.
*/
- public State getState(int label) {
+ State getState(int label) {
final int index = Arrays.binarySearch(labels, label);
return index >= 0 ? states[index] : null;
}
/**
- * Returns an array of outgoing transition labels. The array is sorted in
- * lexicographic order and indexes correspond to states returned from
- * {@link #getStates()}.
- */
- public int[] getTransitionLabels() {
- return this.labels;
- }
-
- /**
- * Returns an array of outgoing transitions from this state. The returned
- * array must not be changed.
- */
- public State[] getStates() {
- return this.states;
- }
-
- /**
* Two states are equal if:
* <ul>
* <li>they have an identical number of outgoing transitions, labeled with
@@ -103,21 +89,6 @@ public final class DaciukMihovAutomatonB
}
/**
- * Return <code>true</code> if this state has any children (outgoing
- * transitions).
- */
- public boolean hasChildren() {
- return labels.length > 0;
- }
-
- /**
- * Is this state a final state in the automaton?
- */
- public boolean isFinal() {
- return is_final;
- }
-
- /**
* Compute the hash code of the <i>current</i> status of this state.
*/
@Override
@@ -142,6 +113,14 @@ public final class DaciukMihovAutomatonB
}
/**
+ * Return <code>true</code> if this state has any children (outgoing
+ * transitions).
+ */
+ boolean hasChildren() {
+ return labels.length > 0;
+ }
+
+ /**
* Create a new outgoing transition labeled <code>label</code> and return
* the newly created target state for this transition.
*/
@@ -149,9 +128,9 @@ public final class DaciukMihovAutomatonB
assert Arrays.binarySearch(labels, label) < 0 : "State already has transition labeled: "
+ label;
- labels = copyOf(labels, labels.length + 1);
- states = copyOf(states, states.length + 1);
-
+ labels = Arrays.copyOf(labels, labels.length + 1);
+ states = Arrays.copyOf(states, states.length + 1);
+
labels[labels.length - 1] = label;
return states[states.length - 1] = new State();
}
@@ -188,42 +167,27 @@ public final class DaciukMihovAutomatonB
}
/**
- * JDK1.5-replacement of {@link Arrays#copyOf(int[], int)}
- */
- private static int[] copyOf(int[] original, int newLength) {
- int[] copy = new int[newLength];
- System.arraycopy(original, 0, copy, 0,
- Math.min(original.length, newLength));
- return copy;
- }
-
- /**
- * JDK1.5-replacement of {@link Arrays#copyOf(char[], int)}
- */
- public static State[] copyOf(State[] original, int newLength) {
- State[] copy = new State[newLength];
- System.arraycopy(original, 0, copy, 0,
- Math.min(original.length, newLength));
- return copy;
- }
-
- /**
* Compare two lists of objects for reference-equality.
*/
private static boolean referenceEquals(Object[] a1, Object[] a2) {
- if (a1.length != a2.length) return false;
-
- for (int i = 0; i < a1.length; i++)
- if (a1[i] != a2[i]) return false;
-
+ if (a1.length != a2.length) {
+ return false;
+ }
+
+ for (int i = 0; i < a1.length; i++) {
+ if (a1[i] != a2[i]) {
+ return false;
+ }
+ }
+
return true;
}
}
/**
- * "register" for state interning.
+ * A "registry" for state interning.
*/
- private HashMap<State,State> register = new HashMap<State,State>();
+ private HashMap<State,State> stateRegistry = new HashMap<State,State>();
/**
* Root automaton state.
@@ -231,10 +195,14 @@ public final class DaciukMihovAutomatonB
private State root = new State();
/**
- * Previous sequence added to the automaton in {@link #add(CharSequence)}.
+ * Previous sequence added to the automaton in {@link #add(CharsRef)}.
*/
private CharsRef previous;
-
+
+ /**
+ * A comparator used for enforcing sorted UTF8 order, used in assertions only.
+ */
+ @SuppressWarnings("deprecation")
private static final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
/**
@@ -243,12 +211,12 @@ public final class DaciukMihovAutomatonB
* to this automaton (the input must be sorted).
*/
public void add(CharsRef current) {
- assert register != null : "Automaton already built.";
+ assert stateRegistry != null : "Automaton already built.";
assert previous == null
- || comparator.compare(previous, current) <= 0 : "Input must be sorted: "
+ || comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: "
+ previous + " >= " + current;
assert setPrevious(current);
-
+
// Descend in the automaton (find matching prefix).
int pos = 0, max = current.length();
State next, state = root;
@@ -270,11 +238,11 @@ public final class DaciukMihovAutomatonB
* @return Root automaton state.
*/
public State complete() {
- if (this.register == null) throw new IllegalStateException();
+ if (this.stateRegistry == null) throw new IllegalStateException();
if (root.hasChildren()) replaceOrRegister(root);
- register = null;
+ stateRegistry = null;
return root;
}
@@ -293,15 +261,16 @@ public final class DaciukMihovAutomatonB
int i = 0;
int[] labels = s.labels;
for (DaciukMihovAutomatonBuilder.State target : s.states) {
- converted.addTransition(new Transition(labels[i++], convert(target,
- visited)));
+ converted.addTransition(
+ new Transition(labels[i++], convert(target, visited)));
}
return converted;
}
-
+
/**
- * Build a minimal, deterministic automaton from a sorted list of strings.
+ * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
+ * strings in UTF-8. These strings must be binary-sorted.
*/
public static Automaton build(Collection<BytesRef> input) {
final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
@@ -313,7 +282,9 @@ public final class DaciukMihovAutomatonB
}
Automaton a = new Automaton();
- a.initial = convert(builder.complete(), new IdentityHashMap<State,org.apache.lucene.util.automaton.State>());
+ a.initial = convert(
+ builder.complete(),
+ new IdentityHashMap<State,org.apache.lucene.util.automaton.State>());
a.deterministic = true;
return a;
}
@@ -330,21 +301,21 @@ public final class DaciukMihovAutomatonB
/**
* Replace last child of <code>state</code> with an already registered state
- * or register the last child state.
+ * or stateRegistry the last child state.
*/
private void replaceOrRegister(State state) {
final State child = state.lastChild();
if (child.hasChildren()) replaceOrRegister(child);
- final State registered = register.get(child);
+ final State registered = stateRegistry.get(child);
if (registered != null) {
state.replaceLastChild(registered);
} else {
- register.put(child, child);
+ stateRegistry.put(child, child);
}
}
-
+
/**
* Add a suffix of <code>current</code> starting at <code>fromIndex</code>
* (inclusive) to state <code>state</code>.
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java?rev=1355302&r1=1355301&r2=1355302&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java Fri Jun 29 10:02:37 2012
@@ -35,7 +35,6 @@ import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.lucene.util.automaton.RegExp;
@SuppressCodecs({ "SimpleText", "Memory" })
@@ -257,7 +256,7 @@ public class TestTermsEnum extends Lucen
acceptTerms.add(s2);
sortedAcceptTerms.add(new BytesRef(s2));
}
- a = DaciukMihovAutomatonBuilder.build(sortedAcceptTerms);
+ a = BasicAutomata.makeStringUnion(sortedAcceptTerms);
}
if (random().nextBoolean()) {
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java?rev=1355302&r1=1355301&r2=1355302&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java Fri Jun 29 10:02:37 2012
@@ -35,13 +35,7 @@ import org.apache.lucene.store.Directory
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.AutomatonTestUtil;
-import org.apache.lucene.util.automaton.BasicOperations;
-import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
-import org.apache.lucene.util.automaton.RegExp;
-import org.apache.lucene.util.automaton.SpecialOperations;
+import org.apache.lucene.util.automaton.*;
public class TestTermsEnum2 extends LuceneTestCase {
private Directory dir;
@@ -72,7 +66,7 @@ public class TestTermsEnum2 extends Luce
writer.addDocument(doc);
}
- termsAutomaton = DaciukMihovAutomatonBuilder.build(terms);
+ termsAutomaton = BasicAutomata.makeStringUnion(terms);
reader = writer.getReader();
searcher = newSearcher(reader);
@@ -97,7 +91,7 @@ public class TestTermsEnum2 extends Luce
}
}
- Automaton alternate = DaciukMihovAutomatonBuilder.build(matchedTerms);
+ Automaton alternate = BasicAutomata.makeStringUnion(matchedTerms);
//System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + " states, sigma=" + alternate.getStartPoints().length);
//AutomatonTestUtil.minimizeSimple(alternate);
//System.out.println("minmize done");
@@ -164,7 +158,7 @@ public class TestTermsEnum2 extends Luce
found.add(BytesRef.deepCopyOf(te.term()));
}
- Automaton actual = DaciukMihovAutomatonBuilder.build(found);
+ Automaton actual = BasicAutomata.makeStringUnion(found);
assertTrue(BasicOperations.sameLanguage(expected, actual));
}
}
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java?rev=1355302&r1=1355301&r2=1355302&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java Fri Jun 29 10:02:37 2012
@@ -17,10 +17,35 @@ package org.apache.lucene.util.automaton
* limitations under the License.
*/
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.UnicodeUtil;
+import java.util.*;
+
+import org.apache.lucene.util.*;
+
+import com.carrotsearch.randomizedtesting.generators.RandomInts;
+
+public class TestBasicOperations extends LuceneTestCase {
+ /** Test string union. */
+ public void testStringUnion() {
+ List<BytesRef> strings = new ArrayList<BytesRef>();
+ for (int i = RandomInts.randomIntBetween(random(), 0, 1000); --i >= 0;) {
+ strings.add(new BytesRef(_TestUtil.randomUnicodeString(random())));
+ }
+
+ Collections.sort(strings);
+ Automaton union = BasicAutomata.makeStringUnion(strings);
+ assertTrue(union.isDeterministic());
+ assertTrue(BasicOperations.sameLanguage(union, naiveUnion(strings)));
+ }
+
+ private static Automaton naiveUnion(List<BytesRef> strings) {
+ Automaton [] eachIndividual = new Automaton [strings.size()];
+ int i = 0;
+ for (BytesRef bref : strings) {
+ eachIndividual[i++] = BasicAutomata.makeString(bref.utf8ToString());
+ }
+ return BasicOperations.union(Arrays.asList(eachIndividual));
+ }
-public class TestBasicOperations extends LuceneTestCase {
/** Test optimization to concatenate() */
public void testSingletonConcatenate() {
Automaton singleton = BasicAutomata.makeString("prefix");