You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2023/05/25 14:18:48 UTC
[lucene] branch main updated: #12276: rename DaciukMihovAutomatonBuilder to StringsToAutomaton (#12310)
This is an automated email from the ASF dual-hosted git repository.
mikemccand pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 7da7c436384 #12276: rename DaciukMihovAutomatonBuilder to StringsToAutomaton (#12310)
7da7c436384 is described below
commit 7da7c436384e23fbaab1031e8e983fb7dd37f630
Author: Michael McCandless <mi...@apache.org>
AuthorDate: Thu May 25 10:18:41 2023 -0400
#12276: rename DaciukMihovAutomatonBuilder to StringsToAutomaton (#12310)
Closes #12276
---
lucene/CHANGES.txt | 2 ++
lucene/MIGRATE.md | 4 ++++
.../lucene/analysis/core/TestFlattenGraphFilter.java | 4 ++--
.../java/org/apache/lucene/util/automaton/Automata.java | 2 +-
...kMihovAutomatonBuilder.java => StringsToAutomaton.java} | 14 ++++++++------
.../apache/lucene/analysis/TestAutomatonToTokenStream.java | 8 ++++----
.../lucene/util/automaton/TestCompiledAutomaton.java | 2 +-
...ovAutomatonBuilder.java => TestStringsToAutomaton.java} | 6 +++---
.../search/uhighlight/MemoryIndexOffsetStrategy.java | 4 ++--
.../lucene/search/uhighlight/TestUnifiedHighlighter.java | 7 +++----
10 files changed, 30 insertions(+), 23 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index b9e6b2b19ee..7fe8d40114b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -51,6 +51,8 @@ API Changes
They have already been runtime-checked to only be implemented by the specific classes
so this is effectively a non-breaking change.
+* GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton
+
New Features
---------------------
diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md
index e731cfd84ab..2bb6c27f563 100644
--- a/lucene/MIGRATE.md
+++ b/lucene/MIGRATE.md
@@ -89,6 +89,10 @@ for the currently-positioned document (doing so will result in undefined behavio
`IOContext.READONCE` for opening internally, as that's the only valid usage pattern for checksum input.
Callers should remove the parameter when calling this method.
+
+### DaciukMihovAutomatonBuilder is renamed to StringsToAutomaton
+
+
## Migration from Lucene 9.0 to Lucene 9.1
### Test framework package migration and module (LUCENE-10301)
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
index 7b35f56016c..13d3dd378e7 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
@@ -41,8 +41,8 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.StringsToAutomaton;
import org.apache.lucene.util.automaton.Transition;
public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
@@ -780,7 +780,7 @@ public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
acceptStrings.sort(Comparator.naturalOrder());
acceptStrings = acceptStrings.stream().limit(wordCount).collect(Collectors.toList());
- Automaton nonFlattenedAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
+ Automaton nonFlattenedAutomaton = StringsToAutomaton.build(acceptStrings);
TokenStream ts = AutomatonToTokenStream.toTokenStream(nonFlattenedAutomaton);
TokenStream flattenedTokenStream = new FlattenGraphFilter(ts);
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
index 9a642338f09..26afdc45564 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
@@ -573,7 +573,7 @@ public final class Automata {
if (utf8Strings.isEmpty()) {
return makeEmpty();
} else {
- return DaciukMihovAutomatonBuilder.build(utf8Strings);
+ return StringsToAutomaton.build(utf8Strings);
}
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
similarity index 95%
rename from lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
rename to lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
index 2fe13101168..c85e2648703 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@@ -27,13 +27,15 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
/**
- * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm
- * requires sorted input data, but is very fast (nearly linear with the input size).
+ * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings using the
+ * algorithm described in <a href="https://aclanthology.org/J00-1002.pdf">Incremental Construction
+ * of Minimal Acyclic Finite-State Automata by Daciuk, Mihov, Watson and Watson</a>. This requires
+ * sorted input data, but is very fast (nearly linear with the input size).
*
* @see #build(Collection)
* @see Automata#makeStringUnion(Collection)
*/
-public final class DaciukMihovAutomatonBuilder {
+public final class StringsToAutomaton {
/**
* This builder rejects terms that are more than 1k chars long since it then uses recursion based
@@ -42,7 +44,7 @@ public final class DaciukMihovAutomatonBuilder {
public static final int MAX_TERM_LENGTH = 1_000;
/** The default constructor is private. Use static methods directly. */
- private DaciukMihovAutomatonBuilder() {
+ private StringsToAutomaton() {
super();
}
@@ -256,7 +258,7 @@ public final class DaciukMihovAutomatonBuilder {
visited.put(s, converted);
int i = 0;
int[] labels = s.labels;
- for (DaciukMihovAutomatonBuilder.State target : s.states) {
+ for (StringsToAutomaton.State target : s.states) {
a.addTransition(converted, convert(a, target, visited), labels[i++]);
}
@@ -268,7 +270,7 @@ public final class DaciukMihovAutomatonBuilder {
* strings in UTF-8. These strings must be binary-sorted.
*/
public static Automaton build(Collection<BytesRef> input) {
- final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
+ final StringsToAutomaton builder = new StringsToAutomaton();
CharsRefBuilder current = new CharsRefBuilder();
for (BytesRef b : input) {
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java
index 27d2c72ddb0..17157c48bdc 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java
@@ -23,7 +23,7 @@ import java.util.List;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
+import org.apache.lucene.util.automaton.StringsToAutomaton;
public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase {
@@ -31,7 +31,7 @@ public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase {
List<BytesRef> acceptStrings = new ArrayList<>();
acceptStrings.add(new BytesRef("abc"));
- Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
+ Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
@@ -48,7 +48,7 @@ public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase {
acceptStrings.add(new BytesRef("123"));
acceptStrings.add(new BytesRef("abc"));
- Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
+ Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
@@ -65,7 +65,7 @@ public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase {
acceptStrings.add(new BytesRef("ab3"));
acceptStrings.add(new BytesRef("abc"));
- Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
+ Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
index 432c281b184..cb69e744b54 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
@@ -35,7 +35,7 @@ public class TestCompiledAutomaton extends LuceneTestCase {
terms.add(new BytesRef(s));
}
Collections.sort(terms);
- final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
+ final Automaton a = StringsToAutomaton.build(terms);
return new CompiledAutomaton(a, true, false, false);
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
similarity index 84%
rename from lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java
rename to lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
index 4b31eda98c6..a251ae74880 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@@ -22,7 +22,7 @@ import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
-public class TestDaciukMihovAutomatonBuilder extends LuceneTestCase {
+public class TestStringsToAutomaton extends LuceneTestCase {
public void testLargeTerms() {
byte[] b10k = new byte[10_000];
@@ -30,12 +30,12 @@ public class TestDaciukMihovAutomatonBuilder extends LuceneTestCase {
IllegalArgumentException e =
expectThrows(
IllegalArgumentException.class,
- () -> DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b10k))));
+ () -> StringsToAutomaton.build(Collections.singleton(new BytesRef(b10k))));
assertTrue(
e.getMessage()
.startsWith("This builder doesn't allow terms that are larger than 1,000 characters"));
byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000);
- DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b1k))); // no exception
+ StringsToAutomaton.build(Collections.singleton(new BytesRef(b1k))); // no exception
}
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
index 7237be57807..ad29e2de8a3 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
@@ -29,7 +29,7 @@ import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
+import org.apache.lucene.util.automaton.StringsToAutomaton;
/**
* Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
@@ -66,7 +66,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
// to build an automaton on them
List<BytesRef> filteredTerms =
Arrays.stream(components.getTerms())
- .filter(b -> b.length < DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH)
+ .filter(b -> b.length < StringsToAutomaton.MAX_TERM_LENGTH)
.toList();
allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms));
}
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
index 6cc524a9fac..821b9308335 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
@@ -58,7 +58,7 @@ import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.analysis.MockTokenizer;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.util.LuceneTestCase;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
+import org.apache.lucene.util.automaton.StringsToAutomaton;
import org.junit.After;
import org.junit.Before;
@@ -1671,12 +1671,11 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
Query query =
new BooleanQuery.Builder()
.add(
- new TermQuery(
- new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH))),
+ new TermQuery(new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH))),
BooleanClause.Occur.SHOULD)
.add(
new TermQuery(
- new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH + 1))),
+ new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH + 1))),
BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD)
.build();