You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2023/05/25 14:18:48 UTC

[lucene] branch main updated: #12276: rename DaciukMihovAutomatonBuilder to StringsToAutomaton (#12310)

This is an automated email from the ASF dual-hosted git repository.

mikemccand pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 7da7c436384 #12276: rename DaciukMihovAutomatonBuilder to StringsToAutomaton (#12310)
7da7c436384 is described below

commit 7da7c436384e23fbaab1031e8e983fb7dd37f630
Author: Michael McCandless <mi...@apache.org>
AuthorDate: Thu May 25 10:18:41 2023 -0400

    #12276: rename DaciukMihovAutomatonBuilder to StringsToAutomaton (#12310)
    
    Closes #12276
---
 lucene/CHANGES.txt                                         |  2 ++
 lucene/MIGRATE.md                                          |  4 ++++
 .../lucene/analysis/core/TestFlattenGraphFilter.java       |  4 ++--
 .../java/org/apache/lucene/util/automaton/Automata.java    |  2 +-
 ...kMihovAutomatonBuilder.java => StringsToAutomaton.java} | 14 ++++++++------
 .../apache/lucene/analysis/TestAutomatonToTokenStream.java |  8 ++++----
 .../lucene/util/automaton/TestCompiledAutomaton.java       |  2 +-
 ...ovAutomatonBuilder.java => TestStringsToAutomaton.java} |  6 +++---
 .../search/uhighlight/MemoryIndexOffsetStrategy.java       |  4 ++--
 .../lucene/search/uhighlight/TestUnifiedHighlighter.java   |  7 +++----
 10 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index b9e6b2b19ee..7fe8d40114b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -51,6 +51,8 @@ API Changes
   They have already been runtime-checked to only be implemented by the specific classes
   so this is effectively a non-breaking change.
 
+* GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton
+
 New Features
 ---------------------
 
diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md
index e731cfd84ab..2bb6c27f563 100644
--- a/lucene/MIGRATE.md
+++ b/lucene/MIGRATE.md
@@ -89,6 +89,10 @@ for the currently-positioned document (doing so will result in undefined behavio
 `IOContext.READONCE` for opening internally, as that's the only valid usage pattern for checksum input.
 Callers should remove the parameter when calling this method.
 
+
+### DaciukMihovAutomatonBuilder is renamed to StringsToAutomaton
+
+
 ## Migration from Lucene 9.0 to Lucene 9.1
 
 ### Test framework package migration and module (LUCENE-10301)
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
index 7b35f56016c..13d3dd378e7 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
@@ -41,8 +41,8 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
 import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.StringsToAutomaton;
 import org.apache.lucene.util.automaton.Transition;
 
 public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
@@ -780,7 +780,7 @@ public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
     acceptStrings.sort(Comparator.naturalOrder());
 
     acceptStrings = acceptStrings.stream().limit(wordCount).collect(Collectors.toList());
-    Automaton nonFlattenedAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
+    Automaton nonFlattenedAutomaton = StringsToAutomaton.build(acceptStrings);
 
     TokenStream ts = AutomatonToTokenStream.toTokenStream(nonFlattenedAutomaton);
     TokenStream flattenedTokenStream = new FlattenGraphFilter(ts);
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
index 9a642338f09..26afdc45564 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
@@ -573,7 +573,7 @@ public final class Automata {
     if (utf8Strings.isEmpty()) {
       return makeEmpty();
     } else {
-      return DaciukMihovAutomatonBuilder.build(utf8Strings);
+      return StringsToAutomaton.build(utf8Strings);
     }
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
similarity index 95%
rename from lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
rename to lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
index 2fe13101168..c85e2648703 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@@ -27,13 +27,15 @@ import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.CharsRefBuilder;
 
 /**
- * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm
- * requires sorted input data, but is very fast (nearly linear with the input size).
+ * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings using the
+ * algorithm described in <a href="https://aclanthology.org/J00-1002.pdf">Incremental Construction
+ * of Minimal Acyclic Finite-State Automata by Daciuk, Mihov, Watson and Watson</a>. This requires
+ * sorted input data, but is very fast (nearly linear with the input size).
  *
  * @see #build(Collection)
  * @see Automata#makeStringUnion(Collection)
  */
-public final class DaciukMihovAutomatonBuilder {
+public final class StringsToAutomaton {
 
   /**
    * This builder rejects terms that are more than 1k chars long since it then uses recursion based
@@ -42,7 +44,7 @@ public final class DaciukMihovAutomatonBuilder {
   public static final int MAX_TERM_LENGTH = 1_000;
 
   /** The default constructor is private. Use static methods directly. */
-  private DaciukMihovAutomatonBuilder() {
+  private StringsToAutomaton() {
     super();
   }
 
@@ -256,7 +258,7 @@ public final class DaciukMihovAutomatonBuilder {
     visited.put(s, converted);
     int i = 0;
     int[] labels = s.labels;
-    for (DaciukMihovAutomatonBuilder.State target : s.states) {
+    for (StringsToAutomaton.State target : s.states) {
       a.addTransition(converted, convert(a, target, visited), labels[i++]);
     }
 
@@ -268,7 +270,7 @@ public final class DaciukMihovAutomatonBuilder {
    * strings in UTF-8. These strings must be binary-sorted.
    */
   public static Automaton build(Collection<BytesRef> input) {
-    final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
+    final StringsToAutomaton builder = new StringsToAutomaton();
 
     CharsRefBuilder current = new CharsRefBuilder();
     for (BytesRef b : input) {
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java
index 27d2c72ddb0..17157c48bdc 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java
@@ -23,7 +23,7 @@ import java.util.List;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
+import org.apache.lucene.util.automaton.StringsToAutomaton;
 
 public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase {
 
@@ -31,7 +31,7 @@ public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase {
     List<BytesRef> acceptStrings = new ArrayList<>();
     acceptStrings.add(new BytesRef("abc"));
 
-    Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
+    Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
     TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
     assertTokenStreamContents(
         ts,
@@ -48,7 +48,7 @@ public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase {
     acceptStrings.add(new BytesRef("123"));
     acceptStrings.add(new BytesRef("abc"));
 
-    Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
+    Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
     TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
     assertTokenStreamContents(
         ts,
@@ -65,7 +65,7 @@ public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase {
     acceptStrings.add(new BytesRef("ab3"));
     acceptStrings.add(new BytesRef("abc"));
 
-    Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
+    Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
     TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
     assertTokenStreamContents(
         ts,
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
index 432c281b184..cb69e744b54 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
@@ -35,7 +35,7 @@ public class TestCompiledAutomaton extends LuceneTestCase {
       terms.add(new BytesRef(s));
     }
     Collections.sort(terms);
-    final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
+    final Automaton a = StringsToAutomaton.build(terms);
     return new CompiledAutomaton(a, true, false, false);
   }
 
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
similarity index 84%
rename from lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java
rename to lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
index 4b31eda98c6..a251ae74880 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@@ -22,7 +22,7 @@ import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 
-public class TestDaciukMihovAutomatonBuilder extends LuceneTestCase {
+public class TestStringsToAutomaton extends LuceneTestCase {
 
   public void testLargeTerms() {
     byte[] b10k = new byte[10_000];
@@ -30,12 +30,12 @@ public class TestDaciukMihovAutomatonBuilder extends LuceneTestCase {
     IllegalArgumentException e =
         expectThrows(
             IllegalArgumentException.class,
-            () -> DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b10k))));
+            () -> StringsToAutomaton.build(Collections.singleton(new BytesRef(b10k))));
     assertTrue(
         e.getMessage()
             .startsWith("This builder doesn't allow terms that are larger than 1,000 characters"));
 
     byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000);
-    DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b1k))); // no exception
+    StringsToAutomaton.build(Collections.singleton(new BytesRef(b1k))); // no exception
   }
 }
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
index 7237be57807..ad29e2de8a3 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
@@ -29,7 +29,7 @@ import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.memory.MemoryIndex;
 import org.apache.lucene.queries.spans.SpanQuery;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
+import org.apache.lucene.util.automaton.StringsToAutomaton;
 
 /**
  * Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
@@ -66,7 +66,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
       // to build an automaton on them
       List<BytesRef> filteredTerms =
           Arrays.stream(components.getTerms())
-              .filter(b -> b.length < DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH)
+              .filter(b -> b.length < StringsToAutomaton.MAX_TERM_LENGTH)
               .toList();
       allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms));
     }
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
index 6cc524a9fac..821b9308335 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
@@ -58,7 +58,7 @@ import org.apache.lucene.tests.analysis.MockAnalyzer;
 import org.apache.lucene.tests.analysis.MockTokenizer;
 import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.tests.util.LuceneTestCase;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
+import org.apache.lucene.util.automaton.StringsToAutomaton;
 import org.junit.After;
 import org.junit.Before;
 
@@ -1671,12 +1671,11 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
     Query query =
         new BooleanQuery.Builder()
             .add(
-                new TermQuery(
-                    new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH))),
+                new TermQuery(new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH))),
                 BooleanClause.Occur.SHOULD)
             .add(
                 new TermQuery(
-                    new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH + 1))),
+                    new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH + 1))),
                 BooleanClause.Occur.SHOULD)
             .add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD)
             .build();