You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by br...@apache.org on 2020/02/28 14:17:02 UTC
[lucene-solr] branch branch_8x updated: LUCENE-9237: Faster UniformSplit intersect TermsEnum.

This is an automated email from the ASF dual-hosted git repository.

broustant pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 7302eff  LUCENE-9237: Faster UniformSplit intersect TermsEnum.
7302eff is described below

commit 7302effd9c6d7dde203992df428f0c8d2389bfb3
Author: Bruno Roustant <br...@salesforce.com>
AuthorDate: Fri Feb 28 11:17:24 2020 +0100

    LUCENE-9237: Faster UniformSplit intersect TermsEnum.
---
 lucene/CHANGES.txt                                 |   2 +
 .../lucene/codecs/uniformsplit/BlockReader.java    |   6 +-
 .../lucene/codecs/uniformsplit/FSTDictionary.java  |  75 +--
 .../codecs/uniformsplit/IndexDictionary.java       |  25 -
 .../codecs/uniformsplit/IntersectBlockReader.java  | 539 +++++++++++----------
 .../uniformsplit/sharedterms/STBlockReader.java    |  13 +-
 .../sharedterms/STIntersectBlockReader.java        |  25 +-
 .../sharedterms/STUniformSplitTerms.java           |   1 +
 8 files changed, 315 insertions(+), 371 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index c21bea3..c6bb656 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -84,6 +84,8 @@ Improvements
 
 * LUCENE-9245: Reduce AutomatonTermsEnum memory usage. (Bruno Roustant, Robert Muir)
 
+* LUCENE-9237: Faster UniformSplit intersect TermsEnum. (Bruno Roustant)
+
 Optimizations
 ---------------------
 
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockReader.java
index 8d4bfc0..f510c5f 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockReader.java
@@ -129,6 +129,7 @@ public class BlockReader extends BaseTermsEnum implements Accountable {
   // Scratch objects to avoid object reallocation.
   protected BytesRef scratchBlockBytes;
   protected final BlockTermState scratchTermState;
+  protected BlockLine scratchBlockLine;
 
   /**
    * @param dictionaryBrowserSupplier to load the {@link IndexDictionary.Browser}
@@ -300,7 +301,7 @@ public class BlockReader extends BaseTermsEnum implements Accountable {
     }
     boolean isIncrementalEncodingSeed = lineIndexInBlock == 0 || lineIndexInBlock == blockHeader.getMiddleLineIndex();
     lineIndexInBlock++;
-    return blockLine = blockLineReader.readLine(blockReadBuffer, isIncrementalEncodingSeed, blockLine);
+    return blockLine = blockLineReader.readLine(blockReadBuffer, isIncrementalEncodingSeed, scratchBlockLine);
   }
 
   /**
@@ -413,6 +414,7 @@ public class BlockReader extends BaseTermsEnum implements Accountable {
       termStatesReadBuffer = new ByteArrayDataInput();
       termStateSerializer = createDeltaBaseTermStateSerializer();
       scratchBlockBytes = new BytesRef();
+      scratchBlockLine = new BlockLine(new TermBytes(0, scratchBlockBytes), 0);
     }
   }
 
@@ -559,7 +561,7 @@ public class BlockReader extends BaseTermsEnum implements Accountable {
     termStateForced = false;
   }
 
-  private CorruptIndexException newCorruptIndexException(String msg, Long fp) {
+  protected CorruptIndexException newCorruptIndexException(String msg, Long fp) {
     return new CorruptIndexException(msg
         + (fp == null ? "" : " at FP " + fp)
         + " for field \"" + fieldMetadata.getFieldInfo().name + "\"", blockInput);
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java
index a9fd218..136955a 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java
@@ -25,10 +25,8 @@ import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.fst.BytesRefFSTEnum;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
@@ -112,81 +110,10 @@ public class FSTDictionary implements IndexDictionary {
 
     protected final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(dictionary);
 
-    protected static final int STATE_SEEK = 0, STATE_NEXT = 1, STATE_END = 2;
-    protected int state = STATE_SEEK;
-
-    //  Note: key and pointer are one position prior to the current fstEnum position,
-    //   since we need need the fstEnum to be one ahead to calculate the prefix.
-    protected final BytesRefBuilder keyBuilder = new BytesRefBuilder();
-    protected int blockPrefixLen = 0;
-    protected long blockFilePointer = -1;
-
     @Override
     public long seekBlock(BytesRef term) throws IOException {
-      state = STATE_SEEK;
       BytesRefFSTEnum.InputOutput<Long> seekFloor = fstEnum.seekFloor(term);
-      if (seekFloor == null) {
-        blockFilePointer = -1;
-      } else {
-        blockFilePointer = seekFloor.output;
-      }
-      return blockFilePointer;
-    }
-
-    @Override
-    public BytesRef nextKey() throws IOException {
-      if (state == STATE_END) {
-        // if fstEnum is at end, then that's it.
-        return null;
-      }
-
-      if (state == STATE_SEEK && blockFilePointer == -1) { // see seekBlock
-        if (fstEnum.next() == null) { // advance.
-          state = STATE_END; // probably never happens (empty FST)?  We code defensively.
-          return null;
-        }
-      }
-      keyBuilder.copyBytes(fstEnum.current().input);
-      blockFilePointer = fstEnum.current().output;
-      assert blockFilePointer >= 0;
-
-      state = STATE_NEXT;
-
-      BytesRef key = keyBuilder.get();
-
-      // advance fstEnum
-      BytesRefFSTEnum.InputOutput<Long> inputOutput = fstEnum.next();
-
-      // calc common prefix
-      if (inputOutput == null) {
-        state = STATE_END; // for *next* call; current state is good
-        blockPrefixLen = 0;
-      } else {
-        int sortKeyLength = StringHelper.sortKeyLength(key, inputOutput.input);
-        assert sortKeyLength >= 1;
-        blockPrefixLen = sortKeyLength - 1;
-      }
-      return key;
-    }
-
-    @Override
-    public BytesRef peekKey() {
-      assert state != STATE_SEEK;
-      return (state == STATE_END) ? null : fstEnum.current().input;
-    }
-
-    @Override
-    public int getBlockPrefixLen() {
-      assert state != STATE_SEEK;
-      assert blockPrefixLen >= 0;
-      return blockPrefixLen;
-    }
-
-    @Override
-    public long getBlockFilePointer() {
-      assert state != STATE_SEEK;
-      assert blockFilePointer >= 0;
-      return blockFilePointer;
+      return seekFloor == null ? -1 : seekFloor.output;
     }
   }
 
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IndexDictionary.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IndexDictionary.java
index 60a3405..e597486 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IndexDictionary.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IndexDictionary.java
@@ -102,31 +102,6 @@ public interface IndexDictionary extends Accountable {
      * term precedes alphabetically the first block key of the dictionary.
      */
     long seekBlock(BytesRef term) throws IOException;
-
-    /**
-     * Returns the next block key and positions the browser at this key.
-     * A key is a prefix of a term in the dictionary.
-     * If seekBlock was just called then this is the current block key.
-     */
-    BytesRef nextKey() throws IOException;
-
-    /**
-     * Returns the next key without advancing.
-     * Only call this after {@link #nextKey()} returns a non-null result.
-     */
-    BytesRef peekKey() throws IOException;
-
-    /**
-     * Returns the number of characters of this block's key that is in common with all terms in this block.
-     * Only call this after {@link #nextKey()} returns a non-null result.
-     */
-    int getBlockPrefixLen() throws IOException;
-
-    /**
-     * Returns the block file pointer associated with the key returned.
-     * Only call this after {@link #nextKey()} returns a non-null result.
-     */
-    long getBlockFilePointer() throws IOException;
   }
 
   /**
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IntersectBlockReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IntersectBlockReader.java
index 0322ebf..ab4af86 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IntersectBlockReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/IntersectBlockReader.java
@@ -18,260 +18,308 @@
 package org.apache.lucene.codecs.uniformsplit;
 
 import java.io.IOException;
-import java.util.Objects;
+import java.util.Arrays;
 
 import org.apache.lucene.codecs.PostingsReaderBase;
 import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.ByteRunAutomaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.Transition;
 
 /**
  * The "intersect" {@link TermsEnum} response to {@link UniformSplitTerms#intersect(CompiledAutomaton, BytesRef)},
  * intersecting the terms with an automaton.
+ * <p>
+ * By design of the UniformSplit block keys, it is less efficient than
+ * {@code org.apache.lucene.codecs.blocktree.IntersectTermsEnum} for {@link org.apache.lucene.search.FuzzyQuery} (-37%).
+ * It is slightly slower for {@link org.apache.lucene.search.WildcardQuery} (-5%) and slightly faster for
+ * {@link org.apache.lucene.search.PrefixQuery} (+5%).
+ *
+ * @lucene.experimental
  */
 public class IntersectBlockReader extends BlockReader {
 
-  protected final AutomatonNextTermCalculator nextStringCalculator;
-  protected final ByteRunAutomaton runAutomaton;
-  protected final BytesRef commonSuffixRef; // maybe null
-  protected final BytesRef commonPrefixRef;
-  protected final BytesRef startTerm; // maybe null
+  /**
+   * Block iteration order. Whether to move next block, jump to a block away, or end the iteration.
+   */
+  protected enum BlockIteration {NEXT, SEEK, END}
 
-  /** Set this when our current mode is seeking to this term.  Set to null after. */
-  protected BytesRef seekTerm;
+  /**
+   * Threshold that controls when to attempt to jump to a block away.
+   * <p>
+   * This counter is 0 when entering a block. It is incremented each time a term is rejected by the automaton.
+   * When the counter is greater than or equal to this threshold, then we compute the next term accepted by
+   * the automaton, with {@link AutomatonNextTermCalculator}, and we jump to a block away if the next term
+   * accepted is greater than the immediate next term in the block.
+   * <p>
+   * A low value, for example 1, improves the performance of automatons requiring many jumps, for example
+   * {@link org.apache.lucene.search.FuzzyQuery} and most {@link org.apache.lucene.search.WildcardQuery}.
+   * A higher value improves the performance of automatons with less or no jump, for example
+   * {@link org.apache.lucene.search.PrefixQuery}.
+   * A threshold of 4 seems to be a good balance.
+   */
+  protected final int NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD = 4;
 
-  protected int blockPrefixRunAutomatonState;
-  protected int blockPrefixLen;
+  protected final Automaton automaton;
+  protected final ByteRunAutomaton runAutomaton;
+  protected final boolean finite;
+  protected final BytesRef commonSuffix; // maybe null
+  protected final int minTermLength;
+  protected final AutomatonNextTermCalculator nextStringCalculator;
 
   /**
-   * Number of bytes accepted by the last call to {@link #runAutomatonForState}.
+   * Set this when our current mode is seeking to this term.  Set to null after.
+   */
+  protected BytesRef seekTerm;
+  /**
+   * Number of bytes accepted by the automaton when validating the current term.
+   */
+  protected int numMatchedBytes;
+  /**
+   * Automaton states reached when validating the current term, from 0 to {@link #numMatchedBytes} - 1.
    */
-  protected int numBytesAccepted;
+  protected int[] states;
   /**
-   * Whether the current term is beyond the automaton common prefix.
-   * If true this means the enumeration should stop immediately.
+   * Block iteration order determined when scanning the terms in the current block.
    */
-  protected boolean beyondCommonPrefix;
+  protected BlockIteration blockIteration;
+  /**
+   * Counter of the number of consecutively rejected terms.
+   * Depending on {@link #NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD}, this may trigger a jump to a block away.
+   */
+  protected int numConsecutivelyRejectedTerms;
 
-  public IntersectBlockReader(CompiledAutomaton compiled, BytesRef startTerm,
-                              IndexDictionary.BrowserSupplier dictionaryBrowserSupplier, IndexInput blockInput, PostingsReaderBase postingsReader,
-                              FieldMetadata fieldMetadata, BlockDecoder blockDecoder) throws IOException {
+  protected IntersectBlockReader(CompiledAutomaton compiled, BytesRef startTerm,
+                                 IndexDictionary.BrowserSupplier dictionaryBrowserSupplier, IndexInput blockInput,
+                                 PostingsReaderBase postingsReader, FieldMetadata fieldMetadata,
+                                 BlockDecoder blockDecoder) throws IOException {
     super(dictionaryBrowserSupplier, blockInput, postingsReader, fieldMetadata, blockDecoder);
-    this.nextStringCalculator = new AutomatonNextTermCalculator(compiled);
-    Automaton automaton = Objects.requireNonNull(compiled.automaton);
-    this.runAutomaton = Objects.requireNonNull(compiled.runAutomaton);
-    this.commonSuffixRef = compiled.commonSuffixRef; // maybe null
-    this.commonPrefixRef = Operations.getCommonPrefixBytesRef(automaton); // never null
-
-    this.startTerm = startTerm;
-    assert startTerm == null || StringHelper.startsWith(startTerm, commonPrefixRef);
-    // it is thus also true that startTerm >= commonPrefixRef
+    automaton = compiled.automaton;
+    runAutomaton = compiled.runAutomaton;
+    finite = compiled.finite;
+    commonSuffix = compiled.commonSuffixRef;
+    minTermLength = getMinTermLength();
+    nextStringCalculator = new AutomatonNextTermCalculator(compiled);
+    seekTerm = startTerm;
+  }
 
-    this.seekTerm = startTerm != null ? startTerm : commonPrefixRef;
+  /**
+   * Computes the minimal length of the terms accepted by the automaton.
+   * This speeds up the term scanning for automatons accepting a finite language.
+   */
+  protected int getMinTermLength() {
+    // Automatons accepting infinite language (e.g. PrefixQuery and WildcardQuery) do not benefit much from
+    // min term length while it takes time to compute it. More precisely, by skipping this computation PrefixQuery
+    // is significantly boosted while WildcardQuery might be slightly degraded on average. This min term length
+    // mainly boosts FuzzyQuery.
+    int commonSuffixLength = commonSuffix == null ? 0 : commonSuffix.length;
+    if (!finite) {
+      return commonSuffixLength;
+    }
+    // Since we are only dealing with finite language, there is no loop to detect.
+    int commonPrefixLength = 0;
+    int state = 0;
+    Transition t = null;
+    while (true) {
+      if (runAutomaton.isAccept(state)) {
+        // The common prefix reaches a final state. So common prefix and common suffix overlap.
+        // Min term length is the max between common prefix and common suffix lengths.
+        return Math.max(commonPrefixLength, commonSuffixLength);
+      }
+      if (automaton.getNumTransitions(state) == 1) {
+        if (t == null) {
+          t = new Transition();
+        }
+        automaton.getTransition(state, 0, t);
+        if (t.min == t.max) {
+          state = t.dest;
+          commonPrefixLength++;
+          continue;
+        }
+      }
+      break;
+    }
+    // Min term length is the sum of common prefix and common suffix lengths.
+    return commonPrefixLength + commonSuffixLength;
   }
 
   @Override
   public BytesRef next() throws IOException {
-    clearTermState();
-
-    if (blockHeader == null) { // initial state
-      // note: don't call super.seekCeil here; we have our own logic
-
-      // Set the browser position to the block having the seek term.
-      // Even if -1, it's okay since we'll soon call nextKey().
-      long blockStartFP = getOrCreateDictionaryBrowser().seekBlock(seekTerm);
-      if (isBeyondLastTerm(seekTerm, blockStartFP)) {
-        return null; // EOF
-      }
-
-      // Starting at this block find and load the next matching block.
-      //   note: Since seekBlock was just called, we actually consider the current block as "next".
-      if (nextBlockMatchingPrefix() == false) { // note: starts at seek'ed block, which may have a match
-        return null; // EOF
+    if (blockHeader == null) {
+      if (!seekFirstBlock()) {
+        return null;
       }
+      states = new int[32];
+      blockIteration = BlockIteration.NEXT;
     }
-
+    termState = null;
     do {
-
-      // look in the rest of this block.
       BytesRef term = nextTermInBlockMatching();
       if (term != null) {
         return term;
       }
-
-      // next term dict matching prefix
-    } while (nextBlockMatchingPrefix());
-
-    return null; // EOF
+    } while (nextBlock());
+    return null;
   }
 
-  /**
-   * Find the next block that appears to contain terms that could match the automata.
-   * The prefix is the primary clue.  Returns true if at one, or false for no more (EOF).
-   */
-  protected boolean nextBlockMatchingPrefix() throws IOException {
-    if (beyondCommonPrefix) {
-      return false; // EOF
+  protected boolean seekFirstBlock() throws IOException {
+    seekTerm = nextStringCalculator.nextSeekTerm(seekTerm);
+    if (seekTerm == null) {
+      return false;
     }
-
-    IndexDictionary.Browser browser = getOrCreateDictionaryBrowser();
-
-    do {
-
-      // Get next block key (becomes in effect the current blockKey)
-      BytesRef blockKey = browser.nextKey();
-      if (blockKey == null) {
-        return false; // EOF
-      }
-
-      blockPrefixLen = browser.getBlockPrefixLen();
-      blockPrefixRunAutomatonState = runAutomatonForState(blockKey.bytes, blockKey.offset, blockPrefixLen, 0);
-
-      // We may have passed commonPrefix  (a short-circuit optimization).
-      if (isBeyondCommonPrefix(blockKey)) {
-        return false; // EOF
-      }
-
-      if (blockPrefixRunAutomatonState >= 0) {
-        break; // a match
-      }
-
-      //
-      // This block doesn't match.
-      //
-
-      seekTerm = null; // we're moving on to another block, and seekTerm is before it.
-
-      // Should we simply get the next key (linear mode) or try to seek?
-      if (nextStringCalculator.isLinearState(blockKey)) {
-        continue;
-      }
-
-      // Maybe the next block's key matches?  We have to check this before calling nextStringCalculator.
-      BytesRef peekKey = browser.peekKey();
-      if (peekKey == null) {
-        return false; // EOF
-      }
-      if (runAutomatonForState(peekKey.bytes, peekKey.offset, peekKey.length, 0) >= 0) {
-        continue; // yay; it matched.  Continue to actually advance to it.  This is rare?
-      }
-
-      // Seek to a block by calculating the next term to match the automata *following* peekKey.
-      this.seekTerm = nextStringCalculator.nextSeekTerm(browser.peekKey());
-      if (seekTerm == null) {
-        return false; // EOF
-      }
-      browser.seekBlock(seekTerm);
-      //continue
-
-    } while (true); // while not a match
-
-    // A match!
-
-    //NOTE: we could determine if this automata has a prefix for this specific block (longer than the commonPrefix).
-    //  If we see it, we could set it as the seekTerm and we could also exit the block early if we get past this prefix
-    //  and runAutomatonFromPrefix would start from this prefix.  Smiley tried but benchmarks were not favorable to it.
-
-    initializeHeader(null, browser.getBlockFilePointer());
-
-    return true;
+    long blockStartFP = getOrCreateDictionaryBrowser().seekBlock(seekTerm);
+    if (blockStartFP == -1) {
+      blockStartFP = fieldMetadata.getFirstBlockStartFP();
+    } else if (isBeyondLastTerm(seekTerm, blockStartFP)) {
+      return false;
+    }
+    initializeHeader(seekTerm, blockStartFP);
+    return blockHeader != null;
   }
 
   /**
-   * Find the next block line that matches, or null when at end of block.
+   * Finds the next block line that matches (accepted by the automaton), or null when at end of block.
+   *
+   * @return The next term in the current block that is accepted by the automaton; or null if none.
    */
   protected BytesRef nextTermInBlockMatching() throws IOException {
-    do {
-      // if seekTerm is set, then we seek into this block instead of starting with the first blindly.
-      if (seekTerm != null) {
-        assert blockLine == null;
-        boolean moveBeyondIfFound = seekTerm == startTerm; // for startTerm, we want to get the following term
-        SeekStatus seekStatus = seekInBlock(seekTerm);
-        seekTerm = null;// reset.
-        if (seekStatus == SeekStatus.END) {
-          return null;
-        } else if (seekStatus == SeekStatus.FOUND && moveBeyondIfFound) {
-          if (readLineInBlock() == null) {
-            return null;
+    if (seekTerm == null) {
+      if (readLineInBlock() == null) {
+        return null;
+      }
+    } else {
+      SeekStatus seekStatus = seekInBlock(seekTerm);
+      seekTerm = null;
+      if (seekStatus == SeekStatus.END) {
+        return null;
+      }
+      assert numMatchedBytes == 0;
+      assert numConsecutivelyRejectedTerms == 0;
+    }
+    while (true) {
+      TermBytes lineTermBytes = blockLine.getTermBytes();
+      BytesRef lineTerm = lineTermBytes.getTerm();
+      assert lineTerm.offset == 0;
+      if (states.length <= lineTerm.length) {
+        states = ArrayUtil.growExact(states, ArrayUtil.oversize(lineTerm.length + 1, Byte.BYTES));
+      }
+      // Since terms are delta encoded, we may start the automaton steps from the last state reached by the previous term.
+      int index = Math.min(lineTermBytes.getSuffixOffset(), numMatchedBytes);
+      // Skip this term early if it is shorter than the min term length, or if it does not end with the common suffix
+      // accepted by the automaton.
+      if (lineTerm.length >= minTermLength && (commonSuffix == null || endsWithCommonSuffix(lineTerm.bytes, lineTerm.length))) {
+        int state = states[index];
+        while (true) {
+          if (index == lineTerm.length) {
+            if (runAutomaton.isAccept(state)) {
+              // The automaton accepts the current term. Record the number of matched bytes and return the term.
+              assert runAutomaton.run(lineTerm.bytes, 0, lineTerm.length);
+              numMatchedBytes = index;
+              if (numConsecutivelyRejectedTerms > 0) {
+                numConsecutivelyRejectedTerms = 0;
+              }
+              assert blockIteration == BlockIteration.NEXT;
+              return lineTerm;
+            }
+            break;
           }
+          state = runAutomaton.step(state, lineTerm.bytes[index] & 0xff);
+          if (state == -1) {
+            // The automaton rejects the current term.
+            break;
+          }
+          // Record the reached automaton state.
+          states[++index] = state;
         }
-        assert blockLine != null;
-      } else {
-        if (readLineInBlock() == null) {
+      }
+      // The current term is not accepted by the automaton.
+      // Still record the reached automaton state to start the next term steps from there.
+      assert !runAutomaton.run(lineTerm.bytes, 0, lineTerm.length);
+      numMatchedBytes = index;
+      // If the number of consecutively rejected terms reaches the threshold,
+      // then determine whether it is worthwhile to jump to a block away.
+      if (++numConsecutivelyRejectedTerms >= NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD
+          && lineIndexInBlock < blockHeader.getLinesCount() - 1
+          && !nextStringCalculator.isLinearState(lineTerm)) {
+        // Compute the next term accepted by the automaton after the current term.
+        if ((seekTerm = nextStringCalculator.nextSeekTerm(lineTerm)) == null) {
+          blockIteration = BlockIteration.END;
           return null;
         }
-      }
-
-      TermBytes lineTermBytes = blockLine.getTermBytes();
-      BytesRef lineTerm = lineTermBytes.getTerm();
-
-      if (commonSuffixRef == null || StringHelper.endsWith(lineTerm, commonSuffixRef)) {
-        if (runAutomatonFromPrefix(lineTerm)) {
-          return lineTerm;
-        } else if (beyondCommonPrefix) {
+        // It is worthwhile to jump to a block away if the next term accepted is after the next term in the block.
+        // Actually the block away may be the current block, but this is a good heuristic.
+        readLineInBlock();
+        if (seekTerm.compareTo(blockLine.getTermBytes().getTerm()) > 0) {
+          // Stop scanning this block terms and set the iteration order to jump to a block away by seeking seekTerm.
+          blockIteration = BlockIteration.SEEK;
           return null;
         }
+        seekTerm = null;
+        // If it is not worthwhile to jump to a block away, do not attempt anymore for the current block.
+        numConsecutivelyRejectedTerms = Integer.MIN_VALUE;
+      } else if (readLineInBlock() == null) {
+        // No more terms in the block. The iteration order is to open the very next block.
+        assert blockIteration == BlockIteration.NEXT;
+        return null;
       }
-
-    } while (true);
-  }
-
-  protected boolean runAutomatonFromPrefix(BytesRef term) {
-    int state = runAutomatonForState(term.bytes, term.offset + blockPrefixLen, term.length - blockPrefixLen, blockPrefixRunAutomatonState);
-    if (state >= 0 && runAutomaton.isAccept(state)) {
-      return true;
     }
-    if (isBeyondCommonPrefix(term)) {
-      // If the automaton rejects early the term, before the common prefix length,
-      // and if the term rejected byte is lexicographically after the same byte in the common prefix,
-      // then it means the current term is beyond the common prefix.
-      // Exit immediately the enumeration.
-      beyondCommonPrefix = true;
-    }
-    return false;
   }
 
   /**
-   * Run the automaton and return the final state (not necessary accepted). -1 signifies no state / no match.
-   * Sets {@link #numBytesAccepted} with the offset of the first byte rejected by the automaton;
-   * or (offset + length) if no byte is rejected.
+   * Indicates whether the given term ends with the automaton common suffix.
+   * This allows to quickly skip terms that the automaton would reject eventually.
    */
-  protected int runAutomatonForState(byte[] s, int offset, int length, int initialState) {
-    //see ByteRunAutomaton.run(); similar
-    int state = initialState;
-    int index = 0;
-    while (index < length) {
-      state = runAutomaton.step(state, s[index + offset] & 0xFF);
-      if (state == -1) {
-        break;
+  protected boolean endsWithCommonSuffix(byte[] termBytes, int termLength) {
+    byte[] suffixBytes = commonSuffix.bytes;
+    int suffixLength = commonSuffix.length;
+    int offset = termLength - suffixLength;
+    assert offset >= 0; // We already checked minTermLength.
+    for (int i = 0; i < suffixLength; i++) {
+      if (termBytes[offset + i] != suffixBytes[i]) {
+        return false;
       }
-      index++;
     }
-    numBytesAccepted = index;
-    return state;
+    return true;
   }
 
   /**
-   * Determines if the provided {@link BytesRef} is beyond the automaton common prefix.
-   * This method must be called after a call to {@link #runAutomatonForState} because
-   * it uses {@link #numBytesAccepted} value.
+   * Opens the next block.
+   * Depending on the {@link #blockIteration} order, it may be the very next block, or a block away that may contain
+   * {@link #seekTerm}.
+   *
+   * @return true if the next block is opened; false if there is no blocks anymore and the iteration is over.
    */
-  protected boolean isBeyondCommonPrefix(BytesRef bytesRef) {
-    // If the automaton rejects early the bytes, before the common prefix length,
-    // and if the rejected byte is lexicographically after the same byte in the common prefix,
-    // then it means the bytes are beyond the common prefix.
-    return numBytesAccepted < commonPrefixRef.length
-        && numBytesAccepted < bytesRef.length
-        && (bytesRef.bytes[numBytesAccepted + bytesRef.offset] & 0xFF) > (commonPrefixRef.bytes[numBytesAccepted + commonPrefixRef.offset] & 0xFF);
+  protected boolean nextBlock() throws IOException {
+    long blockStartFP;
+    switch (blockIteration) {
+      case NEXT:
+        assert seekTerm == null;
+        blockStartFP = blockInput.getFilePointer();
+        break;
+      case SEEK:
+        assert seekTerm != null;
+        blockStartFP = getOrCreateDictionaryBrowser().seekBlock(seekTerm);
+        if (isBeyondLastTerm(seekTerm, blockStartFP)) {
+          return false;
+        }
+        blockIteration = BlockIteration.NEXT;
+        break;
+      case END:
+        return false;
+      default:
+        throw new UnsupportedOperationException("Unsupported " + BlockIteration.class.getSimpleName());
+    }
+    numMatchedBytes = 0;
+    numConsecutivelyRejectedTerms = 0;
+    initializeHeader(seekTerm, blockStartFP);
+    return blockHeader != null;
   }
 
   @Override
@@ -285,64 +333,66 @@ public class IntersectBlockReader extends BlockReader {
   }
 
   @Override
-  public SeekStatus seekCeil(BytesRef text) {
+  public void seekExact(BytesRef term, TermState state) {
     throw new UnsupportedOperationException();
   }
 
   @Override
-  public void seekExact(BytesRef term, TermState state) {
+  public SeekStatus seekCeil(BytesRef text) {
     throw new UnsupportedOperationException();
   }
 
   /**
-   * This is a copy of AutomatonTermsEnum.  Since it's an inner class, the outer class can
+   * This is mostly a copy of AutomatonTermsEnum.  Since it's an inner class, the outer class can
    * call methods that ATE does not expose.  It'd be nice if ATE's logic could be more extensible.
    */
-  protected static class AutomatonNextTermCalculator {
-    // a tableized array-based form of the DFA
-    protected final ByteRunAutomaton runAutomaton;
-    // common suffix of the automaton
-    protected final BytesRef commonSuffixRef;
-    // true if the automaton accepts a finite language
-    protected final boolean finite;
-    // array of sorted transitions for each state, indexed by state number
-    protected final Automaton automaton;
-    // for path tracking: each long records gen when we last
+  protected class AutomatonNextTermCalculator {
+    // for path tracking: each short records gen when we last
     // visited the state; we use gens to avoid having to clear
-    protected final long[] visited;
-    protected long curGen;
+    protected final short[] visited;
+    protected short curGen;
     // the reference used for seeking forwards through the term dictionary
     protected final BytesRefBuilder seekBytesRef = new BytesRefBuilder();
     // true if we are enumerating an infinite portion of the DFA.
     // in this case it is faster to drive the query based on the terms dictionary.
     // when this is true, linearUpperBound indicate the end of range
     // of terms where we should simply do sequential reads instead.
-    protected boolean linear = false;
-    protected final BytesRef linearUpperBound = new BytesRef(10);
-    protected Transition transition = new Transition();
+    protected boolean linear;
+    protected final BytesRef linearUpperBound = new BytesRef();
+    protected final Transition transition = new Transition();
     protected final IntsRefBuilder savedStates = new IntsRefBuilder();
 
     protected AutomatonNextTermCalculator(CompiledAutomaton compiled) {
-      if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
-        throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
+      visited = compiled.finite ? null : new short[runAutomaton.getSize()];
+    }
+
+    /**
+     * Records the given state has been visited.
+     */
+    protected void setVisited(int state) {
+      if (!finite) {
+        visited[state] = curGen;
       }
-      this.finite = compiled.finite;
-      this.runAutomaton = compiled.runAutomaton;
-      assert this.runAutomaton != null;
-      this.commonSuffixRef = compiled.commonSuffixRef;
-      this.automaton = compiled.automaton;
-
-      // used for path tracking, where each bit is a numbered state.
-      visited = new long[runAutomaton.getSize()];
     }
 
-    /** True if the current state of the automata is best iterated linearly (without seeking). */
+    /**
+     * Indicates whether the given state has been visited.
+     */
+    protected boolean isVisited(int state) {
+      return !finite && visited[state] == curGen;
+    }
+
+    /**
+     * True if the current state of the automata is best iterated linearly (without seeking).
+     */
     protected boolean isLinearState(BytesRef term) {
       return linear && term.compareTo(linearUpperBound) < 0;
     }
 
-    /** @see org.apache.lucene.index.FilteredTermsEnum#nextSeekTerm(BytesRef) */
-    protected BytesRef nextSeekTerm(final BytesRef term) throws IOException {
+    /**
+     * @see org.apache.lucene.index.FilteredTermsEnum#nextSeekTerm(BytesRef)
+     */
+    protected BytesRef nextSeekTerm(final BytesRef term) {
       //System.out.println("ATE.nextSeekTerm term=" + term);
       if (term == null) {
         assert seekBytesRef.length() == 0;
@@ -371,12 +421,11 @@ public class IntersectBlockReader extends BlockReader {
       assert linear == false;
 
       int state = 0;
-      assert state == 0;
       int maxInterval = 0xff;
       //System.out.println("setLinear pos=" + position + " seekbytesRef=" + seekBytesRef);
       for (int i = 0; i < position; i++) {
         state = runAutomaton.step(state, seekBytesRef.byteAt(i) & 0xff);
-        assert state >= 0: "state=" + state;
+        assert state >= 0 : "state=" + state;
       }
       final int numTransitions = automaton.getNumTransitions(state);
       automaton.initTransition(state, transition);
@@ -392,8 +441,9 @@ public class IntersectBlockReader extends BlockReader {
       if (maxInterval != 0xff)
         maxInterval++;
       int length = position + 1; /* position + maxTransition */
-      if (linearUpperBound.bytes.length < length)
-        linearUpperBound.bytes = new byte[length];
+      if (linearUpperBound.bytes.length < length) {
+        linearUpperBound.bytes = new byte[ArrayUtil.oversize(length, Byte.BYTES)];
+      }
       System.arraycopy(seekBytesRef.bytes(), 0, linearUpperBound.bytes, 0, position);
       linearUpperBound.bytes[position] = (byte) maxInterval;
       linearUpperBound.length = length;
@@ -405,7 +455,7 @@ public class IntersectBlockReader extends BlockReader {
      * Increments the byte buffer to the next String in binary order after s that will not put
      * the machine into a reject state. If such a string does not exist, returns
      * false.
-     *
+     * <p>
      * The correctness of this method depends upon the automaton being deterministic,
      * and having no transitions to dead states.
      *
@@ -414,21 +464,24 @@ public class IntersectBlockReader extends BlockReader {
     protected boolean nextString() {
       int state;
       int pos = 0;
-      savedStates.grow(seekBytesRef.length()+1);
+      savedStates.grow(seekBytesRef.length() + 1);
       savedStates.setIntAt(0, 0);
 
       while (true) {
-        curGen++;
+        if (!finite && ++curGen == 0) {
+          // Clear the visited states every time curGen wraps (so very infrequently to not impact average perf).
+          Arrays.fill(visited, (short) -1);
+        }
         linear = false;
         // walk the automaton until a character is rejected.
         for (state = savedStates.intAt(pos); pos < seekBytesRef.length(); pos++) {
-          visited[state] = curGen;
+          setVisited(state);
           int nextState = runAutomaton.step(state, seekBytesRef.byteAt(pos) & 0xff);
           if (nextState == -1)
             break;
-          savedStates.setIntAt(pos+1, nextState);
+          savedStates.setIntAt(pos + 1, nextState);
           // we found a loop, record it for faster enumeration
-          if (!finite && !linear && visited[nextState] == curGen) {
+          if (!linear && isVisited(nextState)) {
             setLinear(pos);
           }
           state = nextState;
@@ -446,7 +499,7 @@ public class IntersectBlockReader extends BlockReader {
             /* String is good to go as-is */
             return true;
           /* else advance further */
-          // TODO: paranoia? if we backtrack thru an infinite DFA, the loop detection is important!
+          // paranoia? if we backtrack thru an infinite DFA, the loop detection is important!
           // for now, restart from scratch for all infinite DFAs
           if (!finite) pos = 0;
         }
@@ -456,20 +509,20 @@ public class IntersectBlockReader extends BlockReader {
     /**
      * Returns the next String in lexicographic order that will not put
      * the machine into a reject state.
-     *
+     * <p>
      * This method traverses the DFA from the given position in the String,
      * starting at the given state.
-     *
+     * <p>
      * If this cannot satisfy the machine, returns false. This method will
      * walk the minimal path, in lexicographic order, as long as possible.
-     *
+     * <p>
      * If this method returns false, then there might still be more solutions,
      * it is necessary to backtrack to find out.
      *
-     * @param state current non-reject state
+     * @param state    current non-reject state
      * @param position useful portion of the string
      * @return true if more possible solutions exist for the DFA from this
-     *         position
+     * position
      */
     protected boolean nextString(int state, int position) {
       /*
@@ -487,7 +540,7 @@ public class IntersectBlockReader extends BlockReader {
       }
 
       seekBytesRef.setLength(position);
-      visited[state] = curGen;
+      setVisited(state);
 
       final int numTransitions = automaton.getNumTransitions(state);
       automaton.initTransition(state, transition);
@@ -505,8 +558,8 @@ public class IntersectBlockReader extends BlockReader {
            * as long as is possible, continue down the minimal path in
            * lexicographic order. if a loop or accept state is encountered, stop.
            */
-          while (visited[state] != curGen && !runAutomaton.isAccept(state)) {
-            visited[state] = curGen;
+          while (!isVisited(state) && !runAutomaton.isAccept(state)) {
+            setVisited(state);
             /*
              * Note: we work with a DFA with no transitions to dead states.
              * so the below is ok, if it is not an accept state,
@@ -521,8 +574,8 @@ public class IntersectBlockReader extends BlockReader {
             seekBytesRef.append((byte) transition.min);
 
             // we found a loop, record it for faster enumeration
-            if (!finite && !linear && visited[state] == curGen) {
-              setLinear(seekBytesRef.length()-1);
+            if (!linear && isVisited(state)) {
+              setLinear(seekBytesRef.length() - 1);
             }
           }
           return true;
@@ -546,13 +599,11 @@ public class IntersectBlockReader extends BlockReader {
         // because there is no higher character in binary sort order.
         if (nextChar++ != 0xff) {
           seekBytesRef.setByteAt(position, (byte) nextChar);
-          seekBytesRef.setLength(position+1);
+          seekBytesRef.setLength(position + 1);
           return position;
         }
       }
       return -1; /* all solutions exhausted */
     }
   }
-
-}
-
+}
\ No newline at end of file
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReader.java
index 6d7c79d..8fef520 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReader.java
@@ -48,18 +48,15 @@ public class STBlockReader extends BlockReader {
 
   @Override
   public BytesRef next() throws IOException {
-    BytesRef next = super.next();
-    if (next == null) {
-      return null;
-    }
-    // Check if the term occurs for the searched field.
-    while (!termOccursInField()) {
+    BytesRef next;
+    do {
       next = super.next();
       if (next == null) {
-        // No more term for any field.
+        // No more terms.
         return null;
       }
-    }
+      // Check if the term occurs for the searched field.
+    } while (!termOccursInField());
     // The term occurs for the searched field.
     return next;
   }
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STIntersectBlockReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STIntersectBlockReader.java
index 099b6c3..89159f8 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STIntersectBlockReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STIntersectBlockReader.java
@@ -64,18 +64,15 @@ public class STIntersectBlockReader extends IntersectBlockReader {
 
   @Override
   public BytesRef next() throws IOException {
-    BytesRef next = super.next();
-    if (next == null) {
-      return null;
-    }
-    // Check if the term occurs for the searched field.
-    while (!termOccursInField()) {
+    BytesRef next;
+    do {
       next = super.next();
       if (next == null) {
-        // No more term.
+        // No more terms.
         return null;
       }
-    }
+      // Check if the term occurs for the searched field.
+    } while (!termOccursInField());
     // The term occurs for the searched field.
     return next;
   }
@@ -86,21 +83,13 @@ public class STIntersectBlockReader extends IntersectBlockReader {
   }
 
   @Override
-  protected boolean nextBlockMatchingPrefix() throws IOException {
-    // block header maybe null if we are positioned outside the field block range
-    return super.nextBlockMatchingPrefix() && blockHeader != null;
-  }
-
-  @Override
   protected STBlockLine.Serializer createBlockLineSerializer() {
     return new STBlockLine.Serializer();
   }
 
   /**
-   * Reads the {@link BlockTermState} on the current line for the specific field
-   * corresponding this this reader.
-   * Changes the current {@link BlockTermState} to null if the term does not
-   * occur for the field.
+   * Reads the {@link BlockTermState} on the current line for the specific field corresponding to this reader.
+   * Returns null if the term does not occur for the field.
    */
   @Override
   protected BlockTermState readTermState() throws IOException {
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTerms.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTerms.java
index 6e11ae9..01d374c 100755
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTerms.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTerms.java
@@ -51,6 +51,7 @@ public class STUniformSplitTerms extends UniformSplitTerms {
 
   @Override
   public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
+    checkIntersectAutomatonType(compiled);
     return new STIntersectBlockReader(compiled, startTerm, dictionaryBrowserSupplier, blockInput, postingsReader, fieldMetadata, blockDecoder, fieldInfos);
   }