You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by br...@apache.org on 2020/02/28 09:45:02 UTC
[lucene-solr] branch master updated: LUCENE-9245: Reduce AutomatonTermsEnum memory usage.

This is an automated email from the ASF dual-hosted git repository.

broustant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new e0164d1  LUCENE-9245: Reduce AutomatonTermsEnum memory usage.
e0164d1 is described below

commit e0164d1ac848ca3693e7f7faca91c39def1d48da
Author: Bruno Roustant <br...@salesforce.com>
AuthorDate: Fri Feb 21 18:07:13 2020 +0100

    LUCENE-9245: Reduce AutomatonTermsEnum memory usage.
    
    Closes #1281
---
 lucene/CHANGES.txt                                 |  2 +
 .../apache/lucene/index/AutomatonTermsEnum.java    | 63 ++++++++++++++--------
 2 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3fe79e2..fbfc88c 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -175,6 +175,8 @@ Improvements
   single field to the same value. This optimization can reduce the flush time by around
   20% for the docValues update user cases. (Nhat Nguyen, Adrien Grand, Simon Willnauer)
 
+* LUCENE-9245: Reduce AutomatonTermsEnum memory usage. (Bruno Roustant, Robert Muir)
+
 Optimizations
 ---------------------
 
diff --git a/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java
index 411a810..0f00ce1 100644
--- a/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java
@@ -18,7 +18,9 @@ package org.apache.lucene.index;
 
 
 import java.io.IOException;
+import java.util.Arrays;
 
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IntsRefBuilder;
@@ -54,18 +56,20 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
   private final boolean finite;
   // array of sorted transitions for each state, indexed by state number
   private final Automaton automaton;
-  // for path tracking: each long records gen when we last
+  // Used for visited state tracking: each short records gen when we last
   // visited the state; we use gens to avoid having to clear
-  private final long[] visited;
-  private long curGen;
+  private final short[] visited;
+  private short curGen;
   // the reference used for seeking forwards through the term dictionary
   private final BytesRefBuilder seekBytesRef = new BytesRefBuilder(); 
   // true if we are enumerating an infinite portion of the DFA.
   // in this case it is faster to drive the query based on the terms dictionary.
   // when this is true, linearUpperBound indicate the end of range
   // of terms where we should simply do sequential reads instead.
-  private boolean linear = false;
-  private final BytesRef linearUpperBound = new BytesRef(10);
+  private boolean linear;
+  private final BytesRef linearUpperBound = new BytesRef();
+  private final Transition transition = new Transition();
+  private final IntsRefBuilder savedStates = new IntsRefBuilder();
 
   /**
    * Construct an enumerator based upon an automaton, enumerating the specified
@@ -85,10 +89,26 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
     this.commonSuffixRef = compiled.commonSuffixRef;
     this.automaton = compiled.automaton;
 
-    // used for path tracking, where each bit is a numbered state.
-    visited = new long[runAutomaton.getSize()];
+    // No need to track visited states for a finite language without loops.
+    visited = finite ? null : new short[runAutomaton.getSize()];
   }
-  
+
+  /**
+   * Records the given state has been visited.
+   */
+  private void setVisited(int state) {
+    if (!finite) {
+      visited[state] = curGen;
+    }
+  }
+
+  /**
+   * Indicates whether the given state has been visited.
+   */
+  private boolean isVisited(int state) {
+    return !finite && visited[state] == curGen;
+  }
+
   /**
    * Returns true if the term matches the automaton. Also stashes away the term
    * to assist with smart enumeration.
@@ -128,8 +148,6 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
     }
   }
 
-  private Transition transition = new Transition();
-
   /**
    * Sets the enum to operate in linear fashion, as we have found
    * a looping transition at position: we set an upper bound and 
@@ -139,7 +157,6 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
     assert linear == false;
     
     int state = 0;
-    assert state == 0;
     int maxInterval = 0xff;
     //System.out.println("setLinear pos=" + position + " seekbytesRef=" + seekBytesRef);
     for (int i = 0; i < position; i++) {
@@ -160,8 +177,9 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
     if (maxInterval != 0xff)
       maxInterval++;
     int length = position + 1; /* position + maxTransition */
-    if (linearUpperBound.bytes.length < length)
-      linearUpperBound.bytes = new byte[length];
+    if (linearUpperBound.bytes.length < length) {
+      linearUpperBound.bytes = new byte[ArrayUtil.oversize(length, Byte.BYTES)];
+    }
     System.arraycopy(seekBytesRef.bytes(), 0, linearUpperBound.bytes, 0, position);
     linearUpperBound.bytes[position] = (byte) maxInterval;
     linearUpperBound.length = length;
@@ -169,8 +187,6 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
     linear = true;
   }
 
-  private final IntsRefBuilder savedStates = new IntsRefBuilder();
-  
   /**
    * Increments the byte buffer to the next String in binary order after s that will not put
    * the machine into a reject state. If such a string does not exist, returns
@@ -188,17 +204,20 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
     savedStates.setIntAt(0, 0);
     
     while (true) {
-      curGen++;
+      if (!finite && ++curGen == 0) {
+        // Clear the visited states every time curGen wraps (so very infrequently to not impact average perf).
+        Arrays.fill(visited, (short) -1);
+      }
       linear = false;
       // walk the automaton until a character is rejected.
       for (state = savedStates.intAt(pos); pos < seekBytesRef.length(); pos++) {
-        visited[state] = curGen;
+        setVisited(state);
         int nextState = runAutomaton.step(state, seekBytesRef.byteAt(pos) & 0xff);
         if (nextState == -1)
           break;
         savedStates.setIntAt(pos+1, nextState);
         // we found a loop, record it for faster enumeration
-        if (!finite && !linear && visited[nextState] == curGen) {
+        if (!linear && isVisited(nextState)) {
           setLinear(pos);
         }
         state = nextState;
@@ -257,7 +276,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
     }
 
     seekBytesRef.setLength(position);
-    visited[state] = curGen;
+    setVisited(state);
 
     final int numTransitions = automaton.getNumTransitions(state);
     automaton.initTransition(state, transition);
@@ -275,8 +294,8 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
          * as long as is possible, continue down the minimal path in
          * lexicographic order. if a loop or accept state is encountered, stop.
          */
-        while (visited[state] != curGen && !runAutomaton.isAccept(state)) {
-          visited[state] = curGen;
+        while (!isVisited(state) && !runAutomaton.isAccept(state)) {
+          setVisited(state);
           /* 
            * Note: we work with a DFA with no transitions to dead states.
            * so the below is ok, if it is not an accept state,
@@ -291,7 +310,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
           seekBytesRef.append((byte) transition.min);
           
           // we found a loop, record it for faster enumeration
-          if (!finite && !linear && visited[state] == curGen) {
+          if (!linear && isVisited(state)) {
             setLinear(seekBytesRef.length()-1);
           }
         }