You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2014/12/10 17:48:25 UTC

svn commit: r1644473 - in /lucene/dev/trunk/lucene: CHANGES.txt core/src/java/org/apache/lucene/util/fst/FST.java core/src/test/org/apache/lucene/util/fst/TestFSTs.java

Author: mikemccand
Date: Wed Dec 10 16:48:25 2014
New Revision: 1644473

URL: http://svn.apache.org/r1644473
Log:
LUCENE-6105: don't cache root arcs for small FSTs

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1644473&r1=1644472&r2=1644473&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Dec 10 16:48:25 2014
@@ -368,6 +368,10 @@ Bug Fixes
 
 * LUCENE-6094: Allow IW.rollback to stop ConcurrentMergeScheduler even
   when it's stalling because there are too many merges. (Mike McCandless)
+
+* LUCENE-6105: Don't cache FST root arcs if the number of root arcs is
+  small, or if the cache would be > 20% of the size of the FST.
+  (Robert Muir, Mike McCandless)
   
 Documentation
 

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java?rev=1644473&r1=1644472&r2=1644473&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java Wed Dec 10 16:48:25 2014
@@ -22,9 +22,9 @@ import java.io.BufferedOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
-import java.util.ArrayList;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -40,8 +40,6 @@ import org.apache.lucene.util.Accountabl
 import org.apache.lucene.util.Accountables;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Constants;
-import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.PriorityQueue;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.fst.Builder.UnCompiledNode;
@@ -120,7 +118,8 @@ public final class FST<T> implements Acc
    */
   final static int FIXED_ARRAY_NUM_ARCS_DEEP = 10;
 
-  private int[] bytesPerArc = new int[0];
+  // Reused temporarily while building the FST:
+  private int[] reusedBytesPerArc = new int[0];
 
   // Increment version to change it
   private final static String FILE_FORMAT_NAME = "FST";
@@ -180,8 +179,6 @@ public final class FST<T> implements Acc
   private final boolean allowArrayArcs;
 
   private Arc<T> cachedRootArcs[];
-  private Arc<T> assertingCachedRootArcs[]; // only set wit assert
-
 
   /** Represents a single arc. */
   public final static class Arc<T> {
@@ -443,7 +440,7 @@ public final class FST<T> implements Acc
       size += inCounts.ramBytesUsed();
     }
     size += cachedArcsBytesUsed;
-    size += RamUsageEstimator.sizeOf(bytesPerArc);
+    size += RamUsageEstimator.sizeOf(reusedBytesPerArc);
     return size;
   }
 
@@ -487,26 +484,22 @@ public final class FST<T> implements Acc
     }
   }
   
-  // Caches first 128 labels
+  // Optionally caches first 128 labels
   @SuppressWarnings({"rawtypes","unchecked"})
   private void cacheRootArcs() throws IOException {
-    cachedRootArcs = (Arc<T>[]) new Arc[0x80];
-    readRootArcs(cachedRootArcs);
-    cachedArcsBytesUsed += ramBytesUsed(cachedRootArcs);
-    
-    assert setAssertingRootArcs(cachedRootArcs);
-    assert assertRootArcs();
-  }
-  
-  public void readRootArcs(Arc<T>[] arcs) throws IOException {
+    // We should only be called once per FST:
+    assert cachedArcsBytesUsed == 0;
+
     final Arc<T> arc = new Arc<>();
     getFirstArc(arc);
-    final BytesReader in = getBytesReader();
     if (targetHasArcs(arc)) {
+      final BytesReader in = getBytesReader();
+      Arc<T>[] arcs = (Arc<T>[]) new Arc[0x80];
       readFirstRealTargetArc(arc.target, arc, in);
+      int count = 0;
       while(true) {
         assert arc.label != END_LABEL;
-        if (arc.label < cachedRootArcs.length) {
+        if (arc.label < arcs.length) {
           arcs[arc.label] = new Arc<T>().copyFrom(arc);
         } else {
           break;
@@ -515,43 +508,19 @@ public final class FST<T> implements Acc
           break;
         }
         readNextRealArc(arc, in);
+        count++;
+      }
+
+      int cacheRAM = (int) ramBytesUsed(arcs);
+
+      // Don't cache if there are only a few arcs or if the cache would use > 20% RAM of the FST itself:
+      if (count >= FIXED_ARRAY_NUM_ARCS_SHALLOW && cacheRAM < ramBytesUsed()/5) {
+        cachedRootArcs = arcs;
+        cachedArcsBytesUsed = cacheRAM;
       }
     }
   }
   
-  @SuppressWarnings({"rawtypes","unchecked"})
-  private boolean setAssertingRootArcs(Arc<T>[] arcs) throws IOException {
-    assertingCachedRootArcs = (Arc<T>[]) new Arc[arcs.length];
-    readRootArcs(assertingCachedRootArcs);
-    cachedArcsBytesUsed *= 2;
-    return true;
-  }
-  
-  private boolean assertRootArcs() {
-    assert cachedRootArcs != null;
-    assert assertingCachedRootArcs != null;
-    for (int i = 0; i < cachedRootArcs.length; i++) {
-      final Arc<T> root = cachedRootArcs[i];
-      final Arc<T> asserting = assertingCachedRootArcs[i];
-      if (root != null) { 
-        assert root.arcIdx == asserting.arcIdx;
-        assert root.bytesPerArc == asserting.bytesPerArc;
-        assert root.flags == asserting.flags;
-        assert root.label == asserting.label;
-        assert root.nextArc == asserting.nextArc;
-        assert root.nextFinalOutput.equals(asserting.nextFinalOutput);
-        assert root.node == asserting.node;
-        assert root.numArcs == asserting.numArcs;
-        assert root.output.equals(asserting.output);
-        assert root.posArcsStart == asserting.posArcsStart;
-        assert root.target == asserting.target;
-      } else {
-        assert root == null && asserting == null;
-      } 
-    }
-    return true;
-  }
-
   public T getEmptyOutput() {
     return emptyOutput;
   }
@@ -701,8 +670,8 @@ public final class FST<T> implements Acc
     final boolean doFixedArray = shouldExpand(nodeIn);
     if (doFixedArray) {
       //System.out.println("  fixedArray");
-      if (bytesPerArc.length < nodeIn.numArcs) {
-        bytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, 1)];
+      if (reusedBytesPerArc.length < nodeIn.numArcs) {
+        reusedBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, 1)];
       }
     }
 
@@ -776,10 +745,10 @@ public final class FST<T> implements Acc
       // but record how many bytes each one took, and max
       // byte size:
       if (doFixedArray) {
-        bytesPerArc[arcIdx] = (int) (bytes.getPosition() - lastArcStart);
+        reusedBytesPerArc[arcIdx] = (int) (bytes.getPosition() - lastArcStart);
         lastArcStart = bytes.getPosition();
-        maxBytesPerArc = Math.max(maxBytesPerArc, bytesPerArc[arcIdx]);
-        //System.out.println("    bytes=" + bytesPerArc[arcIdx]);
+        maxBytesPerArc = Math.max(maxBytesPerArc, reusedBytesPerArc[arcIdx]);
+        //System.out.println("    bytes=" + reusedBytesPerArc[arcIdx]);
       }
     }
     
@@ -830,12 +799,12 @@ public final class FST<T> implements Acc
         bytes.skipBytes((int) (destPos - srcPos));
         for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) {
           destPos -= maxBytesPerArc;
-          srcPos -= bytesPerArc[arcIdx];
+          srcPos -= reusedBytesPerArc[arcIdx];
           //System.out.println("  repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos);
           if (srcPos != destPos) {
-            //System.out.println("  copy len=" + bytesPerArc[arcIdx]);
-            assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " bytesPerArc[arcIdx]=" + bytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs;
-            bytes.copyBytes(srcPos, destPos, bytesPerArc[arcIdx]);
+            //System.out.println("  copy len=" + reusedBytesPerArc[arcIdx]);
+            assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " reusedBytesPerArc[arcIdx]=" + reusedBytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs;
+            bytes.copyBytes(srcPos, destPos, reusedBytesPerArc[arcIdx]);
           }
         }
       }
@@ -1185,12 +1154,48 @@ public final class FST<T> implements Acc
     return arc;
   }
 
+  // LUCENE-5152: called only from asserts, to validate that the
+  // non-cached arc lookup would produce the same result, to
+  // catch callers that illegally modify shared structures with
+  // the result (we shallow-clone the Arc itself, but e.g. a BytesRef
+  // output is still shared):
+  private boolean assertRootCachedArc(int label, Arc<T> cachedArc) throws IOException {
+    Arc<T> arc = new Arc<>();
+    getFirstArc(arc);
+    BytesReader in = getBytesReader();
+    Arc<T> result = findTargetArc(label, arc, arc, in, false);
+    if (result == null) {
+      assert cachedArc == null;
+    } else {
+      assert cachedArc != null;
+      assert cachedArc.arcIdx == result.arcIdx;
+      assert cachedArc.bytesPerArc == result.bytesPerArc;
+      assert cachedArc.flags == result.flags;
+      assert cachedArc.label == result.label;
+      assert cachedArc.nextArc == result.nextArc;
+      assert cachedArc.nextFinalOutput.equals(result.nextFinalOutput);
+      assert cachedArc.node == result.node;
+      assert cachedArc.numArcs == result.numArcs;
+      assert cachedArc.output.equals(result.output);
+      assert cachedArc.posArcsStart == result.posArcsStart;
+      assert cachedArc.target == result.target;
+    }
+
+    return true;
+  }
+
   // TODO: could we somehow [partially] tableize arc lookups
-  // look automaton?
+  // like automaton?
 
   /** Finds an arc leaving the incoming arc, replacing the arc in place.
    *  This returns null if the arc was not found, else the incoming arc. */
   public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException {
+    return findTargetArc(labelToMatch, follow, arc, in, true);
+  }
+
+  /** Finds an arc leaving the incoming arc, replacing the arc in place.
+   *  This returns null if the arc was not found, else the incoming arc. */
+  private Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in, boolean useRootArcCache) throws IOException {
 
     if (labelToMatch == END_LABEL) {
       if (follow.isFinal()) {
@@ -1211,12 +1216,13 @@ public final class FST<T> implements Acc
     }
 
     // Short-circuit if this arc is in the root arc cache:
-    if (follow.target == startNode && labelToMatch < cachedRootArcs.length) {
-      
+    if (useRootArcCache && cachedRootArcs != null && follow.target == startNode && labelToMatch < cachedRootArcs.length) {
+      final Arc<T> result = cachedRootArcs[labelToMatch];
+
       // LUCENE-5152: detect tricky cases where caller
       // modified previously returned cached root-arcs:
-      assert assertRootArcs();
-      final Arc<T> result = cachedRootArcs[labelToMatch];
+      assert assertRootCachedArc(labelToMatch, result);
+
       if (result == null) {
         return null;
       } else {

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java?rev=1644473&r1=1644472&r2=1644473&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java Wed Dec 10 16:48:25 2014
@@ -1593,4 +1593,51 @@ public class TestFSTs extends LuceneTest
     }
   }
 
+  public void testIllegallyModifyRootArc() throws Exception {
+    assumeTrue("test relies on assertions", assertsAreEnabled);
+
+    Set<BytesRef> terms = new HashSet<>();
+    for(int i=0;i<100;i++) {
+      String prefix = Character.toString((char) ('a' + i));
+      terms.add(new BytesRef(prefix));
+      if (prefix.equals("m") == false) {
+        for(int j=0;j<20;j++) {
+          // Make a big enough FST that the root cache will be created:
+          String suffix = TestUtil.randomRealisticUnicodeString(random(), 10, 20);
+          terms.add(new BytesRef(prefix + suffix));
+        }
+      }
+    }
+
+    List<BytesRef> termsList = new ArrayList<>(terms);
+    Collections.sort(termsList);
+
+    ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+    Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+
+    IntsRefBuilder input = new IntsRefBuilder();
+    for(BytesRef term : termsList) {
+      Util.toIntsRef(term, input);
+      builder.add(input.get(), term);
+    }
+
+    FST<BytesRef> fst = builder.finish();
+    
+    Arc<BytesRef> arc = new FST.Arc<>();
+    fst.getFirstArc(arc);
+    FST.BytesReader reader = fst.getBytesReader();
+    arc = fst.findTargetArc((int) 'm', arc, arc, reader);
+    assertNotNull(arc);
+    assertEquals(new BytesRef("m"), arc.output);
+
+    // NOTE: illegal:
+    arc.output.length = 0;
+
+    fst.getFirstArc(arc);
+    try {
+      arc = fst.findTargetArc((int) 'm', arc, arc, reader);
+    } catch (AssertionError ae) {
+      // expected
+    }
+  }
 }