You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2014/12/10 17:48:25 UTC
svn commit: r1644473 - in /lucene/dev/trunk/lucene: CHANGES.txt
core/src/java/org/apache/lucene/util/fst/FST.java
core/src/test/org/apache/lucene/util/fst/TestFSTs.java
Author: mikemccand
Date: Wed Dec 10 16:48:25 2014
New Revision: 1644473
URL: http://svn.apache.org/r1644473
Log:
LUCENE-6105: don't cache root arcs for small FSTs
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1644473&r1=1644472&r2=1644473&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Dec 10 16:48:25 2014
@@ -368,6 +368,10 @@ Bug Fixes
* LUCENE-6094: Allow IW.rollback to stop ConcurrentMergeScheduler even
when it's stalling because there are too many merges. (Mike McCandless)
+
+* LUCENE-6105: Don't cache FST root arcs if the number of root arcs is
+ small, or if the cache would be > 20% of the size of the FST.
+ (Robert Muir, Mike McCandless)
Documentation
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java?rev=1644473&r1=1644472&r2=1644473&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java Wed Dec 10 16:48:25 2014
@@ -22,9 +22,9 @@ import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
-import java.util.ArrayList;
import java.nio.file.Files;
import java.nio.file.Path;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -40,8 +40,6 @@ import org.apache.lucene.util.Accountabl
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Constants;
-import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.Builder.UnCompiledNode;
@@ -120,7 +118,8 @@ public final class FST<T> implements Acc
*/
final static int FIXED_ARRAY_NUM_ARCS_DEEP = 10;
- private int[] bytesPerArc = new int[0];
+ // Reused temporarily while building the FST:
+ private int[] reusedBytesPerArc = new int[0];
// Increment version to change it
private final static String FILE_FORMAT_NAME = "FST";
@@ -180,8 +179,6 @@ public final class FST<T> implements Acc
private final boolean allowArrayArcs;
private Arc<T> cachedRootArcs[];
- private Arc<T> assertingCachedRootArcs[]; // only set wit assert
-
/** Represents a single arc. */
public final static class Arc<T> {
@@ -443,7 +440,7 @@ public final class FST<T> implements Acc
size += inCounts.ramBytesUsed();
}
size += cachedArcsBytesUsed;
- size += RamUsageEstimator.sizeOf(bytesPerArc);
+ size += RamUsageEstimator.sizeOf(reusedBytesPerArc);
return size;
}
@@ -487,26 +484,22 @@ public final class FST<T> implements Acc
}
}
- // Caches first 128 labels
+ // Optionally caches first 128 labels
@SuppressWarnings({"rawtypes","unchecked"})
private void cacheRootArcs() throws IOException {
- cachedRootArcs = (Arc<T>[]) new Arc[0x80];
- readRootArcs(cachedRootArcs);
- cachedArcsBytesUsed += ramBytesUsed(cachedRootArcs);
-
- assert setAssertingRootArcs(cachedRootArcs);
- assert assertRootArcs();
- }
-
- public void readRootArcs(Arc<T>[] arcs) throws IOException {
+ // We should only be called once per FST:
+ assert cachedArcsBytesUsed == 0;
+
final Arc<T> arc = new Arc<>();
getFirstArc(arc);
- final BytesReader in = getBytesReader();
if (targetHasArcs(arc)) {
+ final BytesReader in = getBytesReader();
+ Arc<T>[] arcs = (Arc<T>[]) new Arc[0x80];
readFirstRealTargetArc(arc.target, arc, in);
+ int count = 0;
while(true) {
assert arc.label != END_LABEL;
- if (arc.label < cachedRootArcs.length) {
+ if (arc.label < arcs.length) {
arcs[arc.label] = new Arc<T>().copyFrom(arc);
} else {
break;
@@ -515,43 +508,19 @@ public final class FST<T> implements Acc
break;
}
readNextRealArc(arc, in);
+ count++;
+ }
+
+ int cacheRAM = (int) ramBytesUsed(arcs);
+
+ // Don't cache if there are only a few arcs or if the cache would use > 20% RAM of the FST itself:
+ if (count >= FIXED_ARRAY_NUM_ARCS_SHALLOW && cacheRAM < ramBytesUsed()/5) {
+ cachedRootArcs = arcs;
+ cachedArcsBytesUsed = cacheRAM;
}
}
}
- @SuppressWarnings({"rawtypes","unchecked"})
- private boolean setAssertingRootArcs(Arc<T>[] arcs) throws IOException {
- assertingCachedRootArcs = (Arc<T>[]) new Arc[arcs.length];
- readRootArcs(assertingCachedRootArcs);
- cachedArcsBytesUsed *= 2;
- return true;
- }
-
- private boolean assertRootArcs() {
- assert cachedRootArcs != null;
- assert assertingCachedRootArcs != null;
- for (int i = 0; i < cachedRootArcs.length; i++) {
- final Arc<T> root = cachedRootArcs[i];
- final Arc<T> asserting = assertingCachedRootArcs[i];
- if (root != null) {
- assert root.arcIdx == asserting.arcIdx;
- assert root.bytesPerArc == asserting.bytesPerArc;
- assert root.flags == asserting.flags;
- assert root.label == asserting.label;
- assert root.nextArc == asserting.nextArc;
- assert root.nextFinalOutput.equals(asserting.nextFinalOutput);
- assert root.node == asserting.node;
- assert root.numArcs == asserting.numArcs;
- assert root.output.equals(asserting.output);
- assert root.posArcsStart == asserting.posArcsStart;
- assert root.target == asserting.target;
- } else {
- assert root == null && asserting == null;
- }
- }
- return true;
- }
-
public T getEmptyOutput() {
return emptyOutput;
}
@@ -701,8 +670,8 @@ public final class FST<T> implements Acc
final boolean doFixedArray = shouldExpand(nodeIn);
if (doFixedArray) {
//System.out.println(" fixedArray");
- if (bytesPerArc.length < nodeIn.numArcs) {
- bytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, 1)];
+ if (reusedBytesPerArc.length < nodeIn.numArcs) {
+ reusedBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, 1)];
}
}
@@ -776,10 +745,10 @@ public final class FST<T> implements Acc
// but record how many bytes each one took, and max
// byte size:
if (doFixedArray) {
- bytesPerArc[arcIdx] = (int) (bytes.getPosition() - lastArcStart);
+ reusedBytesPerArc[arcIdx] = (int) (bytes.getPosition() - lastArcStart);
lastArcStart = bytes.getPosition();
- maxBytesPerArc = Math.max(maxBytesPerArc, bytesPerArc[arcIdx]);
- //System.out.println(" bytes=" + bytesPerArc[arcIdx]);
+ maxBytesPerArc = Math.max(maxBytesPerArc, reusedBytesPerArc[arcIdx]);
+ //System.out.println(" bytes=" + reusedBytesPerArc[arcIdx]);
}
}
@@ -830,12 +799,12 @@ public final class FST<T> implements Acc
bytes.skipBytes((int) (destPos - srcPos));
for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) {
destPos -= maxBytesPerArc;
- srcPos -= bytesPerArc[arcIdx];
+ srcPos -= reusedBytesPerArc[arcIdx];
//System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos);
if (srcPos != destPos) {
- //System.out.println(" copy len=" + bytesPerArc[arcIdx]);
- assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " bytesPerArc[arcIdx]=" + bytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs;
- bytes.copyBytes(srcPos, destPos, bytesPerArc[arcIdx]);
+ //System.out.println(" copy len=" + reusedBytesPerArc[arcIdx]);
+ assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " reusedBytesPerArc[arcIdx]=" + reusedBytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs;
+ bytes.copyBytes(srcPos, destPos, reusedBytesPerArc[arcIdx]);
}
}
}
@@ -1185,12 +1154,48 @@ public final class FST<T> implements Acc
return arc;
}
+ // LUCENE-5152: called only from asserts, to validate that the
+ // non-cached arc lookup would produce the same result, to
+ // catch callers that illegally modify shared structures with
+ // the result (we shallow-clone the Arc itself, but e.g. a BytesRef
+ // output is still shared):
+ private boolean assertRootCachedArc(int label, Arc<T> cachedArc) throws IOException {
+ Arc<T> arc = new Arc<>();
+ getFirstArc(arc);
+ BytesReader in = getBytesReader();
+ Arc<T> result = findTargetArc(label, arc, arc, in, false);
+ if (result == null) {
+ assert cachedArc == null;
+ } else {
+ assert cachedArc != null;
+ assert cachedArc.arcIdx == result.arcIdx;
+ assert cachedArc.bytesPerArc == result.bytesPerArc;
+ assert cachedArc.flags == result.flags;
+ assert cachedArc.label == result.label;
+ assert cachedArc.nextArc == result.nextArc;
+ assert cachedArc.nextFinalOutput.equals(result.nextFinalOutput);
+ assert cachedArc.node == result.node;
+ assert cachedArc.numArcs == result.numArcs;
+ assert cachedArc.output.equals(result.output);
+ assert cachedArc.posArcsStart == result.posArcsStart;
+ assert cachedArc.target == result.target;
+ }
+
+ return true;
+ }
+
// TODO: could we somehow [partially] tableize arc lookups
- // look automaton?
+ // like automaton?
/** Finds an arc leaving the incoming arc, replacing the arc in place.
* This returns null if the arc was not found, else the incoming arc. */
public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException {
+ return findTargetArc(labelToMatch, follow, arc, in, true);
+ }
+
+ /** Finds an arc leaving the incoming arc, replacing the arc in place.
+ * This returns null if the arc was not found, else the incoming arc. */
+ private Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in, boolean useRootArcCache) throws IOException {
if (labelToMatch == END_LABEL) {
if (follow.isFinal()) {
@@ -1211,12 +1216,13 @@ public final class FST<T> implements Acc
}
// Short-circuit if this arc is in the root arc cache:
- if (follow.target == startNode && labelToMatch < cachedRootArcs.length) {
-
+ if (useRootArcCache && cachedRootArcs != null && follow.target == startNode && labelToMatch < cachedRootArcs.length) {
+ final Arc<T> result = cachedRootArcs[labelToMatch];
+
// LUCENE-5152: detect tricky cases where caller
// modified previously returned cached root-arcs:
- assert assertRootArcs();
- final Arc<T> result = cachedRootArcs[labelToMatch];
+ assert assertRootCachedArc(labelToMatch, result);
+
if (result == null) {
return null;
} else {
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java?rev=1644473&r1=1644472&r2=1644473&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java Wed Dec 10 16:48:25 2014
@@ -1593,4 +1593,51 @@ public class TestFSTs extends LuceneTest
}
}
+ public void testIllegallyModifyRootArc() throws Exception {
+ assumeTrue("test relies on assertions", assertsAreEnabled);
+
+ Set<BytesRef> terms = new HashSet<>();
+ for(int i=0;i<100;i++) {
+ String prefix = Character.toString((char) ('a' + i));
+ terms.add(new BytesRef(prefix));
+ if (prefix.equals("m") == false) {
+ for(int j=0;j<20;j++) {
+ // Make a big enough FST that the root cache will be created:
+ String suffix = TestUtil.randomRealisticUnicodeString(random(), 10, 20);
+ terms.add(new BytesRef(prefix + suffix));
+ }
+ }
+ }
+
+ List<BytesRef> termsList = new ArrayList<>(terms);
+ Collections.sort(termsList);
+
+ ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+ Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+
+ IntsRefBuilder input = new IntsRefBuilder();
+ for(BytesRef term : termsList) {
+ Util.toIntsRef(term, input);
+ builder.add(input.get(), term);
+ }
+
+ FST<BytesRef> fst = builder.finish();
+
+ Arc<BytesRef> arc = new FST.Arc<>();
+ fst.getFirstArc(arc);
+ FST.BytesReader reader = fst.getBytesReader();
+ arc = fst.findTargetArc((int) 'm', arc, arc, reader);
+ assertNotNull(arc);
+ assertEquals(new BytesRef("m"), arc.output);
+
+ // NOTE: illegal:
+ arc.output.length = 0;
+
+ fst.getFirstArc(arc);
+ try {
+ arc = fst.findTargetArc((int) 'm', arc, arc, reader);
+ } catch (AssertionError ae) {
+ // expected
+ }
+ }
}