You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2013/01/31 22:51:58 UTC
svn commit: r1441213 - in /lucene/dev/trunk/lucene: CHANGES.txt
core/src/java/org/apache/lucene/util/fst/BytesStore.java
core/src/java/org/apache/lucene/util/fst/FST.java
core/src/test/org/apache/lucene/util/fst/Test2BFST.java
Author: mikemccand
Date: Thu Jan 31 21:51:58 2013
New Revision: 1441213
URL: http://svn.apache.org/viewvc?rev=1441213&view=rev
Log:
LUCENE-4739: fix FST.save/load to work with > 1.1 GB FSTs
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1441213&r1=1441212&r2=1441213&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Thu Jan 31 21:51:58 2013
@@ -123,6 +123,9 @@ Bug Fixes
* LUCENE-4732: Fixed TermsEnum.seekCeil/seekExact on term vectors.
(Adrien Grand, Robert Muir)
+* LUCENE-4739: Fixed bugs that prevented FSTs more than ~1.1GB from
+ being saved and loaded (Adrien Grand, Mike McCandless)
+
======================= Lucene 4.1.0 =======================
Changes in backwards compatibility policy
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java?rev=1441213&r1=1441212&r2=1441213&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java Thu Jan 31 21:51:58 2013
@@ -46,7 +46,7 @@ class BytesStore extends DataOutput {
}
/** Pulls bytes from the provided IndexInput. */
- public BytesStore(DataInput in, int numBytes, int maxBlockSize) throws IOException {
+ public BytesStore(DataInput in, long numBytes, int maxBlockSize) throws IOException {
int blockSize = 2;
int blockBits = 1;
while(blockSize < numBytes && blockSize < maxBlockSize) {
@@ -56,9 +56,9 @@ class BytesStore extends DataOutput {
this.blockBits = blockBits;
this.blockSize = blockSize;
this.blockMask = blockSize-1;
- int left = numBytes;
+ long left = numBytes;
while(left > 0) {
- final int chunk = Math.min(blockSize, left);
+ final int chunk = (int) Math.min(blockSize, left);
byte[] block = new byte[chunk];
in.readBytes(block, 0, block.length);
blocks.add(block);
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java?rev=1441213&r1=1441212&r2=1441213&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java Thu Jan 31 21:51:58 2013
@@ -27,11 +27,6 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Map;
-/*
-import java.io.Writer;
-import java.io.OutputStreamWriter;
-import java.io.FileOutputStream;
-*/
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.ByteArrayDataOutput;
@@ -41,12 +36,15 @@ import org.apache.lucene.store.InputStre
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Constants;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.fst.Builder.UnCompiledNode;
import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts;
+//import java.io.Writer;
+//import java.io.OutputStreamWriter;
// TODO: break this into WritableFST and ReadOnlyFST.. then
// we can have subclasses of ReadOnlyFST to handle the
@@ -276,7 +274,6 @@ public final class FST<T> {
this.outputs = outputs;
this.allowArrayArcs = allowArrayArcs;
version = VERSION_CURRENT;
- // 32 KB blocks:
bytes = new BytesStore(bytesPageBits);
// pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs
@@ -295,9 +292,22 @@ public final class FST<T> {
nodeRefToAddress = null;
}
+ public static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
+
/** Load a previously saved FST. */
public FST(DataInput in, Outputs<T> outputs) throws IOException {
+ this(in, outputs, DEFAULT_MAX_BLOCK_BITS);
+ }
+
+ /** Load a previously saved FST; maxBlockBits allows you to
+ * control the size of the byte[] pages used to hold the FST bytes. */
+ public FST(DataInput in, Outputs<T> outputs, int maxBlockBits) throws IOException {
this.outputs = outputs;
+
+ if (maxBlockBits < 1 || maxBlockBits > 30) {
+ throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
+ }
+
// NOTE: only reads most recent format; we don't have
// back-compat promise for FSTs (they are experimental):
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET);
@@ -345,13 +355,13 @@ public final class FST<T> {
} else {
nodeRefToAddress = null;
}
- startNode = in.readVInt();
+ startNode = in.readVLong();
nodeCount = in.readVLong();
arcCount = in.readVLong();
arcWithOutputCount = in.readVLong();
- int numBytes = in.readVInt();
- bytes = new BytesStore(in, numBytes, Integer.MAX_VALUE);
+ long numBytes = in.readVLong();
+ bytes = new BytesStore(in, numBytes, 1<<maxBlockBits);
NO_OUTPUT = outputs.getNoOutput();
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java?rev=1441213&r1=1441212&r2=1441213&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java Thu Jan 31 21:51:58 2013
@@ -20,10 +20,16 @@ package org.apache.lucene.util.fst;
import java.util.Arrays;
import java.util.Random;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TimeUnits;
+import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.packed.PackedInts;
import org.junit.Ignore;
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
@@ -39,6 +45,8 @@ public class Test2BFST extends LuceneTes
IntsRef input = new IntsRef(ints, 0, ints.length);
long seed = random().nextLong();
+ Directory dir = new MMapDirectory(_TestUtil.getTempDir("2BFST"));
+
for(int doPackIter=0;doPackIter<2;doPackIter++) {
boolean doPack = doPackIter == 1;
@@ -72,42 +80,56 @@ public class Test2BFST extends LuceneTes
FST<Object> fst = b.finish();
- System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
+ for(int verify=0;verify<2;verify++) {
+ System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
- Arrays.fill(ints2, 0);
- r = new Random(seed);
+ Arrays.fill(ints2, 0);
+ r = new Random(seed);
- for(int i=0;i<count;i++) {
- if (i % 1000000 == 0) {
- System.out.println(i + "...: ");
- }
- for(int j=10;j<ints2.length;j++) {
- ints2[j] = r.nextInt(256);
+ for(int i=0;i<count;i++) {
+ if (i % 1000000 == 0) {
+ System.out.println(i + "...: ");
+ }
+ for(int j=10;j<ints2.length;j++) {
+ ints2[j] = r.nextInt(256);
+ }
+ assertEquals(NO_OUTPUT, Util.get(fst, input2));
+ nextInput(r, ints2);
+ }
+
+ System.out.println("\nTEST: enum all input/outputs");
+ IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<Object>(fst);
+
+ Arrays.fill(ints2, 0);
+ r = new Random(seed);
+ int upto = 0;
+ while(true) {
+ IntsRefFSTEnum.InputOutput<Object> pair = fstEnum.next();
+ if (pair == null) {
+ break;
+ }
+ for(int j=10;j<ints2.length;j++) {
+ ints2[j] = r.nextInt(256);
+ }
+ assertEquals(input2, pair.input);
+ assertEquals(NO_OUTPUT, pair.output);
+ upto++;
+ nextInput(r, ints2);
+ }
+ assertEquals(count, upto);
+
+ if (verify == 0) {
+ System.out.println("\nTEST: save/load FST and re-verify");
+ IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
+ fst.save(out);
+ out.close();
+ IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
+ fst = new FST<Object>(in, outputs);
+ in.close();
+ } else {
+ dir.deleteFile("fst");
}
- assertEquals(NO_OUTPUT, Util.get(fst, input2));
- nextInput(r, ints2);
}
-
- System.out.println("\nTEST: enum all input/outputs");
- IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<Object>(fst);
-
- Arrays.fill(ints2, 0);
- r = new Random(seed);
- int upto = 0;
- while(true) {
- IntsRefFSTEnum.InputOutput<Object> pair = fstEnum.next();
- if (pair == null) {
- break;
- }
- for(int j=10;j<ints2.length;j++) {
- ints2[j] = r.nextInt(256);
- }
- assertEquals(input2, pair.input);
- assertEquals(NO_OUTPUT, pair.output);
- upto++;
- nextInput(r, ints2);
- }
- assertEquals(count, upto);
}
// Build FST w/ ByteSequenceOutputs and stop when FST
@@ -138,39 +160,53 @@ public class Test2BFST extends LuceneTes
}
FST<BytesRef> fst = b.finish();
+ for(int verify=0;verify<2;verify++) {
- System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
+ System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
- r = new Random(seed);
- Arrays.fill(ints, 0);
+ r = new Random(seed);
+ Arrays.fill(ints, 0);
- for(int i=0;i<count;i++) {
- if (i % 1000000 == 0) {
- System.out.println(i + "...: ");
+ for(int i=0;i<count;i++) {
+ if (i % 1000000 == 0) {
+ System.out.println(i + "...: ");
+ }
+ r.nextBytes(outputBytes);
+ assertEquals(output, Util.get(fst, input));
+ nextInput(r, ints);
+ }
+
+ System.out.println("\nTEST: enum all input/outputs");
+ IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst);
+
+ Arrays.fill(ints, 0);
+ r = new Random(seed);
+ int upto = 0;
+ while(true) {
+ IntsRefFSTEnum.InputOutput<BytesRef> pair = fstEnum.next();
+ if (pair == null) {
+ break;
+ }
+ assertEquals(input, pair.input);
+ r.nextBytes(outputBytes);
+ assertEquals(output, pair.output);
+ upto++;
+ nextInput(r, ints);
+ }
+ assertEquals(count, upto);
+
+ if (verify == 0) {
+ System.out.println("\nTEST: save/load FST and re-verify");
+ IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
+ fst.save(out);
+ out.close();
+ IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
+ fst = new FST<BytesRef>(in, outputs);
+ in.close();
+ } else {
+ dir.deleteFile("fst");
}
- r.nextBytes(outputBytes);
- assertEquals(output, Util.get(fst, input));
- nextInput(r, ints);
}
-
- System.out.println("\nTEST: enum all input/outputs");
- IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst);
-
- Arrays.fill(ints, 0);
- r = new Random(seed);
- int upto = 0;
- while(true) {
- IntsRefFSTEnum.InputOutput<BytesRef> pair = fstEnum.next();
- if (pair == null) {
- break;
- }
- assertEquals(input, pair.input);
- r.nextBytes(outputBytes);
- assertEquals(output, pair.output);
- upto++;
- nextInput(r, ints);
- }
- assertEquals(count, upto);
}
// Build FST w/ PositiveIntOutputs and stop when FST
@@ -202,46 +238,62 @@ public class Test2BFST extends LuceneTes
FST<Long> fst = b.finish();
- System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
-
- Arrays.fill(ints, 0);
-
- output = 1;
- r = new Random(seed);
- for(int i=0;i<count;i++) {
- if (i % 1000000 == 0) {
- System.out.println(i + "...: ");
- }
+ for(int verify=0;verify<2;verify++) {
- // forward lookup:
- assertEquals(output, Util.get(fst, input).longValue());
- // reverse lookup:
- assertEquals(input, Util.getByOutput(fst, output));
- output += 1 + r.nextInt(10);
- nextInput(r, ints);
- }
+ System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
- System.out.println("\nTEST: enum all input/outputs");
- IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
+ Arrays.fill(ints, 0);
- Arrays.fill(ints, 0);
- r = new Random(seed);
- int upto = 0;
- output = 1;
- while(true) {
- IntsRefFSTEnum.InputOutput<Long> pair = fstEnum.next();
- if (pair == null) {
- break;
+ output = 1;
+ r = new Random(seed);
+ for(int i=0;i<count;i++) {
+ if (i % 1000000 == 0) {
+ System.out.println(i + "...: ");
+ }
+
+ // forward lookup:
+ assertEquals(output, Util.get(fst, input).longValue());
+ // reverse lookup:
+ assertEquals(input, Util.getByOutput(fst, output));
+ output += 1 + r.nextInt(10);
+ nextInput(r, ints);
+ }
+
+ System.out.println("\nTEST: enum all input/outputs");
+ IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
+
+ Arrays.fill(ints, 0);
+ r = new Random(seed);
+ int upto = 0;
+ output = 1;
+ while(true) {
+ IntsRefFSTEnum.InputOutput<Long> pair = fstEnum.next();
+ if (pair == null) {
+ break;
+ }
+ assertEquals(input, pair.input);
+ assertEquals(output, pair.output.longValue());
+ output += 1 + r.nextInt(10);
+ upto++;
+ nextInput(r, ints);
+ }
+ assertEquals(count, upto);
+
+ if (verify == 0) {
+ System.out.println("\nTEST: save/load FST and re-verify");
+ IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
+ fst.save(out);
+ out.close();
+ IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
+ fst = new FST<Long>(in, outputs);
+ in.close();
+ } else {
+ dir.deleteFile("fst");
}
- assertEquals(input, pair.input);
- assertEquals(output, pair.output.longValue());
- output += 1 + r.nextInt(10);
- upto++;
- nextInput(r, ints);
}
- assertEquals(count, upto);
}
}
+ dir.close();
}
private void nextInput(Random r, int[] ints) {