You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2019/02/19 17:52:31 UTC
[lucene-solr] branch master updated: LUCENE-8635: add option to
move FSTs off-heap,
and do so for the FST terms index in the default codec for non-primary-key
fields if MMapDirectory is being used
This is an automated email from the ASF dual-hosted git repository.
mikemccand pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new ec801b4 LUCENE-8635: add option to move FSTs off-heap, and do so for the FST terms index in the default codec for non-primary-key fields if MMapDirectory is being used
ec801b4 is described below
commit ec801b4c54194dc0d4893d227e2f2c9580c04ec6
Author: Mike McCandless <mi...@apache.org>
AuthorDate: Tue Feb 19 12:52:22 2019 -0500
LUCENE-8635: add option to move FSTs off-heap, and do so for the FST terms index in the default codec for non-primary-key fields if MMapDirectory is being used
---
lucene/CHANGES.txt | 6 ++
.../codecs/blocktree/BlockTreeTermsReader.java | 9 +-
.../lucene/codecs/blocktree/FieldReader.java | 11 ++-
.../apache/lucene/store/ByteBufferIndexInput.java | 2 +-
.../src/java/org/apache/lucene/util/fst/FST.java | 41 ++++-----
.../java/org/apache/lucene/util/fst/FSTStore.java | 30 +++++++
.../apache/lucene/util/fst/OffHeapFSTStore.java | 69 +++++++++++++++
.../org/apache/lucene/util/fst/OnHeapFSTStore.java | 97 ++++++++++++++++++++++
.../lucene/util/fst/ReverseRandomAccessReader.java | 63 ++++++++++++++
9 files changed, 293 insertions(+), 35 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 7df6430..0184947 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -189,6 +189,12 @@ Changes in Runtime Behavior
is instead calculated as a function of the sloppy frequency of the matching
intervals. (Alan Woodward, Jim Ferenczi)
+* LUCENE-8635: FSTs can now remain off-heap, accessed via
+ IndexInput, and the default codec's term dictionary
+ (BlockTreeTermsReader) will now leave the FST for the terms index
+ off-heap for non-primary-key fields using MMapDirectory, reducing
+ heap usage for such fields. (Ankit Jain)
+
New Features
* LUCENE-8340: LongPoint#newDistanceFeatureQuery may be used to boost scores based on
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
index b0091fd..0076f14 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
@@ -102,6 +102,8 @@ public final class BlockTreeTermsReader extends FieldsProducer {
// Open input to the main terms dict file (_X.tib)
final IndexInput termsIn;
+ // Open input to the terms index file (_X.tip)
+ final IndexInput indexIn;
//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
@@ -118,7 +120,6 @@ public final class BlockTreeTermsReader extends FieldsProducer {
/** Sole constructor. */
public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state) throws IOException {
boolean success = false;
- IndexInput indexIn = null;
this.postingsReader = postingsReader;
this.segment = state.segmentInfo.name;
@@ -197,13 +198,11 @@ public final class BlockTreeTermsReader extends FieldsProducer {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
}
}
-
- indexIn.close();
success = true;
} finally {
if (!success) {
// this.close() will close in:
- IOUtils.closeWhileHandlingException(indexIn, this);
+ IOUtils.closeWhileHandlingException(this);
}
}
}
@@ -237,7 +236,7 @@ public final class BlockTreeTermsReader extends FieldsProducer {
@Override
public void close() throws IOException {
try {
- IOUtils.close(termsIn, postingsReader);
+ IOUtils.close(indexIn, termsIn, postingsReader);
} finally {
// Clear so refs to terms index is GCable even if
// app hangs onto us:
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
index 46aee6e..41073fd 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
@@ -26,6 +26,7 @@ import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteBufferIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
@@ -34,6 +35,7 @@ import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.OffHeapFSTStore;
/**
* BlockTree's implementation of {@link Terms}.
@@ -86,9 +88,14 @@ public final class FieldReader extends Terms implements Accountable {
if (indexIn != null) {
final IndexInput clone = indexIn.clone();
- //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
clone.seek(indexStartFP);
- index = new FST<>(clone, ByteSequenceOutputs.getSingleton());
+ // Initialize FST offheap if index is MMapDirectory and
+ // docCount != sumDocFreq implying field is not primary key
+ if (clone instanceof ByteBufferIndexInput && this.docCount != this.sumDocFreq) {
+ index = new FST<>(clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
+ } else {
+ index = new FST<>(clone, ByteSequenceOutputs.getSingleton());
+ }
/*
if (false) {
diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBufferIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBufferIndexInput.java
index 0f6c733..3df89d7 100644
--- a/lucene/core/src/java/org/apache/lucene/store/ByteBufferIndexInput.java
+++ b/lucene/core/src/java/org/apache/lucene/store/ByteBufferIndexInput.java
@@ -33,7 +33,7 @@ import java.nio.ByteBuffer;
* For efficiency, this class requires that the buffers
* are a power-of-two (<code>chunkSizePower</code>).
*/
-abstract class ByteBufferIndexInput extends IndexInput implements RandomAccessInput {
+public abstract class ByteBufferIndexInput extends IndexInput implements RandomAccessInput {
protected final long length;
protected final long chunkSizeMask;
protected final int chunkSizePower;
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
index 8e84e3c..e5130bd 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
@@ -127,8 +127,7 @@ public final class FST<T> implements Accountable {
* GB then bytesArray is set instead. */
final BytesStore bytes;
- /** Used at read time when the FST fits into a single byte[]. */
- final byte[] bytesArray;
+ private final FSTStore fstStore;
private long startNode = -1;
@@ -238,7 +237,7 @@ public final class FST<T> implements Accountable {
this.inputType = inputType;
this.outputs = outputs;
version = VERSION_CURRENT;
- bytesArray = null;
+ fstStore = null;
bytes = new BytesStore(bytesPageBits);
// pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs
@@ -251,18 +250,16 @@ public final class FST<T> implements Accountable {
/** Load a previously saved FST. */
public FST(DataInput in, Outputs<T> outputs) throws IOException {
- this(in, outputs, DEFAULT_MAX_BLOCK_BITS);
+ this(in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS));
}
/** Load a previously saved FST; maxBlockBits allows you to
* control the size of the byte[] pages used to hold the FST bytes. */
- public FST(DataInput in, Outputs<T> outputs, int maxBlockBits) throws IOException {
+ public FST(DataInput in, Outputs<T> outputs, FSTStore fstStore) throws IOException {
+ bytes = null;
+ this.fstStore = fstStore;
this.outputs = outputs;
- if (maxBlockBits < 1 || maxBlockBits > 30) {
- throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
- }
-
// NOTE: only reads most recent format; we don't have
// back-compat promise for FSTs (they are experimental):
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT);
@@ -302,17 +299,7 @@ public final class FST<T> implements Accountable {
startNode = in.readVLong();
long numBytes = in.readVLong();
- if (numBytes > 1 << maxBlockBits) {
- // FST is big: we need multiple pages
- bytes = new BytesStore(in, numBytes, 1<<maxBlockBits);
- bytesArray = null;
- } else {
- // FST fits into a single block: use ByteArrayBytesStoreReader for less overhead
- bytes = null;
- bytesArray = new byte[(int) numBytes];
- in.readBytes(bytesArray, 0, bytesArray.length);
- }
-
+ this.fstStore.init(in, numBytes);
cacheRootArcs();
}
@@ -344,11 +331,12 @@ public final class FST<T> implements Accountable {
@Override
public long ramBytesUsed() {
long size = BASE_RAM_BYTES_USED;
- if (bytesArray != null) {
- size += bytesArray.length;
+ if (this.fstStore != null) {
+ size += this.fstStore.ramBytesUsed();
} else {
size += bytes.ramBytesUsed();
}
+
size += cachedArcsBytesUsed;
return size;
}
@@ -465,9 +453,8 @@ public final class FST<T> implements Accountable {
out.writeVLong(numBytes);
bytes.writeTo(out);
} else {
- assert bytesArray != null;
- out.writeVLong(bytesArray.length);
- out.writeBytes(bytesArray, 0, bytesArray.length);
+ assert fstStore != null;
+ fstStore.writeTo(out);
}
}
@@ -1137,8 +1124,8 @@ public final class FST<T> implements Accountable {
/** Returns a {@link BytesReader} for this FST, positioned at
* position 0. */
public BytesReader getBytesReader() {
- if (bytesArray != null) {
- return new ReverseBytesReader(bytesArray);
+ if (this.fstStore != null) {
+ return this.fstStore.getReverseBytesReader();
} else {
return bytes.getReverseReader();
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java
new file mode 100644
index 0000000..0cd1340
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.Accountable;
+
+import java.io.IOException;
+
+/** Abstraction for reading/writing bytes necessary for FST. */
+public interface FSTStore extends Accountable {
+ void init(DataInput in, long numBytes) throws IOException;
+ FST.BytesReader getReverseBytesReader();
+ void writeTo(DataOutput out) throws IOException;
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java
new file mode 100644
index 0000000..88c8884
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.RamUsageEstimator;
+
+import java.io.IOException;
+
+/** Provides off heap storage of finite state machine (FST),
+ * using underlying index input instead of byte store on heap
+ *
+ * @lucene.experimental
+ */
+public final class OffHeapFSTStore implements FSTStore {
+
+ private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OffHeapFSTStore.class);
+
+ private IndexInput in;
+ private long offset;
+ private long numBytes;
+
+ @Override
+ public void init(DataInput in, long numBytes) throws IOException {
+ if (in instanceof IndexInput) {
+ this.in = (IndexInput) in;
+ this.numBytes = numBytes;
+ this.offset = this.in.getFilePointer();
+ } else {
+ throw new IllegalArgumentException("parameter:in should be an instance of IndexInput for using OffHeapFSTStore, not a "
+ + in.getClass().getName());
+ }
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return BASE_RAM_BYTES_USED;
+ }
+
+ @Override
+ public FST.BytesReader getReverseBytesReader() {
+ try {
+ return new ReverseRandomAccessReader(in.randomAccessSlice(offset, numBytes));
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void writeTo(DataOutput out) throws IOException {
+ throw new UnsupportedOperationException("writeToOutput operation is not supported for OffHeapFSTStore");
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java
new file mode 100644
index 0000000..7dc09b9
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.RamUsageEstimator;
+
+import java.io.IOException;
+
+/** Provides storage of finite state machine (FST),
+ * using byte array or byte store allocated on heap.
+ *
+ * @lucene.experimental
+ */
+public final class OnHeapFSTStore implements FSTStore {
+
+ private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OnHeapFSTStore.class);
+
+ /** A {@link BytesStore}, used during building, or during reading when
+ * the FST is very large (more than 1 GB). If the FST is less than 1
+ * GB then bytesArray is set instead. */
+ private BytesStore bytes;
+
+ /** Used at read time when the FST fits into a single byte[]. */
+ private byte[] bytesArray;
+
+ private final int maxBlockBits;
+
+ public OnHeapFSTStore(int maxBlockBits) {
+ if (maxBlockBits < 1 || maxBlockBits > 30) {
+ throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
+ }
+
+ this.maxBlockBits = maxBlockBits;
+ }
+
+ @Override
+ public void init(DataInput in, long numBytes) throws IOException {
+ if (numBytes > 1 << this.maxBlockBits) {
+ // FST is big: we need multiple pages
+ bytes = new BytesStore(in, numBytes, 1<<this.maxBlockBits);
+ } else {
+ // FST fits into a single block: use ByteArrayBytesStoreReader for less overhead
+ bytesArray = new byte[(int) numBytes];
+ in.readBytes(bytesArray, 0, bytesArray.length);
+ }
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ long size = BASE_RAM_BYTES_USED;
+ if (bytesArray != null) {
+ size += bytesArray.length;
+ } else {
+ size += bytes.ramBytesUsed();
+ }
+
+ return size;
+ }
+
+ @Override
+ public FST.BytesReader getReverseBytesReader() {
+ if (bytesArray != null) {
+ return new ReverseBytesReader(bytesArray);
+ } else {
+ return bytes.getReverseReader();
+ }
+ }
+
+ @Override
+ public void writeTo(DataOutput out) throws IOException {
+ if (bytes != null) {
+ long numBytes = bytes.getPosition();
+ out.writeVLong(numBytes);
+ bytes.writeTo(out);
+ } else {
+ assert bytesArray != null;
+ out.writeVLong(bytesArray.length);
+ out.writeBytes(bytesArray, 0, bytesArray.length);
+ }
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReverseRandomAccessReader.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReverseRandomAccessReader.java
new file mode 100644
index 0000000..210ccee
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReverseRandomAccessReader.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import java.io.IOException;
+import org.apache.lucene.store.RandomAccessInput;
+
+/** Implements reverse read from a RandomAccessInput. */
+final class ReverseRandomAccessReader extends FST.BytesReader {
+ private final RandomAccessInput in;
+ private long pos;
+
+ public ReverseRandomAccessReader(RandomAccessInput in) {
+ this.in = in;
+ }
+
+ @Override
+ public byte readByte() throws IOException {
+ return in.readByte(pos--);
+ }
+
+ @Override
+ public void readBytes(byte[] b, int offset, int len) throws IOException {
+ int i = offset, end = offset + len;
+ while (i < end) {
+ b[i++] = in.readByte(pos--);
+ }
+ }
+
+ @Override
+ public void skipBytes(long count) {
+ pos -= count;
+ }
+
+ @Override
+ public long getPosition() {
+ return pos;
+ }
+
+ @Override
+ public void setPosition(long pos) {
+ this.pos = pos;
+ }
+
+ @Override
+ public boolean reversed() {
+ return true;
+ }
+}