You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2019/02/19 17:57:09 UTC

[lucene-solr] branch branch_8x updated: LUCENE-8635: add option to move FSTs off-heap, and do so for the FST terms index in the default codec for non-primary-key fields if MMapDirectory is being used

This is an automated email from the ASF dual-hosted git repository.

mikemccand pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 10d5e93  LUCENE-8635: add option to move FSTs off-heap, and do so for the FST terms index in the default codec for non-primary-key fields if MMapDirectory is being used
10d5e93 is described below

commit 10d5e935e22256670940f33b96229cdb8da9f6a8
Author: Mike McCandless <mi...@apache.org>
AuthorDate: Tue Feb 19 12:52:22 2019 -0500

    LUCENE-8635: add option to move FSTs off-heap, and do so for the FST terms index in the default codec for non-primary-key fields if MMapDirectory is being used
---
 lucene/CHANGES.txt                                 |  6 ++
 .../codecs/blocktree/BlockTreeTermsReader.java     |  9 +-
 .../lucene/codecs/blocktree/FieldReader.java       | 11 ++-
 .../apache/lucene/store/ByteBufferIndexInput.java  |  2 +-
 .../src/java/org/apache/lucene/util/fst/FST.java   | 41 ++++-----
 .../java/org/apache/lucene/util/fst/FSTStore.java  | 30 +++++++
 .../apache/lucene/util/fst/OffHeapFSTStore.java    | 69 +++++++++++++++
 .../org/apache/lucene/util/fst/OnHeapFSTStore.java | 97 ++++++++++++++++++++++
 .../lucene/util/fst/ReverseRandomAccessReader.java | 63 ++++++++++++++
 9 files changed, 293 insertions(+), 35 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index eb7bb58..081208f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -177,6 +177,12 @@ Changes in Runtime Behavior
   is instead calculated as a function of the sloppy frequency of the matching
   intervals. (Alan Woodward, Jim Ferenczi)
 
+* LUCENE-8635: FSTs can now remain off-heap, accessed via
+  IndexInput, and the default codec's term dictionary
+  (BlockTreeTermsReader) will now leave the FST for the terms index
+  off-heap for non-primary-key fields using MMapDirectory, reducing
+  heap usage for such fields. (Ankit Jain)
+
 New Features
 
 * LUCENE-8340: LongPoint#newDistanceFeatureQuery may be used to boost scores based on
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
index b0091fd..0076f14 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
@@ -102,6 +102,8 @@ public final class BlockTreeTermsReader extends FieldsProducer {
 
   // Open input to the main terms dict file (_X.tib)
   final IndexInput termsIn;
+  // Open input to the terms index file (_X.tip)
+  final IndexInput indexIn;
 
   //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
 
@@ -118,7 +120,6 @@ public final class BlockTreeTermsReader extends FieldsProducer {
   /** Sole constructor. */
   public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state) throws IOException {
     boolean success = false;
-    IndexInput indexIn = null;
     
     this.postingsReader = postingsReader;
     this.segment = state.segmentInfo.name;
@@ -197,13 +198,11 @@ public final class BlockTreeTermsReader extends FieldsProducer {
           throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
         }
       }
-      
-      indexIn.close();
       success = true;
     } finally {
       if (!success) {
         // this.close() will close in:
-        IOUtils.closeWhileHandlingException(indexIn, this);
+        IOUtils.closeWhileHandlingException(this);
       }
     }
   }
@@ -237,7 +236,7 @@ public final class BlockTreeTermsReader extends FieldsProducer {
   @Override
   public void close() throws IOException {
     try {
-      IOUtils.close(termsIn, postingsReader);
+      IOUtils.close(indexIn, termsIn, postingsReader);
     } finally { 
       // Clear so refs to terms index is GCable even if
       // app hangs onto us:
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
index 46aee6e..41073fd 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
@@ -26,6 +26,7 @@ import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteBufferIndexInput;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.Accountables;
@@ -34,6 +35,7 @@ import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.apache.lucene.util.fst.ByteSequenceOutputs;
 import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.OffHeapFSTStore;
 
 /**
  * BlockTree's implementation of {@link Terms}.
@@ -86,9 +88,14 @@ public final class FieldReader extends Terms implements Accountable {
 
     if (indexIn != null) {
       final IndexInput clone = indexIn.clone();
-      //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
       clone.seek(indexStartFP);
-      index = new FST<>(clone, ByteSequenceOutputs.getSingleton());
+      // Initialize FST offheap if index is MMapDirectory and
+      // docCount != sumDocFreq implying field is not primary key
+      if (clone instanceof ByteBufferIndexInput && this.docCount != this.sumDocFreq) {
+        index = new FST<>(clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
+      } else {
+        index = new FST<>(clone, ByteSequenceOutputs.getSingleton());
+      }
         
       /*
         if (false) {
diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBufferIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBufferIndexInput.java
index 0f6c733..3df89d7 100644
--- a/lucene/core/src/java/org/apache/lucene/store/ByteBufferIndexInput.java
+++ b/lucene/core/src/java/org/apache/lucene/store/ByteBufferIndexInput.java
@@ -33,7 +33,7 @@ import java.nio.ByteBuffer;
  * For efficiency, this class requires that the buffers
  * are a power-of-two (<code>chunkSizePower</code>).
  */
-abstract class ByteBufferIndexInput extends IndexInput implements RandomAccessInput {
+public abstract class ByteBufferIndexInput extends IndexInput implements RandomAccessInput {
   protected final long length;
   protected final long chunkSizeMask;
   protected final int chunkSizePower;
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
index 92973f6..5a5169b 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
@@ -127,8 +127,7 @@ public final class FST<T> implements Accountable {
    *  GB then bytesArray is set instead. */
   final BytesStore bytes;
 
-  /** Used at read time when the FST fits into a single byte[]. */
-  final byte[] bytesArray;
+  private final FSTStore fstStore;
 
   private long startNode = -1;
 
@@ -238,7 +237,7 @@ public final class FST<T> implements Accountable {
     this.inputType = inputType;
     this.outputs = outputs;
     version = VERSION_CURRENT;
-    bytesArray = null;
+    fstStore = null;
     bytes = new BytesStore(bytesPageBits);
     // pad: ensure no node gets address 0 which is reserved to mean
     // the stop state w/ no arcs
@@ -251,18 +250,16 @@ public final class FST<T> implements Accountable {
 
   /** Load a previously saved FST. */
   public FST(DataInput in, Outputs<T> outputs) throws IOException {
-    this(in, outputs, DEFAULT_MAX_BLOCK_BITS);
+    this(in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS));
   }
 
   /** Load a previously saved FST; maxBlockBits allows you to
    *  control the size of the byte[] pages used to hold the FST bytes. */
-  public FST(DataInput in, Outputs<T> outputs, int maxBlockBits) throws IOException {
+  public FST(DataInput in, Outputs<T> outputs, FSTStore fstStore) throws IOException {
+    bytes = null;
+    this.fstStore = fstStore;
     this.outputs = outputs;
 
-    if (maxBlockBits < 1 || maxBlockBits > 30) {
-      throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
-    }
-
     // NOTE: only reads most recent format; we don't have
     // back-compat promise for FSTs (they are experimental):
     version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT);
@@ -302,17 +299,7 @@ public final class FST<T> implements Accountable {
     startNode = in.readVLong();
 
     long numBytes = in.readVLong();
-    if (numBytes > 1 << maxBlockBits) {
-      // FST is big: we need multiple pages
-      bytes = new BytesStore(in, numBytes, 1<<maxBlockBits);
-      bytesArray = null;
-    } else {
-      // FST fits into a single block: use ByteArrayBytesStoreReader for less overhead
-      bytes = null;
-      bytesArray = new byte[(int) numBytes];
-      in.readBytes(bytesArray, 0, bytesArray.length);
-    }
-    
+    this.fstStore.init(in, numBytes);
     cacheRootArcs();
   }
 
@@ -344,11 +331,12 @@ public final class FST<T> implements Accountable {
   @Override
   public long ramBytesUsed() {
     long size = BASE_RAM_BYTES_USED;
-    if (bytesArray != null) {
-      size += bytesArray.length;
+    if (this.fstStore != null) {
+      size += this.fstStore.ramBytesUsed();
     } else {
       size += bytes.ramBytesUsed();
     }
+
     size += cachedArcsBytesUsed;
     return size;
   }
@@ -467,9 +455,8 @@ public final class FST<T> implements Accountable {
       out.writeVLong(numBytes);
       bytes.writeTo(out);
     } else {
-      assert bytesArray != null;
-      out.writeVLong(bytesArray.length);
-      out.writeBytes(bytesArray, 0, bytesArray.length);
+      assert fstStore != null;
+      fstStore.writeTo(out);
     }
   }
   
@@ -1139,8 +1126,8 @@ public final class FST<T> implements Accountable {
   /** Returns a {@link BytesReader} for this FST, positioned at
    *  position 0. */
   public BytesReader getBytesReader() {
-    if (bytesArray != null) {
-      return new ReverseBytesReader(bytesArray);
+    if (this.fstStore != null) {
+      return this.fstStore.getReverseBytesReader();
     } else {
       return bytes.getReverseReader();
     }
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java
new file mode 100644
index 0000000..0cd1340
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTStore.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.Accountable;
+
+import java.io.IOException;
+
+/** Abstraction for reading/writing bytes necessary for FST. */
+public interface FSTStore extends Accountable {
+    void init(DataInput in, long numBytes) throws IOException;
+    FST.BytesReader getReverseBytesReader();
+    void writeTo(DataOutput out) throws IOException;
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java
new file mode 100644
index 0000000..88c8884
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/OffHeapFSTStore.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.RamUsageEstimator;
+
+import java.io.IOException;
+
+/** Provides off heap storage of finite state machine (FST),
+ *  using underlying index input instead of byte store on heap
+ *
+ * @lucene.experimental
+ */
+public final class OffHeapFSTStore implements FSTStore {
+
+    private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OffHeapFSTStore.class);
+
+    private IndexInput in;
+    private long offset;
+    private long numBytes;
+
+    @Override
+    public void init(DataInput in, long numBytes) throws IOException {
+        if (in instanceof IndexInput) {
+            this.in = (IndexInput) in;
+            this.numBytes = numBytes;
+            this.offset = this.in.getFilePointer();
+        } else {
+            throw new IllegalArgumentException("parameter:in should be an instance of IndexInput for using OffHeapFSTStore, not a "
+                                               + in.getClass().getName());
+        }
+    }
+
+    @Override
+    public long ramBytesUsed() {
+        return BASE_RAM_BYTES_USED;
+    }
+
+    @Override
+    public FST.BytesReader getReverseBytesReader() {
+        try {
+            return new ReverseRandomAccessReader(in.randomAccessSlice(offset, numBytes));
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    @Override
+    public void writeTo(DataOutput out) throws IOException {
+        throw new UnsupportedOperationException("writeToOutput operation is not supported for OffHeapFSTStore");
+    }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java
new file mode 100644
index 0000000..7dc09b9
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/OnHeapFSTStore.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.RamUsageEstimator;
+
+import java.io.IOException;
+
+/** Provides storage of finite state machine (FST),
+ *  using byte array or byte store allocated on heap.
+ *
+ * @lucene.experimental
+ */
+public final class OnHeapFSTStore implements FSTStore {
+
+    private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OnHeapFSTStore.class);
+
+    /** A {@link BytesStore}, used during building, or during reading when
+     *  the FST is very large (more than 1 GB).  If the FST is less than 1
+     *  GB then bytesArray is set instead. */
+    private BytesStore bytes;
+
+    /** Used at read time when the FST fits into a single byte[]. */
+    private byte[] bytesArray;
+
+    private final int maxBlockBits;
+
+    public OnHeapFSTStore(int maxBlockBits) {
+        if (maxBlockBits < 1 || maxBlockBits > 30) {
+            throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
+        }
+
+        this.maxBlockBits = maxBlockBits;
+    }
+
+    @Override
+    public void init(DataInput in, long numBytes) throws IOException {
+        if (numBytes > 1 << this.maxBlockBits) {
+            // FST is big: we need multiple pages
+            bytes = new BytesStore(in, numBytes, 1<<this.maxBlockBits);
+        } else {
+            // FST fits into a single block: use ByteArrayBytesStoreReader for less overhead
+            bytesArray = new byte[(int) numBytes];
+            in.readBytes(bytesArray, 0, bytesArray.length);
+        }
+    }
+
+    @Override
+    public long ramBytesUsed() {
+        long size = BASE_RAM_BYTES_USED;
+        if (bytesArray != null) {
+            size += bytesArray.length;
+        } else {
+            size += bytes.ramBytesUsed();
+        }
+
+        return size;
+    }
+
+    @Override
+    public FST.BytesReader getReverseBytesReader() {
+        if (bytesArray != null) {
+            return new ReverseBytesReader(bytesArray);
+        } else {
+            return bytes.getReverseReader();
+        }
+    }
+
+    @Override
+    public void writeTo(DataOutput out) throws IOException {
+        if (bytes != null) {
+            long numBytes = bytes.getPosition();
+            out.writeVLong(numBytes);
+            bytes.writeTo(out);
+        } else {
+            assert bytesArray != null;
+            out.writeVLong(bytesArray.length);
+            out.writeBytes(bytesArray, 0, bytesArray.length);
+        }
+    }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/ReverseRandomAccessReader.java b/lucene/core/src/java/org/apache/lucene/util/fst/ReverseRandomAccessReader.java
new file mode 100644
index 0000000..210ccee
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/ReverseRandomAccessReader.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.fst;
+
+import java.io.IOException;
+import org.apache.lucene.store.RandomAccessInput;
+
+/** Implements reverse read from a RandomAccessInput. */
+final class ReverseRandomAccessReader extends FST.BytesReader {
+    private final RandomAccessInput in;
+    private long pos;
+
+    public ReverseRandomAccessReader(RandomAccessInput in) {
+        this.in = in;
+    }
+
+    @Override
+    public byte readByte() throws IOException {
+        return in.readByte(pos--);
+    }
+
+    @Override
+    public void readBytes(byte[] b, int offset, int len) throws IOException {
+        int i = offset, end = offset + len;
+        while (i < end) {
+            b[i++] = in.readByte(pos--);
+        }
+    }
+
+    @Override
+    public void skipBytes(long count) {
+        pos -= count;
+    }
+
+    @Override
+    public long getPosition() {
+        return pos;
+    }
+
+    @Override
+    public void setPosition(long pos) {
+        this.pos = pos;
+    }
+
+    @Override
+    public boolean reversed() {
+        return true;
+    }
+}