You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2021/06/18 09:32:52 UTC

[lucene-solr] branch branch_8x updated: LUCENE-9996: Reduce RAM usage of DWPT for a single document. (#184)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new e12bd96  LUCENE-9996: Reduce RAM usage of DWPT for a single document. (#184)
e12bd96 is described below

commit e12bd964866348f95832a52c92fedb39ec500a90
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Fri Jun 18 11:21:03 2021 +0200

    LUCENE-9996: Reduce RAM usage of DWPT for a single document. (#184)
    
    With this change, doc-value terms dictionaries use a shared `ByteBlockPool`
    across all fields, and points, binary doc values and doc-value ordinals use
    slightly smaller page sizes.
---
 .../apache/lucene/index/BinaryDocValuesWriter.java |  4 +-
 .../apache/lucene/index/DefaultIndexingChain.java  | 11 ++--
 .../org/apache/lucene/index/PointValuesWriter.java | 23 ++++---
 .../apache/lucene/index/SortedDocValuesWriter.java |  5 +-
 .../lucene/index/SortedSetDocValuesWriter.java     |  5 +-
 .../java/org/apache/lucene/util/ByteBlockPool.java |  8 ---
 .../java/org/apache/lucene/util/PagedBytes.java    | 13 +++-
 .../lucene/util/packed/PackedLongValues.java       |  2 +-
 .../apache/lucene/index/TestDocumentWriter.java    | 74 +++++++++++++++++++++-
 .../org/apache/lucene/util/TestByteBlockPool.java  |  8 +--
 .../org/apache/lucene/util/TestPagedBytes.java     |  1 +
 11 files changed, 113 insertions(+), 41 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
index fe9cfaf..881778a 100644
--- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
@@ -41,8 +41,8 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
   /** Maximum length for a binary field. */
   private static final int MAX_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH;
 
-  // 32 KB block sizes for PagedBytes storage:
-  private final static int BLOCK_BITS = 15;
+  // 4 KB block sizes for PagedBytes storage:
+  private final static int BLOCK_BITS = 12;
 
   private final PagedBytes bytes;
   private final DataOutput bytesOut;
diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
index 3ea577a..edfb08f 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
@@ -65,6 +65,8 @@ final class DefaultIndexingChain extends DocConsumer {
 
   // Writes postings and term vectors:
   final TermsHash termsHash;
+  // Shared pool for doc-value terms
+  final ByteBlockPool docValuesBytePool;
   // Writes stored fields
   final StoredFieldsConsumer storedFieldsConsumer;
   final TermVectorsConsumer termVectorsWriter;
@@ -107,6 +109,7 @@ final class DefaultIndexingChain extends DocConsumer {
       termVectorsWriter = new SortingTermVectorsConsumer(intBlockAllocator, byteBlockAllocator, directory, segmentInfo, indexWriterConfig.getCodec());
     }
     termsHash = new FreqProxTermsWriter(intBlockAllocator, byteBlockAllocator, bytesUsed, termVectorsWriter);
+    docValuesBytePool = new ByteBlockPool(byteBlockAllocator);
   }
 
   private void onAbortingException(Throwable th) {
@@ -590,7 +593,7 @@ final class DefaultIndexingChain extends DocConsumer {
   }
 
   /** Called from processDocument to index one field's point */
-  private void indexPoint(int docID, PerField fp, IndexableField field) {
+  private void indexPoint(int docID, PerField fp, IndexableField field) throws IOException {
     int pointDimensionCount = field.fieldType().pointDimensionCount();
     int pointIndexDimensionCount = field.fieldType().pointIndexDimensionCount();
 
@@ -605,7 +608,7 @@ final class DefaultIndexingChain extends DocConsumer {
     fp.fieldInfo.setPointDimensions(pointDimensionCount, pointIndexDimensionCount, dimensionNumBytes);
 
     if (fp.pointValuesWriter == null) {
-      fp.pointValuesWriter = new PointValuesWriter(byteBlockAllocator, bytesUsed, fp.fieldInfo);
+      fp.pointValuesWriter = new PointValuesWriter(bytesUsed, fp.fieldInfo);
     }
     fp.pointValuesWriter.addPackedValue(docID, field.binaryValue());
   }
@@ -702,7 +705,7 @@ final class DefaultIndexingChain extends DocConsumer {
 
       case SORTED:
         if (fp.docValuesWriter == null) {
-          fp.docValuesWriter = new SortedDocValuesWriter(fp.fieldInfo, bytesUsed);
+          fp.docValuesWriter = new SortedDocValuesWriter(fp.fieldInfo, bytesUsed, docValuesBytePool);
         }
         ((SortedDocValuesWriter) fp.docValuesWriter).addValue(docID, field.binaryValue());
         break;
@@ -716,7 +719,7 @@ final class DefaultIndexingChain extends DocConsumer {
 
       case SORTED_SET:
         if (fp.docValuesWriter == null) {
-          fp.docValuesWriter = new SortedSetDocValuesWriter(fp.fieldInfo, bytesUsed);
+          fp.docValuesWriter = new SortedSetDocValuesWriter(fp.fieldInfo, bytesUsed, docValuesBytePool);
         }
         ((SortedSetDocValuesWriter) fp.docValuesWriter).addValue(docID, field.binaryValue());
         break;
diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
index ddbc6a6..10def05b 100644
--- a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
@@ -21,15 +21,17 @@ import java.io.IOException;
 import org.apache.lucene.codecs.MutablePointValues;
 import org.apache.lucene.codecs.PointsReader;
 import org.apache.lucene.codecs.PointsWriter;
+import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.ByteBlockPool;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Counter;
+import org.apache.lucene.util.PagedBytes;
 
 /** Buffers up pending byte[][] value(s) per doc, then flushes when segment flushes. */
 class PointValuesWriter {
   private final FieldInfo fieldInfo;
-  private final ByteBlockPool bytes;
+  private final PagedBytes bytes;
+  private final DataOutput bytesOut;
   private final Counter iwBytesUsed;
   private int[] docIDs;
   private int numPoints;
@@ -37,17 +39,18 @@ class PointValuesWriter {
   private int lastDocID = -1;
   private final int packedBytesLength;
 
-  PointValuesWriter(ByteBlockPool.Allocator allocator, Counter bytesUsed, FieldInfo fieldInfo) {
+  PointValuesWriter(Counter bytesUsed, FieldInfo fieldInfo) {
     this.fieldInfo = fieldInfo;
     this.iwBytesUsed = bytesUsed;
-    this.bytes = new ByteBlockPool(allocator);
+    this.bytes = new PagedBytes(12);
+    bytesOut = bytes.getDataOutput();
     docIDs = new int[16];
     iwBytesUsed.addAndGet(16 * Integer.BYTES);
     packedBytesLength = fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes();
   }
 
   // TODO: if exactly the same value is added to exactly the same doc, should we dedup?
-  public void addPackedValue(int docID, BytesRef value) {
+  public void addPackedValue(int docID, BytesRef value) throws IOException {
     if (value == null) {
       throw new IllegalArgumentException("field=" + fieldInfo.name + ": point value must not be null");
     }
@@ -59,7 +62,9 @@ class PointValuesWriter {
       docIDs = ArrayUtil.grow(docIDs, numPoints+1);
       iwBytesUsed.addAndGet((docIDs.length - numPoints) * Integer.BYTES);
     }
-    bytes.append(value);
+    final long bytesRamBytesUsedBefore = bytes.ramBytesUsed();
+    bytesOut.writeBytes(value.bytes, value.offset, value.length);
+    iwBytesUsed.addAndGet(bytes.ramBytesUsed() - bytesRamBytesUsedBefore);
     docIDs[numPoints] = docID;
     if (docID != lastDocID) {
       numDocs++;
@@ -70,6 +75,7 @@ class PointValuesWriter {
   }
 
   public void flush(SegmentWriteState state, Sorter.DocMap sortMap, PointsWriter writer) throws IOException {
+    final PagedBytes.Reader bytesReader = bytes.freeze(false);
     PointValues points = new MutablePointValues() {
       final int[] ords = new int[numPoints];
       int[] temp;
@@ -146,14 +152,13 @@ class PointValuesWriter {
       @Override
       public void getValue(int i, BytesRef packedValue) {
         final long offset = (long) packedBytesLength * ords[i];
-        packedValue.length = packedBytesLength;
-        bytes.setRawBytesRef(packedValue, offset);
+        bytesReader.fillSlice(packedValue, offset, packedBytesLength);
       }
 
       @Override
       public byte getByteAt(int i, int k) {
         final long offset = (long) packedBytesLength * ords[i] + k;
-        return bytes.readByte(offset);
+        return bytesReader.getByte(offset);
       }
 
       @Override
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
index 4ed81f7..9f87fd9 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
@@ -47,12 +47,11 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
   private int[] finalSortedValues;
   private int[] finalOrdMap;
 
-  public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
+  public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed, ByteBlockPool pool) {
     this.fieldInfo = fieldInfo;
     this.iwBytesUsed = iwBytesUsed;
     hash = new BytesRefHash(
-        new ByteBlockPool(
-            new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)),
+            pool,
             BytesRefHash.DEFAULT_CAPACITY,
             new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
     pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
index de9fa88..df12b97 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
@@ -55,12 +55,11 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
   private int[] finalOrdMap;
 
 
-  SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
+  SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed, ByteBlockPool pool) {
     this.fieldInfo = fieldInfo;
     this.iwBytesUsed = iwBytesUsed;
     hash = new BytesRefHash(
-        new ByteBlockPool(
-            new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)),
+            pool,
             BytesRefHash.DEFAULT_CAPACITY,
             new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
     pending = PackedLongValues.packedBuilder(PackedInts.COMPACT);
diff --git a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
index 7649c2c..c9db5da 100644
--- a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
+++ b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
@@ -387,14 +387,6 @@ public final class ByteBlockPool implements Accountable {
     }
   }
 
-  /** Read a single byte at the given {@code offset}. */
-  public byte readByte(long offset) {
-    int bufferIndex = (int) (offset >> BYTE_BLOCK_SHIFT);
-    int pos = (int) (offset & BYTE_BLOCK_MASK);
-    byte[] buffer = buffers[bufferIndex];
-    return buffer[pos];
-  }
-
   @Override
   public long ramBytesUsed() {
     long size = BASE_RAM_BYTES;
diff --git a/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java b/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
index e07046c..8538805 100644
--- a/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
+++ b/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
@@ -98,7 +98,18 @@ public final class PagedBytes implements Accountable {
         System.arraycopy(blocks[1+index], 0, b.bytes, blockSize-offset, length-(blockSize-offset));
       }
     }
-    
+
+    /**
+     * Get the byte at the given offset.
+     *
+     * @lucene.internal
+     */
+    public byte getByte(long o) {
+      final int index = (int) (o >> blockBits);
+      final int offset = (int) (o & blockMask);
+      return blocks[index][offset];
+    }
+
     /**
      * Reads length as 1 or 2 byte vInt prefix, starting at <i>start</i>.
      * <p>
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java b/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java
index 19788b7..8b050d3 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java
@@ -31,7 +31,7 @@ public class PackedLongValues extends LongValues implements Accountable {
 
   private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(PackedLongValues.class);
 
-  static final int DEFAULT_PAGE_SIZE = 1024;
+  static final int DEFAULT_PAGE_SIZE = 256;
   static final int MIN_PAGE_SIZE = 64;
   // More than 1M doesn't really makes sense with these appending buffers
   // since their goal is to try to have small numbers of bits per value
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java
index 54e6d26..5c3d362 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java
@@ -18,14 +18,29 @@ package org.apache.lucene.index;
 
 
 import java.io.IOException;
-
-import org.apache.lucene.analysis.*;
+import java.util.function.Function;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.IntPoint;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.document.SortedNumericDocValuesField;
+import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.store.Directory;
@@ -295,4 +310,57 @@ public class TestDocumentWriter extends LuceneTestCase {
     assertEquals("omitTermFreqAndPositions field bit should be set for f2", IndexOptions.DOCS, fi.fieldInfo("f2").getIndexOptions());
     reader.close();
   }
-}
+
+  /** Make sure that every new field doesn't increment memory usage by more than 16kB */
+  private void doTestRAMUsage(Function<String, IndexableField> fieldSupplier) throws IOException {
+    try (Directory dir = newDirectory();
+        IndexWriter w =
+            new IndexWriter(
+                dir,
+                newIndexWriterConfig()
+                .setMaxBufferedDocs(10)
+                .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH))) {
+      Document doc = new Document();
+      final int numFields = 100;
+      for (int i = 0; i < numFields; ++i) {
+        doc.add(fieldSupplier.apply("f" + i));
+      }
+      w.addDocument(doc);
+      assertTrue(w.hasChangesInRam());
+      assertTrue(w.ramBytesUsed() < numFields * 16384L);
+    }
+  }
+
+  public void testRAMUsageStored() throws IOException {
+    doTestRAMUsage(field -> new StoredField(field, new BytesRef("Lucene")));
+  }
+
+  public void testRAMUsageIndexed() throws IOException {
+    doTestRAMUsage(field -> new StringField(field, new BytesRef("Lucene"), Store.NO));
+  }
+
+  public void testRAMUsagePoint() throws IOException {
+    doTestRAMUsage(field -> new IntPoint(field, 42));
+  }
+
+  public void testRAMUsageNumericDocValue() throws IOException {
+    doTestRAMUsage(field -> new NumericDocValuesField(field, 42));
+  }
+
+  public void testRAMUsageSortedDocValue() throws IOException {
+    doTestRAMUsage(field -> new SortedDocValuesField(field, new BytesRef("Lucene")));
+  }
+
+  public void testRAMUsageBinaryDocValue() throws IOException {
+    doTestRAMUsage(field -> new BinaryDocValuesField(field, new BytesRef("Lucene")));
+  }
+
+  public void testRAMUsageSortedNumericDocValue() throws IOException {
+    doTestRAMUsage(field -> new SortedNumericDocValuesField(field, 42));
+  }
+
+  public void testRAMUsageSortedSetDocValue() throws IOException {
+    doTestRAMUsage(field -> new SortedSetDocValuesField(field, new BytesRef("Lucene")));
+  }
+
+}
\ No newline at end of file
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java b/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
index 475f716..b21ac00 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
@@ -45,18 +45,12 @@ public class TestByteBlockPool extends LuceneTestCase {
       for (BytesRef expected : list) {
         ref.grow(expected.length);
         ref.setLength(expected.length);
-        switch (random().nextInt(3)) {
+        switch (random().nextInt(2)) {
           case 0:
             // copy bytes
             pool.readBytes(position, ref.bytes(), 0, ref.length());
             break;
           case 1:
-            // copy bytes one by one
-            for (int i = 0; i < ref.length(); ++i) {
-              ref.setByteAt(i, pool.readByte(position + i));
-            }
-            break;
-          case 2:
             BytesRef scratch = new BytesRef();
             scratch.length = ref.length();
             pool.setRawBytesRef(scratch, position);
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestPagedBytes.java b/lucene/core/src/test/org/apache/lucene/util/TestPagedBytes.java
index a9971c6..89e3e1d 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestPagedBytes.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestPagedBytes.java
@@ -83,6 +83,7 @@ public class TestPagedBytes extends LuceneTestCase {
       final BytesRef slice = new BytesRef();
       for(int iter2=0;iter2<100;iter2++) {
         final int pos = random.nextInt(numBytes-1);
+        assertEquals(answer[pos], reader.getByte(pos));
         final int len = random.nextInt(Math.min(blockSize+1, numBytes - pos));
         reader.fillSlice(slice, pos, len);
         for(int byteUpto=0;byteUpto<len;byteUpto++) {