You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2021/06/18 07:17:55 UTC
[lucene] branch main updated: LUCENE-9996: Reduce RAM usage of DWPT
for a single document. (#184)
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 1365156 LUCENE-9996: Reduce RAM usage of DWPT for a single document. (#184)
1365156 is described below
commit 1365156fcdb68ed12d3ae08bba24269da48cfc22
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Fri Jun 18 09:17:50 2021 +0200
LUCENE-9996: Reduce RAM usage of DWPT for a single document. (#184)
With this change, doc-value terms dictionaries use a shared `ByteBlockPool`
across all fields, and points, binary doc values and doc-value ordinals use
slightly smaller page sizes.
---
lucene/CHANGES.txt | 4 +-
.../apache/lucene/index/BinaryDocValuesWriter.java | 4 +-
.../org/apache/lucene/index/IndexingChain.java | 9 ++-
.../org/apache/lucene/index/PointValuesWriter.java | 23 ++++---
.../apache/lucene/index/SortedDocValuesWriter.java | 4 +-
.../lucene/index/SortedSetDocValuesWriter.java | 4 +-
.../java/org/apache/lucene/util/ByteBlockPool.java | 8 ---
.../java/org/apache/lucene/util/PagedBytes.java | 11 +++
.../lucene/util/packed/PackedLongValues.java | 2 +-
.../apache/lucene/index/TestDocumentWriter.java | 79 +++++++++++++++++++++-
.../org/apache/lucene/util/TestByteBlockPool.java | 8 +--
.../org/apache/lucene/util/TestPagedBytes.java | 1 +
12 files changed, 121 insertions(+), 36 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index bef89fc..af392f1 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -377,7 +377,9 @@ Improvements
Optimizations
---------------------
-(No changes)
+* LUCENE-9996: Improved memory efficiency of IndexWriter's RAM buffer, in
+ particular in the case of many fields and many indexing threads.
+ (Adrien Grand)
Bug Fixes
---------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
index 434257e..ecacdc3 100644
--- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
@@ -38,8 +38,8 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
/** Maximum length for a binary field. */
private static final int MAX_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH;
- // 32 KB block sizes for PagedBytes storage:
- private static final int BLOCK_BITS = 15;
+ // 4 kB block sizes for PagedBytes storage:
+ private static final int BLOCK_BITS = 12;
private final PagedBytes bytes;
private final DataOutput bytesOut;
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
index dc59290..c6ac695 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
@@ -64,6 +64,8 @@ final class IndexingChain implements Accountable {
// Writes postings and term vectors:
final TermsHash termsHash;
+ // Shared pool for doc-value terms
+ final ByteBlockPool docValuesBytePool;
// Writes stored fields
final StoredFieldsConsumer storedFieldsConsumer;
final TermVectorsConsumer termVectorsWriter;
@@ -127,6 +129,7 @@ final class IndexingChain implements Accountable {
termsHash =
new FreqProxTermsWriter(
intBlockAllocator, byteBlockAllocator, bytesUsed, termVectorsWriter);
+ docValuesBytePool = new ByteBlockPool(byteBlockAllocator);
}
private void onAbortingException(Throwable th) {
@@ -696,19 +699,19 @@ final class IndexingChain implements Accountable {
pf.docValuesWriter = new BinaryDocValuesWriter(fi, bytesUsed);
break;
case SORTED:
- pf.docValuesWriter = new SortedDocValuesWriter(fi, bytesUsed);
+ pf.docValuesWriter = new SortedDocValuesWriter(fi, bytesUsed, docValuesBytePool);
break;
case SORTED_NUMERIC:
pf.docValuesWriter = new SortedNumericDocValuesWriter(fi, bytesUsed);
break;
case SORTED_SET:
- pf.docValuesWriter = new SortedSetDocValuesWriter(fi, bytesUsed);
+ pf.docValuesWriter = new SortedSetDocValuesWriter(fi, bytesUsed, docValuesBytePool);
break;
default:
throw new AssertionError("unrecognized DocValues.Type: " + dvType);
}
if (fi.getPointDimensionCount() != 0) {
- pf.pointValuesWriter = new PointValuesWriter(byteBlockAllocator, bytesUsed, fi);
+ pf.pointValuesWriter = new PointValuesWriter(bytesUsed, fi);
}
if (fi.getVectorDimension() != 0) {
pf.vectorValuesWriter = new VectorValuesWriter(fi, bytesUsed);
diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
index f604066..599ca77 100644
--- a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
@@ -20,15 +20,17 @@ import java.io.IOException;
import org.apache.lucene.codecs.MutablePointValues;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.PointsWriter;
+import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Counter;
+import org.apache.lucene.util.PagedBytes;
/** Buffers up pending byte[][] value(s) per doc, then flushes when segment flushes. */
class PointValuesWriter {
private final FieldInfo fieldInfo;
- private final ByteBlockPool bytes;
+ private final PagedBytes bytes;
+ private final DataOutput bytesOut;
private final Counter iwBytesUsed;
private int[] docIDs;
private int numPoints;
@@ -36,17 +38,18 @@ class PointValuesWriter {
private int lastDocID = -1;
private final int packedBytesLength;
- PointValuesWriter(ByteBlockPool.Allocator allocator, Counter bytesUsed, FieldInfo fieldInfo) {
+ PointValuesWriter(Counter bytesUsed, FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
this.iwBytesUsed = bytesUsed;
- this.bytes = new ByteBlockPool(allocator);
+ this.bytes = new PagedBytes(12);
+ bytesOut = bytes.getDataOutput();
docIDs = new int[16];
iwBytesUsed.addAndGet(16 * Integer.BYTES);
packedBytesLength = fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes();
}
// TODO: if exactly the same value is added to exactly the same doc, should we dedup?
- public void addPackedValue(int docID, BytesRef value) {
+ public void addPackedValue(int docID, BytesRef value) throws IOException {
if (value == null) {
throw new IllegalArgumentException(
"field=" + fieldInfo.name + ": point value must not be null");
@@ -65,7 +68,9 @@ class PointValuesWriter {
docIDs = ArrayUtil.grow(docIDs, numPoints + 1);
iwBytesUsed.addAndGet((docIDs.length - numPoints) * Integer.BYTES);
}
- bytes.append(value);
+ final long bytesRamBytesUsedBefore = bytes.ramBytesUsed();
+ bytesOut.writeBytes(value.bytes, value.offset, value.length);
+ iwBytesUsed.addAndGet(bytes.ramBytesUsed() - bytesRamBytesUsedBefore);
docIDs[numPoints] = docID;
if (docID != lastDocID) {
numDocs++;
@@ -86,6 +91,7 @@ class PointValuesWriter {
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, PointsWriter writer)
throws IOException {
+ final PagedBytes.Reader bytesReader = bytes.freeze(false);
PointValues points =
new MutablePointValues() {
final int[] ords = new int[numPoints];
@@ -164,14 +170,13 @@ class PointValuesWriter {
@Override
public void getValue(int i, BytesRef packedValue) {
final long offset = (long) packedBytesLength * ords[i];
- packedValue.length = packedBytesLength;
- bytes.setRawBytesRef(packedValue, offset);
+ bytesReader.fillSlice(packedValue, offset, packedBytesLength);
}
@Override
public byte getByteAt(int i, int k) {
final long offset = (long) packedBytesLength * ords[i] + k;
- return bytes.readByte(offset);
+ return bytesReader.getByte(offset);
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
index 9d95569..90ab053 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
@@ -48,12 +48,12 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
private int[] finalSortedValues;
private int[] finalOrdMap;
- public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
+ public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed, ByteBlockPool pool) {
this.fieldInfo = fieldInfo;
this.iwBytesUsed = iwBytesUsed;
hash =
new BytesRefHash(
- new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)),
+ pool,
BytesRefHash.DEFAULT_CAPACITY,
new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
index 47dad4a..7b4ad7c 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
@@ -55,12 +55,12 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
private int[] finalSortedValues;
private int[] finalOrdMap;
- SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
+ SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed, ByteBlockPool pool) {
this.fieldInfo = fieldInfo;
this.iwBytesUsed = iwBytesUsed;
hash =
new BytesRefHash(
- new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)),
+ pool,
BytesRefHash.DEFAULT_CAPACITY,
new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
pending = PackedLongValues.packedBuilder(PackedInts.COMPACT);
diff --git a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
index f878aff..2ede753 100644
--- a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
+++ b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
@@ -377,14 +377,6 @@ public final class ByteBlockPool implements Accountable {
}
}
- /** Read a single byte at the given {@code offset}. */
- public byte readByte(long offset) {
- int bufferIndex = (int) (offset >> BYTE_BLOCK_SHIFT);
- int pos = (int) (offset & BYTE_BLOCK_MASK);
- byte[] buffer = buffers[bufferIndex];
- return buffer[pos];
- }
-
@Override
public long ramBytesUsed() {
long size = BASE_RAM_BYTES;
diff --git a/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java b/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
index dd495a9..487ca2d 100644
--- a/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
+++ b/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java
@@ -101,6 +101,17 @@ public final class PagedBytes implements Accountable {
}
/**
+ * Get the byte at the given offset.
+ *
+ * @lucene.internal
+ */
+ public byte getByte(long o) {
+ final int index = (int) (o >> blockBits);
+ final int offset = (int) (o & blockMask);
+ return blocks[index][offset];
+ }
+
+ /**
* Reads length as 1 or 2 byte vInt prefix, starting at <i>start</i>.
*
* <p><b>Note:</b> this method does not support slices spanning across block borders.
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java b/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java
index e9bcc20..b028e23 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java
@@ -29,7 +29,7 @@ public class PackedLongValues extends LongValues implements Accountable {
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(PackedLongValues.class);
- static final int DEFAULT_PAGE_SIZE = 1024;
+ static final int DEFAULT_PAGE_SIZE = 256;
static final int MIN_PAGE_SIZE = 64;
// More than 1M doesn't really makes sense with these appending buffers
// since their goal is to try to have small numbers of bits per value
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java
index 78a69e5..991e6bb 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java
@@ -17,14 +17,30 @@
package org.apache.lucene.index;
import java.io.IOException;
-import org.apache.lucene.analysis.*;
+import java.util.function.Function;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.IntPoint;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.document.SortedNumericDocValuesField;
+import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
+import org.apache.lucene.document.VectorField;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeSource;
@@ -32,6 +48,8 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
+import org.hamcrest.MatcherAssert;
+import org.hamcrest.Matchers;
public class TestDocumentWriter extends LuceneTestCase {
private Directory dir;
@@ -307,4 +325,63 @@ public class TestDocumentWriter extends LuceneTestCase {
fi.fieldInfo("f2").getIndexOptions());
reader.close();
}
+
+ /** Make sure that every new field doesn't increment memory usage by more than 16kB */
+ private void doTestRAMUsage(Function<String, IndexableField> fieldSupplier) throws IOException {
+ try (Directory dir = newDirectory();
+ IndexWriter w =
+ new IndexWriter(
+ dir,
+ newIndexWriterConfig()
+ .setMaxBufferedDocs(10)
+ .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH))) {
+ Document doc = new Document();
+ final int numFields = 100;
+ for (int i = 0; i < numFields; ++i) {
+ doc.add(fieldSupplier.apply("f" + i));
+ }
+ w.addDocument(doc);
+ assertTrue(w.hasChangesInRam());
+ MatcherAssert.assertThat(w.ramBytesUsed(), Matchers.lessThan(numFields * 16384L));
+ }
+ }
+
+ public void testRAMUsageStored() throws IOException {
+ doTestRAMUsage(field -> new StoredField(field, new BytesRef("Lucene")));
+ }
+
+ public void testRAMUsageIndexed() throws IOException {
+ doTestRAMUsage(field -> new StringField(field, new BytesRef("Lucene"), Store.NO));
+ }
+
+ public void testRAMUsagePoint() throws IOException {
+ doTestRAMUsage(field -> new IntPoint(field, 42));
+ }
+
+ public void testRAMUsageNumericDocValue() throws IOException {
+ doTestRAMUsage(field -> new NumericDocValuesField(field, 42));
+ }
+
+ public void testRAMUsageSortedDocValue() throws IOException {
+ doTestRAMUsage(field -> new SortedDocValuesField(field, new BytesRef("Lucene")));
+ }
+
+ public void testRAMUsageBinaryDocValue() throws IOException {
+ doTestRAMUsage(field -> new BinaryDocValuesField(field, new BytesRef("Lucene")));
+ }
+
+ public void testRAMUsageSortedNumericDocValue() throws IOException {
+ doTestRAMUsage(field -> new SortedNumericDocValuesField(field, 42));
+ }
+
+ public void testRAMUsageSortedSetDocValue() throws IOException {
+ doTestRAMUsage(field -> new SortedSetDocValuesField(field, new BytesRef("Lucene")));
+ }
+
+ public void testRAMUsageVector() throws IOException {
+ doTestRAMUsage(
+ field ->
+ new VectorField(
+ field, new float[] {1, 2, 3, 4}, VectorValues.SimilarityFunction.EUCLIDEAN));
+ }
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java b/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
index 0b4b3dc..d136fd4 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestByteBlockPool.java
@@ -45,18 +45,12 @@ public class TestByteBlockPool extends LuceneTestCase {
for (BytesRef expected : list) {
ref.grow(expected.length);
ref.setLength(expected.length);
- switch (random().nextInt(3)) {
+ switch (random().nextInt(2)) {
case 0:
// copy bytes
pool.readBytes(position, ref.bytes(), 0, ref.length());
break;
case 1:
- // copy bytes one by one
- for (int i = 0; i < ref.length(); ++i) {
- ref.setByteAt(i, pool.readByte(position + i));
- }
- break;
- case 2:
BytesRef scratch = new BytesRef();
scratch.length = ref.length();
pool.setRawBytesRef(scratch, position);
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestPagedBytes.java b/lucene/core/src/test/org/apache/lucene/util/TestPagedBytes.java
index 28050b4..7bd7c3d 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestPagedBytes.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestPagedBytes.java
@@ -87,6 +87,7 @@ public class TestPagedBytes extends LuceneTestCase {
final BytesRef slice = new BytesRef();
for (int iter2 = 0; iter2 < 100; iter2++) {
final int pos = random.nextInt(numBytes - 1);
+ assertEquals(answer[pos], reader.getByte(pos));
final int len = random.nextInt(Math.min(blockSize + 1, numBytes - pos));
reader.fillSlice(slice, pos, len);
for (int byteUpto = 0; byteUpto < len; byteUpto++) {