You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2021/10/20 17:05:43 UTC
[lucene] branch main updated: LUCENE-10189: Optimize flush of
doc-value fields that are effectively single-valued. (#399)
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 3a11983 LUCENE-10189: Optimize flush of doc-value fields that are effectively single-valued. (#399)
3a11983 is described below
commit 3a11983de2df842c413cf159ccc5bdefcb721e33
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Wed Oct 20 19:05:40 2021 +0200
LUCENE-10189: Optimize flush of doc-value fields that are effectively single-valued. (#399)
---
.../org/apache/lucene/index/DocsWithFieldSet.java | 15 ++++--
.../lucene/index/NumericDocValuesWriter.java | 4 +-
.../apache/lucene/index/SortedDocValuesWriter.java | 27 +++-------
.../lucene/index/SortedNumericDocValuesWriter.java | 40 +++++++++-----
.../lucene/index/SortedSetDocValuesWriter.java | 62 +++++++++++++---------
5 files changed, 82 insertions(+), 66 deletions(-)
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java b/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java
index af49e77..b4f5ac5 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java
@@ -32,7 +32,7 @@ final class DocsWithFieldSet extends DocIdSet {
RamUsageEstimator.shallowSizeOfInstance(DocsWithFieldSet.class);
private FixedBitSet set;
- private int cost = 0;
+ private int cardinality = 0;
private int lastDocId = -1;
void add(int docID) {
@@ -43,14 +43,14 @@ final class DocsWithFieldSet extends DocIdSet {
if (set != null) {
set = FixedBitSet.ensureCapacity(set, docID);
set.set(docID);
- } else if (docID != cost) {
+ } else if (docID != cardinality) {
// migrate to a sparse encoding using a bit set
set = new FixedBitSet(docID + 1);
- set.set(0, cost);
+ set.set(0, cardinality);
set.set(docID);
}
lastDocId = docID;
- cost++;
+ cardinality++;
}
@Override
@@ -60,6 +60,11 @@ final class DocsWithFieldSet extends DocIdSet {
@Override
public DocIdSetIterator iterator() {
- return set != null ? new BitSetIterator(set, cost) : DocIdSetIterator.all(cost);
+ return set != null ? new BitSetIterator(set, cardinality) : DocIdSetIterator.all(cardinality);
+ }
+
+ /** Return the number of documents of this set. */
+ int cardinality() {
+ return cardinality;
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
index 9b79c80..c760e7f 100644
--- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
@@ -30,7 +30,7 @@ import org.apache.lucene.util.packed.PackedLongValues;
/** Buffers up pending long per doc, then flushes when segment flushes. */
class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
- private PackedLongValues.Builder pending;
+ private final PackedLongValues.Builder pending;
private PackedLongValues finalValues;
private final Counter iwBytesUsed;
private long bytesUsed;
@@ -126,7 +126,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
}
// iterates over the values we have in ram
- private static class BufferedNumericDocValues extends NumericDocValues {
+ static class BufferedNumericDocValues extends NumericDocValues {
final PackedLongValues.Iterator iter;
final DocIdSetIterator docsWithField;
private long value;
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
index 90ab053..ce7b8dc 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
@@ -37,8 +37,8 @@ import org.apache.lucene.util.packed.PackedLongValues;
*/
class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
final BytesRefHash hash;
- private PackedLongValues.Builder pending;
- private DocsWithFieldSet docsWithField;
+ private final PackedLongValues.Builder pending;
+ private final DocsWithFieldSet docsWithField;
private final Counter iwBytesUsed;
private long bytesUsed; // this currently only tracks differences in 'pending'
private final FieldInfo fieldInfo;
@@ -123,7 +123,7 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
finalOrdMap[finalSortedValues[ord]] = ord;
}
return new BufferedSortedDocValues(
- hash, valueCount, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator());
+ hash, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator());
}
private int[] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedDocValues oldValues)
@@ -159,12 +159,7 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
state.segmentInfo.maxDoc(),
sortMap,
new BufferedSortedDocValues(
- hash,
- valueCount,
- finalOrds,
- finalSortedValues,
- finalOrdMap,
- docsWithField.iterator()));
+ hash, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator()));
} else {
sorted = null;
}
@@ -178,12 +173,7 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
}
final SortedDocValues buf =
new BufferedSortedDocValues(
- hash,
- valueCount,
- finalOrds,
- finalSortedValues,
- finalOrdMap,
- docsWithField.iterator());
+ hash, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator());
if (sorted == null) {
return buf;
}
@@ -192,25 +182,22 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
});
}
- private static class BufferedSortedDocValues extends SortedDocValues {
+ static class BufferedSortedDocValues extends SortedDocValues {
final BytesRefHash hash;
final BytesRef scratch = new BytesRef();
final int[] sortedValues;
final int[] ordMap;
- final int valueCount;
private int ord;
final PackedLongValues.Iterator iter;
final DocIdSetIterator docsWithField;
public BufferedSortedDocValues(
BytesRefHash hash,
- int valueCount,
PackedLongValues docToOrd,
int[] sortedValues,
int[] ordMap,
DocIdSetIterator docsWithField) {
this.hash = hash;
- this.valueCount = valueCount;
this.sortedValues = sortedValues;
this.iter = docToOrd.iterator();
this.ordMap = ordMap;
@@ -262,7 +249,7 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
@Override
public int getValueCount() {
- return valueCount;
+ return hash.size();
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
index 95579fc..66a2837 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
@@ -21,6 +21,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.index.NumericDocValuesWriter.BufferedNumericDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Counter;
@@ -30,9 +31,9 @@ import org.apache.lucene.util.packed.PackedLongValues;
/** Buffers up pending long[] per doc, sorts, then flushes when segment flushes. */
class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValues> {
- private PackedLongValues.Builder pending; // stream of all values
+ private final PackedLongValues.Builder pending; // stream of all values
private PackedLongValues.Builder pendingCounts; // count of values per doc
- private DocsWithFieldSet docsWithField;
+ private final DocsWithFieldSet docsWithField;
private final Counter iwBytesUsed;
private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts'
private final FieldInfo fieldInfo;
@@ -47,11 +48,9 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
this.fieldInfo = fieldInfo;
this.iwBytesUsed = iwBytesUsed;
pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
- pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
docsWithField = new DocsWithFieldSet();
bytesUsed =
pending.ramBytesUsed()
- + pendingCounts.ramBytesUsed()
+ docsWithField.ramBytesUsed()
+ RamUsageEstimator.sizeOf(currentValues);
iwBytesUsed.addAndGet(bytesUsed);
@@ -78,7 +77,15 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
pending.add(currentValues[i]);
}
// record the number of values for this doc
- pendingCounts.add(currentUpto);
+ if (pendingCounts != null) {
+ pendingCounts.add(currentUpto);
+ } else if (currentUpto != 1) {
+ pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
+ for (int i = 0; i < docsWithField.cardinality(); ++i) {
+ pendingCounts.add(1);
+ }
+ pendingCounts.add(currentUpto);
+ }
currentUpto = 0;
docsWithField.add(currentDoc);
@@ -96,7 +103,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
private void updateBytesUsed() {
final long newBytesUsed =
pending.ramBytesUsed()
- + pendingCounts.ramBytesUsed()
+ + (pendingCounts == null ? 0 : pendingCounts.ramBytesUsed())
+ docsWithField.ramBytesUsed()
+ RamUsageEstimator.sizeOf(currentValues);
iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
@@ -109,10 +116,9 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
assert finalValuesCount == null;
finishCurrentDoc();
finalValues = pending.build();
- finalValuesCount = pendingCounts.build();
+ finalValuesCount = pendingCounts == null ? null : pendingCounts.build();
}
- return new BufferedSortedNumericDocValues(
- finalValues, finalValuesCount, docsWithField.iterator());
+ return getValues(finalValues, finalValuesCount, docsWithField);
}
static final class LongValues {
@@ -144,6 +150,15 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
}
}
+ private SortedNumericDocValues getValues(
+ PackedLongValues values, PackedLongValues valueCounts, DocsWithFieldSet docsWithField) {
+ if (valueCounts == null) {
+ return DocValues.singleton(new BufferedNumericDocValues(values, docsWithField.iterator()));
+ } else {
+ return new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator());
+ }
+ }
+
@Override
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer)
throws IOException {
@@ -152,7 +167,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
if (finalValues == null) {
finishCurrentDoc();
values = pending.build();
- valueCounts = pendingCounts.build();
+ valueCounts = pendingCounts == null ? null : pendingCounts.build();
} else {
values = finalValues;
valueCounts = finalValuesCount;
@@ -164,7 +179,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
new LongValues(
state.segmentInfo.maxDoc(),
sortMap,
- new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator()),
+ getValues(values, valueCounts, docsWithField),
PackedInts.FASTEST);
} else {
sorted = null;
@@ -178,8 +193,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
if (fieldInfoIn != fieldInfo) {
throw new IllegalArgumentException("wrong fieldInfo");
}
- final SortedNumericDocValues buf =
- new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator());
+ final SortedNumericDocValues buf = getValues(values, valueCounts, docsWithField);
if (sorted == null) {
return buf;
} else {
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
index 8c9eb69..6ffc7a1 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
@@ -23,6 +23,7 @@ import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.index.SortedDocValuesWriter.BufferedSortedDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
@@ -40,9 +41,9 @@ import org.apache.lucene.util.packed.PackedLongValues;
*/
class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
final BytesRefHash hash;
- private PackedLongValues.Builder pending; // stream of all termIDs
+ private final PackedLongValues.Builder pending; // stream of all termIDs
private PackedLongValues.Builder pendingCounts; // termIDs per doc
- private DocsWithFieldSet docsWithField;
+ private final DocsWithFieldSet docsWithField;
private final Counter iwBytesUsed;
private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts'
private final FieldInfo fieldInfo;
@@ -65,11 +66,9 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
BytesRefHash.DEFAULT_CAPACITY,
new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
pending = PackedLongValues.packedBuilder(PackedInts.COMPACT);
- pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
docsWithField = new DocsWithFieldSet();
bytesUsed =
pending.ramBytesUsed()
- + pendingCounts.ramBytesUsed()
+ docsWithField.ramBytesUsed()
+ RamUsageEstimator.sizeOf(currentValues);
iwBytesUsed.addAndGet(bytesUsed);
@@ -116,7 +115,15 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
lastValue = termID;
}
// record the number of unique term ids for this doc
- pendingCounts.add(count);
+ if (pendingCounts != null) {
+ pendingCounts.add(count);
+ } else if (count != 1) {
+ pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
+ for (int i = 0; i < docsWithField.cardinality(); ++i) {
+ pendingCounts.add(1);
+ }
+ pendingCounts.add(count);
+ }
maxCount = Math.max(maxCount, count);
currentUpto = 0;
docsWithField.add(currentDoc);
@@ -146,7 +153,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
private void updateBytesUsed() {
final long newBytesUsed =
pending.ramBytesUsed()
- + pendingCounts.ramBytesUsed()
+ + (pendingCounts == null ? 0 : pendingCounts.ramBytesUsed())
+ docsWithField.ramBytesUsed()
+ RamUsageEstimator.sizeOf(currentValues);
iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
@@ -160,21 +167,32 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
finishCurrentDoc();
int valueCount = hash.size();
finalOrds = pending.build();
- finalOrdCounts = pendingCounts.build();
+ finalOrdCounts = pendingCounts == null ? null : pendingCounts.build();
finalSortedValues = hash.sort();
finalOrdMap = new int[valueCount];
}
for (int ord = 0; ord < finalOrdMap.length; ord++) {
finalOrdMap[finalSortedValues[ord]] = ord;
}
- return new BufferedSortedSetDocValues(
- finalSortedValues,
- finalOrdMap,
- hash,
- finalOrds,
- finalOrdCounts,
- maxCount,
- docsWithField.iterator());
+ return getValues(
+ finalSortedValues, finalOrdMap, hash, finalOrds, finalOrdCounts, maxCount, docsWithField);
+ }
+
+ private SortedSetDocValues getValues(
+ int[] sortedValues,
+ int[] ordMap,
+ BytesRefHash hash,
+ PackedLongValues ords,
+ PackedLongValues ordCounts,
+ int maxCount,
+ DocsWithFieldSet docsWithField) {
+ if (ordCounts == null) {
+ return DocValues.singleton(
+ new BufferedSortedDocValues(hash, ords, sortedValues, ordMap, docsWithField.iterator()));
+ } else {
+ return new BufferedSortedSetDocValues(
+ sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator());
+ }
}
@Override
@@ -190,7 +208,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
assert finalOrdCounts == null && finalSortedValues == null && finalOrdMap == null;
finishCurrentDoc();
ords = pending.build();
- ordCounts = pendingCounts.build();
+ ordCounts = pendingCounts == null ? null : pendingCounts.build();
sortedValues = hash.sort();
ordMap = new int[valueCount];
for (int ord = 0; ord < valueCount; ord++) {
@@ -209,8 +227,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
new DocOrds(
state.segmentInfo.maxDoc(),
sortMap,
- new BufferedSortedSetDocValues(
- sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator()),
+ getValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField),
PackedInts.FASTEST);
} else {
docOrds = null;
@@ -224,14 +241,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
throw new IllegalArgumentException("wrong fieldInfo");
}
final SortedSetDocValues buf =
- new BufferedSortedSetDocValues(
- sortedValues,
- ordMap,
- hash,
- ords,
- ordCounts,
- maxCount,
- docsWithField.iterator());
+ getValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField);
if (docOrds == null) {
return buf;
} else {