You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2021/10/20 17:05:43 UTC

[lucene] branch main updated: LUCENE-10189: Optimize flush of doc-value fields that are effectively single-valued. (#399)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 3a11983  LUCENE-10189: Optimize flush of doc-value fields that are effectively single-valued. (#399)
3a11983 is described below

commit 3a11983de2df842c413cf159ccc5bdefcb721e33
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Wed Oct 20 19:05:40 2021 +0200

    LUCENE-10189: Optimize flush of doc-value fields that are effectively single-valued. (#399)
---
 .../org/apache/lucene/index/DocsWithFieldSet.java  | 15 ++++--
 .../lucene/index/NumericDocValuesWriter.java       |  4 +-
 .../apache/lucene/index/SortedDocValuesWriter.java | 27 +++-------
 .../lucene/index/SortedNumericDocValuesWriter.java | 40 +++++++++-----
 .../lucene/index/SortedSetDocValuesWriter.java     | 62 +++++++++++++---------
 5 files changed, 82 insertions(+), 66 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java b/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java
index af49e77..b4f5ac5 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java
@@ -32,7 +32,7 @@ final class DocsWithFieldSet extends DocIdSet {
       RamUsageEstimator.shallowSizeOfInstance(DocsWithFieldSet.class);
 
   private FixedBitSet set;
-  private int cost = 0;
+  private int cardinality = 0;
   private int lastDocId = -1;
 
   void add(int docID) {
@@ -43,14 +43,14 @@ final class DocsWithFieldSet extends DocIdSet {
     if (set != null) {
       set = FixedBitSet.ensureCapacity(set, docID);
       set.set(docID);
-    } else if (docID != cost) {
+    } else if (docID != cardinality) {
       // migrate to a sparse encoding using a bit set
       set = new FixedBitSet(docID + 1);
-      set.set(0, cost);
+      set.set(0, cardinality);
       set.set(docID);
     }
     lastDocId = docID;
-    cost++;
+    cardinality++;
   }
 
   @Override
@@ -60,6 +60,11 @@ final class DocsWithFieldSet extends DocIdSet {
 
   @Override
   public DocIdSetIterator iterator() {
-    return set != null ? new BitSetIterator(set, cost) : DocIdSetIterator.all(cost);
+    return set != null ? new BitSetIterator(set, cardinality) : DocIdSetIterator.all(cardinality);
+  }
+
+  /** Return the number of documents of this set. */
+  int cardinality() {
+    return cardinality;
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
index 9b79c80..c760e7f 100644
--- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
@@ -30,7 +30,7 @@ import org.apache.lucene.util.packed.PackedLongValues;
 /** Buffers up pending long per doc, then flushes when segment flushes. */
 class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
 
-  private PackedLongValues.Builder pending;
+  private final PackedLongValues.Builder pending;
   private PackedLongValues finalValues;
   private final Counter iwBytesUsed;
   private long bytesUsed;
@@ -126,7 +126,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
   }
 
   // iterates over the values we have in ram
-  private static class BufferedNumericDocValues extends NumericDocValues {
+  static class BufferedNumericDocValues extends NumericDocValues {
     final PackedLongValues.Iterator iter;
     final DocIdSetIterator docsWithField;
     private long value;
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
index 90ab053..ce7b8dc 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
@@ -37,8 +37,8 @@ import org.apache.lucene.util.packed.PackedLongValues;
  */
 class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
   final BytesRefHash hash;
-  private PackedLongValues.Builder pending;
-  private DocsWithFieldSet docsWithField;
+  private final PackedLongValues.Builder pending;
+  private final DocsWithFieldSet docsWithField;
   private final Counter iwBytesUsed;
   private long bytesUsed; // this currently only tracks differences in 'pending'
   private final FieldInfo fieldInfo;
@@ -123,7 +123,7 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
       finalOrdMap[finalSortedValues[ord]] = ord;
     }
     return new BufferedSortedDocValues(
-        hash, valueCount, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator());
+        hash, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator());
   }
 
   private int[] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedDocValues oldValues)
@@ -159,12 +159,7 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
               state.segmentInfo.maxDoc(),
               sortMap,
               new BufferedSortedDocValues(
-                  hash,
-                  valueCount,
-                  finalOrds,
-                  finalSortedValues,
-                  finalOrdMap,
-                  docsWithField.iterator()));
+                  hash, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator()));
     } else {
       sorted = null;
     }
@@ -178,12 +173,7 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
             }
             final SortedDocValues buf =
                 new BufferedSortedDocValues(
-                    hash,
-                    valueCount,
-                    finalOrds,
-                    finalSortedValues,
-                    finalOrdMap,
-                    docsWithField.iterator());
+                    hash, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator());
             if (sorted == null) {
               return buf;
             }
@@ -192,25 +182,22 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
         });
   }
 
-  private static class BufferedSortedDocValues extends SortedDocValues {
+  static class BufferedSortedDocValues extends SortedDocValues {
     final BytesRefHash hash;
     final BytesRef scratch = new BytesRef();
     final int[] sortedValues;
     final int[] ordMap;
-    final int valueCount;
     private int ord;
     final PackedLongValues.Iterator iter;
     final DocIdSetIterator docsWithField;
 
     public BufferedSortedDocValues(
         BytesRefHash hash,
-        int valueCount,
         PackedLongValues docToOrd,
         int[] sortedValues,
         int[] ordMap,
         DocIdSetIterator docsWithField) {
       this.hash = hash;
-      this.valueCount = valueCount;
       this.sortedValues = sortedValues;
       this.iter = docToOrd.iterator();
       this.ordMap = ordMap;
@@ -262,7 +249,7 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
 
     @Override
     public int getValueCount() {
-      return valueCount;
+      return hash.size();
     }
   }
 
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
index 95579fc..66a2837 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
@@ -21,6 +21,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
 import java.io.IOException;
 import java.util.Arrays;
 import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.index.NumericDocValuesWriter.BufferedNumericDocValues;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Counter;
@@ -30,9 +31,9 @@ import org.apache.lucene.util.packed.PackedLongValues;
 
 /** Buffers up pending long[] per doc, sorts, then flushes when segment flushes. */
 class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValues> {
-  private PackedLongValues.Builder pending; // stream of all values
+  private final PackedLongValues.Builder pending; // stream of all values
   private PackedLongValues.Builder pendingCounts; // count of values per doc
-  private DocsWithFieldSet docsWithField;
+  private final DocsWithFieldSet docsWithField;
   private final Counter iwBytesUsed;
   private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts'
   private final FieldInfo fieldInfo;
@@ -47,11 +48,9 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
     this.fieldInfo = fieldInfo;
     this.iwBytesUsed = iwBytesUsed;
     pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
-    pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
     docsWithField = new DocsWithFieldSet();
     bytesUsed =
         pending.ramBytesUsed()
-            + pendingCounts.ramBytesUsed()
             + docsWithField.ramBytesUsed()
             + RamUsageEstimator.sizeOf(currentValues);
     iwBytesUsed.addAndGet(bytesUsed);
@@ -78,7 +77,15 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
       pending.add(currentValues[i]);
     }
     // record the number of values for this doc
-    pendingCounts.add(currentUpto);
+    if (pendingCounts != null) {
+      pendingCounts.add(currentUpto);
+    } else if (currentUpto != 1) {
+      pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
+      for (int i = 0; i < docsWithField.cardinality(); ++i) {
+        pendingCounts.add(1);
+      }
+      pendingCounts.add(currentUpto);
+    }
     currentUpto = 0;
 
     docsWithField.add(currentDoc);
@@ -96,7 +103,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
   private void updateBytesUsed() {
     final long newBytesUsed =
         pending.ramBytesUsed()
-            + pendingCounts.ramBytesUsed()
+            + (pendingCounts == null ? 0 : pendingCounts.ramBytesUsed())
             + docsWithField.ramBytesUsed()
             + RamUsageEstimator.sizeOf(currentValues);
     iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
@@ -109,10 +116,9 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
       assert finalValuesCount == null;
       finishCurrentDoc();
       finalValues = pending.build();
-      finalValuesCount = pendingCounts.build();
+      finalValuesCount = pendingCounts == null ? null : pendingCounts.build();
     }
-    return new BufferedSortedNumericDocValues(
-        finalValues, finalValuesCount, docsWithField.iterator());
+    return getValues(finalValues, finalValuesCount, docsWithField);
   }
 
   static final class LongValues {
@@ -144,6 +150,15 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
     }
   }
 
+  private SortedNumericDocValues getValues(
+      PackedLongValues values, PackedLongValues valueCounts, DocsWithFieldSet docsWithField) {
+    if (valueCounts == null) {
+      return DocValues.singleton(new BufferedNumericDocValues(values, docsWithField.iterator()));
+    } else {
+      return new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator());
+    }
+  }
+
   @Override
   public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer)
       throws IOException {
@@ -152,7 +167,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
     if (finalValues == null) {
       finishCurrentDoc();
       values = pending.build();
-      valueCounts = pendingCounts.build();
+      valueCounts = pendingCounts == null ? null : pendingCounts.build();
     } else {
       values = finalValues;
       valueCounts = finalValuesCount;
@@ -164,7 +179,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
           new LongValues(
               state.segmentInfo.maxDoc(),
               sortMap,
-              new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator()),
+              getValues(values, valueCounts, docsWithField),
               PackedInts.FASTEST);
     } else {
       sorted = null;
@@ -178,8 +193,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
             if (fieldInfoIn != fieldInfo) {
               throw new IllegalArgumentException("wrong fieldInfo");
             }
-            final SortedNumericDocValues buf =
-                new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator());
+            final SortedNumericDocValues buf = getValues(values, valueCounts, docsWithField);
             if (sorted == null) {
               return buf;
             } else {
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
index 8c9eb69..6ffc7a1 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
@@ -23,6 +23,7 @@ import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
 import java.io.IOException;
 import java.util.Arrays;
 import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.index.SortedDocValuesWriter.BufferedSortedDocValues;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.ByteBlockPool;
@@ -40,9 +41,9 @@ import org.apache.lucene.util.packed.PackedLongValues;
  */
 class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
   final BytesRefHash hash;
-  private PackedLongValues.Builder pending; // stream of all termIDs
+  private final PackedLongValues.Builder pending; // stream of all termIDs
   private PackedLongValues.Builder pendingCounts; // termIDs per doc
-  private DocsWithFieldSet docsWithField;
+  private final DocsWithFieldSet docsWithField;
   private final Counter iwBytesUsed;
   private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts'
   private final FieldInfo fieldInfo;
@@ -65,11 +66,9 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
             BytesRefHash.DEFAULT_CAPACITY,
             new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
     pending = PackedLongValues.packedBuilder(PackedInts.COMPACT);
-    pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
     docsWithField = new DocsWithFieldSet();
     bytesUsed =
         pending.ramBytesUsed()
-            + pendingCounts.ramBytesUsed()
             + docsWithField.ramBytesUsed()
             + RamUsageEstimator.sizeOf(currentValues);
     iwBytesUsed.addAndGet(bytesUsed);
@@ -116,7 +115,15 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
       lastValue = termID;
     }
     // record the number of unique term ids for this doc
-    pendingCounts.add(count);
+    if (pendingCounts != null) {
+      pendingCounts.add(count);
+    } else if (count != 1) {
+      pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
+      for (int i = 0; i < docsWithField.cardinality(); ++i) {
+        pendingCounts.add(1);
+      }
+      pendingCounts.add(count);
+    }
     maxCount = Math.max(maxCount, count);
     currentUpto = 0;
     docsWithField.add(currentDoc);
@@ -146,7 +153,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
   private void updateBytesUsed() {
     final long newBytesUsed =
         pending.ramBytesUsed()
-            + pendingCounts.ramBytesUsed()
+            + (pendingCounts == null ? 0 : pendingCounts.ramBytesUsed())
             + docsWithField.ramBytesUsed()
             + RamUsageEstimator.sizeOf(currentValues);
     iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
@@ -160,21 +167,32 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
       finishCurrentDoc();
       int valueCount = hash.size();
       finalOrds = pending.build();
-      finalOrdCounts = pendingCounts.build();
+      finalOrdCounts = pendingCounts == null ? null : pendingCounts.build();
       finalSortedValues = hash.sort();
       finalOrdMap = new int[valueCount];
     }
     for (int ord = 0; ord < finalOrdMap.length; ord++) {
       finalOrdMap[finalSortedValues[ord]] = ord;
     }
-    return new BufferedSortedSetDocValues(
-        finalSortedValues,
-        finalOrdMap,
-        hash,
-        finalOrds,
-        finalOrdCounts,
-        maxCount,
-        docsWithField.iterator());
+    return getValues(
+        finalSortedValues, finalOrdMap, hash, finalOrds, finalOrdCounts, maxCount, docsWithField);
+  }
+
+  private SortedSetDocValues getValues(
+      int[] sortedValues,
+      int[] ordMap,
+      BytesRefHash hash,
+      PackedLongValues ords,
+      PackedLongValues ordCounts,
+      int maxCount,
+      DocsWithFieldSet docsWithField) {
+    if (ordCounts == null) {
+      return DocValues.singleton(
+          new BufferedSortedDocValues(hash, ords, sortedValues, ordMap, docsWithField.iterator()));
+    } else {
+      return new BufferedSortedSetDocValues(
+          sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator());
+    }
   }
 
   @Override
@@ -190,7 +208,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
       assert finalOrdCounts == null && finalSortedValues == null && finalOrdMap == null;
       finishCurrentDoc();
       ords = pending.build();
-      ordCounts = pendingCounts.build();
+      ordCounts = pendingCounts == null ? null : pendingCounts.build();
       sortedValues = hash.sort();
       ordMap = new int[valueCount];
       for (int ord = 0; ord < valueCount; ord++) {
@@ -209,8 +227,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
           new DocOrds(
               state.segmentInfo.maxDoc(),
               sortMap,
-              new BufferedSortedSetDocValues(
-                  sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator()),
+              getValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField),
               PackedInts.FASTEST);
     } else {
       docOrds = null;
@@ -224,14 +241,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
               throw new IllegalArgumentException("wrong fieldInfo");
             }
             final SortedSetDocValues buf =
-                new BufferedSortedSetDocValues(
-                    sortedValues,
-                    ordMap,
-                    hash,
-                    ords,
-                    ordCounts,
-                    maxCount,
-                    docsWithField.iterator());
+                getValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField);
             if (docOrds == null) {
               return buf;
             } else {