You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2023/03/16 12:55:14 UTC
[lucene] branch branch_9x updated: Fully reuse postings enums when flushing sorted indexes. (#12206)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new 7d36a4073fa Fully reuse postings enums when flushing sorted indexes. (#12206)
7d36a4073fa is described below

commit 7d36a4073fa548e296eb063f842e63a90208c012
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Thu Mar 16 13:51:33 2023 +0100

    Fully reuse postings enums when flushing sorted indexes. (#12206)
    
    Currently we're only half reusing postings enums when flushing sorted indexes
    as we still create new wrapper instances every time, which can be costly with
    fields that have many terms.
---
 .../apache/lucene/index/FreqProxTermsWriter.java   | 108 +++++++++------------
 .../apache/lucene/store/ByteBuffersDataInput.java  |  14 ++-
 2 files changed, 56 insertions(+), 66 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
index 5ba0df2bed0..c178b5aa63b 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
@@ -188,7 +188,7 @@ final class FreqProxTermsWriter extends TermsHash {
           wrapReuse = (SortingPostingsEnum) reuse;
           inReuse = wrapReuse.getWrapped();
         } else {
-          wrapReuse = null;
+          wrapReuse = new SortingPostingsEnum();
           inReuse = reuse;
         }
 
@@ -201,8 +201,8 @@ final class FreqProxTermsWriter extends TermsHash {
             indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
         final boolean storeOffsets =
             indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
-        return new SortingPostingsEnum(
-            docMap.size(), wrapReuse, inDocsAndPositions, docMap, storePositions, storeOffsets);
+        wrapReuse.reset(docMap, inDocsAndPositions, storePositions, storeOffsets);
+        return wrapReuse;
       }
 
       final PostingsEnum inReuse;
@@ -213,33 +213,29 @@ final class FreqProxTermsWriter extends TermsHash {
         wrapReuse = (SortingDocsEnum) reuse;
         inReuse = wrapReuse.getWrapped();
       } else {
-        wrapReuse = null;
-        inReuse = null;
+        wrapReuse = new SortingDocsEnum();
+        inReuse = reuse;
       }
 
       final PostingsEnum inDocs = in.postings(inReuse, flags);
-      return new SortingDocsEnum(docMap.size(), wrapReuse, inDocs, docMap);
+      wrapReuse.reset(docMap, inDocs);
+      return wrapReuse;
     }
   }
 
   static class SortingDocsEnum extends PostingsEnum {
 
-    private final PostingsEnum in;
     private final LSBRadixSorter sorter;
-    private int[] docs;
-    private int docIt = -1;
-    private final int upTo;
+    private PostingsEnum in;
+    private int[] docs = IntsRef.EMPTY_INTS;
+    private int docIt;
+    private int upTo;
 
-    SortingDocsEnum(
-        int maxDoc, SortingDocsEnum reuse, final PostingsEnum in, final Sorter.DocMap docMap)
-        throws IOException {
-      if (reuse != null) {
-        sorter = reuse.sorter;
-        docs = reuse.docs;
-      } else {
-        sorter = new LSBRadixSorter();
-        docs = IntsRef.EMPTY_INTS;
-      }
+    SortingDocsEnum() {
+      sorter = new LSBRadixSorter();
+    }
+
+    void reset(Sorter.DocMap docMap, PostingsEnum in) throws IOException {
       this.in = in;
       int i = 0;
       for (int doc = in.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = in.nextDoc()) {
@@ -253,10 +249,12 @@ final class FreqProxTermsWriter extends TermsHash {
         docs = ArrayUtil.grow(docs);
       }
       docs[upTo] = DocIdSetIterator.NO_MORE_DOCS;
+      final int maxDoc = docMap.size();
       final int numBits = PackedInts.bitsRequired(Math.max(0, maxDoc - 1));
       // Even though LSBRadixSorter cannot take advantage of partial ordering like TimSorter it is
       // often still faster for nearly-sorted inputs.
       sorter.sort(numBits, docs, upTo);
+      docIt = -1;
     }
 
     PostingsEnum getWrapped() {
@@ -311,7 +309,7 @@ final class FreqProxTermsWriter extends TermsHash {
     }
   }
 
-  static class SortingPostingsEnum extends FilterLeafReader.FilterPostingsEnum {
+  static class SortingPostingsEnum extends PostingsEnum {
 
     /**
      * A {@link TimSorter} which sorts two parallel arrays of doc IDs and offsets in one go. Everyti
@@ -324,8 +322,8 @@ final class FreqProxTermsWriter extends TermsHash {
       private int[] tmpDocs;
       private long[] tmpOffsets;
 
-      public DocOffsetSorter(int maxDoc) {
-        super(maxDoc / 8);
+      public DocOffsetSorter(int numTempSlots) {
+        super(numTempSlots);
         this.tmpDocs = IntsRef.EMPTY_INTS;
         this.tmpOffsets = LongsRef.EMPTY_LONGS;
       }
@@ -379,55 +377,38 @@ final class FreqProxTermsWriter extends TermsHash {
       }
     }
 
-    private final int maxDoc;
-    private final DocOffsetSorter sorter;
-    private int[] docs;
-    private long[] offsets;
-    private final int upto;
+    private DocOffsetSorter sorter;
+    private int[] docs = IntsRef.EMPTY_INTS;
+    private long[] offsets = LongsRef.EMPTY_LONGS;
+    private int upto;
 
-    private final ByteBuffersDataInput postingInput;
-    private final boolean storePositions, storeOffsets;
+    private ByteBuffersDataInput postingInput;
+    private PostingsEnum in;
+    private boolean storePositions, storeOffsets;
 
-    private int docIt = -1;
+    private int docIt;
     private int pos;
-    private int startOffset = -1;
-    private int endOffset = -1;
-    private final BytesRef payload;
+    private int startOffset;
+    private int endOffset;
+    private final BytesRef payload = new BytesRef();
     private int currFreq;
 
-    private final ByteBuffersDataOutput buffer;
+    private final ByteBuffersDataOutput buffer = ByteBuffersDataOutput.newResettableInstance();
 
-    SortingPostingsEnum(
-        int maxDoc,
-        SortingPostingsEnum reuse,
-        final PostingsEnum in,
-        Sorter.DocMap docMap,
-        boolean storePositions,
-        boolean storeOffsets)
+    void reset(Sorter.DocMap docMap, PostingsEnum in, boolean storePositions, boolean storeOffsets)
         throws IOException {
-      super(in);
-      this.maxDoc = maxDoc;
+      this.in = in;
       this.storePositions = storePositions;
       this.storeOffsets = storeOffsets;
-      if (reuse != null) {
-        docs = reuse.docs;
-        offsets = reuse.offsets;
-        payload = reuse.payload;
-        buffer = reuse.buffer;
-        buffer.reset();
-        if (reuse.maxDoc == maxDoc) {
-          sorter = reuse.sorter;
-        } else {
-          sorter = new DocOffsetSorter(maxDoc);
-        }
-      } else {
-        docs = new int[32];
-        offsets = new long[32];
-        payload = new BytesRef(32);
-        buffer = ByteBuffersDataOutput.newResettableInstance();
-        sorter = new DocOffsetSorter(maxDoc);
+      if (sorter == null) {
+        final int numTempSlots = docMap.size() / 8;
+        sorter = new DocOffsetSorter(numTempSlots);
       }
+      docIt = -1;
+      startOffset = -1;
+      endOffset = -1;
 
+      buffer.reset();
       int doc;
       int i = 0;
       while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
@@ -548,5 +529,10 @@ final class FreqProxTermsWriter extends TermsHash {
     PostingsEnum getWrapped() {
       return in;
     }
+
+    @Override
+    public long cost() {
+      return in.cost();
+    }
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java
index 99e679d8ec2..fdc11381548 100644
--- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java
+++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java
@@ -55,10 +55,10 @@ public final class ByteBuffersDataInput extends DataInput
   public ByteBuffersDataInput(List<ByteBuffer> buffers) {
     ensureAssumptions(buffers);
 
-    this.blocks =
-        buffers.stream()
-            .map(buf -> buf.asReadOnlyBuffer().order(ByteOrder.LITTLE_ENDIAN))
-            .toArray(ByteBuffer[]::new);
+    this.blocks = buffers.toArray(ByteBuffer[]::new);
+    for (int i = 0; i < blocks.length; ++i) {
+      blocks[i] = blocks[i].asReadOnlyBuffer().order(ByteOrder.LITTLE_ENDIAN);
+    }
     // pre-allocate these arrays and create the view buffers lazily
     this.floatBuffers = new FloatBuffer[blocks.length * Float.BYTES];
     this.longBuffers = new LongBuffer[blocks.length * Long.BYTES];
@@ -71,7 +71,11 @@ public final class ByteBuffersDataInput extends DataInput
       this.blockMask = (1 << blockBits) - 1;
     }
 
-    this.size = Arrays.stream(blocks).mapToLong(block -> block.remaining()).sum();
+    long size = 0;
+    for (ByteBuffer block : blocks) {
+      size += block.remaining();
+    }
+    this.size = size;
 
     // The initial "position" of this stream is shifted by the position of the first block.
     this.offset = blocks[0].position();