You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by li...@apache.org on 2020/09/28 02:10:16 UTC

[arrow] branch master updated: ARROW-9965: [Java] Improve performance of BaseFixedWidthVector.setSafe by optimizing capacity calculations

This is an automated email from the ASF dual-hosted git repository.

liyafan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 848c225  ARROW-9965: [Java] Improve performance of BaseFixedWidthVector.setSafe by optimizing capacity calculations
848c225 is described below

commit 848c22541e530ab25b7146a7a9a3653a403dd9af
Author: Josiah <jo...@gmail.com>
AuthorDate: Mon Sep 28 10:08:51 2020 +0800

    ARROW-9965: [Java] Improve performance of BaseFixedWidthVector.setSafe by optimizing capacity calculations
    
    It turns out that setSafe performs a very expensive integer division when trying to compute buffer capacity; specifically, it divides by the field size, which isn't hardcoded. Although it is typically a power of 2, this doesn't compile down to a bitshift.
    
    Special-casing and forcing a bitshift operation results in a ~300% increase in benchmarks that use a hot loop to set Arrow vectors. We have a similar use-case in an internal data-intensive service.
    
    Benchmark results with arrow.enable_unsafe_memory_access=true
    
    Before:
    ```
    Benchmark Mode Cnt Score Error Units
    IntBenchmarks.setIntDirectly avgt 15 9.563 ± 0.335 us/op
    IntBenchmarks.setWithValueHolder avgt 15 9.266 ± 0.064 us/op
    IntBenchmarks.setWithWriter avgt 15 18.806 ± 0.154 us/op
    ```
    
    After:
    ```
    Benchmark Mode Cnt Score Error Units
    IntBenchmarks.setIntDirectly avgt 15 3.490 ± 0.175 us/op
    IntBenchmarks.setWithValueHolder avgt 15 3.806 ± 0.015 us/op
    IntBenchmarks.setWithWriter avgt 15 5.490 ± 0.304 us/op
    ```
    
    See https://issues.apache.org/jira/browse/ARROW-9965 for further benchmarks, and an analysis of the root cause of the slowdown.
    
    Closes #8214 from josiahyan/ARROW-9965
    
    Authored-by: Josiah <jo...@gmail.com>
    Signed-off-by: liyafan82 <fa...@foxmail.com>
---
 .../apache/arrow/vector/BaseFixedWidthVector.java  | 23 +++++++++++++++++++---
 .../java/org/apache/arrow/vector/BitVector.java    | 10 +++-------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
index ee47f6d..ea31a4f 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
@@ -49,6 +49,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector
   private final int typeWidth;
 
   protected int lastValueCapacity;
+  protected int actualValueCapacity;
 
   protected final Field field;
   private int allocationMonitor;
@@ -72,6 +73,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector
     validityBuffer = allocator.getEmpty();
     valueBuffer = allocator.getEmpty();
     lastValueCapacity = INITIAL_VALUE_ALLOCATION;
+    refreshValueCapacity();
   }
 
 
@@ -182,14 +184,21 @@ public abstract class BaseFixedWidthVector extends BaseValueVector
    */
   @Override
   public int getValueCapacity() {
-    return Math.min(getValueBufferValueCapacity(), getValidityBufferValueCapacity());
+    return actualValueCapacity;
   }
 
-  private int getValueBufferValueCapacity() {
+  /**
+   * Call this if you change the capacity of valueBuffer or validityBuffer.
+   */
+  protected void refreshValueCapacity() {
+    actualValueCapacity = Math.min(getValueBufferValueCapacity(), getValidityBufferValueCapacity());
+  }
+
+  protected int getValueBufferValueCapacity() {
     return capAtMaxInt(valueBuffer.capacity() / typeWidth);
   }
 
-  private int getValidityBufferValueCapacity() {
+  protected int getValidityBufferValueCapacity() {
     return capAtMaxInt(validityBuffer.capacity() * 8);
   }
 
@@ -238,6 +247,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector
     valueCount = 0;
     validityBuffer = releaseBuffer(validityBuffer);
     valueBuffer = releaseBuffer(valueBuffer);
+    refreshValueCapacity();
   }
 
   /* used to step down the memory allocation */
@@ -331,6 +341,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector
     validityBuffer = buffers.getValidityBuf();
     zeroVector();
 
+    refreshValueCapacity();
     lastValueCapacity = getValueCapacity();
   }
 
@@ -343,6 +354,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector
   private void allocateValidityBuffer(final int validityBufferSize) {
     validityBuffer = allocator.buffer(validityBufferSize);
     validityBuffer.readerIndex(0);
+    refreshValueCapacity();
   }
 
   /**
@@ -441,6 +453,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector
     validityBuffer.getReferenceManager().release();
     validityBuffer = newValidityBuffer;
 
+    refreshValueCapacity();
     lastValueCapacity = getValueCapacity();
   }
 
@@ -492,6 +505,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector
     validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator);
     valueBuffer.getReferenceManager().release();
     valueBuffer = dataBuffer.getReferenceManager().retain(dataBuffer, allocator);
+    refreshValueCapacity();
 
     valueCount = fieldNode.getLength();
   }
@@ -572,6 +586,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector
     target.validityBuffer = transferBuffer(validityBuffer, target.allocator);
     target.valueBuffer = transferBuffer(valueBuffer, target.allocator);
     target.valueCount = valueCount;
+    target.refreshValueCapacity();
     clear();
   }
 
@@ -602,6 +617,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector
     final int sliceLength = length * typeWidth;
     final ArrowBuf slicedBuffer = valueBuffer.slice(startPoint, sliceLength);
     target.valueBuffer = transferBuffer(slicedBuffer, target.allocator);
+    target.refreshValueCapacity();
   }
 
   /**
@@ -623,6 +639,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector
         }
         target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget);
         target.validityBuffer.getReferenceManager().retain(1);
+        target.refreshValueCapacity();
       } else {
         /* Copy data
          * When the first bit starts from the middle of a byte (offset != 0),
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java
index 14d357f..3bcfd98 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java
@@ -119,14 +119,9 @@ public final class BitVector extends BaseFixedWidthVector {
     lastValueCapacity = valueCount;
   }
 
-  /**
-   * Get the current value capacity for the vector.
-   *
-   * @return number of elements that vector can hold.
-   */
   @Override
-  public int getValueCapacity() {
-    return capAtMaxInt(validityBuffer.capacity() * 8);
+  protected int getValueBufferValueCapacity() {
+    return capAtMaxInt(valueBuffer.capacity() * 8);
   }
 
   /**
@@ -171,6 +166,7 @@ public final class BitVector extends BaseFixedWidthVector {
             validityBuffer, target.validityBuffer);
     target.valueBuffer = splitAndTransferBuffer(startIndex, length, target,
             valueBuffer, target.valueBuffer);
+    target.refreshValueCapacity();
 
     target.setValueCount(length);
   }