You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by em...@apache.org on 2019/06/28 05:53:09 UTC

[arrow] branch master updated: ARROW-5707: [Java] Improve the performance and code structure for ArrowRecordBatch

This is an automated email from the ASF dual-hosted git repository.

emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 36f8411  ARROW-5707: [Java] Improve the performance and code structure for ArrowRecordBatch
36f8411 is described below

commit 36f84112281ef71bca70c26d8a636d8503b16dd6
Author: liyafan82 <fa...@foxmail.com>
AuthorDate: Thu Jun 27 22:51:48 2019 -0700

    ARROW-5707: [Java] Improve the performance and code structure for ArrowRecordBatch
    
    Improve the performance of ArrowRecordBatch by reducing the number of divisions.
    Improve the code structure of ArrowRecordBatch by removing the useless constructor.
    
    Author: liyafan82 <fa...@foxmail.com>
    
    Closes #4674 from liyafan82/fly_0624_arb and squashes the following commits:
    
    07e67dc69 <liyafan82>  Make utility class final
    3d62b3864 <liyafan82>  Restore TestTls
    5c5c19f38 <liyafan82> Merge branch 'master' into fly_0624_arb
    7a22c02a1 <liyafan82>  Add utilities for rounding size
    15773740a <liyafan82>  Add benchmark
    8b5f1ec03 <liyafan82>  Add comments and restore the constructor
    90f23aa9d <liyafan82>  Fix the bug when clearing lower 3 bits
    924ecdf91 <liyafan82>  Improve the performance and code structure for ArrowRecordBatch
---
 .../ipc/message/ArrowRecordBatchBenchmarks.java    | 100 +++++++++++++++++++++
 .../apache/arrow/util/DataSizeRoundingUtil.java    |  99 ++++++++++++++++++++
 .../org/apache/arrow/vector/BaseValueVector.java   |  12 +--
 .../org/apache/arrow/vector/BitVectorHelper.java   |   3 +-
 .../arrow/vector/ipc/message/ArrowRecordBatch.java |  16 ++--
 .../arrow/util/TestDataSizeRoundingUtil.java       |  76 ++++++++++++++++
 6 files changed, 291 insertions(+), 15 deletions(-)

diff --git a/java/performance/src/test/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java b/java/performance/src/test/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java
new file mode 100644
index 0000000..ca724f0
--- /dev/null
+++ b/java/performance/src/test/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.ipc.message;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.VarCharVector;
+import org.junit.Test;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+/**
+ * Benchmarks for {@link ArrowRecordBatch}.
+ */
+@State(Scope.Benchmark)
+public class ArrowRecordBatchBenchmarks {
+
+  private static final int VECTOR_CAPACITY = 16 * 1024;
+
+  private static final int VECTOR_LENGTH = 1024;
+
+  private static final int ALLOCATOR_CAPACITY = 1024 * 1024;
+
+  private BufferAllocator allocator;
+
+  private VarCharVector vector;
+
+  private List<ArrowFieldNode> nodes;
+
+  /**
+   * Setup benchmarks.
+   */
+  @Setup
+  public void prepare() {
+    allocator = new RootAllocator(ALLOCATOR_CAPACITY);
+    vector = new VarCharVector("vector", allocator);
+    vector.allocateNew(VECTOR_CAPACITY, VECTOR_LENGTH);
+
+    nodes = new ArrayList<>();
+    nodes.add(new ArrowFieldNode(VECTOR_LENGTH, 0));
+    nodes.add(new ArrowFieldNode(VECTOR_LENGTH, 0));
+  }
+
+  /**
+   * Tear down benchmarks.
+   */
+  @TearDown
+  public void tearDown() {
+    vector.close();
+    allocator.close();
+  }
+
+  @Benchmark
+  @BenchmarkMode(Mode.AverageTime)
+  @OutputTimeUnit(TimeUnit.NANOSECONDS)
+  public int createAndGetLength() {
+    try (ArrowRecordBatch batch = new ArrowRecordBatch(VECTOR_LENGTH, nodes, vector.getFieldBuffers())) {
+      return batch.computeBodyLength();
+    }
+  }
+
+  @Test
+  public void evaluate() throws RunnerException {
+    Options opt = new OptionsBuilder()
+            .include(ArrowRecordBatchBenchmarks.class.getSimpleName())
+            .forks(1)
+            .build();
+
+    new Runner(opt).run();
+  }
+}
diff --git a/java/vector/src/main/java/org/apache/arrow/util/DataSizeRoundingUtil.java b/java/vector/src/main/java/org/apache/arrow/util/DataSizeRoundingUtil.java
new file mode 100644
index 0000000..2946503
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/util/DataSizeRoundingUtil.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.util;
+
+/**
+ * Utilities for rounding data size.
+ */
+public final class DataSizeRoundingUtil {
+
+  /**
+   * The mask for rounding an integer to a multiple of 8.
+   * (i.e. clear the lowest 3 bits)
+   */
+  public static int ROUND_8_MASK_INT = 0xFFFFFFF8;
+
+  /**
+   * The mask for rounding a long integer to a multiple of 8.
+   * (i.e. clear the lowest 3 bits)
+   */
+  public static long ROUND_8_MASK_LONG = 0xFFFFFFFFFFFFFFF8L;
+
+  /**
+   * The number of bits to shift for dividing by 8.
+   */
+  public static int DIVIDE_BY_8_SHIFT_BITS = 3;
+
+  /**
+   * Round up the number to the nearest multiple of 8.
+   * @param input the number to round.
+   * @return the rounded number.
+   */
+  public static int roundUpTo8Multiple(int input) {
+    return (input + 7) & ROUND_8_MASK_INT;
+  }
+
+  /**
+   * Round up the number to the nearest multiple of 8.
+   * @param input the number to round.
+   * @return the rounded number
+   */
+  public static long roundUpTo8Multiple(long input) {
+    return (input + 7L) & ROUND_8_MASK_LONG;
+  }
+
+  /**
+   * Round down the number to the nearest multiple of 8.
+   * @param input the number to round.
+   * @return the rounded number.
+   */
+  public static int roundDownTo8Multiple(int input) {
+    return input & ROUND_8_MASK_INT;
+  }
+
+  /**
+   * Round down the number to the nearest multiple of 8.
+   * @param input the number to round.
+   * @return the rounded number
+   */
+  public static long roundDownTo8Multiple(long input) {
+    return input & ROUND_8_MASK_LONG;
+  }
+
+  /**
+   * A fast way to compute Math.ceil(input / 8.0).
+   * @param input the input number.
+   * @return the computed number.
+   */
+  public static int divideBy8Ceil(int input) {
+    return (input + 7) >>> DIVIDE_BY_8_SHIFT_BITS;
+  }
+
+  /**
+   * A fast way to compute Math.ceil(input / 8.0).
+   * @param input the input number.
+   * @return the computed number.
+   */
+  public static long divideBy8Ceil(long input) {
+    return (input + 7) >>> (long) DIVIDE_BY_8_SHIFT_BITS;
+  }
+
+  private DataSizeRoundingUtil() {
+
+  }
+}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java
index f2f6a1f..bc12e8e 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java
@@ -23,6 +23,7 @@ import java.util.Iterator;
 import org.apache.arrow.memory.BaseAllocator;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.ReferenceManager;
+import org.apache.arrow.util.DataSizeRoundingUtil;
 import org.apache.arrow.util.Preconditions;
 import org.apache.arrow.vector.util.TransferPair;
 import org.slf4j.Logger;
@@ -114,12 +115,7 @@ public abstract class BaseValueVector implements ValueVector {
 
   /* number of bytes for the validity buffer for the given valueCount */
   protected static int getValidityBufferSizeFromCount(final int valueCount) {
-    return (valueCount + 7) >> 3;
-  }
-
-  /* round up to the next multiple of 8 */
-  private static long roundUp8(long size) {
-    return ((size + 7) / 8) * 8;
+    return DataSizeRoundingUtil.divideBy8Ceil(valueCount);
   }
 
   /* round up bytes for the validity buffer for the given valueCount */
@@ -139,7 +135,7 @@ public abstract class BaseValueVector implements ValueVector {
       // for boolean type, value-buffer and validity-buffer are of same size.
       bufferSize *= 2;
     } else {
-      bufferSize += roundUp8(valueCount * typeWidth);
+      bufferSize += DataSizeRoundingUtil.roundUpTo8Multiple(valueCount * typeWidth);
     }
     return BaseAllocator.nextPowerOfTwo(bufferSize);
   }
@@ -179,7 +175,7 @@ public abstract class BaseValueVector implements ValueVector {
       int actualCount = (int) ((bufferSize * 8.0) / (8 * typeWidth + 1));
       do {
         validityBufferSize = (int) roundUp8ForValidityBuffer(actualCount);
-        dataBufferSize = (int) roundUp8(actualCount * typeWidth);
+        dataBufferSize = DataSizeRoundingUtil.roundUpTo8Multiple(actualCount * typeWidth);
         if (validityBufferSize + dataBufferSize <= bufferSize) {
           break;
         }
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java
index 4790f87..d1e9900 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java
@@ -18,6 +18,7 @@
 package org.apache.arrow.vector;
 
 import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.util.DataSizeRoundingUtil;
 import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
 
 import io.netty.buffer.ArrowBuf;
@@ -125,7 +126,7 @@ public class BitVectorHelper {
    * @return buffer size
    */
   public static int getValidityBufferSize(int valueCount) {
-    return (valueCount + 7) >> 3;
+    return DataSizeRoundingUtil.divideBy8Ceil(valueCount);
   }
 
   /**
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatch.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatch.java
index 1d69abd..185b44e 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatch.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatch.java
@@ -25,6 +25,7 @@ import java.util.stream.Collectors;
 
 import org.apache.arrow.flatbuf.RecordBatch;
 import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.util.DataSizeRoundingUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -71,7 +72,7 @@ public class ArrowRecordBatch implements ArrowMessage {
     this.length = length;
     this.nodes = nodes;
     this.buffers = buffers;
-    List<ArrowBuffer> arrowBuffers = new ArrayList<>();
+    List<ArrowBuffer> arrowBuffers = new ArrayList<>(buffers.size());
     long offset = 0;
     for (ArrowBuf arrowBuf : buffers) {
       arrowBuf.getReferenceManager().retain();
@@ -79,14 +80,17 @@ public class ArrowRecordBatch implements ArrowMessage {
       arrowBuffers.add(new ArrowBuffer(offset, size));
       LOGGER.debug("Buffer in RecordBatch at {}, length: {}", offset, size);
       offset += size;
-      if (alignBuffers && offset % 8 != 0) { // align on 8 byte boundaries
-        offset += 8 - (offset % 8);
+      if (alignBuffers) { // align on 8 byte boundaries
+        offset = DataSizeRoundingUtil.roundUpTo8Multiple(offset);
       }
     }
     this.buffersLayout = Collections.unmodifiableList(arrowBuffers);
   }
 
   // clone constructor
+  // this constructor is different from the public ones in that the reference manager's
+  // <code>retain</code> method is not called, so the first <code>dummy</code> parameter is used
+  // to distinguish this from the public constructor.
   private ArrowRecordBatch(boolean dummy, int length, List<ArrowFieldNode> nodes, List<ArrowBuf> buffers) {
     this.length = length;
     this.nodes = nodes;
@@ -213,9 +217,9 @@ public class ArrowRecordBatch implements ArrowMessage {
       ByteBuffer nioBuffer =
           buffer.nioBuffer(buffer.readerIndex(), buffer.readableBytes());
       size += nioBuffer.remaining();
-      if (size % 8 != 0) {
-        size += 8 - (size % 8);
-      }
+
+      // round up size to the next multiple of 8
+      size = DataSizeRoundingUtil.roundUpTo8Multiple(size);
     }
     return size;
   }
diff --git a/java/vector/src/test/java/org/apache/arrow/util/TestDataSizeRoundingUtil.java b/java/vector/src/test/java/org/apache/arrow/util/TestDataSizeRoundingUtil.java
new file mode 100644
index 0000000..a04a957
--- /dev/null
+++ b/java/vector/src/test/java/org/apache/arrow/util/TestDataSizeRoundingUtil.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.util;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+/**
+ * Test cases for {@link DataSizeRoundingUtil}.
+ */
+public class TestDataSizeRoundingUtil {
+
+  @Test
+  public void testRoundUpTo8MultipleInt() {
+    assertEquals(0, DataSizeRoundingUtil.roundUpTo8Multiple(0));
+    assertEquals(16, DataSizeRoundingUtil.roundUpTo8Multiple(9));
+    assertEquals(24, DataSizeRoundingUtil.roundUpTo8Multiple(20));
+    assertEquals(128, DataSizeRoundingUtil.roundUpTo8Multiple(128));
+  }
+
+  @Test
+  public void testRoundUpTo8MultipleLong() {
+    assertEquals(0L, DataSizeRoundingUtil.roundUpTo8Multiple(0L));
+    assertEquals(40L, DataSizeRoundingUtil.roundUpTo8Multiple(37L));
+    assertEquals(32L, DataSizeRoundingUtil.roundUpTo8Multiple(29L));
+    assertEquals(512L, DataSizeRoundingUtil.roundUpTo8Multiple(512L));
+  }
+
+  @Test
+  public void testRoundDownTo8MultipleInt() {
+    assertEquals(0, DataSizeRoundingUtil.roundDownTo8Multiple(0));
+    assertEquals(16, DataSizeRoundingUtil.roundDownTo8Multiple(23));
+    assertEquals(24, DataSizeRoundingUtil.roundDownTo8Multiple(27));
+    assertEquals(128, DataSizeRoundingUtil.roundDownTo8Multiple(128));
+  }
+
+  @Test
+  public void testRoundDownTo8MultipleLong() {
+    assertEquals(0L, DataSizeRoundingUtil.roundDownTo8Multiple(0L));
+    assertEquals(40L, DataSizeRoundingUtil.roundDownTo8Multiple(45L));
+    assertEquals(32L, DataSizeRoundingUtil.roundDownTo8Multiple(39L));
+    assertEquals(512L, DataSizeRoundingUtil.roundDownTo8Multiple(512L));
+  }
+
+  @Test
+  public void testDivideBy8CeilInt() {
+    assertEquals(0, DataSizeRoundingUtil.divideBy8Ceil(0));
+    assertEquals(3, DataSizeRoundingUtil.divideBy8Ceil(23));
+    assertEquals(5, DataSizeRoundingUtil.divideBy8Ceil(35));
+    assertEquals(24, DataSizeRoundingUtil.divideBy8Ceil(192));
+  }
+
+  @Test
+  public void testDivideBy8CeilLong() {
+    assertEquals(0L, DataSizeRoundingUtil.divideBy8Ceil(0L));
+    assertEquals(5L, DataSizeRoundingUtil.divideBy8Ceil(37L));
+    assertEquals(10L, DataSizeRoundingUtil.divideBy8Ceil(73L));
+    assertEquals(25L, DataSizeRoundingUtil.divideBy8Ceil(200L));
+  }
+}