You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by si...@apache.org on 2018/01/25 18:33:30 UTC

[arrow] branch master updated: ARROW-2019: [JAVA] Control the memory allocated for inner vector in LIST (#1497)

This is an automated email from the ASF dual-hosted git repository.

siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 1a9d024  ARROW-2019: [JAVA] Control the memory allocated for inner vector in LIST (#1497)
1a9d024 is described below

commit 1a9d024781e8435e6ae010c55c32c9a9d7fa1e16
Author: Sidd <si...@dremio.com>
AuthorDate: Thu Jan 25 10:33:27 2018 -0800

    ARROW-2019: [JAVA] Control the memory allocated for inner vector in LIST (#1497)
    
    * ARROW-2019: [JAVA] Control the memory allocated for inner vector in LIST
    
    * address review comments
---
 .../arrow/vector/BaseVariableWidthVector.java      | 36 ++++++++++++
 .../vector/complex/BaseRepeatedValueVector.java    | 32 ++++++++++
 .../apache/arrow/vector/complex/ListVector.java    | 57 +++++++++++++++---
 .../org/apache/arrow/vector/TestListVector.java    | 68 ++++++++++++++++++++++
 .../org/apache/arrow/vector/TestValueVector.java   | 36 ++++++++++++
 .../org/apache/arrow/vector/TestVectorReAlloc.java |  4 +-
 6 files changed, 224 insertions(+), 9 deletions(-)

diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
index fff329a..d1190ce 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
@@ -170,6 +170,42 @@ public abstract class BaseVariableWidthVector extends BaseValueVector
   }
 
   /**
+   * Sets the desired value capacity for the vector. This function doesn't
+   * allocate any memory for the vector.
+   * @param valueCount desired number of elements in the vector
+   * @param density average number of bytes per variable width element
+   */
+  public void setInitialCapacity(int valueCount, double density) {
+    final long size = (long) (valueCount * density);
+    if (size < 1) {
+      throw new IllegalArgumentException("With the provided density and value count, potential capacity of the data buffer is 0");
+    }
+    if (size > MAX_ALLOCATION_SIZE) {
+      throw new OversizedAllocationException("Requested amount of memory is more than max allowed");
+    }
+    valueAllocationSizeInBytes = (int) size;
+    validityAllocationSizeInBytes = getValidityBufferSizeFromCount(valueCount);
+    /* to track the end offset of last data element in vector, we need
+     * an additional slot in offset buffer.
+     */
+    offsetAllocationSizeInBytes = (valueCount + 1) * OFFSET_WIDTH;
+  }
+
+  /**
+   * Get the density of this ListVector
+   * @return density
+   */
+  public double getDensity() {
+    if (valueCount == 0) {
+      return 0.0D;
+    }
+    final int startOffset = offsetBuffer.getInt(0);
+    final int endOffset = offsetBuffer.getInt(valueCount * OFFSET_WIDTH);
+    final double totalListSize = endOffset - startOffset;
+    return totalListSize/valueCount;
+  }
+
+  /**
    * Get the current value capacity for the vector
    * @return number of elements that vector can hold.
    */
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
index d0a664a..50ee3a7 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
@@ -143,6 +143,38 @@ public abstract class BaseRepeatedValueVector extends BaseValueVector implements
     }
   }
 
+  /**
+   * Specialized version of setInitialCapacity() for ListVector. This is
+   * used by some callers when they want to explicitly control and be
+   * conservative about memory allocated for inner data vector. This is
+   * very useful when we are working with memory constraints for a query
+   * and have a fixed amount of memory reserved for the record batch. In
+   * such cases, we are likely to face OOM or related problems when
+   * we reserve memory for a record batch with value count x and
+   * do setInitialCapacity(x) such that each vector allocates only
+   * what is necessary and not the default amount but the multiplier
+   * forces the memory requirement to go beyond what was needed.
+   *
+   * @param numRecords value count
+   * @param density density of ListVector. Density is the average size of
+   *                list per position in the List vector. For example, a
+   *                density value of 10 implies each position in the list
+   *                vector has a list of 10 values.
+   *                A density value of 0.1 implies out of 10 positions in
+   *                the list vector, 1 position has a list of size 1 and
+   *                remaining positions are null (no lists) or empty lists.
+   *                This helps in tightly controlling the memory we provision
+   *                for inner data vector.
+   */
+  public void setInitialCapacity(int numRecords, double density) {
+    offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH;
+    final int innerValueCapacity = (int)(numRecords * density);
+    if (innerValueCapacity < 1) {
+      throw new IllegalArgumentException("With the provided density and value count, potential value capacity for the data vector is 0");
+    }
+    vector.setInitialCapacity(innerValueCapacity);
+  }
+
   @Override
   public int getValueCapacity() {
     final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 1, 0);
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
index 8aeeb7e..b472dae 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
@@ -31,12 +31,7 @@ import io.netty.buffer.ArrowBuf;
 import org.apache.arrow.memory.BaseAllocator;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.OutOfMemoryException;
-import org.apache.arrow.vector.AddOrGetResult;
-import org.apache.arrow.vector.BufferBacked;
-import org.apache.arrow.vector.FieldVector;
-import org.apache.arrow.vector.ValueVector;
-import org.apache.arrow.vector.ZeroVector;
-import org.apache.arrow.vector.BitVectorHelper;
+import org.apache.arrow.vector.*;
 import org.apache.arrow.vector.complex.impl.ComplexCopier;
 import org.apache.arrow.vector.complex.impl.UnionListReader;
 import org.apache.arrow.vector.complex.impl.UnionListWriter;
@@ -103,6 +98,54 @@ public class ListVector extends BaseRepeatedValueVector implements FieldVector,
   }
 
   @Override
+  public void setInitialCapacity(int numRecords) {
+    validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords);
+    super.setInitialCapacity(numRecords);
+  }
+
+  /**
+   * Specialized version of setInitialCapacity() for ListVector. This is
+   * used by some callers when they want to explicitly control and be
+   * conservative about memory allocated for inner data vector. This is
+   * very useful when we are working with memory constraints for a query
+   * and have a fixed amount of memory reserved for the record batch. In
+   * such cases, we are likely to face OOM or related problems when
+   * we reserve memory for a record batch with value count x and
+   * do setInitialCapacity(x) such that each vector allocates only
+   * what is necessary and not the default amount but the multiplier
+   * forces the memory requirement to go beyond what was needed.
+   *
+   * @param numRecords value count
+   * @param density density of ListVector. Density is the average size of
+   *                list per position in the List vector. For example, a
+   *                density value of 10 implies each position in the list
+   *                vector has a list of 10 values.
+   *                A density value of 0.1 implies out of 10 positions in
+   *                the list vector, 1 position has a list of size 1 and
+   *                remaining positions are null (no lists) or empty lists.
+   *                This helps in tightly controlling the memory we provision
+   *                for inner data vector.
+   */
+  public void setInitialCapacity(int numRecords, double density) {
+    validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords);
+    super.setInitialCapacity(numRecords, density);
+  }
+
+  /**
+   * Get the density of this ListVector
+   * @return density
+   */
+  public double getDensity() {
+    if (valueCount == 0) {
+      return 0.0D;
+    }
+    final int startOffset = offsetBuffer.getInt(0);
+    final int endOffset = offsetBuffer.getInt(valueCount * OFFSET_WIDTH);
+    final double totalListSize = endOffset - startOffset;
+    return totalListSize/valueCount;
+  }
+
+  @Override
   public List<FieldVector> getChildrenFromFields() {
     return singletonList(getDataVector());
   }
@@ -623,7 +666,7 @@ public class ListVector extends BaseRepeatedValueVector implements FieldVector,
    */
   @Override
   public int getValueCapacity() {
-    return Math.min(getValidityBufferValueCapacity(), super.getValueCapacity());
+    return getValidityAndOffsetValueCapacity();
   }
 
   private int getValidityAndOffsetValueCapacity() {
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
index e2023f4..d49a677 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
@@ -112,6 +112,9 @@ public class TestListVector {
       result = outVector.getObject(2);
       resultSet = (ArrayList<Long>) result;
       assertEquals(0, resultSet.size());
+
+      /* 3+0+0/3 */
+      assertEquals(1.0D, inVector.getDensity(), 0);
     }
   }
 
@@ -209,6 +212,9 @@ public class TestListVector {
       listVector.setLastSet(3);
       listVector.setValueCount(10);
 
+      /* (3+2+3)/10 */
+      assertEquals(0.8D, listVector.getDensity(), 0);
+
       index = 0;
       offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH);
       assertEquals(Integer.toString(0), Integer.toString(offset));
@@ -709,6 +715,8 @@ public class TestListVector {
       listWriter.bigInt().writeBigInt(300);
       listWriter.endList();
 
+      listVector.setValueCount(2);
+
       /* check listVector contents */
       Object result = listVector.getObject(0);
       ArrayList<Long> resultSet = (ArrayList<Long>) result;
@@ -739,6 +747,9 @@ public class TestListVector {
       assertEquals(2, buffers.size());
       assertEquals(bitAddress, buffers.get(0).memoryAddress());
       assertEquals(offsetAddress, buffers.get(1).memoryAddress());
+
+      /* (3+2)/2 */
+      assertEquals(2.5, listVector.getDensity(), 0);
     }
   }
 
@@ -753,4 +764,61 @@ public class TestListVector {
       assertTrue(emptyVectorStr.contains(ListVector.DATA_VECTOR_NAME));
     }
   }
+
+  @Test
+  public void testSetInitialCapacity() {
+    try (final ListVector vector = ListVector.empty("", allocator)) {
+      vector.addOrGetVector(FieldType.nullable(MinorType.INT.getType()));
+
+      /**
+       * use the default multiplier of 5,
+       * 512 * 5 => 2560 * 4 => 10240 bytes => 16KB => 4096 value capacity.
+       */
+      vector.setInitialCapacity(512);
+      vector.allocateNew();
+      assertEquals(512, vector.getValueCapacity());
+      assertEquals(4096, vector.getDataVector().getValueCapacity());
+
+      /* use density as 4 */
+      vector.setInitialCapacity(512, 4);
+      vector.allocateNew();
+      assertEquals(512, vector.getValueCapacity());
+      assertEquals(512*4, vector.getDataVector().getValueCapacity());
+
+      /**
+       * inner value capacity we pass to data vector is 512 * 0.1 => 51
+       * For an int vector this is 204 bytes of memory for data buffer
+       * and 7 bytes for validity buffer.
+       * and with power of 2 allocation, we allocate 256 bytes and 8 bytes
+       * for the data buffer and validity buffer of the inner vector. Thus
+       * value capacity of inner vector is 64
+       */
+      vector.setInitialCapacity(512, 0.1);
+      vector.allocateNew();
+      assertEquals(512, vector.getValueCapacity());
+      assertEquals(64, vector.getDataVector().getValueCapacity());
+
+      /**
+       * inner value capacity we pass to data vector is 512 * 0.01 => 5
+       * For an int vector this is 20 bytes of memory for data buffer
+       * and 1 byte for validity buffer.
+       * and with power of 2 allocation, we allocate 32 bytes and 1 bytes
+       * for the data buffer and validity buffer of the inner vector. Thus
+       * value capacity of inner vector is 8
+       */
+      vector.setInitialCapacity(512, 0.01);
+      vector.allocateNew();
+      assertEquals(512, vector.getValueCapacity());
+      assertEquals(8, vector.getDataVector().getValueCapacity());
+
+      boolean error = false;
+      try {
+        vector.setInitialCapacity(5, 0.1);
+      } catch (IllegalArgumentException e) {
+        error = true;
+      } finally {
+        assertTrue(error);
+      }
+    }
+  }
 }
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
index 601b206..992bb62 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
@@ -1908,4 +1908,40 @@ public class TestValueVector {
     vector.offsetBuffer.setInt((index + 1) * vector.OFFSET_WIDTH, currentOffset + bytes.length);
     vector.valueBuffer.setBytes(currentOffset, bytes, 0, bytes.length);
   }
+
+  @Test /* VarCharVector */
+  public void testSetInitialCapacity() {
+    try (final VarCharVector vector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator)) {
+
+      /* use the default 8 data bytes on average per element */
+      vector.setInitialCapacity(4096);
+      vector.allocateNew();
+      assertEquals(4096, vector.getValueCapacity());
+      assertEquals(4096 * 8, vector.getDataBuffer().capacity());
+
+      vector.setInitialCapacity(4096, 1);
+      vector.allocateNew();
+      assertEquals(4096, vector.getValueCapacity());
+      assertEquals(4096, vector.getDataBuffer().capacity());
+
+      vector.setInitialCapacity(4096, 0.1);
+      vector.allocateNew();
+      assertEquals(4096, vector.getValueCapacity());
+      assertEquals(512, vector.getDataBuffer().capacity());
+
+      vector.setInitialCapacity(4096, 0.01);
+      vector.allocateNew();
+      assertEquals(4096, vector.getValueCapacity());
+      assertEquals(64, vector.getDataBuffer().capacity());
+
+      boolean error = false;
+      try {
+        vector.setInitialCapacity(5, 0.1);
+      } catch (IllegalArgumentException e) {
+        error = true;
+      } finally {
+        assertTrue(error);
+      }
+    }
+  }
 }
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java
index f8edf89..ca039c5 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java
@@ -104,7 +104,7 @@ public class TestVectorReAlloc {
       vector.setInitialCapacity(512);
       vector.allocateNew();
 
-      assertEquals(1023, vector.getValueCapacity());
+      assertEquals(512, vector.getValueCapacity());
 
       try {
         vector.getInnerValueCountAt(2014);
@@ -114,7 +114,7 @@ public class TestVectorReAlloc {
       }
 
       vector.reAlloc();
-      assertEquals(2047, vector.getValueCapacity()); // note: size - 1
+      assertEquals(1024, vector.getValueCapacity());
       assertEquals(0, vector.getOffsetBuffer().getInt(2014 * ListVector.OFFSET_WIDTH));
     }
   }

-- 
To stop receiving notification emails like this one, please contact
siddteotia@apache.org.