You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by si...@apache.org on 2018/01/25 18:33:30 UTC
[arrow] branch master updated: ARROW-2019: [JAVA] Control the
memory allocated for inner vector in LIST (#1497)
This is an automated email from the ASF dual-hosted git repository.
siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 1a9d024 ARROW-2019: [JAVA] Control the memory allocated for inner vector in LIST (#1497)
1a9d024 is described below
commit 1a9d024781e8435e6ae010c55c32c9a9d7fa1e16
Author: Sidd <si...@dremio.com>
AuthorDate: Thu Jan 25 10:33:27 2018 -0800
ARROW-2019: [JAVA] Control the memory allocated for inner vector in LIST (#1497)
* ARROW-2019: [JAVA] Control the memory allocated for inner vector in LIST
* address review comments
---
.../arrow/vector/BaseVariableWidthVector.java | 36 ++++++++++++
.../vector/complex/BaseRepeatedValueVector.java | 32 ++++++++++
.../apache/arrow/vector/complex/ListVector.java | 57 +++++++++++++++---
.../org/apache/arrow/vector/TestListVector.java | 68 ++++++++++++++++++++++
.../org/apache/arrow/vector/TestValueVector.java | 36 ++++++++++++
.../org/apache/arrow/vector/TestVectorReAlloc.java | 4 +-
6 files changed, 224 insertions(+), 9 deletions(-)
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
index fff329a..d1190ce 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
@@ -170,6 +170,42 @@ public abstract class BaseVariableWidthVector extends BaseValueVector
}
/**
+ * Sets the desired value capacity for the vector. This function doesn't
+ * allocate any memory for the vector.
+ * @param valueCount desired number of elements in the vector
+ * @param density average number of bytes per variable width element
+ */
+ public void setInitialCapacity(int valueCount, double density) {
+ final long size = (long) (valueCount * density);
+ if (size < 1) {
+ throw new IllegalArgumentException("With the provided density and value count, potential capacity of the data buffer is 0");
+ }
+ if (size > MAX_ALLOCATION_SIZE) {
+ throw new OversizedAllocationException("Requested amount of memory is more than max allowed");
+ }
+ valueAllocationSizeInBytes = (int) size;
+ validityAllocationSizeInBytes = getValidityBufferSizeFromCount(valueCount);
+ /* to track the end offset of last data element in vector, we need
+ * an additional slot in offset buffer.
+ */
+ offsetAllocationSizeInBytes = (valueCount + 1) * OFFSET_WIDTH;
+ }
+
+ /**
+ * Get the density of this ListVector
+ * @return density
+ */
+ public double getDensity() {
+ if (valueCount == 0) {
+ return 0.0D;
+ }
+ final int startOffset = offsetBuffer.getInt(0);
+ final int endOffset = offsetBuffer.getInt(valueCount * OFFSET_WIDTH);
+ final double totalListSize = endOffset - startOffset;
+ return totalListSize/valueCount;
+ }
+
+ /**
* Get the current value capacity for the vector
* @return number of elements that vector can hold.
*/
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
index d0a664a..50ee3a7 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
@@ -143,6 +143,38 @@ public abstract class BaseRepeatedValueVector extends BaseValueVector implements
}
}
+ /**
+ * Specialized version of setInitialCapacity() for ListVector. This is
+ * used by some callers when they want to explicitly control and be
+ * conservative about memory allocated for inner data vector. This is
+ * very useful when we are working with memory constraints for a query
+ * and have a fixed amount of memory reserved for the record batch. In
+ * such cases, we are likely to face OOM or related problems when
+ * we reserve memory for a record batch with value count x and
+ * do setInitialCapacity(x) such that each vector allocates only
+ * what is necessary and not the default amount but the multiplier
+ * forces the memory requirement to go beyond what was needed.
+ *
+ * @param numRecords value count
+ * @param density density of ListVector. Density is the average size of
+ * list per position in the List vector. For example, a
+ * density value of 10 implies each position in the list
+ * vector has a list of 10 values.
+ * A density value of 0.1 implies out of 10 positions in
+ * the list vector, 1 position has a list of size 1 and
+ * remaining positions are null (no lists) or empty lists.
+ * This helps in tightly controlling the memory we provision
+ * for inner data vector.
+ */
+ public void setInitialCapacity(int numRecords, double density) {
+ offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH;
+ final int innerValueCapacity = (int)(numRecords * density);
+ if (innerValueCapacity < 1) {
+ throw new IllegalArgumentException("With the provided density and value count, potential value capacity for the data vector is 0");
+ }
+ vector.setInitialCapacity(innerValueCapacity);
+ }
+
@Override
public int getValueCapacity() {
final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 1, 0);
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
index 8aeeb7e..b472dae 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
@@ -31,12 +31,7 @@ import io.netty.buffer.ArrowBuf;
import org.apache.arrow.memory.BaseAllocator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.OutOfMemoryException;
-import org.apache.arrow.vector.AddOrGetResult;
-import org.apache.arrow.vector.BufferBacked;
-import org.apache.arrow.vector.FieldVector;
-import org.apache.arrow.vector.ValueVector;
-import org.apache.arrow.vector.ZeroVector;
-import org.apache.arrow.vector.BitVectorHelper;
+import org.apache.arrow.vector.*;
import org.apache.arrow.vector.complex.impl.ComplexCopier;
import org.apache.arrow.vector.complex.impl.UnionListReader;
import org.apache.arrow.vector.complex.impl.UnionListWriter;
@@ -103,6 +98,54 @@ public class ListVector extends BaseRepeatedValueVector implements FieldVector,
}
@Override
+ public void setInitialCapacity(int numRecords) {
+ validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords);
+ super.setInitialCapacity(numRecords);
+ }
+
+ /**
+ * Specialized version of setInitialCapacity() for ListVector. This is
+ * used by some callers when they want to explicitly control and be
+ * conservative about memory allocated for inner data vector. This is
+ * very useful when we are working with memory constraints for a query
+ * and have a fixed amount of memory reserved for the record batch. In
+ * such cases, we are likely to face OOM or related problems when
+ * we reserve memory for a record batch with value count x and
+ * do setInitialCapacity(x) such that each vector allocates only
+ * what is necessary and not the default amount but the multiplier
+ * forces the memory requirement to go beyond what was needed.
+ *
+ * @param numRecords value count
+ * @param density density of ListVector. Density is the average size of
+ * list per position in the List vector. For example, a
+ * density value of 10 implies each position in the list
+ * vector has a list of 10 values.
+ * A density value of 0.1 implies out of 10 positions in
+ * the list vector, 1 position has a list of size 1 and
+ * remaining positions are null (no lists) or empty lists.
+ * This helps in tightly controlling the memory we provision
+ * for inner data vector.
+ */
+ public void setInitialCapacity(int numRecords, double density) {
+ validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords);
+ super.setInitialCapacity(numRecords, density);
+ }
+
+ /**
+ * Get the density of this ListVector
+ * @return density
+ */
+ public double getDensity() {
+ if (valueCount == 0) {
+ return 0.0D;
+ }
+ final int startOffset = offsetBuffer.getInt(0);
+ final int endOffset = offsetBuffer.getInt(valueCount * OFFSET_WIDTH);
+ final double totalListSize = endOffset - startOffset;
+ return totalListSize/valueCount;
+ }
+
+ @Override
public List<FieldVector> getChildrenFromFields() {
return singletonList(getDataVector());
}
@@ -623,7 +666,7 @@ public class ListVector extends BaseRepeatedValueVector implements FieldVector,
*/
@Override
public int getValueCapacity() {
- return Math.min(getValidityBufferValueCapacity(), super.getValueCapacity());
+ return getValidityAndOffsetValueCapacity();
}
private int getValidityAndOffsetValueCapacity() {
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
index e2023f4..d49a677 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
@@ -112,6 +112,9 @@ public class TestListVector {
result = outVector.getObject(2);
resultSet = (ArrayList<Long>) result;
assertEquals(0, resultSet.size());
+
+ /* 3+0+0/3 */
+ assertEquals(1.0D, inVector.getDensity(), 0);
}
}
@@ -209,6 +212,9 @@ public class TestListVector {
listVector.setLastSet(3);
listVector.setValueCount(10);
+ /* (3+2+3)/10 */
+ assertEquals(0.8D, listVector.getDensity(), 0);
+
index = 0;
offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH);
assertEquals(Integer.toString(0), Integer.toString(offset));
@@ -709,6 +715,8 @@ public class TestListVector {
listWriter.bigInt().writeBigInt(300);
listWriter.endList();
+ listVector.setValueCount(2);
+
/* check listVector contents */
Object result = listVector.getObject(0);
ArrayList<Long> resultSet = (ArrayList<Long>) result;
@@ -739,6 +747,9 @@ public class TestListVector {
assertEquals(2, buffers.size());
assertEquals(bitAddress, buffers.get(0).memoryAddress());
assertEquals(offsetAddress, buffers.get(1).memoryAddress());
+
+ /* (3+2)/2 */
+ assertEquals(2.5, listVector.getDensity(), 0);
}
}
@@ -753,4 +764,61 @@ public class TestListVector {
assertTrue(emptyVectorStr.contains(ListVector.DATA_VECTOR_NAME));
}
}
+
+ @Test
+ public void testSetInitialCapacity() {
+ try (final ListVector vector = ListVector.empty("", allocator)) {
+ vector.addOrGetVector(FieldType.nullable(MinorType.INT.getType()));
+
+ /**
+ * use the default multiplier of 5,
+ * 512 * 5 => 2560 * 4 => 10240 bytes => 16KB => 4096 value capacity.
+ */
+ vector.setInitialCapacity(512);
+ vector.allocateNew();
+ assertEquals(512, vector.getValueCapacity());
+ assertEquals(4096, vector.getDataVector().getValueCapacity());
+
+ /* use density as 4 */
+ vector.setInitialCapacity(512, 4);
+ vector.allocateNew();
+ assertEquals(512, vector.getValueCapacity());
+ assertEquals(512*4, vector.getDataVector().getValueCapacity());
+
+ /**
+ * inner value capacity we pass to data vector is 512 * 0.1 => 51
+ * For an int vector this is 204 bytes of memory for data buffer
+ * and 7 bytes for validity buffer.
+ * and with power of 2 allocation, we allocate 256 bytes and 8 bytes
+ * for the data buffer and validity buffer of the inner vector. Thus
+ * value capacity of inner vector is 64
+ */
+ vector.setInitialCapacity(512, 0.1);
+ vector.allocateNew();
+ assertEquals(512, vector.getValueCapacity());
+ assertEquals(64, vector.getDataVector().getValueCapacity());
+
+ /**
+ * inner value capacity we pass to data vector is 512 * 0.01 => 5
+ * For an int vector this is 20 bytes of memory for data buffer
+ * and 1 byte for validity buffer.
+ * and with power of 2 allocation, we allocate 32 bytes and 1 bytes
+ * for the data buffer and validity buffer of the inner vector. Thus
+ * value capacity of inner vector is 8
+ */
+ vector.setInitialCapacity(512, 0.01);
+ vector.allocateNew();
+ assertEquals(512, vector.getValueCapacity());
+ assertEquals(8, vector.getDataVector().getValueCapacity());
+
+ boolean error = false;
+ try {
+ vector.setInitialCapacity(5, 0.1);
+ } catch (IllegalArgumentException e) {
+ error = true;
+ } finally {
+ assertTrue(error);
+ }
+ }
+ }
}
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
index 601b206..992bb62 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
@@ -1908,4 +1908,40 @@ public class TestValueVector {
vector.offsetBuffer.setInt((index + 1) * vector.OFFSET_WIDTH, currentOffset + bytes.length);
vector.valueBuffer.setBytes(currentOffset, bytes, 0, bytes.length);
}
+
+ @Test /* VarCharVector */
+ public void testSetInitialCapacity() {
+ try (final VarCharVector vector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator)) {
+
+ /* use the default 8 data bytes on average per element */
+ vector.setInitialCapacity(4096);
+ vector.allocateNew();
+ assertEquals(4096, vector.getValueCapacity());
+ assertEquals(4096 * 8, vector.getDataBuffer().capacity());
+
+ vector.setInitialCapacity(4096, 1);
+ vector.allocateNew();
+ assertEquals(4096, vector.getValueCapacity());
+ assertEquals(4096, vector.getDataBuffer().capacity());
+
+ vector.setInitialCapacity(4096, 0.1);
+ vector.allocateNew();
+ assertEquals(4096, vector.getValueCapacity());
+ assertEquals(512, vector.getDataBuffer().capacity());
+
+ vector.setInitialCapacity(4096, 0.01);
+ vector.allocateNew();
+ assertEquals(4096, vector.getValueCapacity());
+ assertEquals(64, vector.getDataBuffer().capacity());
+
+ boolean error = false;
+ try {
+ vector.setInitialCapacity(5, 0.1);
+ } catch (IllegalArgumentException e) {
+ error = true;
+ } finally {
+ assertTrue(error);
+ }
+ }
+ }
}
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java
index f8edf89..ca039c5 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java
@@ -104,7 +104,7 @@ public class TestVectorReAlloc {
vector.setInitialCapacity(512);
vector.allocateNew();
- assertEquals(1023, vector.getValueCapacity());
+ assertEquals(512, vector.getValueCapacity());
try {
vector.getInnerValueCountAt(2014);
@@ -114,7 +114,7 @@ public class TestVectorReAlloc {
}
vector.reAlloc();
- assertEquals(2047, vector.getValueCapacity()); // note: size - 1
+ assertEquals(1024, vector.getValueCapacity());
assertEquals(0, vector.getOffsetBuffer().getInt(2014 * ListVector.OFFSET_WIDTH));
}
}
--
To stop receiving notification emails like this one, please contact
siddteotia@apache.org.