You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by hv...@apache.org on 2018/05/03 14:52:46 UTC
spark git commit: [SPARK-24133][SQL] Backport [] Check for integer
overflows when resizing WritableColumnVectors
Repository: spark
Updated Branches:
refs/heads/branch-2.3 10e2f1fc0 -> bfe50b684
[SPARK-24133][SQL] Backport [] Check for integer overflows when resizing WritableColumnVectors
`ColumnVector`s store string data in one big byte array. Since the array size is capped at just under Integer.MAX_VALUE, a single `ColumnVector` cannot store more than 2GB of string data.
But since the Parquet files commonly contain large blobs stored as strings, and `ColumnVector`s by default carry 4096 values, it's entirely possible to go past that limit. In such cases a negative capacity is requested from `WritableColumnVector.reserve()`. The call succeeds (requested capacity is smaller than already allocated capacity), and consequently `java.lang.ArrayIndexOutOfBoundsException` is thrown when the reader actually attempts to put the data into the array.
This change introduces a simple check for integer overflow to `WritableColumnVector.reserve()` which should help catch the error earlier and provide more informative exception. Additionally, the error message in `WritableColumnVector.throwUnsupportedException()` was corrected.
New units tests were added.
Author: Ala Luszczak <aladatabricks.com>
Closes #21206 from ala/overflow-reserve.
(cherry picked from commit 8bd27025b7cf0b44726b6f4020d294ef14dbbb7e)
Signed-off-by: Ala Luszczak <aladatabricks.com>
Author: Ala Luszczak <al...@databricks.com>
Closes #21227 from ala/cherry-pick-overflow-reserve.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bfe50b68
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bfe50b68
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bfe50b68
Branch: refs/heads/branch-2.3
Commit: bfe50b6843938100e7bad59071b027689a22ab83
Parents: 10e2f1f
Author: Ala Luszczak <al...@databricks.com>
Authored: Thu May 3 16:52:40 2018 +0200
Committer: Herman van Hovell <hv...@databricks.com>
Committed: Thu May 3 16:52:40 2018 +0200
----------------------------------------------------------------------
.../sql/execution/vectorized/WritableColumnVector.java | 13 ++++++++-----
.../sql/execution/vectorized/ColumnarBatchSuite.scala | 7 +++++++
2 files changed, 15 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/bfe50b68/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 48eebcf..36a92b6 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -81,7 +81,9 @@ public abstract class WritableColumnVector extends ColumnVector {
}
public void reserve(int requiredCapacity) {
- if (requiredCapacity > capacity) {
+ if (requiredCapacity < 0) {
+ throwUnsupportedException(requiredCapacity, null);
+ } else if (requiredCapacity > capacity) {
int newCapacity = (int) Math.min(MAX_CAPACITY, requiredCapacity * 2L);
if (requiredCapacity <= newCapacity) {
try {
@@ -96,10 +98,11 @@ public abstract class WritableColumnVector extends ColumnVector {
}
private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
- String message = "Cannot reserve additional contiguous bytes in the vectorized reader " +
- "(requested = " + requiredCapacity + " bytes). As a workaround, you can disable the " +
- "vectorized reader by setting " + SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key() +
- " to false.";
+ String message = "Cannot reserve additional contiguous bytes in the vectorized reader (" +
+ (requiredCapacity >= 0 ? "requested " + requiredCapacity + " bytes" : "integer overflow") +
+ "). As a workaround, you can disable the vectorized reader. For parquet file format, " +
+ "refer to " + SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key() + "; for orc file format," +
+ " refer to " + SQLConf.ORC_VECTORIZED_READER_ENABLED().key() + ".";
throw new RuntimeException(message, cause);
}
http://git-wip-us.apache.org/repos/asf/spark/blob/bfe50b68/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 772f687..f57f07b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -1333,4 +1333,11 @@ class ColumnarBatchSuite extends SparkFunSuite {
column.close()
}
+
+ testVector("WritableColumnVector.reserve(): requested capacity is negative", 1024, ByteType) {
+ column =>
+ val ex = intercept[RuntimeException] { column.reserve(-1) }
+ assert(ex.getMessage.contains(
+ "Cannot reserve additional contiguous bytes in the vectorized reader (integer overflow)"))
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org