You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by hv...@apache.org on 2018/05/03 14:52:46 UTC
spark git commit: [SPARK-24133][SQL] Backport [] Check for integer overflows when resizing WritableColumnVectors

Repository: spark
Updated Branches:
  refs/heads/branch-2.3 10e2f1fc0 -> bfe50b684


[SPARK-24133][SQL] Backport [] Check for integer overflows when resizing WritableColumnVectors

`ColumnVector`s store string data in one big byte array. Since the array size is capped at just under Integer.MAX_VALUE, a single `ColumnVector` cannot store more than 2GB of string data.
But since the Parquet files commonly contain large blobs stored as strings, and `ColumnVector`s by default carry 4096 values, it's entirely possible to go past that limit. In such cases a negative capacity is requested from `WritableColumnVector.reserve()`. The call succeeds (requested capacity is smaller than already allocated capacity), and consequently `java.lang.ArrayIndexOutOfBoundsException` is thrown when the reader actually attempts to put the data into the array.

This change introduces a simple check for integer overflow to `WritableColumnVector.reserve()` which should help catch the error earlier and provide more informative exception. Additionally, the error message in `WritableColumnVector.throwUnsupportedException()` was corrected.

New units tests were added.

Author: Ala Luszczak <aladatabricks.com>

Closes #21206 from ala/overflow-reserve.

(cherry picked from commit 8bd27025b7cf0b44726b6f4020d294ef14dbbb7e)
Signed-off-by: Ala Luszczak <aladatabricks.com>

Author: Ala Luszczak <al...@databricks.com>

Closes #21227 from ala/cherry-pick-overflow-reserve.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bfe50b68
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bfe50b68
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bfe50b68

Branch: refs/heads/branch-2.3
Commit: bfe50b6843938100e7bad59071b027689a22ab83
Parents: 10e2f1f
Author: Ala Luszczak <al...@databricks.com>
Authored: Thu May 3 16:52:40 2018 +0200
Committer: Herman van Hovell <hv...@databricks.com>
Committed: Thu May 3 16:52:40 2018 +0200

----------------------------------------------------------------------
 .../sql/execution/vectorized/WritableColumnVector.java | 13 ++++++++-----
 .../sql/execution/vectorized/ColumnarBatchSuite.scala  |  7 +++++++
 2 files changed, 15 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/bfe50b68/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 48eebcf..36a92b6 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -81,7 +81,9 @@ public abstract class WritableColumnVector extends ColumnVector {
   }
 
   public void reserve(int requiredCapacity) {
-    if (requiredCapacity > capacity) {
+    if (requiredCapacity < 0) {
+      throwUnsupportedException(requiredCapacity, null);
+    } else if (requiredCapacity > capacity) {
       int newCapacity = (int) Math.min(MAX_CAPACITY, requiredCapacity * 2L);
       if (requiredCapacity <= newCapacity) {
         try {
@@ -96,10 +98,11 @@ public abstract class WritableColumnVector extends ColumnVector {
   }
 
   private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
-    String message = "Cannot reserve additional contiguous bytes in the vectorized reader " +
-        "(requested = " + requiredCapacity + " bytes). As a workaround, you can disable the " +
-        "vectorized reader by setting " + SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key() +
-        " to false.";
+    String message = "Cannot reserve additional contiguous bytes in the vectorized reader (" +
+        (requiredCapacity >= 0 ? "requested " + requiredCapacity + " bytes" : "integer overflow") +
+        "). As a workaround, you can disable the vectorized reader. For parquet file format, " +
+        "refer to " + SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key() + "; for orc file format," +
+        " refer to " + SQLConf.ORC_VECTORIZED_READER_ENABLED().key() + ".";
     throw new RuntimeException(message, cause);
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/bfe50b68/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 772f687..f57f07b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -1333,4 +1333,11 @@ class ColumnarBatchSuite extends SparkFunSuite {
 
       column.close()
   }
+
+  testVector("WritableColumnVector.reserve(): requested capacity is negative", 1024, ByteType) {
+    column =>
+      val ex = intercept[RuntimeException] { column.reserve(-1) }
+      assert(ex.getMessage.contains(
+          "Cannot reserve additional contiguous bytes in the vectorized reader (integer overflow)"))
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org