You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by hv...@apache.org on 2018/05/02 08:41:39 UTC

spark git commit: [SPARK-23976][CORE] Detect length overflow in UTF8String.concat()/ByteArray.concat()

Repository: spark
Updated Branches:
  refs/heads/master e15850be6 -> 9215ee7a1


[SPARK-23976][CORE] Detect length overflow in UTF8String.concat()/ByteArray.concat()

## What changes were proposed in this pull request?

This PR detects length overflow if total elements in inputs are not acceptable.

For example, when the three inputs has `0x7FFF_FF00`, `0x7FFF_FF00`, and `0xE00`, we should detect length overflow since we cannot allocate such a large structure on `byte[]`.
On the other hand, the current algorithm can allocate the result structure with `0x1000`-byte length due to integer sum overflow.

## How was this patch tested?

Existing UTs.
If we would create UTs, we need large heap (6-8GB). It may make test environment unstable.
If it is necessary to create UTs, I will create them.

Author: Kazuaki Ishizaki <is...@jp.ibm.com>

Closes #21064 from kiszk/SPARK-23976.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9215ee7a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9215ee7a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9215ee7a

Branch: refs/heads/master
Commit: 9215ee7a16b57c56ae927d65e024cf7afe542cbb
Parents: e15850b
Author: Kazuaki Ishizaki <is...@jp.ibm.com>
Authored: Wed May 2 10:41:34 2018 +0200
Committer: Herman van Hovell <hv...@databricks.com>
Committed: Wed May 2 10:41:34 2018 +0200

----------------------------------------------------------------------
 .../java/org/apache/spark/unsafe/types/ByteArray.java   | 12 +++++++-----
 .../java/org/apache/spark/unsafe/types/UTF8String.java  |  8 ++++----
 2 files changed, 11 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/9215ee7a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
----------------------------------------------------------------------
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
index c03caf0..ecd7c19 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
@@ -17,10 +17,12 @@
 
 package org.apache.spark.unsafe.types;
 
-import org.apache.spark.unsafe.Platform;
-
 import java.util.Arrays;
 
+import com.google.common.primitives.Ints;
+
+import org.apache.spark.unsafe.Platform;
+
 public final class ByteArray {
 
   public static final byte[] EMPTY_BYTE = new byte[0];
@@ -77,17 +79,17 @@ public final class ByteArray {
 
   public static byte[] concat(byte[]... inputs) {
     // Compute the total length of the result
-    int totalLength = 0;
+    long totalLength = 0;
     for (int i = 0; i < inputs.length; i++) {
       if (inputs[i] != null) {
-        totalLength += inputs[i].length;
+        totalLength += (long)inputs[i].length;
       } else {
         return null;
       }
     }
 
     // Allocate a new byte array, and copy the inputs one by one into it
-    final byte[] result = new byte[totalLength];
+    final byte[] result = new byte[Ints.checkedCast(totalLength)];
     int offset = 0;
     for (int i = 0; i < inputs.length; i++) {
       int len = inputs[i].length;

http://git-wip-us.apache.org/repos/asf/spark/blob/9215ee7a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
----------------------------------------------------------------------
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index e9b3d9b..e91fc43 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -29,8 +29,8 @@ import com.esotericsoftware.kryo.Kryo;
 import com.esotericsoftware.kryo.KryoSerializable;
 import com.esotericsoftware.kryo.io.Input;
 import com.esotericsoftware.kryo.io.Output;
-
 import com.google.common.primitives.Ints;
+
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.hash.Murmur3_x86_32;
@@ -877,17 +877,17 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
    */
   public static UTF8String concat(UTF8String... inputs) {
     // Compute the total length of the result.
-    int totalLength = 0;
+    long totalLength = 0;
     for (int i = 0; i < inputs.length; i++) {
       if (inputs[i] != null) {
-        totalLength += inputs[i].numBytes;
+        totalLength += (long)inputs[i].numBytes;
       } else {
         return null;
       }
     }
 
     // Allocate a new byte array, and copy the inputs one by one into it.
-    final byte[] result = new byte[totalLength];
+    final byte[] result = new byte[Ints.checkedCast(totalLength)];
     int offset = 0;
     for (int i = 0; i < inputs.length; i++) {
       int len = inputs[i].numBytes;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org