You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ue...@apache.org on 2017/07/07 05:05:30 UTC
spark git commit: [SPARK-21327][SQL][PYSPARK] ArrayConstructor should
handle an array of typecode 'l' as long rather than int in Python 2.
Repository: spark
Updated Branches:
refs/heads/master d451b7f43 -> 53c2eb59b
[SPARK-21327][SQL][PYSPARK] ArrayConstructor should handle an array of typecode 'l' as long rather than int in Python 2.
## What changes were proposed in this pull request?
Currently `ArrayConstructor` handles an array of typecode `'l'` as `int` when converting Python object in Python 2 into Java object, so if the value is larger than `Integer.MAX_VALUE` or smaller than `Integer.MIN_VALUE` then the overflow occurs.
```python
import array
data = [Row(longarray=array.array('l', [-9223372036854775808, 0, 9223372036854775807]))]
df = spark.createDataFrame(data)
df.show(truncate=False)
```
```
+----------+
|longarray |
+----------+
|[0, 0, -1]|
+----------+
```
This should be:
```
+----------------------------------------------+
|longarray |
+----------------------------------------------+
|[-9223372036854775808, 0, 9223372036854775807]|
+----------------------------------------------+
```
## How was this patch tested?
Added a test and existing tests.
Author: Takuya UESHIN <ue...@databricks.com>
Closes #18553 from ueshin/issues/SPARK-21327.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/53c2eb59
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/53c2eb59
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/53c2eb59
Branch: refs/heads/master
Commit: 53c2eb59b2cc557081f6a252748dc38511601b0d
Parents: d451b7f
Author: Takuya UESHIN <ue...@databricks.com>
Authored: Fri Jul 7 14:05:22 2017 +0900
Committer: Takuya UESHIN <ue...@databricks.com>
Committed: Fri Jul 7 14:05:22 2017 +0900
----------------------------------------------------------------------
.../scala/org/apache/spark/api/python/SerDeUtil.scala | 10 ++++++++++
python/pyspark/sql/tests.py | 6 ++++++
2 files changed, 16 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/53c2eb59/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
index 6e4eab4..42f67e8 100644
--- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
@@ -73,6 +73,16 @@ private[spark] object SerDeUtil extends Logging {
// This must be ISO 8859-1 / Latin 1, not UTF-8, to interoperate correctly
val data = args(1).asInstanceOf[String].getBytes(StandardCharsets.ISO_8859_1)
construct(typecode, machineCodes(typecode), data)
+ } else if (args.length == 2 && args(0) == "l") {
+ // On Python 2, an array of typecode 'l' should be handled as long rather than int.
+ val values = args(1).asInstanceOf[JArrayList[_]]
+ val result = new Array[Long](values.size)
+ var i = 0
+ while (i < values.size) {
+ result(i) = values.get(i).asInstanceOf[Number].longValue()
+ i += 1
+ }
+ result
} else {
super.construct(args)
}
http://git-wip-us.apache.org/repos/asf/spark/blob/53c2eb59/python/pyspark/sql/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index c0e3b8d..9db2f40 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2342,6 +2342,12 @@ class SQLTests(ReusedPySparkTestCase):
self.assertEquals(types[2], np.bool)
self.assertEquals(types[3], np.float32)
+ def test_create_dataframe_from_array_of_long(self):
+ import array
+ data = [Row(longarray=array.array('l', [-9223372036854775808, 0, 9223372036854775807]))]
+ df = self.spark.createDataFrame(data)
+ self.assertEqual(df.first(), Row(longarray=[-9223372036854775808, 0, 9223372036854775807]))
+
class HiveSparkSubmitTests(SparkSubmitTests):
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org