You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2018/01/15 08:27:01 UTC
spark git commit: [SPARK-23023][SQL] Cast field data to strings in
showString
Repository: spark
Updated Branches:
refs/heads/master 9a96bfc8b -> b59808385
[SPARK-23023][SQL] Cast field data to strings in showString
## What changes were proposed in this pull request?
The current `Datset.showString` prints rows thru `RowEncoder` deserializers like;
```
scala> Seq(Seq(Seq(1, 2), Seq(3), Seq(4, 5, 6))).toDF("a").show(false)
+------------------------------------------------------------+
|a |
+------------------------------------------------------------+
|[WrappedArray(1, 2), WrappedArray(3), WrappedArray(4, 5, 6)]|
+------------------------------------------------------------+
```
This result is incorrect because the correct one is;
```
scala> Seq(Seq(Seq(1, 2), Seq(3), Seq(4, 5, 6))).toDF("a").show(false)
+------------------------+
|a |
+------------------------+
|[[1, 2], [3], [4, 5, 6]]|
+------------------------+
```
So, this pr fixed code in `showString` to cast field data to strings before printing.
## How was this patch tested?
Added tests in `DataFrameSuite`.
Author: Takeshi Yamamuro <ya...@apache.org>
Closes #20214 from maropu/SPARK-23023.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b5980838
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b5980838
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b5980838
Branch: refs/heads/master
Commit: b59808385cfe24ce768e5b3098b9034e64b99a5a
Parents: 9a96bfc
Author: Takeshi Yamamuro <ya...@apache.org>
Authored: Mon Jan 15 16:26:52 2018 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Mon Jan 15 16:26:52 2018 +0800
----------------------------------------------------------------------
python/pyspark/sql/functions.py | 32 ++++++++++----------
.../scala/org/apache/spark/sql/Dataset.scala | 21 +++++++------
.../org/apache/spark/sql/DataFrameSuite.scala | 28 +++++++++++++++++
.../org/apache/spark/sql/DatasetSuite.scala | 12 ++++----
4 files changed, 61 insertions(+), 32 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/b5980838/python/pyspark/sql/functions.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index e1ad659..f7b3f29 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1849,14 +1849,14 @@ def explode_outer(col):
+---+----------+----+-----+
>>> df.select("id", "a_map", explode_outer("an_array")).show()
- +---+-------------+----+
- | id| a_map| col|
- +---+-------------+----+
- | 1|Map(x -> 1.0)| foo|
- | 1|Map(x -> 1.0)| bar|
- | 2| Map()|null|
- | 3| null|null|
- +---+-------------+----+
+ +---+----------+----+
+ | id| a_map| col|
+ +---+----------+----+
+ | 1|[x -> 1.0]| foo|
+ | 1|[x -> 1.0]| bar|
+ | 2| []|null|
+ | 3| null|null|
+ +---+----------+----+
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.explode_outer(_to_java_column(col))
@@ -1881,14 +1881,14 @@ def posexplode_outer(col):
| 3| null|null|null| null|
+---+----------+----+----+-----+
>>> df.select("id", "a_map", posexplode_outer("an_array")).show()
- +---+-------------+----+----+
- | id| a_map| pos| col|
- +---+-------------+----+----+
- | 1|Map(x -> 1.0)| 0| foo|
- | 1|Map(x -> 1.0)| 1| bar|
- | 2| Map()|null|null|
- | 3| null|null|null|
- +---+-------------+----+----+
+ +---+----------+----+----+
+ | id| a_map| pos| col|
+ +---+----------+----+----+
+ | 1|[x -> 1.0]| 0| foo|
+ | 1|[x -> 1.0]| 1| bar|
+ | 2| []|null|null|
+ | 3| null|null|null|
+ +---+----------+----+----+
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.posexplode_outer(_to_java_column(col))
http://git-wip-us.apache.org/repos/asf/spark/blob/b5980838/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 77e5712..34f0ab5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -237,13 +237,20 @@ class Dataset[T] private[sql](
private[sql] def showString(
_numRows: Int, truncate: Int = 20, vertical: Boolean = false): String = {
val numRows = _numRows.max(0).min(Int.MaxValue - 1)
- val takeResult = toDF().take(numRows + 1)
+ val newDf = toDF()
+ val castCols = newDf.logicalPlan.output.map { col =>
+ // Since binary types in top-level schema fields have a specific format to print,
+ // so we do not cast them to strings here.
+ if (col.dataType == BinaryType) {
+ Column(col)
+ } else {
+ Column(col).cast(StringType)
+ }
+ }
+ val takeResult = newDf.select(castCols: _*).take(numRows + 1)
val hasMoreData = takeResult.length > numRows
val data = takeResult.take(numRows)
- lazy val timeZone =
- DateTimeUtils.getTimeZone(sparkSession.sessionState.conf.sessionLocalTimeZone)
-
// For array values, replace Seq and Array with square brackets
// For cells that are beyond `truncate` characters, replace it with the
// first `truncate-3` and "..."
@@ -252,12 +259,6 @@ class Dataset[T] private[sql](
val str = cell match {
case null => "null"
case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]")
- case array: Array[_] => array.mkString("[", ", ", "]")
- case seq: Seq[_] => seq.mkString("[", ", ", "]")
- case d: Date =>
- DateTimeUtils.dateToString(DateTimeUtils.fromJavaDate(d))
- case ts: Timestamp =>
- DateTimeUtils.timestampToString(DateTimeUtils.fromJavaTimestamp(ts), timeZone)
case _ => cell.toString
}
if (truncate > 0 && str.length > truncate) {
http://git-wip-us.apache.org/repos/asf/spark/blob/b5980838/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 5e4c1a6..3370708 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1255,6 +1255,34 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
assert(testData.select($"*").showString(1, vertical = true) === expectedAnswer)
}
+ test("SPARK-23023 Cast rows to strings in showString") {
+ val df1 = Seq(Seq(1, 2, 3, 4)).toDF("a")
+ assert(df1.showString(10) ===
+ s"""+------------+
+ || a|
+ |+------------+
+ ||[1, 2, 3, 4]|
+ |+------------+
+ |""".stripMargin)
+ val df2 = Seq(Map(1 -> "a", 2 -> "b")).toDF("a")
+ assert(df2.showString(10) ===
+ s"""+----------------+
+ || a|
+ |+----------------+
+ ||[1 -> a, 2 -> b]|
+ |+----------------+
+ |""".stripMargin)
+ val df3 = Seq(((1, "a"), 0), ((2, "b"), 0)).toDF("a", "b")
+ assert(df3.showString(10) ===
+ s"""+------+---+
+ || a| b|
+ |+------+---+
+ ||[1, a]| 0|
+ ||[2, b]| 0|
+ |+------+---+
+ |""".stripMargin)
+ }
+
test("SPARK-7327 show with empty dataFrame") {
val expectedAnswer = """+---+-----+
||key|value|
http://git-wip-us.apache.org/repos/asf/spark/blob/b5980838/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 54893c1..49c59cf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -958,12 +958,12 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
).toDS()
val expected =
- """+-------+
- || f|
- |+-------+
- ||[foo,1]|
- ||[bar,2]|
- |+-------+
+ """+--------+
+ || f|
+ |+--------+
+ ||[foo, 1]|
+ ||[bar, 2]|
+ |+--------+
|""".stripMargin
checkShowString(ds, expected)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org