You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2015/05/19 06:53:47 UTC
spark git commit: [SPARK-7687] [SQL] DataFrame.describe() should cast
all aggregates to String
Repository: spark
Updated Branches:
refs/heads/master c2437de18 -> c9fa870a6
[SPARK-7687] [SQL] DataFrame.describe() should cast all aggregates to String
In `DataFrame.describe()`, the `count` aggregate produces an integer, the `avg` and `stdev` aggregates produce doubles, and `min` and `max` aggregates can produce varying types depending on what type of column they're applied to. As a result, we should cast all aggregate results to String so that `describe()`'s output types match its declared output schema.
Author: Josh Rosen <jo...@databricks.com>
Closes #6218 from JoshRosen/SPARK-7687 and squashes the following commits:
146b615 [Josh Rosen] Fix R test.
2974bd5 [Josh Rosen] Cast to string type instead
f206580 [Josh Rosen] Cast to double to fix SPARK-7687
307ecbf [Josh Rosen] Add failing regression test for SPARK-7687
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c9fa870a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c9fa870a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c9fa870a
Branch: refs/heads/master
Commit: c9fa870a6de3f7d0903fa7a75ea5ffb6a2fcd174
Parents: c2437de
Author: Josh Rosen <jo...@databricks.com>
Authored: Mon May 18 21:53:44 2015 -0700
Committer: Reynold Xin <rx...@databricks.com>
Committed: Mon May 18 21:53:44 2015 -0700
----------------------------------------------------------------------
R/pkg/inst/tests/test_sparkSQL.R | 10 +++++-----
.../scala/org/apache/spark/sql/DataFrame.scala | 6 +++---
.../org/apache/spark/sql/DataFrameSuite.scala | 17 +++++++++++------
3 files changed, 19 insertions(+), 14 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/c9fa870a/R/pkg/inst/tests/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 3e5658e..1768c57 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -757,12 +757,12 @@ test_that("parquetFile works with multiple input paths", {
test_that("describe() on a DataFrame", {
df <- jsonFile(sqlCtx, jsonPath)
stats <- describe(df, "age")
- expect_true(collect(stats)[1, "summary"] == "count")
- expect_true(collect(stats)[2, "age"] == 24.5)
- expect_true(collect(stats)[3, "age"] == 5.5)
+ expect_equal(collect(stats)[1, "summary"], "count")
+ expect_equal(collect(stats)[2, "age"], "24.5")
+ expect_equal(collect(stats)[3, "age"], "5.5")
stats <- describe(df)
- expect_true(collect(stats)[4, "name"] == "Andy")
- expect_true(collect(stats)[5, "age"] == 30.0)
+ expect_equal(collect(stats)[4, "name"], "Andy")
+ expect_equal(collect(stats)[5, "age"], "30")
})
unlink(parquetPath)
http://git-wip-us.apache.org/repos/asf/spark/blob/c9fa870a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 27e9af4..adad858 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1063,7 +1063,7 @@ class DataFrame private[sql](
val ret: Seq[Row] = if (outputCols.nonEmpty) {
val aggExprs = statistics.flatMap { case (_, colToAgg) =>
- outputCols.map(c => Column(colToAgg(Column(c).expr)).as(c))
+ outputCols.map(c => Column(Cast(colToAgg(Column(c).expr), StringType)).as(c))
}
val row = agg(aggExprs.head, aggExprs.tail: _*).head().toSeq
@@ -1077,9 +1077,9 @@ class DataFrame private[sql](
statistics.map { case (name, _) => Row(name) }
}
- // The first column is string type, and the rest are double type.
+ // All columns are string type
val schema = StructType(
- StructField("summary", StringType) :: outputCols.map(StructField(_, DoubleType))).toAttributes
+ StructField("summary", StringType) :: outputCols.map(StructField(_, StringType))).toAttributes
LocalRelation(schema, ret)
}
http://git-wip-us.apache.org/repos/asf/spark/blob/c9fa870a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index f05d059..0dcba80 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -370,14 +370,14 @@ class DataFrameSuite extends QueryTest {
("Amy", 24, 180)).toDF("name", "age", "height")
val describeResult = Seq(
- Row("count", 4, 4),
- Row("mean", 33.0, 178.0),
- Row("stddev", 16.583123951777, 10.0),
- Row("min", 16, 164),
- Row("max", 60, 192))
+ Row("count", "4", "4"),
+ Row("mean", "33.0", "178.0"),
+ Row("stddev", "16.583123951777", "10.0"),
+ Row("min", "16", "164"),
+ Row("max", "60", "192"))
val emptyDescribeResult = Seq(
- Row("count", 0, 0),
+ Row("count", "0", "0"),
Row("mean", null, null),
Row("stddev", null, null),
Row("min", null, null),
@@ -388,6 +388,11 @@ class DataFrameSuite extends QueryTest {
val describeTwoCols = describeTestData.describe("age", "height")
assert(getSchemaAsSeq(describeTwoCols) === Seq("summary", "age", "height"))
checkAnswer(describeTwoCols, describeResult)
+ // All aggregate value should have been cast to string
+ describeTwoCols.collect().foreach { row =>
+ assert(row.get(1).isInstanceOf[String], "expected string but found " + row.get(1).getClass)
+ assert(row.get(2).isInstanceOf[String], "expected string but found " + row.get(2).getClass)
+ }
val describeAllCols = describeTestData.describe()
assert(getSchemaAsSeq(describeAllCols) === Seq("summary", "age", "height"))
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org