You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2018/03/21 06:17:54 UTC
spark git commit: [SPARK-23666][SQL] Do not display exprIds of Alias
in user-facing info.
Repository: spark
Updated Branches:
refs/heads/master 477d6bd72 -> 983e8d9d6
[SPARK-23666][SQL] Do not display exprIds of Alias in user-facing info.
## What changes were proposed in this pull request?
To drop `exprId`s for `Alias` in user-facing info., this pr added an entry for `Alias` in `NonSQLExpression.sql`
## How was this patch tested?
Added tests in `UDFSuite`.
Author: Takeshi Yamamuro <ya...@apache.org>
Closes #20827 from maropu/SPARK-23666.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/983e8d9d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/983e8d9d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/983e8d9d
Branch: refs/heads/master
Commit: 983e8d9d64b6b1304c43ea6e5dffdc1078138ef9
Parents: 477d6bd
Author: Takeshi Yamamuro <ya...@apache.org>
Authored: Tue Mar 20 23:17:49 2018 -0700
Committer: Wenchen Fan <we...@databricks.com>
Committed: Tue Mar 20 23:17:49 2018 -0700
----------------------------------------------------------------------
docs/sql-programming-guide.md | 1 +
.../sql/catalyst/expressions/Expression.scala | 1 +
.../scala/org/apache/spark/sql/UDFSuite.scala | 132 +++++++++++--------
3 files changed, 78 insertions(+), 56 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/983e8d9d/docs/sql-programming-guide.md
----------------------------------------------------------------------
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 0e092e0..5b47fd7 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1806,6 +1806,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see
- Since Spark 2.4, Spark maximizes the usage of a vectorized ORC reader for ORC files by default. To do that, `spark.sql.orc.impl` and `spark.sql.orc.filterPushdown` change their default values to `native` and `true` respectively.
- In PySpark, when Arrow optimization is enabled, previously `toPandas` just failed when Arrow optimization is unabled to be used whereas `createDataFrame` from Pandas DataFrame allowed the fallback to non-optimization. Now, both `toPandas` and `createDataFrame` from Pandas DataFrame allow the fallback by default, which can be switched off by `spark.sql.execution.arrow.fallback.enabled`.
- Since Spark 2.4, writing an empty dataframe to a directory launches at least one write task, even if physically the dataframe has no partition. This introduces a small behavior change that for self-describing file formats like Parquet and Orc, Spark creates a metadata-only file in the target directory when writing a 0-partition dataframe, so that schema inference can still work if users read that directory later. The new behavior is more reasonable and more consistent regarding writing empty dataframe.
+ - Since Spark 2.4, expression IDs in UDF arguments do not appear in column names. For example, an column name in Spark 2.4 is not `UDF:f(col0 AS colA#28)` but ``UDF:f(col0 AS `colA`)``.
## Upgrading From Spark SQL 2.2 to 2.3
http://git-wip-us.apache.org/repos/asf/spark/blob/983e8d9d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index d7f9e38..38caf67 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -288,6 +288,7 @@ trait NonSQLExpression extends Expression {
final override def sql: String = {
transform {
case a: Attribute => new PrettyAttribute(a)
+ case a: Alias => PrettyAttribute(a.sql, a.dataType)
}.toString
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/983e8d9d/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index af6a10b..21afdc7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -144,73 +144,81 @@ class UDFSuite extends QueryTest with SharedSQLContext {
}
test("UDF in a WHERE") {
- spark.udf.register("oneArgFilter", (n: Int) => { n > 80 })
+ withTempView("integerData") {
+ spark.udf.register("oneArgFilter", (n: Int) => { n > 80 })
- val df = sparkContext.parallelize(
- (1 to 100).map(i => TestData(i, i.toString))).toDF()
- df.createOrReplaceTempView("integerData")
+ val df = sparkContext.parallelize(
+ (1 to 100).map(i => TestData(i, i.toString))).toDF()
+ df.createOrReplaceTempView("integerData")
- val result =
- sql("SELECT * FROM integerData WHERE oneArgFilter(key)")
- assert(result.count() === 20)
+ val result =
+ sql("SELECT * FROM integerData WHERE oneArgFilter(key)")
+ assert(result.count() === 20)
+ }
}
test("UDF in a HAVING") {
- spark.udf.register("havingFilter", (n: Long) => { n > 5 })
-
- val df = Seq(("red", 1), ("red", 2), ("blue", 10),
- ("green", 100), ("green", 200)).toDF("g", "v")
- df.createOrReplaceTempView("groupData")
-
- val result =
- sql(
- """
- | SELECT g, SUM(v) as s
- | FROM groupData
- | GROUP BY g
- | HAVING havingFilter(s)
- """.stripMargin)
-
- assert(result.count() === 2)
+ withTempView("groupData") {
+ spark.udf.register("havingFilter", (n: Long) => { n > 5 })
+
+ val df = Seq(("red", 1), ("red", 2), ("blue", 10),
+ ("green", 100), ("green", 200)).toDF("g", "v")
+ df.createOrReplaceTempView("groupData")
+
+ val result =
+ sql(
+ """
+ | SELECT g, SUM(v) as s
+ | FROM groupData
+ | GROUP BY g
+ | HAVING havingFilter(s)
+ """.stripMargin)
+
+ assert(result.count() === 2)
+ }
}
test("UDF in a GROUP BY") {
- spark.udf.register("groupFunction", (n: Int) => { n > 10 })
-
- val df = Seq(("red", 1), ("red", 2), ("blue", 10),
- ("green", 100), ("green", 200)).toDF("g", "v")
- df.createOrReplaceTempView("groupData")
-
- val result =
- sql(
- """
- | SELECT SUM(v)
- | FROM groupData
- | GROUP BY groupFunction(v)
- """.stripMargin)
- assert(result.count() === 2)
+ withTempView("groupData") {
+ spark.udf.register("groupFunction", (n: Int) => { n > 10 })
+
+ val df = Seq(("red", 1), ("red", 2), ("blue", 10),
+ ("green", 100), ("green", 200)).toDF("g", "v")
+ df.createOrReplaceTempView("groupData")
+
+ val result =
+ sql(
+ """
+ | SELECT SUM(v)
+ | FROM groupData
+ | GROUP BY groupFunction(v)
+ """.stripMargin)
+ assert(result.count() === 2)
+ }
}
test("UDFs everywhere") {
- spark.udf.register("groupFunction", (n: Int) => { n > 10 })
- spark.udf.register("havingFilter", (n: Long) => { n > 2000 })
- spark.udf.register("whereFilter", (n: Int) => { n < 150 })
- spark.udf.register("timesHundred", (n: Long) => { n * 100 })
-
- val df = Seq(("red", 1), ("red", 2), ("blue", 10),
- ("green", 100), ("green", 200)).toDF("g", "v")
- df.createOrReplaceTempView("groupData")
-
- val result =
- sql(
- """
- | SELECT timesHundred(SUM(v)) as v100
- | FROM groupData
- | WHERE whereFilter(v)
- | GROUP BY groupFunction(v)
- | HAVING havingFilter(v100)
- """.stripMargin)
- assert(result.count() === 1)
+ withTempView("groupData") {
+ spark.udf.register("groupFunction", (n: Int) => { n > 10 })
+ spark.udf.register("havingFilter", (n: Long) => { n > 2000 })
+ spark.udf.register("whereFilter", (n: Int) => { n < 150 })
+ spark.udf.register("timesHundred", (n: Long) => { n * 100 })
+
+ val df = Seq(("red", 1), ("red", 2), ("blue", 10),
+ ("green", 100), ("green", 200)).toDF("g", "v")
+ df.createOrReplaceTempView("groupData")
+
+ val result =
+ sql(
+ """
+ | SELECT timesHundred(SUM(v)) as v100
+ | FROM groupData
+ | WHERE whereFilter(v)
+ | GROUP BY groupFunction(v)
+ | HAVING havingFilter(v100)
+ """.stripMargin)
+ assert(result.count() === 1)
+ }
}
test("struct UDF") {
@@ -304,4 +312,16 @@ class UDFSuite extends QueryTest with SharedSQLContext {
assert(explainStr(spark.range(1).select(udf1(udf2(functions.lit(1)))))
.contains(s"UDF:$udf1Name(UDF:$udf2Name(1))"))
}
+
+ test("SPARK-23666 Do not display exprId in argument names") {
+ withTempView("x") {
+ Seq(((1, 2), 3)).toDF("a", "b").createOrReplaceTempView("x")
+ spark.udf.register("f", (a: Int) => a)
+ val outputStream = new java.io.ByteArrayOutputStream()
+ Console.withOut(outputStream) {
+ spark.sql("SELECT f(a._1) FROM x").show
+ }
+ assert(outputStream.toString.contains("UDF:f(a._1 AS `_1`)"))
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org