You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2019/04/29 13:17:53 UTC
[spark] branch master updated: [SPARK-27581][SQL] DataFrame
countDistinct("*") shouldn't fail with AnalysisException
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 76785cd [SPARK-27581][SQL] DataFrame countDistinct("*") shouldn't fail with AnalysisException
76785cd is described below
commit 76785cd6f0d26825e9a79f831239633e953cef74
Author: Liang-Chi Hsieh <vi...@gmail.com>
AuthorDate: Mon Apr 29 21:17:32 2019 +0800
[SPARK-27581][SQL] DataFrame countDistinct("*") shouldn't fail with AnalysisException
## What changes were proposed in this pull request?
Currently `countDistinct("*")` doesn't work. An analysis exception is thrown:
```scala
val df = sql("select id % 100 from range(100000)")
df.select(countDistinct("*")).first()
org.apache.spark.sql.AnalysisException: Invalid usage of '*' in expression 'count';
```
Users need to use `expr`.
```scala
df.select(expr("count(distinct(*))")).first()
```
This limits some API usage like `df.select(count("*"), countDistinct("*))`.
The PR takes the simplest fix that lets analyzer expand star and resolve `count` function.
## How was this patch tested?
Added unit test.
Closes #24482 from viirya/SPARK-27581.
Authored-by: Liang-Chi Hsieh <vi...@gmail.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 7 ++++---
.../scala/org/apache/spark/sql/DataFrameAggregateSuite.scala | 10 ++++++++++
2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index c1997b6..f92bf79 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -358,9 +358,10 @@ object functions {
* @since 1.3.0
*/
@scala.annotation.varargs
- def countDistinct(expr: Column, exprs: Column*): Column = {
- withAggregateFunction(Count.apply((expr +: exprs).map(_.expr)), isDistinct = true)
- }
+ def countDistinct(expr: Column, exprs: Column*): Column =
+ // For usage like countDistinct("*"), we should let analyzer expand star and
+ // resolve function.
+ Column(UnresolvedFunction("count", (expr +: exprs).map(_.expr), isDistinct = true))
/**
* Aggregate function: returns the number of distinct items in a group.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 73259a0..97aaa1b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -772,4 +772,14 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
Row(Seq(0.0f, 0.0f), Row(0.0d, Double.NaN), Seq(Row(0.0d, Double.NaN)), 2)
)
}
+
+ test("SPARK-27581: DataFrame countDistinct(\"*\") shouldn't fail with AnalysisException") {
+ val df = sql("select id % 100 from range(100000)")
+ val distinctCount1 = df.select(expr("count(distinct(*))"))
+ val distinctCount2 = df.select(countDistinct("*"))
+ checkAnswer(distinctCount1, distinctCount2)
+
+ val countAndDistinct = df.select(count("*"), countDistinct("*"))
+ checkAnswer(countAndDistinct, Row(100000, 100))
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org