You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/02/02 00:30:04 UTC
[spark] branch master updated: [SPARK-34306][SQL][PYTHON][R] Use
Snake naming rule across the function APIs
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 30468a9 [SPARK-34306][SQL][PYTHON][R] Use Snake naming rule across the function APIs
30468a9 is described below
commit 30468a901577e82c855fbc4cb78e1b869facb44c
Author: HyukjinKwon <gu...@apache.org>
AuthorDate: Tue Feb 2 09:29:40 2021 +0900
[SPARK-34306][SQL][PYTHON][R] Use Snake naming rule across the function APIs
### What changes were proposed in this pull request?
This PR completes snake_case rule at functions APIs across the languages, see also SPARK-10621.
In more details, this PR:
- Adds `count_distinct` in Scala Python, and R, and document that `count_distinct` is encouraged. This was not deprecated because `countDistinct` is pretty commonly used. We could deprecate in the future releases.
- (Scala-specific) adds `typedlit` but doesn't deprecate `typedLit` which is arguably commonly used. Likewise, we could deprecate in the future releases.
- Deprecates and renames:
- `sumDistinct` -> `sum_distinct`
- `bitwiseNOT` -> `bitwise_not`
- `shiftLeft` -> `shiftleft` (matched with SQL name in `FunctionRegistry`)
- `shiftRight` -> `shiftright` (matched with SQL name in `FunctionRegistry`)
- `shiftRightUnsigned` -> `shiftrightunsigned` (matched with SQL name in `FunctionRegistry`)
- (Scala-specific) `callUDF` -> `call_udf`
### Why are the changes needed?
To keep the consistent naming in APIs.
### Does this PR introduce _any_ user-facing change?
Yes, it deprecates some APIs and add new renamed APIs as described above.
### How was this patch tested?
Unittests were added.
Closes #31408 from HyukjinKwon/SPARK-34306.
Authored-by: HyukjinKwon <gu...@apache.org>
Signed-off-by: HyukjinKwon <gu...@apache.org>
---
R/pkg/NAMESPACE | 6 +
R/pkg/R/functions.R | 125 +++++++++++++++++----
R/pkg/R/generics.R | 24 ++++
R/pkg/tests/fulltests/test_sparkSQL.R | 17 ++-
R/pkg/vignettes/sparkr-vignettes.Rmd | 2 +-
docs/sql-getting-started.md | 2 +-
.../spark/examples/ml/JavaTokenizerExample.java | 6 +-
python/docs/source/reference/pyspark.sql.rst | 9 +-
python/pyspark/sql/functions.py | 91 +++++++++++++--
python/pyspark/sql/functions.pyi | 6 +
python/pyspark/sql/tests/test_column.py | 2 +
python/pyspark/sql/tests/test_functions.py | 21 +++-
python/pyspark/sql/tests/test_group.py | 1 +
.../optimizer/RewriteDistinctAggregates.scala | 4 +-
.../main/scala/org/apache/spark/sql/Dataset.scala | 2 +-
.../scala/org/apache/spark/sql/functions.scala | 113 ++++++++++++++++---
.../org/apache/spark/sql/JavaDataFrameSuite.java | 2 +-
.../apache/spark/sql/DataFrameAggregateSuite.scala | 32 +++---
.../apache/spark/sql/DataFrameFunctionsSuite.scala | 6 +-
.../org/apache/spark/sql/DataFrameSuite.scala | 6 +-
.../org/apache/spark/sql/DateFunctionsSuite.scala | 2 +-
.../org/apache/spark/sql/MathFunctionsSuite.scala | 24 ++--
.../apache/spark/sql/execution/PlannerSuite.scala | 4 +-
.../spark/sql/execution/SameResultSuite.scala | 4 +-
.../apache/spark/sql/hive/JavaDataFrameSuite.java | 2 +-
.../spark/sql/hive/HiveSparkSubmitSuite.scala | 2 +-
.../hive/execution/ObjectHashAggregateSuite.scala | 2 +-
27 files changed, 409 insertions(+), 108 deletions(-)
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 6ef2df5..4b0e868 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -243,6 +243,7 @@ exportMethods("%<=>%",
"base64",
"between",
"bin",
+ "bitwise_not",
"bitwiseNOT",
"bround",
"cast",
@@ -259,6 +260,7 @@ exportMethods("%<=>%",
"cos",
"cosh",
"count",
+ "count_distinct",
"countDistinct",
"crc32",
"create_array",
@@ -391,8 +393,11 @@ exportMethods("%<=>%",
"sha1",
"sha2",
"shiftLeft",
+ "shiftleft",
"shiftRight",
+ "shiftright",
"shiftRightUnsigned",
+ "shiftrightunsigned",
"shuffle",
"sd",
"sign",
@@ -415,6 +420,7 @@ exportMethods("%<=>%",
"substr",
"substring_index",
"sum",
+ "sum_distinct",
"sumDistinct",
"tan",
"tanh",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 43b25a1..741878b 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -484,7 +484,7 @@ setMethod("acosh",
#' \dontrun{
#' head(select(df, approx_count_distinct(df$gear)))
#' head(select(df, approx_count_distinct(df$gear, 0.02)))
-#' head(select(df, countDistinct(df$gear, df$cyl)))
+#' head(select(df, count_distinct(df$gear, df$cyl)))
#' head(select(df, n_distinct(df$gear)))
#' head(distinct(select(df, "gear")))}
#' @note approx_count_distinct(Column) since 3.0.0
@@ -636,20 +636,33 @@ setMethod("bin",
})
#' @details
-#' \code{bitwiseNOT}: Computes bitwise NOT.
+#' \code{bitwise_not}: Computes bitwise NOT.
#'
#' @rdname column_nonaggregate_functions
-#' @aliases bitwiseNOT bitwiseNOT,Column-method
+#' @aliases bitwise_not bitwise_not,Column-method
#' @examples
#'
#' \dontrun{
-#' head(select(df, bitwiseNOT(cast(df$vs, "int"))))}
+#' head(select(df, bitwise_not(cast(df$vs, "int"))))}
+#' @note bitwise_not since 3.2.0
+setMethod("bitwise_not",
+ signature(x = "Column"),
+ function(x) {
+ jc <- callJStatic("org.apache.spark.sql.functions", "bitwise_not", x@jc)
+ column(jc)
+ })
+
+#' @details
+#' \code{bitwiseNOT}: Computes bitwise NOT.
+#'
+#' @rdname column_nonaggregate_functions
+#' @aliases bitwiseNOT bitwiseNOT,Column-method
#' @note bitwiseNOT since 1.5.0
setMethod("bitwiseNOT",
signature(x = "Column"),
function(x) {
- jc <- callJStatic("org.apache.spark.sql.functions", "bitwiseNOT", x@jc)
- column(jc)
+ .Deprecated("bitwise_not")
+ bitwise_not(x)
})
#' @details
@@ -1937,21 +1950,34 @@ setMethod("sum",
})
#' @details
-#' \code{sumDistinct}: Returns the sum of distinct values in the expression.
+#' \code{sum_distinct}: Returns the sum of distinct values in the expression.
#'
#' @rdname column_aggregate_functions
-#' @aliases sumDistinct sumDistinct,Column-method
+#' @aliases sum_distinct sum_distinct,Column-method
#' @examples
#'
#' \dontrun{
-#' head(select(df, sumDistinct(df$gear)))
+#' head(select(df, sum_distinct(df$gear)))
#' head(distinct(select(df, "gear")))}
+#' @note sum_distinct since 3.2.0
+setMethod("sum_distinct",
+ signature(x = "Column"),
+ function(x) {
+ jc <- callJStatic("org.apache.spark.sql.functions", "sum_distinct", x@jc)
+ column(jc)
+ })
+
+#' @details
+#' \code{sumDistinct}: Returns the sum of distinct values in the expression.
+#'
+#' @rdname column_aggregate_functions
+#' @aliases sumDistinct sumDistinct,Column-method
#' @note sumDistinct since 1.4.0
setMethod("sumDistinct",
signature(x = "Column"),
function(x) {
- jc <- callJStatic("org.apache.spark.sql.functions", "sumDistinct", x@jc)
- column(jc)
+ .Deprecated("sum_distinct")
+ sum_distinct(x)
})
#' @details
@@ -2469,24 +2495,38 @@ setMethod("approxCountDistinct",
})
#' @details
-#' \code{countDistinct}: Returns the number of distinct items in a group.
+#' \code{count_distinct}: Returns the number of distinct items in a group.
#'
#' @rdname column_aggregate_functions
-#' @aliases countDistinct countDistinct,Column-method
-#' @note countDistinct since 1.4.0
-setMethod("countDistinct",
+#' @aliases count_distinct count_distinct,Column-method
+#' @note count_distinct since 3.2.0
+setMethod("count_distinct",
signature(x = "Column"),
function(x, ...) {
jcols <- lapply(list(...), function(x) {
stopifnot(class(x) == "Column")
x@jc
})
- jc <- callJStatic("org.apache.spark.sql.functions", "countDistinct", x@jc,
+ jc <- callJStatic("org.apache.spark.sql.functions", "count_distinct", x@jc,
jcols)
column(jc)
})
#' @details
+#' \code{countDistinct}: Returns the number of distinct items in a group.
+#'
+#' An alias of \code{count_distinct}, and it is encouraged to use \code{count_distinct} directly.
+#'
+#' @rdname column_aggregate_functions
+#' @aliases countDistinct countDistinct,Column-method
+#' @note countDistinct since 1.4.0
+setMethod("countDistinct",
+ signature(x = "Column"),
+ function(x, ...) {
+ count_distinct(x, ...)
+ })
+
+#' @details
#' \code{concat}: Concatenates multiple input columns together into a single column.
#' The function works with strings, binary and compatible array columns.
#'
@@ -2550,7 +2590,7 @@ setMethod("least",
#' @note n_distinct since 1.4.0
setMethod("n_distinct", signature(x = "Column"),
function(x, ...) {
- countDistinct(x, ...)
+ count_distinct(x, ...)
})
#' @rdname count
@@ -2894,6 +2934,21 @@ setMethod("sha2", signature(y = "Column", x = "numeric"),
})
#' @details
+#' \code{shiftleft}: Shifts the given value numBits left. If the given value is a long value,
+#' this function will return a long value else it will return an integer value.
+#'
+#' @rdname column_math_functions
+#' @aliases shiftleft shiftleft,Column,numeric-method
+#' @note shiftleft since 3.2.0
+setMethod("shiftleft", signature(y = "Column", x = "numeric"),
+ function(y, x) {
+ jc <- callJStatic("org.apache.spark.sql.functions",
+ "shiftleft",
+ y@jc, as.integer(x))
+ column(jc)
+ })
+
+#' @details
#' \code{shiftLeft}: Shifts the given value numBits left. If the given value is a long value,
#' this function will return a long value else it will return an integer value.
#'
@@ -2902,8 +2957,21 @@ setMethod("sha2", signature(y = "Column", x = "numeric"),
#' @note shiftLeft since 1.5.0
setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
function(y, x) {
+ .Deprecated("shiftleft")
+ shiftleft(y, x)
+ })
+
+#' @details
+#' \code{shiftright}: (Signed) shifts the given value numBits right. If the given value is a long
+#' value, it will return a long value else it will return an integer value.
+#'
+#' @rdname column_math_functions
+#' @aliases shiftright shiftright,Column,numeric-method
+#' @note shiftright since 3.2.0
+setMethod("shiftright", signature(y = "Column", x = "numeric"),
+ function(y, x) {
jc <- callJStatic("org.apache.spark.sql.functions",
- "shiftLeft",
+ "shiftright",
y@jc, as.integer(x))
column(jc)
})
@@ -2917,8 +2985,21 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
#' @note shiftRight since 1.5.0
setMethod("shiftRight", signature(y = "Column", x = "numeric"),
function(y, x) {
+ .Deprecated("shiftright")
+ shiftright(y, x)
+ })
+
+#' @details
+#' \code{shiftrightunsigned}: (Unsigned) shifts the given value numBits right. If the given value is
+#' a long value, it will return a long value else it will return an integer value.
+#'
+#' @rdname column_math_functions
+#' @aliases shiftrightunsigned shiftrightunsigned,Column,numeric-method
+#' @note shiftrightunsigned since 3.2.0
+setMethod("shiftrightunsigned", signature(y = "Column", x = "numeric"),
+ function(y, x) {
jc <- callJStatic("org.apache.spark.sql.functions",
- "shiftRight",
+ "shiftrightunsigned",
y@jc, as.integer(x))
column(jc)
})
@@ -2932,10 +3013,8 @@ setMethod("shiftRight", signature(y = "Column", x = "numeric"),
#' @note shiftRightUnsigned since 1.5.0
setMethod("shiftRightUnsigned", signature(y = "Column", x = "numeric"),
function(y, x) {
- jc <- callJStatic("org.apache.spark.sql.functions",
- "shiftRightUnsigned",
- y@jc, as.integer(x))
- column(jc)
+ .Deprecated("shiftrightunsigned")
+ shiftrightunsigned(y, x)
})
#' @details
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 4c73cc2..a97fd4c 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -886,6 +886,10 @@ setGeneric("bin", function(x) { standardGeneric("bin") })
#' @rdname column_nonaggregate_functions
#' @name NULL
+setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") })
+
+#' @rdname column_nonaggregate_functions
+#' @name NULL
setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") })
#' @rdname column_math_functions
@@ -925,6 +929,10 @@ setGeneric("conv", function(x, fromBase, toBase) { standardGeneric("conv") })
#' @rdname column_aggregate_functions
#' @name NULL
+setGeneric("count_distinct", function(x, ...) { standardGeneric("count_distinct") })
+
+#' @rdname column_aggregate_functions
+#' @name NULL
setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct") })
#' @rdname column_misc_functions
@@ -1326,12 +1334,24 @@ setGeneric("shiftLeft", function(y, x) { standardGeneric("shiftLeft") })
#' @rdname column_math_functions
#' @name NULL
+setGeneric("shiftleft", function(y, x) { standardGeneric("shiftleft") })
+
+#' @rdname column_math_functions
+#' @name NULL
setGeneric("shiftRight", function(y, x) { standardGeneric("shiftRight") })
#' @rdname column_math_functions
#' @name NULL
+setGeneric("shiftright", function(y, x) { standardGeneric("shiftright") })
+
+#' @rdname column_math_functions
+#' @name NULL
setGeneric("shiftRightUnsigned", function(y, x) { standardGeneric("shiftRightUnsigned") })
+#' @rdname column_math_functions
+#' @name NULL
+setGeneric("shiftrightunsigned", function(y, x) { standardGeneric("shiftrightunsigned") })
+
#' @rdname column_collection_functions
#' @name NULL
setGeneric("shuffle", function(x) { standardGeneric("shuffle") })
@@ -1390,6 +1410,10 @@ setGeneric("substring_index", function(x, delim, count) { standardGeneric("subst
#' @rdname column_aggregate_functions
#' @name NULL
+setGeneric("sum_distinct", function(x) { standardGeneric("sum_distinct") })
+
+#' @rdname column_aggregate_functions
+#' @name NULL
setGeneric("sumDistinct", function(x) { standardGeneric("sumDistinct") })
#' @rdname column_datetime_functions
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 4eafe8f..0ee23b2 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1397,7 +1397,8 @@ test_that("column operators", {
test_that("column functions", {
c <- column("a")
c1 <- abs(c) + acos(c) + approx_count_distinct(c) + ascii(c) + asin(c) + atan(c)
- c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
+ c2 <- avg(c) + base64(c) + bin(c) + suppressWarnings(bitwiseNOT(c)) +
+ bitwise_not(c) + cbrt(c) + ceil(c) + cos(c)
c3 <- cosh(c) + count(c) + crc32(c) + hash(c) + exp(c)
c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c)
c5 <- hour(c) + initcap(c) + last(c) + last_day(c) + length(c)
@@ -1405,7 +1406,8 @@ test_that("column functions", {
c7 <- mean(c) + min(c) + month(c) + negate(c) + posexplode(c) + quarter(c)
c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c) + monotonically_increasing_id()
c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c)
- c10 <- sumDistinct(c) + tan(c) + tanh(c) + degrees(c) + radians(c)
+ c10 <- suppressWarnings(sumDistinct(c)) + sum_distinct(c) + tan(c) + tanh(c) +
+ degrees(c) + radians(c)
c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
c12 <- variance(c) + xxhash64(c) + ltrim(c, "a") + rtrim(c, "b") + trim(c, "c")
c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
@@ -1457,6 +1459,8 @@ test_that("column functions", {
expect_equal(collect(df3)[[2, 1]], FALSE)
expect_equal(collect(df3)[[3, 1]], TRUE)
+ df4 <- select(df, count_distinct(df$age, df$name))
+ expect_equal(collect(df4)[[1, 1]], 2)
df4 <- select(df, countDistinct(df$age, df$name))
expect_equal(collect(df4)[[1, 1]], 2)
@@ -1887,9 +1891,12 @@ test_that("column binary mathfunctions", {
expect_equal(collect(select(df, hypot(df$a, df$b)))[3, "HYPOT(a, b)"], sqrt(3^2 + 7^2))
expect_equal(collect(select(df, hypot(df$a, df$b)))[4, "HYPOT(a, b)"], sqrt(4^2 + 8^2))
## nolint end
- expect_equal(collect(select(df, shiftLeft(df$b, 1)))[4, 1], 16)
- expect_equal(collect(select(df, shiftRight(df$b, 1)))[4, 1], 4)
- expect_equal(collect(select(df, shiftRightUnsigned(df$b, 1)))[4, 1], 4)
+ expect_equal(collect(select(df, shiftleft(df$b, 1)))[4, 1], 16)
+ expect_equal(collect(select(df, shiftright(df$b, 1)))[4, 1], 4)
+ expect_equal(collect(select(df, shiftrightunsigned(df$b, 1)))[4, 1], 4)
+ expect_equal(collect(select(df, suppressWarnings(shiftLeft(df$b, 1))))[4, 1], 16)
+ expect_equal(collect(select(df, suppressWarnings(shiftRight(df$b, 1))))[4, 1], 4)
+ expect_equal(collect(select(df, suppressWarnings(shiftRightUnsigned(df$b, 1))))[4, 1], 4)
expect_equal(class(collect(select(df, rand()))[2, 1]), "numeric")
expect_equal(collect(select(df, rand(1)))[1, 1], 0.636, tolerance = 0.01)
expect_equal(class(collect(select(df, randn()))[2, 1]), "numeric")
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 3177b54..0ed0028 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -331,7 +331,7 @@ A common flow of grouping and aggregation is
2. Feed the `GroupedData` object to `agg` or `summarize` functions, with some provided aggregation functions to compute a number within each group.
-A number of widely used functions are supported to aggregate data after grouping, including `avg`, `countDistinct`, `count`, `first`, `kurtosis`, `last`, `max`, `mean`, `min`, `sd`, `skewness`, `stddev_pop`, `stddev_samp`, `sumDistinct`, `sum`, `var_pop`, `var_samp`, `var`. See the [API doc for aggregate functions](https://spark.apache.org/docs/latest/api/R/column_aggregate_functions.html) linked there.
+A number of widely used functions are supported to aggregate data after grouping, including `avg`, `count_distinct`, `count`, `first`, `kurtosis`, `last`, `max`, `mean`, `min`, `sd`, `skewness`, `stddev_pop`, `stddev_samp`, `sum_distinct`, `sum`, `var_pop`, `var_samp`, `var`. See the [API doc for aggregate functions](https://spark.apache.org/docs/latest/api/R/column_aggregate_functions.html) linked there.
For example we can compute a histogram of the number of cylinders in the `mtcars` dataset as shown below.
diff --git a/docs/sql-getting-started.md b/docs/sql-getting-started.md
index 5a6f182..5486e73 100644
--- a/docs/sql-getting-started.md
+++ b/docs/sql-getting-started.md
@@ -352,7 +352,7 @@ Scalar functions are functions that return a single value per row, as opposed to
## Aggregate Functions
-Aggregate functions are functions that return a single value on a group of rows. The [Built-in Aggregation Functions](sql-ref-functions-builtin.html#aggregate-functions) provide common aggregations such as `count()`, `countDistinct()`, `avg()`, `max()`, `min()`, etc.
+Aggregate functions are functions that return a single value on a group of rows. The [Built-in Aggregation Functions](sql-ref-functions-builtin.html#aggregate-functions) provide common aggregations such as `count()`, `count_distinct()`, `avg()`, `max()`, `min()`, etc.
Users are not limited to the predefined aggregate functions and can create their own. For more details
about user defined aggregate functions, please refer to the documentation of
[User Defined Aggregate Functions](sql-ref-functions-udf-aggregate.html).
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
index 3b5d8e6..091f7d1 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
@@ -36,7 +36,7 @@ import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
// col("...") is preferable to df.col("...")
-import static org.apache.spark.sql.functions.callUDF;
+import static org.apache.spark.sql.functions.call_udf;
import static org.apache.spark.sql.functions.col;
// $example off$
@@ -73,12 +73,12 @@ public class JavaTokenizerExample {
Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame);
tokenized.select("sentence", "words")
- .withColumn("tokens", callUDF("countTokens", col("words")))
+ .withColumn("tokens", call_udf("countTokens", col("words")))
.show(false);
Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame);
regexTokenized.select("sentence", "words")
- .withColumn("tokens", callUDF("countTokens", col("words")))
+ .withColumn("tokens", call_udf("countTokens", col("words")))
.show(false);
// $example off$
diff --git a/python/docs/source/reference/pyspark.sql.rst b/python/docs/source/reference/pyspark.sql.rst
index 0dc2f6e..78926c9 100644
--- a/python/docs/source/reference/pyspark.sql.rst
+++ b/python/docs/source/reference/pyspark.sql.rst
@@ -340,6 +340,7 @@ Functions
avg
base64
bin
+ bitwise_not
bitwiseNOT
broadcast
bround
@@ -358,6 +359,7 @@ Functions
cos
cosh
count
+ count_distinct
countDistinct
covar_pop
covar_samp
@@ -482,9 +484,9 @@ Functions
sequence
sha1
sha2
- shiftLeft
- shiftRight
- shiftRightUnsigned
+ shiftleft
+ shiftright
+ shiftrightunsigned
shuffle
signum
sin
@@ -504,6 +506,7 @@ Functions
substring
substring_index
sum
+ sum_distinct
sumDistinct
tan
tanh
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 45dbedf..6b5f7ee 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -206,8 +206,20 @@ def mean(col):
def sumDistinct(col):
"""
Aggregate function: returns the sum of distinct values in the expression.
+
+ .. deprecated:: 3.2.0
+ Use :func:`sum_distinct` instead.
"""
- return _invoke_function_over_column("sumDistinct", col)
+ warnings.warn("Deprecated in 3.2, use sum_distinct instead.", FutureWarning)
+ return sum_distinct(col)
+
+
+@since(3.2)
+def sum_distinct(col):
+ """
+ Aggregate function: returns the sum of distinct values in the expression.
+ """
+ return _invoke_function_over_column("sum_distinct", col)
def acos(col):
@@ -494,8 +506,20 @@ def toRadians(col):
def bitwiseNOT(col):
"""
Computes bitwise not.
+
+ .. deprecated:: 3.2.0
+ Use :func:`bitwise_not` instead.
+ """
+ warnings.warn("Deprecated in 3.2, use bitwise_not instead.", FutureWarning)
+ return bitwise_not(col)
+
+
+@since(3.2)
+def bitwise_not(col):
+ """
+ Computes bitwise not.
"""
- return _invoke_function_over_column("bitwiseNOT", col)
+ return _invoke_function_over_column("bitwise_not", col)
@since(2.4)
@@ -810,7 +834,7 @@ def approx_count_distinct(col, rsd=None):
col : :class:`Column` or str
rsd : float, optional
maximum relative standard deviation allowed (default = 0.05).
- For rsd < 0.01, it is more efficient to use :func:`countDistinct`
+ For rsd < 0.01, it is more efficient to use :func:`count_distinct`
Examples
--------
@@ -928,18 +952,29 @@ def covar_samp(col1, col2):
def countDistinct(col, *cols):
"""Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.
+ An alias of :func:`count_distinct`, and it is encouraged to use :func:`count_distinct`
+ directly.
+
.. versionadded:: 1.3.0
+ """
+ return count_distinct(col, *cols)
+
+
+def count_distinct(col, *cols):
+ """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.
+
+ .. versionadded:: 3.2.0
Examples
--------
- >>> df.agg(countDistinct(df.age, df.name).alias('c')).collect()
+ >>> df.agg(count_distinct(df.age, df.name).alias('c')).collect()
[Row(c=2)]
- >>> df.agg(countDistinct("age", "name").alias('c')).collect()
+ >>> df.agg(count_distinct("age", "name").alias('c')).collect()
[Row(c=2)]
"""
sc = SparkContext._active_spark_context
- jc = sc._jvm.functions.countDistinct(_to_java_column(col), _to_seq(sc, cols, _to_java_column))
+ jc = sc._jvm.functions.count_distinct(_to_java_column(col), _to_seq(sc, cols, _to_java_column))
return Column(jc)
@@ -1255,13 +1290,25 @@ def shiftLeft(col, numBits):
.. versionadded:: 1.5.0
+ .. deprecated:: 3.2.0
+ Use :func:`shiftleft` instead.
+ """
+ warnings.warn("Deprecated in 3.2, use shiftleft instead.", FutureWarning)
+ return shiftleft(col, numBits)
+
+
+def shiftleft(col, numBits):
+ """Shift the given value numBits left.
+
+ .. versionadded:: 3.2.0
+
Examples
--------
- >>> spark.createDataFrame([(21,)], ['a']).select(shiftLeft('a', 1).alias('r')).collect()
+ >>> spark.createDataFrame([(21,)], ['a']).select(shiftleft('a', 1).alias('r')).collect()
[Row(r=42)]
"""
sc = SparkContext._active_spark_context
- return Column(sc._jvm.functions.shiftLeft(_to_java_column(col), numBits))
+ return Column(sc._jvm.functions.shiftleft(_to_java_column(col), numBits))
def shiftRight(col, numBits):
@@ -1269,9 +1316,21 @@ def shiftRight(col, numBits):
.. versionadded:: 1.5.0
+ .. deprecated:: 3.2.0
+ Use :func:`shiftright` instead.
+ """
+ warnings.warn("Deprecated in 3.2, use shiftright instead.", FutureWarning)
+ return shiftright(col, numBits)
+
+
+def shiftright(col, numBits):
+ """(Signed) shift the given value numBits right.
+
+ .. versionadded:: 3.2.0
+
Examples
--------
- >>> spark.createDataFrame([(42,)], ['a']).select(shiftRight('a', 1).alias('r')).collect()
+ >>> spark.createDataFrame([(42,)], ['a']).select(shiftright('a', 1).alias('r')).collect()
[Row(r=21)]
"""
sc = SparkContext._active_spark_context
@@ -1284,10 +1343,22 @@ def shiftRightUnsigned(col, numBits):
.. versionadded:: 1.5.0
+ .. deprecated:: 3.2.0
+ Use :func:`shiftrightunsigned` instead.
+ """
+ warnings.warn("Deprecated in 3.2, use shiftrightunsigned instead.", FutureWarning)
+ return shiftrightunsigned(col, numBits)
+
+
+def shiftrightunsigned(col, numBits):
+ """Unsigned shift the given value numBits right.
+
+ .. versionadded:: 3.2.0
+
Examples
--------
>>> df = spark.createDataFrame([(-42,)], ['a'])
- >>> df.select(shiftRightUnsigned('a', 1).alias('r')).collect()
+ >>> df.select(shiftrightunsigned('a', 1).alias('r')).collect()
[Row(r=9223372036854775787)]
"""
sc = SparkContext._active_spark_context
diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi
index 273128b..e04edc7 100644
--- a/python/pyspark/sql/functions.pyi
+++ b/python/pyspark/sql/functions.pyi
@@ -45,6 +45,7 @@ def corr(col1: ColumnOrName, col2: ColumnOrName) -> Column: ...
def covar_pop(col1: ColumnOrName, col2: ColumnOrName) -> Column: ...
def covar_samp(col1: ColumnOrName, col2: ColumnOrName) -> Column: ...
def countDistinct(col: ColumnOrName, *cols: ColumnOrName) -> Column: ...
+def count_distinct(col: ColumnOrName, *cols: ColumnOrName) -> Column: ...
def first(col: ColumnOrName, ignorenulls: bool = ...) -> Column: ...
def grouping(col: ColumnOrName) -> Column: ...
def grouping_id(*cols: ColumnOrName) -> Column: ...
@@ -64,8 +65,11 @@ def randn(seed: Optional[int] = ...) -> Column: ...
def round(col: ColumnOrName, scale: int = ...) -> Column: ...
def bround(col: ColumnOrName, scale: int = ...) -> Column: ...
def shiftLeft(col: ColumnOrName, numBits: int) -> Column: ...
+def shiftleft(col: ColumnOrName, numBits: int) -> Column: ...
def shiftRight(col: ColumnOrName, numBits: int) -> Column: ...
+def shiftright(col: ColumnOrName, numBits: int) -> Column: ...
def shiftRightUnsigned(col: ColumnOrName, numBits: int) -> Column: ...
+def shiftrightunsigned(col: ColumnOrName, numBits: int) -> Column: ...
def spark_partition_id() -> Column: ...
def expr(str: str) -> Column: ...
def struct(*cols: ColumnOrName) -> Column: ...
@@ -278,6 +282,7 @@ def atan2(col1: ColumnOrName, col2: float) -> Column: ...
def avg(col: ColumnOrName) -> Column: ...
def base64(col: ColumnOrName) -> Column: ...
def bitwiseNOT(col: ColumnOrName) -> Column: ...
+def bitwise_not(col: ColumnOrName) -> Column: ...
def cbrt(col: ColumnOrName) -> Column: ...
def ceil(col: ColumnOrName) -> Column: ...
def col(col: str) -> Column: ...
@@ -333,6 +338,7 @@ def stddev_pop(col: ColumnOrName) -> Column: ...
def stddev_samp(col: ColumnOrName) -> Column: ...
def sum(col: ColumnOrName) -> Column: ...
def sumDistinct(col: ColumnOrName) -> Column: ...
+def sum_distinct(col: ColumnOrName) -> Column: ...
def tan(col: ColumnOrName) -> Column: ...
def tanh(col: ColumnOrName) -> Column: ...
def toDegrees(col: ColumnOrName) -> Column: ...
diff --git a/python/pyspark/sql/tests/test_column.py b/python/pyspark/sql/tests/test_column.py
index 2ae0a9b..c2530b2 100644
--- a/python/pyspark/sql/tests/test_column.py
+++ b/python/pyspark/sql/tests/test_column.py
@@ -139,6 +139,8 @@ class ColumnTests(ReusedSQLTestCase):
self.assertEqual(170 ^ 75, result['(a ^ b)'])
result = df.select(functions.bitwiseNOT(df.b)).collect()[0].asDict()
self.assertEqual(~75, result['~b'])
+ result = df.select(functions.bitwise_not(df.b)).collect()[0].asDict()
+ self.assertEqual(~75, result['~b'])
def test_with_field(self):
from pyspark.sql.functions import lit, col
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index 053164a..112043e 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -21,7 +21,9 @@ import re
from py4j.protocol import Py4JJavaError
from pyspark.sql import Row, Window
-from pyspark.sql.functions import udf, input_file_name, col, percentile_approx, lit
+from pyspark.sql.functions import udf, input_file_name, col, percentile_approx, \
+ lit, assert_true, sum_distinct, sumDistinct, shiftleft, shiftLeft, shiftRight, \
+ shiftright, shiftrightunsigned, shiftRightUnsigned
from pyspark.testing.sqlutils import ReusedSQLTestCase
@@ -640,6 +642,23 @@ class FunctionsTests(ReusedSQLTestCase):
str(cm.exception)
)
+ def test_sum_distinct(self):
+ self.spark.range(10).select(
+ assert_true(sum_distinct(col("id")) == sumDistinct(col("id")))).collect()
+
+ def test_shiftleft(self):
+ self.spark.range(10).select(
+ assert_true(shiftLeft(col("id"), 2) == shiftleft(col("id"), 2))).collect()
+
+ def test_shiftright(self):
+ self.spark.range(10).select(
+ assert_true(shiftRight(col("id"), 2) == shiftright(col("id"), 2))).collect()
+
+ def test_shiftrightunsigned(self):
+ self.spark.range(10).select(
+ assert_true(
+ shiftRightUnsigned(col("id"), 2) == shiftrightunsigned(col("id"), 2))).collect()
+
if __name__ == "__main__":
import unittest
diff --git a/python/pyspark/sql/tests/test_group.py b/python/pyspark/sql/tests/test_group.py
index 324c964..e603334 100644
--- a/python/pyspark/sql/tests/test_group.py
+++ b/python/pyspark/sql/tests/test_group.py
@@ -31,6 +31,7 @@ class GroupTests(ReusedSQLTestCase):
self.assertEqual((0, u'99'),
tuple(g.agg(functions.first(df.key), functions.last(df.value)).first()))
self.assertTrue(95 < g.agg(functions.approx_count_distinct(df.key)).first()[0])
+ # test deprecated countDistinct
self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
index 188435d..e65e869 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
@@ -39,8 +39,8 @@ import org.apache.spark.sql.types.IntegerType
*
* val agg = data.groupBy($"key")
* .agg(
- * countDistinct($"cat1").as("cat1_cnt"),
- * countDistinct($"cat2").as("cat2_cnt"),
+ * count_distinct($"cat1").as("cat1_cnt"),
+ * count_distinct($"cat2").as("cat2_cnt"),
* sum($"value").as("total"))
* }}}
*
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index d399197..e78b811 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2319,7 +2319,7 @@ class Dataset[T] private[sql](
*
* val allWords = ds.select('title, explode(split('words, " ")).as("word"))
*
- * val bookCountPerWord = allWords.groupBy("word").agg(countDistinct("title"))
+ * val bookCountPerWord = allWords.groupBy("word").agg(count_distinct("title"))
* }}}
*
* Using `flatMap()` this can similarly be exploded as:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 40d2a51..de95067 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -116,6 +116,16 @@ object functions {
/**
* Creates a [[Column]] of literal value.
*
+ * An alias of `typedlit`, and it is encouraged to use `typedlit` directly.
+ *
+ * @group normal_funcs
+ * @since 2.2.0
+ */
+ def typedLit[T : TypeTag](literal: T): Column = typedlit(literal)
+
+ /**
+ * Creates a [[Column]] of literal value.
+ *
* The passed in object is returned directly if it is already a [[Column]].
* If the object is a Scala Symbol, it is converted into a [[Column]] also.
* Otherwise, a new [[Column]] is created to represent the literal value.
@@ -123,9 +133,9 @@ object functions {
* can handle parameterized scala types e.g.: List, Seq and Map.
*
* @group normal_funcs
- * @since 2.2.0
+ * @since 3.2.0
*/
- def typedLit[T : TypeTag](literal: T): Column = literal match {
+ def typedlit[T : TypeTag](literal: T): Column = literal match {
case c: Column => c
case s: Symbol => new ColumnName(s.name)
case _ => Column(Literal.create(literal))
@@ -388,24 +398,37 @@ object functions {
/**
* Aggregate function: returns the number of distinct items in a group.
*
+ * An alias of `count_distinct`, and it is encouraged to use `count_distinct` directly.
+ *
* @group agg_funcs
* @since 1.3.0
*/
@scala.annotation.varargs
- def countDistinct(expr: Column, exprs: Column*): Column =
- // For usage like countDistinct("*"), we should let analyzer expand star and
- // resolve function.
- Column(UnresolvedFunction("count", (expr +: exprs).map(_.expr), isDistinct = true))
+ def countDistinct(expr: Column, exprs: Column*): Column = count_distinct(expr, exprs: _*)
/**
* Aggregate function: returns the number of distinct items in a group.
*
+ * An alias of `count_distinct`, and it is encouraged to use `count_distinct` directly.
+ *
* @group agg_funcs
* @since 1.3.0
*/
@scala.annotation.varargs
def countDistinct(columnName: String, columnNames: String*): Column =
- countDistinct(Column(columnName), columnNames.map(Column.apply) : _*)
+ count_distinct(Column(columnName), columnNames.map(Column.apply) : _*)
+
+ /**
+ * Aggregate function: returns the number of distinct items in a group.
+ *
+ * @group agg_funcs
+ * @since 3.2.0
+ */
+ @scala.annotation.varargs
+ def count_distinct(expr: Column, exprs: Column*): Column =
+ // For usage like countDistinct("*"), we should let analyzer expand star and
+ // resolve function.
+ Column(UnresolvedFunction("count", (expr +: exprs).map(_.expr), isDistinct = true))
/**
* Aggregate function: returns the population covariance for two columns.
@@ -796,6 +819,7 @@ object functions {
* @group agg_funcs
* @since 1.3.0
*/
+ @deprecated("Use sum_distinct", "3.2.0")
def sumDistinct(e: Column): Column = withAggregateFunction(Sum(e.expr), isDistinct = true)
/**
@@ -804,7 +828,16 @@ object functions {
* @group agg_funcs
* @since 1.3.0
*/
- def sumDistinct(columnName: String): Column = sumDistinct(Column(columnName))
+ @deprecated("Use sum_distinct", "3.2.0")
+ def sumDistinct(columnName: String): Column = sum_distinct(Column(columnName))
+
+ /**
+ * Aggregate function: returns the sum of distinct values in the expression.
+ *
+ * @group agg_funcs
+ * @since 3.2.0
+ */
+ def sum_distinct(e: Column): Column = withAggregateFunction(Sum(e.expr), isDistinct = true)
/**
* Aggregate function: alias for `var_samp`.
@@ -1411,7 +1444,16 @@ object functions {
* @group normal_funcs
* @since 1.4.0
*/
- def bitwiseNOT(e: Column): Column = withExpr { BitwiseNot(e.expr) }
+ @deprecated("Use bitwise_not", "3.2.0")
+ def bitwiseNOT(e: Column): Column = bitwise_not(e)
+
+ /**
+ * Computes bitwise NOT (~) of a number.
+ *
+ * @group normal_funcs
+ * @since 3.2.0
+ */
+ def bitwise_not(e: Column): Column = withExpr { BitwiseNot(e.expr) }
/**
* Parses the expression string into the column that it represents, similar to
@@ -2142,7 +2184,17 @@ object functions {
* @group math_funcs
* @since 1.5.0
*/
- def shiftLeft(e: Column, numBits: Int): Column = withExpr { ShiftLeft(e.expr, lit(numBits).expr) }
+ @deprecated("Use shiftleft", "3.2.0")
+ def shiftLeft(e: Column, numBits: Int): Column = shiftleft(e, numBits)
+
+ /**
+ * Shift the given value numBits left. If the given value is a long value, this function
+ * will return a long value else it will return an integer value.
+ *
+ * @group math_funcs
+ * @since 3.2.0
+ */
+ def shiftleft(e: Column, numBits: Int): Column = withExpr { ShiftLeft(e.expr, lit(numBits).expr) }
/**
* (Signed) shift the given value numBits right. If the given value is a long value, it will
@@ -2151,7 +2203,17 @@ object functions {
* @group math_funcs
* @since 1.5.0
*/
- def shiftRight(e: Column, numBits: Int): Column = withExpr {
+ @deprecated("Use shiftright", "3.2.0")
+ def shiftRight(e: Column, numBits: Int): Column = shiftright(e, numBits)
+
+ /**
+ * (Signed) shift the given value numBits right. If the given value is a long value, it will
+ * return a long value else it will return an integer value.
+ *
+ * @group math_funcs
+ * @since 3.2.0
+ */
+ def shiftright(e: Column, numBits: Int): Column = withExpr {
ShiftRight(e.expr, lit(numBits).expr)
}
@@ -2162,7 +2224,17 @@ object functions {
* @group math_funcs
* @since 1.5.0
*/
- def shiftRightUnsigned(e: Column, numBits: Int): Column = withExpr {
+ @deprecated("Use shiftrightunsigned", "3.2.0")
+ def shiftRightUnsigned(e: Column, numBits: Int): Column = shiftrightunsigned(e, numBits)
+
+ /**
+ * Unsigned shift the given value numBits right. If the given value is a long value,
+ * it will return a long value else it will return an integer value.
+ *
+ * @group math_funcs
+ * @since 3.2.0
+ */
+ def shiftrightunsigned(e: Column, numBits: Int): Column = withExpr {
ShiftRightUnsigned(e.expr, lit(numBits).expr)
}
@@ -5075,6 +5147,17 @@ object functions {
/**
* Call an user-defined function.
+ *
+ * @group udf_funcs
+ * @since 1.5.0
+ */
+ @scala.annotation.varargs
+ @deprecated("Use call_udf")
+ def callUDF(udfName: String, cols: Column*): Column =
+ call_udf(udfName, cols: _*)
+
+ /**
+ * Call an user-defined function.
* Example:
* {{{
* import org.apache.spark.sql._
@@ -5082,14 +5165,14 @@ object functions {
* val df = Seq(("id1", 1), ("id2", 4), ("id3", 5)).toDF("id", "value")
* val spark = df.sparkSession
* spark.udf.register("simpleUDF", (v: Int) => v * v)
- * df.select($"id", callUDF("simpleUDF", $"value"))
+ * df.select($"id", call_udf("simpleUDF", $"value"))
* }}}
*
* @group udf_funcs
- * @since 1.5.0
+ * @since 3.2.0
*/
@scala.annotation.varargs
- def callUDF(udfName: String, cols: Column*): Column = withExpr {
+ def call_udf(udfName: String, cols: Column*): Column = withExpr {
UnresolvedFunction(udfName, cols.map(_.expr), isDistinct = false)
}
}
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index f4bffd9..da7c622 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -105,7 +105,7 @@ public class JavaDataFrameSuite {
// Varargs in column expressions
df.groupBy().agg(countDistinct("key", "value"));
- df.groupBy().agg(countDistinct(col("key"), col("value")));
+ df.groupBy().agg(count_distinct(col("key"), col("value")));
df.select(coalesce(col("key")));
// Varargs with mathfunctions
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index c6d134b..07e6a40 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -297,7 +297,7 @@ class DataFrameAggregateSuite extends QueryTest
Row(2.0, 2.0))
checkAnswer(
- testData2.agg(avg($"a"), sumDistinct($"a")), // non-partial
+ testData2.agg(avg($"a"), sumDistinct($"a")), // non-partial and test deprecated version
Row(2.0, 6.0) :: Nil)
checkAnswer(
@@ -305,7 +305,7 @@ class DataFrameAggregateSuite extends QueryTest
Row(new java.math.BigDecimal(2)))
checkAnswer(
- decimalData.agg(avg($"a"), sumDistinct($"a")), // non-partial
+ decimalData.agg(avg($"a"), sum_distinct($"a")), // non-partial
Row(new java.math.BigDecimal(2), new java.math.BigDecimal(6)) :: Nil)
checkAnswer(
@@ -314,7 +314,7 @@ class DataFrameAggregateSuite extends QueryTest
// non-partial
checkAnswer(
decimalData.agg(
- avg($"a" cast DecimalType(10, 2)), sumDistinct($"a" cast DecimalType(10, 2))),
+ avg($"a" cast DecimalType(10, 2)), sum_distinct($"a" cast DecimalType(10, 2))),
Row(new java.math.BigDecimal(2), new java.math.BigDecimal(6)) :: Nil)
}
@@ -324,11 +324,11 @@ class DataFrameAggregateSuite extends QueryTest
Row(2.0))
checkAnswer(
- testData3.agg(avg($"b"), countDistinct($"b")),
+ testData3.agg(avg($"b"), count_distinct($"b")),
Row(2.0, 1))
checkAnswer(
- testData3.agg(avg($"b"), sumDistinct($"b")), // non-partial
+ testData3.agg(avg($"b"), sum_distinct($"b")), // non-partial
Row(2.0, 2.0))
}
@@ -339,7 +339,7 @@ class DataFrameAggregateSuite extends QueryTest
Row(null))
checkAnswer(
- emptyTableData.agg(avg($"a"), sumDistinct($"b")), // non-partial
+ emptyTableData.agg(avg($"a"), sum_distinct($"b")), // non-partial
Row(null, null))
}
@@ -347,7 +347,7 @@ class DataFrameAggregateSuite extends QueryTest
assert(testData2.count() === testData2.rdd.map(_ => 1).count())
checkAnswer(
- testData2.agg(count($"a"), sumDistinct($"a")), // non-partial
+ testData2.agg(count($"a"), sum_distinct($"a")), // non-partial
Row(6, 6.0))
}
@@ -364,12 +364,12 @@ class DataFrameAggregateSuite extends QueryTest
checkAnswer(
testData3.agg(
- count($"a"), count($"b"), count(lit(1)), countDistinct($"a"), countDistinct($"b")),
+ count($"a"), count($"b"), count(lit(1)), count_distinct($"a"), count_distinct($"b")),
Row(2, 1, 2, 2, 1)
)
checkAnswer(
- testData3.agg(count($"b"), countDistinct($"b"), sumDistinct($"b")), // non-partial
+ testData3.agg(count($"b"), count_distinct($"b"), sum_distinct($"b")), // non-partial
Row(1, 1, 2)
)
}
@@ -384,17 +384,17 @@ class DataFrameAggregateSuite extends QueryTest
.toDF("key1", "key2", "key3")
checkAnswer(
- df1.agg(countDistinct($"key1", $"key2")),
+ df1.agg(count_distinct($"key1", $"key2")),
Row(3)
)
checkAnswer(
- df1.agg(countDistinct($"key1", $"key2", $"key3")),
+ df1.agg(count_distinct($"key1", $"key2", $"key3")),
Row(3)
)
checkAnswer(
- df1.groupBy($"key1").agg(countDistinct($"key2", $"key3")),
+ df1.groupBy($"key1").agg(count_distinct($"key2", $"key3")),
Seq(Row("a", 2), Row("x", 1))
)
}
@@ -402,7 +402,7 @@ class DataFrameAggregateSuite extends QueryTest
test("zero count") {
val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b")
checkAnswer(
- emptyTableData.agg(count($"a"), sumDistinct($"a")), // non-partial
+ emptyTableData.agg(count($"a"), sum_distinct($"a")), // non-partial
Row(0, null))
}
@@ -433,7 +433,7 @@ class DataFrameAggregateSuite extends QueryTest
test("zero sum distinct") {
val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b")
checkAnswer(
- emptyTableData.agg(sumDistinct($"a")),
+ emptyTableData.agg(sum_distinct($"a")),
Row(null))
}
@@ -622,7 +622,7 @@ class DataFrameAggregateSuite extends QueryTest
val df = Seq((1, 3, "a"), (1, 2, "b"), (3, 4, "c"), (3, 4, "c"), (3, 5, "d"))
.toDF("x", "y", "z")
checkAnswer(
- df.groupBy($"x").agg(countDistinct($"y"), sort_array(collect_list($"z"))),
+ df.groupBy($"x").agg(count_distinct($"y"), sort_array(collect_list($"z"))),
Seq(Row(1, 2, Seq("a", "b")), Row(3, 2, Seq("c", "c", "d"))))
}
@@ -837,7 +837,7 @@ class DataFrameAggregateSuite extends QueryTest
)
}
- test("SPARK-27581: DataFrame countDistinct(\"*\") shouldn't fail with AnalysisException") {
+ test("SPARK-27581: DataFrame count_distinct(\"*\") shouldn't fail with AnalysisException") {
val df = sql("select id % 100 from range(100000)")
val distinctCount1 = df.select(expr("count(distinct(*))"))
val distinctCount2 = df.select(countDistinct("*"))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index aa1678e..99f17e7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -171,10 +171,10 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
)
}
- test("bitwiseNOT") {
+ test("bitwise_not") {
checkAnswer(
- testData2.select(bitwiseNOT($"a")),
- testData2.collect().toSeq.map(r => Row(~r.getInt(0))))
+ testData2.select(bitwiseNOT($"a"), bitwise_not($"a")),
+ testData2.collect().toSeq.map(r => Row(~r.getInt(0), ~r.getInt(0))))
}
test("bin") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 3cafeed3..10b99b8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -133,7 +133,7 @@ class DataFrameSuite extends QueryTest
df2
.select('_1 as 'letter, 'number)
.groupBy('letter)
- .agg(countDistinct('number)),
+ .agg(count_distinct('number)),
Row("a", 3) :: Row("b", 2) :: Row("c", 1) :: Nil
)
}
@@ -513,7 +513,7 @@ class DataFrameSuite extends QueryTest
Row(5, false)))
checkAnswer(
- testData2.select(sumDistinct($"a")),
+ testData2.select(sum_distinct($"a")),
Row(6))
}
@@ -607,7 +607,7 @@ class DataFrameSuite extends QueryTest
val df = Seq(("id1", 1), ("id2", 4), ("id3", 5)).toDF("id", "value")
df.sparkSession.udf.register("simpleUDF", (v: Int) => v * v)
checkAnswer(
- df.select($"id", callUDF("simpleUDF", $"value")),
+ df.select($"id", callUDF("simpleUDF", $"value")), // test deprecated one
Row("id1", 1) :: Row("id2", 16) :: Row("id3", 25) :: Nil)
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index b545d60..34b1654 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -47,7 +47,7 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession {
test("function current_timestamp and now") {
val df1 = Seq((1, 2), (3, 1)).toDF("a", "b")
- checkAnswer(df1.select(countDistinct(current_timestamp())), Row(1))
+ checkAnswer(df1.select(count_distinct(current_timestamp())), Row(1))
// Execution in one query should return the same value
checkAnswer(sql("""SELECT CURRENT_TIMESTAMP() = CURRENT_TIMESTAMP()"""), Row(true))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala
index 87526b1..3509804 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala
@@ -366,14 +366,14 @@ class MathFunctionsSuite extends QueryTest with SharedSparkSession {
checkAnswer(
df.select(
- shiftLeft('a, 1), shiftLeft('b, 1), shiftLeft('c, 1), shiftLeft('d, 1),
- shiftLeft('f, 1)),
+ shiftleft('a, 1), shiftleft('b, 1), shiftleft('c, 1), shiftleft('d, 1),
+ shiftLeft('f, 1)), // test deprecated one.
Row(42.toLong, 42, 42.toShort, 42.toByte, null))
checkAnswer(
df.selectExpr(
- "shiftLeft(a, 1)", "shiftLeft(b, 1)", "shiftLeft(b, 1)", "shiftLeft(d, 1)",
- "shiftLeft(f, 1)"),
+ "shiftleft(a, 1)", "shiftleft(b, 1)", "shiftleft(b, 1)", "shiftleft(d, 1)",
+ "shiftleft(f, 1)"),
Row(42.toLong, 42, 42.toShort, 42.toByte, null))
}
@@ -383,14 +383,14 @@ class MathFunctionsSuite extends QueryTest with SharedSparkSession {
checkAnswer(
df.select(
- shiftRight('a, 1), shiftRight('b, 1), shiftRight('c, 1), shiftRight('d, 1),
- shiftRight('f, 1)),
+ shiftright('a, 1), shiftright('b, 1), shiftright('c, 1), shiftright('d, 1),
+ shiftRight('f, 1)), // test deprecated one.
Row(21.toLong, 21, 21.toShort, 21.toByte, null))
checkAnswer(
df.selectExpr(
- "shiftRight(a, 1)", "shiftRight(b, 1)", "shiftRight(c, 1)", "shiftRight(d, 1)",
- "shiftRight(f, 1)"),
+ "shiftright(a, 1)", "shiftright(b, 1)", "shiftright(c, 1)", "shiftright(d, 1)",
+ "shiftright(f, 1)"),
Row(21.toLong, 21, 21.toShort, 21.toByte, null))
}
@@ -400,14 +400,14 @@ class MathFunctionsSuite extends QueryTest with SharedSparkSession {
checkAnswer(
df.select(
- shiftRightUnsigned('a, 1), shiftRightUnsigned('b, 1), shiftRightUnsigned('c, 1),
- shiftRightUnsigned('d, 1), shiftRightUnsigned('f, 1)),
+ shiftrightunsigned('a, 1), shiftrightunsigned('b, 1), shiftrightunsigned('c, 1),
+ shiftrightunsigned('d, 1), shiftRightUnsigned('f, 1)), // test deprecated one.
Row(9223372036854775787L, 21, 21.toShort, 21.toByte, null))
checkAnswer(
df.selectExpr(
- "shiftRightUnsigned(a, 1)", "shiftRightUnsigned(b, 1)", "shiftRightUnsigned(c, 1)",
- "shiftRightUnsigned(d, 1)", "shiftRightUnsigned(f, 1)"),
+ "shiftrightunsigned(a, 1)", "shiftrightunsigned(b, 1)", "shiftrightunsigned(c, 1)",
+ "shiftrightunsigned(d, 1)", "shiftrightunsigned(f, 1)"),
Row(9223372036854775787L, 21, 21.toShort, 21.toByte, null))
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 8924d2e..e851722 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -61,13 +61,13 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper {
}
test("count distinct is partially aggregated") {
- val query = testData.groupBy('value).agg(countDistinct('key)).queryExecution.analyzed
+ val query = testData.groupBy('value).agg(count_distinct('key)).queryExecution.analyzed
testPartialAggregationPlan(query)
}
test("mixed aggregates are partially aggregated") {
val query =
- testData.groupBy('value).agg(count('value), countDistinct('key)).queryExecution.analyzed
+ testData.groupBy('value).agg(count('value), count_distinct('key)).queryExecution.analyzed
testPartialAggregationPlan(query)
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SameResultSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SameResultSuite.scala
index 18d3667..d2406aa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SameResultSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SameResultSuite.scala
@@ -120,8 +120,8 @@ class SameResultSuite extends QueryTest with SharedSparkSession {
val df2 = spark.range(10).agg(sum($"id"))
assert(df1.queryExecution.executedPlan.sameResult(df2.queryExecution.executedPlan))
- val df3 = spark.range(10).agg(sumDistinct($"id"))
- val df4 = spark.range(10).agg(sumDistinct($"id"))
+ val df3 = spark.range(10).agg(sum_distinct($"id"))
+ val df4 = spark.range(10).agg(sum_distinct($"id"))
assert(df3.queryExecution.executedPlan.sameResult(df4.queryExecution.executedPlan))
}
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
index 2b53238..268a31d 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
@@ -85,7 +85,7 @@ public class JavaDataFrameSuite {
udaf.distinct(col("value")),
udaf.apply(col("value")),
registeredUDAF.apply(col("value")),
- callUDF("mydoublesum", col("value")));
+ callUDF("mydoublesum", col("value"))); // test deprecated one
List<Row> expectedResult = new ArrayList<>();
expectedResult.add(RowFactory.create(4950.0, 9900.0, 9900.0, 9900.0));
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index 77d54ed..4e64b4d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -711,7 +711,7 @@ object SPARK_9757 extends QueryTest {
val df =
hiveContext
.range(10)
- .select(callUDF("struct", ($"id" + 0.2) cast DecimalType(10, 3)) as "dec_struct")
+ .select(call_udf("struct", ($"id" + 0.2) cast DecimalType(10, 3)) as "dec_struct")
df.write.option("path", dir.getCanonicalPath).mode("overwrite").saveAsTable("t")
checkAnswer(hiveContext.table("t"), df)
}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala
index 72aeb4f..2bca25e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala
@@ -219,7 +219,7 @@ class ObjectHashAggregateSuite
val withPartialSafe = max($"c2")
// A Spark SQL native distinct aggregate function
- val withDistinct = countDistinct($"c3")
+ val withDistinct = count_distinct($"c3")
val allAggs = Seq(
"typed" -> typed,
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org