You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2022/06/09 12:24:48 UTC

[spark] branch master updated: [SPARK-39427][SQL] Disable ANSI intervals in the percentile functions

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new ee24847ad10 [SPARK-39427][SQL] Disable ANSI intervals in the percentile functions
ee24847ad10 is described below

commit ee24847ad100139628a9bffe45f711bdebaa0170
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Thu Jun 9 15:24:11 2022 +0300

    [SPARK-39427][SQL] Disable ANSI intervals in the percentile functions
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to don't support ANSI intervals by the percentile functions, and remove the YearMonthIntervalType and DayTimeIntervalType types from the list of input types. I propose to properly support ANSI intervals and enable them back after that.
    
    ### Why are the changes needed?
    To don't confuse users by results of the percentile functions when inputs are ANSI intervals. At the moment, the functions return DOUBLE (or ARRAY OF DAUBLE) type independently from inputs. In the case of ANSI intervals, the functions should return ANSI interval too.
    
    ### Does this PR introduce _any_ user-facing change?
    No, since the functions haven't released yet.
    
    ### How was this patch tested?
    By running affected test suites:
    ```
    $ build/sbt "sql/test:testOnly org.apache.spark.sql.expressions.ExpressionInfoSuite"
    $ build/sbt "sql/testOnly *ExpressionsSchemaSuite"
    $ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite"
    $ build/sbt "test:testOnly *PercentileSuite"
    $ build/sbt "test:testOnly *PercentileQuerySuite"
    ```
    and checked manually that ANSI intervals are not supported as input types:
    ```sql
    spark-sql> SELECT percentile(col, 0.5) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL '10' MONTH) AS tab(col);
    Error in query: cannot resolve 'percentile(tab.col, CAST(0.5BD AS DOUBLE), 1L)' due to data type mismatch: argument 1 requires numeric type, however, 'tab.col' is of interval month type.; line 1 pos 7;
    ```
    
    Closes #36817 from MaxGekk/percentile-disable-ansi-interval.
    
    Authored-by: Max Gekk <ma...@gmail.com>
    Signed-off-by: Max Gekk <ma...@gmail.com>
---
 .../sql/catalyst/expressions/aggregate/percentiles.scala | 15 ++++-----------
 .../catalyst/expressions/aggregate/PercentileSuite.scala |  8 ++++----
 .../org/apache/spark/sql/PercentileQuerySuite.scala      | 16 ++++++++--------
 3 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
index 3acb2f2cc97..0efe09bca2e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
@@ -67,8 +67,7 @@ abstract class PercentileBase extends TypedImperativeAggregate[OpenHashMap[AnyRe
       case _: ArrayType => ArrayType(DoubleType, false)
       case _ => DoubleType
     }
-    Seq(TypeCollection(NumericType, YearMonthIntervalType, DayTimeIntervalType),
-      percentageExpType, IntegralType)
+    Seq(NumericType, percentageExpType, IntegralType)
   }
 
   // Check the inputTypes are valid, and the percentageExpression satisfies:
@@ -288,7 +287,7 @@ abstract class PercentileBase extends TypedImperativeAggregate[OpenHashMap[AnyRe
   usage =
     """
       _FUNC_(col, percentage [, frequency]) - Returns the exact percentile value of numeric
-       or ansi interval column `col` at the given percentage. The value of percentage must be
+       column `col` at the given percentage. The value of percentage must be
        between 0.0 and 1.0. The value of frequency should be positive integral
 
       _FUNC_(col, array(percentage1 [, percentage2]...) [, frequency]) - Returns the exact
@@ -303,10 +302,6 @@ abstract class PercentileBase extends TypedImperativeAggregate[OpenHashMap[AnyRe
        3.0
       > SELECT _FUNC_(col, array(0.25, 0.75)) FROM VALUES (0), (10) AS tab(col);
        [2.5,7.5]
-      > SELECT _FUNC_(col, 0.5) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL '10' MONTH) AS tab(col);
-       5.0
-      > SELECT _FUNC_(col, array(0.2, 0.5)) FROM VALUES (INTERVAL '0' SECOND), (INTERVAL '10' SECOND) AS tab(col);
-       [2000000.0,5000000.0]
   """,
   group = "agg_funcs",
   since = "2.1.0")
@@ -360,13 +355,11 @@ case class Percentile(
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(col) - Returns the median of numeric or ansi interval column `col`.",
+  usage = "_FUNC_(col) - Returns the median of numeric column `col`.",
   examples = """
     Examples:
       > SELECT _FUNC_(col) FROM VALUES (0), (10) AS tab(col);
        5.0
-      > SELECT _FUNC_(col) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL '10' MONTH) AS tab(col);
-       5.0
   """,
   group = "agg_funcs",
   since = "3.4.0")
@@ -385,7 +378,7 @@ case class Median(child: Expression)
 
 /**
  * Return a percentile value based on a continuous distribution of
- * numeric or ansi interval column at the given percentage (specified in ORDER BY clause).
+ * numeric column at the given percentage (specified in ORDER BY clause).
  * The value of percentage must be between 0.0 and 1.0.
  */
 case class PercentileCont(left: Expression, right: Expression, reverse: Boolean = false)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
index e08c6279fd6..a3ae7a44a30 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
@@ -170,8 +170,8 @@ class PercentileSuite extends SparkFunSuite {
       val child = AttributeReference("a", dataType)()
       val percentile = new Percentile(child, percentage)
       assertEqual(percentile.checkInputDataTypes(),
-        TypeCheckFailure(s"argument 1 requires (numeric or interval year to month or " +
-          s"interval day to second) type, however, 'a' is of ${dataType.simpleString} type."))
+        TypeCheckFailure(s"argument 1 requires numeric type," +
+          s" however, 'a' is of ${dataType.simpleString} type."))
     }
 
     val invalidFrequencyDataTypes = Seq(FloatType, DoubleType, BooleanType,
@@ -184,8 +184,8 @@ class PercentileSuite extends SparkFunSuite {
       val frq = AttributeReference("frq", frequencyType)()
       val percentile = new Percentile(child, percentage, frq)
       assertEqual(percentile.checkInputDataTypes(),
-        TypeCheckFailure(s"argument 1 requires (numeric or interval year to month or " +
-          s"interval day to second) type, however, 'a' is of ${dataType.simpleString} type."))
+        TypeCheckFailure(s"argument 1 requires numeric type," +
+          s" however, 'a' is of ${dataType.simpleString} type."))
     }
 
     for(dataType <- validDataTypes;
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala
index f39f0c18024..823c1375de0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala
@@ -29,21 +29,21 @@ class PercentileQuerySuite extends QueryTest with SharedSparkSession {
 
   private val table = "percentile_test"
 
-  test("SPARK-37138: Support Ansi Interval type in Percentile") {
+  test("SPARK-37138, SPARK-39427: Disable Ansi Interval type in Percentile") {
     withTempView(table) {
       Seq((Period.ofMonths(100), Duration.ofSeconds(100L)),
         (Period.ofMonths(200), Duration.ofSeconds(200L)),
         (Period.ofMonths(300), Duration.ofSeconds(300L)))
         .toDF("col1", "col2").createOrReplaceTempView(table)
-      checkAnswer(
+      val e = intercept[AnalysisException] {
         spark.sql(
           s"""SELECT
-             |  CAST(percentile(col1, 0.5) AS STRING),
-             |  SUM(null),
-             |  CAST(percentile(col2, 0.5) AS STRING)
-             |FROM $table
-           """.stripMargin),
-        Row("200.0", null, "2.0E8"))
+            |  CAST(percentile(col1, 0.5) AS STRING),
+            |  SUM(null),
+            |  CAST(percentile(col2, 0.5) AS STRING)
+            |FROM $table""".stripMargin).collect()
+      }
+      assert(e.getMessage.contains("data type mismatch"))
     }
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org