You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2022/06/09 12:39:52 UTC

[spark] branch branch-3.3 updated: [SPARK-39427][SQL] Disable ANSI intervals in the percentile functions

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
     new d622f3133c1 [SPARK-39427][SQL] Disable ANSI intervals in the percentile functions
d622f3133c1 is described below

commit d622f3133c1acd3f00aba0108838fdba826cef9b
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Thu Jun 9 15:24:11 2022 +0300

    [SPARK-39427][SQL] Disable ANSI intervals in the percentile functions
    
    In the PR, I propose to don't support ANSI intervals by the percentile functions, and remove the YearMonthIntervalType and DayTimeIntervalType types from the list of input types. I propose to properly support ANSI intervals and enable them back after that.
    
    To don't confuse users by results of the percentile functions when inputs are ANSI intervals. At the moment, the functions return DOUBLE (or ARRAY OF DAUBLE) type independently from inputs. In the case of ANSI intervals, the functions should return ANSI interval too.
    
    No, since the functions haven't released yet.
    
    By running affected test suites:
    ```
    $ build/sbt "sql/test:testOnly org.apache.spark.sql.expressions.ExpressionInfoSuite"
    $ build/sbt "sql/testOnly *ExpressionsSchemaSuite"
    $ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite"
    $ build/sbt "test:testOnly *PercentileSuite"
    $ build/sbt "test:testOnly *PercentileQuerySuite"
    ```
    and checked manually that ANSI intervals are not supported as input types:
    ```sql
    spark-sql> SELECT percentile(col, 0.5) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL '10' MONTH) AS tab(col);
    Error in query: cannot resolve 'percentile(tab.col, CAST(0.5BD AS DOUBLE), 1L)' due to data type mismatch: argument 1 requires numeric type, however, 'tab.col' is of interval month type.; line 1 pos 7;
    ```
    
    Closes #36817 from MaxGekk/percentile-disable-ansi-interval.
    
    Authored-by: Max Gekk <ma...@gmail.com>
    Signed-off-by: Max Gekk <ma...@gmail.com>
    (cherry picked from commit ee24847ad100139628a9bffe45f711bdebaa0170)
    Signed-off-by: Max Gekk <ma...@gmail.com>
---
 .../sql/catalyst/expressions/aggregate/percentiles.scala | 11 +++--------
 .../catalyst/expressions/aggregate/PercentileSuite.scala |  8 ++++----
 .../org/apache/spark/sql/PercentileQuerySuite.scala      | 16 ++++++++--------
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
index e861fb370ca..bd62c0aef75 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
@@ -67,8 +67,7 @@ abstract class PercentileBase extends TypedImperativeAggregate[OpenHashMap[AnyRe
       case _: ArrayType => ArrayType(DoubleType, false)
       case _ => DoubleType
     }
-    Seq(TypeCollection(NumericType, YearMonthIntervalType, DayTimeIntervalType),
-      percentageExpType, IntegralType)
+    Seq(NumericType, percentageExpType, IntegralType)
   }
 
   // Check the inputTypes are valid, and the percentageExpression satisfies:
@@ -288,7 +287,7 @@ abstract class PercentileBase extends TypedImperativeAggregate[OpenHashMap[AnyRe
   usage =
     """
       _FUNC_(col, percentage [, frequency]) - Returns the exact percentile value of numeric
-       or ansi interval column `col` at the given percentage. The value of percentage must be
+       column `col` at the given percentage. The value of percentage must be
        between 0.0 and 1.0. The value of frequency should be positive integral
       _FUNC_(col, array(percentage1 [, percentage2]...) [, frequency]) - Returns the exact
       percentile value array of numeric column `col` at the given percentage(s). Each value
@@ -301,10 +300,6 @@ abstract class PercentileBase extends TypedImperativeAggregate[OpenHashMap[AnyRe
        3.0
       > SELECT _FUNC_(col, array(0.25, 0.75)) FROM VALUES (0), (10) AS tab(col);
        [2.5,7.5]
-      > SELECT _FUNC_(col, 0.5) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL '10' MONTH) AS tab(col);
-       5.0
-      > SELECT _FUNC_(col, array(0.2, 0.5)) FROM VALUES (INTERVAL '0' SECOND), (INTERVAL '10' SECOND) AS tab(col);
-       [2000000.0,5000000.0]
   """,
   group = "agg_funcs",
   since = "2.1.0")
@@ -359,7 +354,7 @@ case class Percentile(
 
 /**
  * Return a percentile value based on a continuous distribution of
- * numeric or ansi interval column at the given percentage (specified in ORDER BY clause).
+ * numeric column at the given percentage (specified in ORDER BY clause).
  * The value of percentage must be between 0.0 and 1.0.
  */
 case class PercentileCont(left: Expression, right: Expression, reverse: Boolean = false)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
index 7b85be05e3b..6cc01f7bab9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
@@ -170,8 +170,8 @@ class PercentileSuite extends SparkFunSuite {
       val child = AttributeReference("a", dataType)()
       val percentile = new Percentile(child, percentage)
       assertEqual(percentile.checkInputDataTypes(),
-        TypeCheckFailure(s"argument 1 requires (numeric or interval year to month or " +
-          s"interval day to second) type, however, 'a' is of ${dataType.simpleString} type."))
+        TypeCheckFailure(s"argument 1 requires numeric type," +
+          s" however, 'a' is of ${dataType.simpleString} type."))
     }
 
     val invalidFrequencyDataTypes = Seq(FloatType, DoubleType, BooleanType,
@@ -184,8 +184,8 @@ class PercentileSuite extends SparkFunSuite {
       val frq = AttributeReference("frq", frequencyType)()
       val percentile = new Percentile(child, percentage, frq)
       assertEqual(percentile.checkInputDataTypes(),
-        TypeCheckFailure(s"argument 1 requires (numeric or interval year to month or " +
-          s"interval day to second) type, however, 'a' is of ${dataType.simpleString} type."))
+        TypeCheckFailure(s"argument 1 requires numeric type," +
+          s" however, 'a' is of ${dataType.simpleString} type."))
     }
 
     for(dataType <- validDataTypes;
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala
index f39f0c18024..823c1375de0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/PercentileQuerySuite.scala
@@ -29,21 +29,21 @@ class PercentileQuerySuite extends QueryTest with SharedSparkSession {
 
   private val table = "percentile_test"
 
-  test("SPARK-37138: Support Ansi Interval type in Percentile") {
+  test("SPARK-37138, SPARK-39427: Disable Ansi Interval type in Percentile") {
     withTempView(table) {
       Seq((Period.ofMonths(100), Duration.ofSeconds(100L)),
         (Period.ofMonths(200), Duration.ofSeconds(200L)),
         (Period.ofMonths(300), Duration.ofSeconds(300L)))
         .toDF("col1", "col2").createOrReplaceTempView(table)
-      checkAnswer(
+      val e = intercept[AnalysisException] {
         spark.sql(
           s"""SELECT
-             |  CAST(percentile(col1, 0.5) AS STRING),
-             |  SUM(null),
-             |  CAST(percentile(col2, 0.5) AS STRING)
-             |FROM $table
-           """.stripMargin),
-        Row("200.0", null, "2.0E8"))
+            |  CAST(percentile(col1, 0.5) AS STRING),
+            |  SUM(null),
+            |  CAST(percentile(col2, 0.5) AS STRING)
+            |FROM $table""".stripMargin).collect()
+      }
+      assert(e.getMessage.contains("data type mismatch"))
     }
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org