You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2020/06/09 16:13:10 UTC
[GitHub] [spark] maropu commented on a change in pull request #28764: [SPARK-21117][SQL] Built-in SQL Function Support - WIDTH_BUCKET

maropu commented on a change in pull request #28764:
URL: https://github.com/apache/spark/pull/28764#discussion_r437169927



##########
File path: sql/core/src/test/resources/sql-tests/results/postgreSQL/numeric.sql.out
##########
@@ -4423,6 +4423,176 @@ struct<>
 
 
 
+-- !query
+SELECT width_bucket(5.0, 3.0, 4.0, 0)
+-- !query schema
+struct<widthbucket(CAST(5.0 AS DOUBLE), CAST(3.0 AS DOUBLE), CAST(4.0 AS DOUBLE), CAST(0 AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+SELECT width_bucket(5.0, 3.0, 4.0, -5)
+-- !query schema
+struct<widthbucket(CAST(5.0 AS DOUBLE), CAST(3.0 AS DOUBLE), CAST(4.0 AS DOUBLE), CAST(-5 AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+SELECT width_bucket(3.5, 3.0, 3.0, 888)
+-- !query schema
+struct<widthbucket(CAST(3.5 AS DOUBLE), CAST(3.0 AS DOUBLE), CAST(3.0 AS DOUBLE), CAST(888 AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+SELECT width_bucket(double(5.0), double(3.0), double(4.0), 0)
+-- !query schema
+struct<widthbucket(CAST(5.0 AS DOUBLE), CAST(3.0 AS DOUBLE), CAST(4.0 AS DOUBLE), CAST(0 AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+SELECT width_bucket(double(5.0), double(3.0), double(4.0), -5)
+-- !query schema
+struct<widthbucket(CAST(5.0 AS DOUBLE), CAST(3.0 AS DOUBLE), CAST(4.0 AS DOUBLE), CAST(-5 AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+SELECT width_bucket(double(3.5), double(3.0), double(3.0), 888)
+-- !query schema
+struct<widthbucket(CAST(3.5 AS DOUBLE), CAST(3.0 AS DOUBLE), CAST(3.0 AS DOUBLE), CAST(888 AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+SELECT width_bucket('NaN', 3.0, 4.0, 888)
+-- !query schema
+struct<widthbucket(CAST(NaN AS DOUBLE), CAST(3.0 AS DOUBLE), CAST(4.0 AS DOUBLE), CAST(888 AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+SELECT width_bucket(double(0), 'NaN', double(4.0), 888)
+-- !query schema
+struct<widthbucket(CAST(0 AS DOUBLE), CAST(NaN AS DOUBLE), CAST(4.0 AS DOUBLE), CAST(888 AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+CREATE TABLE width_bucket_test (operand_num decimal(30,15), operand_f8 double) USING parquet
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+INSERT INTO width_bucket_test VALUES
+    (-5.2, -5.2),
+    (-0.0000000001, -0.0000000001),
+    (0.000000000001, 0.000000000001),
+    (1, 1),
+    (1.99999999999999, 1.99999999999999),
+    (2, 2),
+    (2.00000000000001, 2.00000000000001),
+    (3, 3),
+    (4, 4),
+    (4.5, 4.5),
+    (5, 5),
+    (5.5, 5.5),
+    (6, 6),
+    (7, 7),
+    (8, 8),
+    (9, 9),
+    (9.99999999999999, 9.99999999999999),
+    (10, 10),
+    (10.0000000000001, 10.0000000000001)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+SELECT
+    operand_num,
+    width_bucket(operand_num, 0, 10, 5) AS wb_1,
+    width_bucket(operand_f8, 0, 10, 5) AS wb_1f,
+    width_bucket(operand_num, 10, 0, 5) AS wb_2,
+    width_bucket(operand_f8, 10, 0, 5) AS wb_2f,
+    width_bucket(operand_num, 2, 8, 4) AS wb_3,
+    width_bucket(operand_f8, 2, 8, 4) AS wb_3f,
+    width_bucket(operand_num, 5.0, 5.5, 20) AS wb_4,
+    width_bucket(operand_f8, 5.0, 5.5, 20) AS wb_4f,
+    width_bucket(operand_num, -25, 25, 10) AS wb_5,
+    width_bucket(operand_f8, -25, 25, 10) AS wb_5f
+    FROM width_bucket_test
+-- !query schema
+struct<operand_num:decimal(30,15),wb_1:bigint,wb_1f:bigint,wb_2:bigint,wb_2f:bigint,wb_3:bigint,wb_3f:bigint,wb_4:bigint,wb_4f:bigint,wb_5:bigint,wb_5f:bigint>
+-- !query output
+-0.000000000100000	0	0	6	6	0	0	0	0	5	5
+-5.200000000000000	0	0	6	6	0	0	0	0	4	4
+0.000000000001000	1	1	5	5	0	0	0	0	6	6
+1.000000000000000	1	1	5	5	0	0	0	0	6	6
+1.999999999999990	1	1	5	5	0	0	0	0	6	6
+10.000000000000000	6	6	1	1	5	5	21	21	8	8
+10.000000000000100	6	6	0	0	5	5	21	21	8	8
+2.000000000000000	2	2	5	5	1	1	0	0	6	6
+2.000000000000010	2	2	4	4	1	1	0	0	6	6
+3.000000000000000	2	2	4	4	1	1	0	0	6	6
+4.000000000000000	3	3	4	4	2	2	0	0	6	6
+4.500000000000000	3	3	3	3	2	2	0	0	6	6
+5.000000000000000	3	3	3	3	3	3	1	1	7	7
+5.500000000000000	3	3	3	3	3	3	21	21	7	7
+6.000000000000000	4	4	3	3	3	3	21	21	7	7
+7.000000000000000	4	4	2	2	4	4	21	21	7	7
+8.000000000000000	5	5	2	2	5	5	21	21	7	7
+9.000000000000000	5	5	1	1	5	5	21	21	7	7
+9.999999999999990	5	5	1	1	5	5	21	21	7	7
+

Review comment:
       For the expected output, please see: https://github.com/postgres/postgres/blob/master/src/test/regress/expected/numeric.out#L778-L850

##########
File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
##########
@@ -1325,3 +1325,94 @@ case class BRound(child: Expression, scale: Expression)
     with Serializable with ImplicitCastInputTypes {
   def this(child: Expression) = this(child, Literal(0))
 }
+
+object WidthBucket {
+
+  def computeBucketNumber(value: Double, min: Double, max: Double, numBucket: Long): jl.Long = {
+    // Checks error cases below:
+    //  - `numBucket` must be greater than zero and be less than Long.MaxValue
+    //  - `value`, `min`, and `max` cannot be NaN
+    //  - `min` bound cannot equal `max
+    //  - `min` and `max` must be finite
+    if (numBucket <= 0 || numBucket == Long.MaxValue || jl.Double.isNaN(value) || min == max ||
+        jl.Double.isNaN(min) || jl.Double.isInfinite(min) ||
+        jl.Double.isNaN(max) || jl.Double.isInfinite(max)) {
+      return null

Review comment:
       I added the error checks to follow the PostgreSQL behaviour: https://github.com/postgres/postgres/blob/master/src/test/regress/expected/numeric.out#L778-L796
   https://github.com/postgres/postgres/blob/master/src/test/regress/expected/numeric.out#L837-L842
   The current logic returns null instead of throwing a runtime exception.
   

##########
File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
##########
@@ -1325,3 +1325,94 @@ case class BRound(child: Expression, scale: Expression)
     with Serializable with ImplicitCastInputTypes {
   def this(child: Expression) = this(child, Literal(0))
 }
+
+object WidthBucket {
+
+  def computeBucketNumber(value: Double, min: Double, max: Double, numBucket: Long): jl.Long = {
+    // Checks error cases below:
+    //  - `numBucket` must be greater than zero and be less than Long.MaxValue
+    //  - `value`, `min`, and `max` cannot be NaN
+    //  - `min` bound cannot equal `max
+    //  - `min` and `max` must be finite
+    if (numBucket <= 0 || numBucket == Long.MaxValue || jl.Double.isNaN(value) || min == max ||
+        jl.Double.isNaN(min) || jl.Double.isInfinite(min) ||
+        jl.Double.isNaN(max) || jl.Double.isInfinite(max)) {
+      return null
+    }
+
+    val lower = Math.min(min, max)
+    val upper = Math.max(min, max)
+
+    if (min < max) {

Review comment:
       This logic follows the PostgreSQL one: https://github.com/postgres/postgres/blob/master/src/backend/utils/adt/float.c#L3916-L3943




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org