You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2018/09/25 12:13:13 UTC

spark git commit: [SPARK-23907][SQL] Revert regr_* functions entirely

Repository: spark
Updated Branches:
  refs/heads/master 7d8f5b62c -> 9cbd001e2


[SPARK-23907][SQL] Revert regr_* functions entirely

## What changes were proposed in this pull request?
This patch reverts entirely all the regr_* functions added in SPARK-23907. These were added by mgaido91 (and proposed by gatorsmile) to improve compatibility with other database systems, without any actual use cases. However, they are very rarely used, and in Spark there are much better ways to compute these functions, due to Spark's flexibility in exposing real programming APIs.

I'm going through all the APIs added in Spark 2.4 and I think we should revert these. If there are strong enough demands and more use cases, we can add them back in the future pretty easily.

## How was this patch tested?
Reverted test cases also.

Closes #22541 from rxin/SPARK-23907.

Authored-by: Reynold Xin <rx...@databricks.com>
Signed-off-by: hyukjinkwon <gu...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9cbd001e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9cbd001e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9cbd001e

Branch: refs/heads/master
Commit: 9cbd001e2476cd06aa0bcfcc77a21a9077d5797a
Parents: 7d8f5b6
Author: Reynold Xin <rx...@databricks.com>
Authored: Tue Sep 25 20:13:07 2018 +0800
Committer: hyukjinkwon <gu...@apache.org>
Committed: Tue Sep 25 20:13:07 2018 +0800

----------------------------------------------------------------------
 .../catalyst/analysis/FunctionRegistry.scala    |   9 -
 .../expressions/aggregate/regression.scala      | 190 -------------------
 .../sql-tests/inputs/udaf-regrfunctions.sql     |  56 ------
 .../results/udaf-regrfunctions.sql.out          |  93 ---------
 4 files changed, 348 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/9cbd001e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 8b69a47..7dafebf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -300,15 +300,6 @@ object FunctionRegistry {
     expression[CollectList]("collect_list"),
     expression[CollectSet]("collect_set"),
     expression[CountMinSketchAgg]("count_min_sketch"),
-    expression[RegrCount]("regr_count"),
-    expression[RegrSXX]("regr_sxx"),
-    expression[RegrSYY]("regr_syy"),
-    expression[RegrAvgX]("regr_avgx"),
-    expression[RegrAvgY]("regr_avgy"),
-    expression[RegrSXY]("regr_sxy"),
-    expression[RegrSlope]("regr_slope"),
-    expression[RegrR2]("regr_r2"),
-    expression[RegrIntercept]("regr_intercept"),
 
     // string functions
     expression[Ascii]("ascii"),

http://git-wip-us.apache.org/repos/asf/spark/blob/9cbd001e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/regression.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/regression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/regression.scala
deleted file mode 100644
index d8f4505..0000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/regression.scala
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions.aggregate
-
-import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.{AbstractDataType, DoubleType}
-
-/**
- * Base trait for all regression functions.
- */
-trait RegrLike extends AggregateFunction with ImplicitCastInputTypes {
-  def y: Expression
-  def x: Expression
-
-  override def children: Seq[Expression] = Seq(y, x)
-  override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType, DoubleType)
-
-  protected def updateIfNotNull(exprs: Seq[Expression]): Seq[Expression] = {
-    assert(aggBufferAttributes.length == exprs.length)
-    val nullableChildren = children.filter(_.nullable)
-    if (nullableChildren.isEmpty) {
-      exprs
-    } else {
-      exprs.zip(aggBufferAttributes).map { case (e, a) =>
-        If(nullableChildren.map(IsNull).reduce(Or), a, e)
-      }
-    }
-  }
-}
-
-
-@ExpressionDescription(
-  usage = "_FUNC_(y, x) - Returns the number of non-null pairs.",
-  since = "2.4.0")
-case class RegrCount(y: Expression, x: Expression)
-  extends CountLike with RegrLike {
-
-  override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(Seq(count + 1L))
-
-  override def prettyName: String = "regr_count"
-}
-
-
-@ExpressionDescription(
-  usage = "_FUNC_(y, x) - Returns SUM(x*x)-SUM(x)*SUM(x)/N. Any pair with a NULL is ignored.",
-  since = "2.4.0")
-case class RegrSXX(y: Expression, x: Expression)
-  extends CentralMomentAgg(x) with RegrLike {
-
-  override protected def momentOrder = 2
-
-  override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef)
-
-  override val evaluateExpression: Expression = {
-    If(n === Literal(0.0), Literal.create(null, DoubleType), m2)
-  }
-
-  override def prettyName: String = "regr_sxx"
-}
-
-
-@ExpressionDescription(
-  usage = "_FUNC_(y, x) - Returns SUM(y*y)-SUM(y)*SUM(y)/N. Any pair with a NULL is ignored.",
-  since = "2.4.0")
-case class RegrSYY(y: Expression, x: Expression)
-  extends CentralMomentAgg(y) with RegrLike {
-
-  override protected def momentOrder = 2
-
-  override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef)
-
-  override val evaluateExpression: Expression = {
-    If(n === Literal(0.0), Literal.create(null, DoubleType), m2)
-  }
-
-  override def prettyName: String = "regr_syy"
-}
-
-
-@ExpressionDescription(
-  usage = "_FUNC_(y, x) - Returns the average of x. Any pair with a NULL is ignored.",
-  since = "2.4.0")
-case class RegrAvgX(y: Expression, x: Expression)
-  extends AverageLike(x) with RegrLike {
-
-  override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef)
-
-  override def prettyName: String = "regr_avgx"
-}
-
-
-@ExpressionDescription(
-  usage = "_FUNC_(y, x) - Returns the average of y. Any pair with a NULL is ignored.",
-  since = "2.4.0")
-case class RegrAvgY(y: Expression, x: Expression)
-  extends AverageLike(y) with RegrLike {
-
-  override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef)
-
-  override def prettyName: String = "regr_avgy"
-}
-
-// scalastyle:off line.size.limit
-@ExpressionDescription(
-  usage = "_FUNC_(y, x) - Returns the covariance of y and x multiplied for the number of items in the dataset. Any pair with a NULL is ignored.",
-  since = "2.4.0")
-// scalastyle:on line.size.limit
-case class RegrSXY(y: Expression, x: Expression)
-  extends Covariance(y, x) with RegrLike {
-
-  override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef)
-
-  override val evaluateExpression: Expression = {
-    If(n === Literal(0.0), Literal.create(null, DoubleType), ck)
-  }
-
-  override def prettyName: String = "regr_sxy"
-}
-
-
-// scalastyle:off line.size.limit
-@ExpressionDescription(
-  usage = "_FUNC_(y, x) - Returns the slope of the linear regression line. Any pair with a NULL is ignored.",
-  since = "2.4.0")
-// scalastyle:on line.size.limit
-case class RegrSlope(y: Expression, x: Expression)
-  extends PearsonCorrelation(y, x) with RegrLike {
-
-  override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef)
-
-  override val evaluateExpression: Expression = {
-    If(n < Literal(2.0) || yMk === Literal(0.0), Literal.create(null, DoubleType), ck / yMk)
-  }
-
-  override def prettyName: String = "regr_slope"
-}
-
-
-// scalastyle:off line.size.limit
-@ExpressionDescription(
-  usage = "_FUNC_(y, x) - Returns the coefficient of determination (also called R-squared or goodness of fit) for the regression line. Any pair with a NULL is ignored.",
-  since = "2.4.0")
-// scalastyle:on line.size.limit
-case class RegrR2(y: Expression, x: Expression)
-  extends PearsonCorrelation(y, x) with RegrLike {
-
-  override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef)
-
-  override val evaluateExpression: Expression = {
-    If(n < Literal(2.0) || yMk === Literal(0.0), Literal.create(null, DoubleType),
-      If(xMk === Literal(0.0), Literal(1.0), ck * ck / yMk / xMk))
-  }
-
-  override def prettyName: String = "regr_r2"
-}
-
-
-// scalastyle:off line.size.limit
-@ExpressionDescription(
-  usage = "_FUNC_(y, x) - Returns the y-intercept of the linear regression line. Any pair with a NULL is ignored.",
-  since = "2.4.0")
-// scalastyle:on line.size.limit
-case class RegrIntercept(y: Expression, x: Expression)
-  extends PearsonCorrelation(y, x) with RegrLike {
-
-  override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef)
-
-  override val evaluateExpression: Expression = {
-    If(n === Literal(0.0) || yMk === Literal(0.0), Literal.create(null, DoubleType),
-      xAvg - (ck / yMk) * yAvg)
-  }
-
-  override def prettyName: String = "regr_intercept"
-}

http://git-wip-us.apache.org/repos/asf/spark/blob/9cbd001e/sql/core/src/test/resources/sql-tests/inputs/udaf-regrfunctions.sql
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/inputs/udaf-regrfunctions.sql b/sql/core/src/test/resources/sql-tests/inputs/udaf-regrfunctions.sql
deleted file mode 100644
index 92c7e26..0000000
--- a/sql/core/src/test/resources/sql-tests/inputs/udaf-regrfunctions.sql
+++ /dev/null
@@ -1,56 +0,0 @@
---
---   Licensed to the Apache Software Foundation (ASF) under one or more
---   contributor license agreements.  See the NOTICE file distributed with
---   this work for additional information regarding copyright ownership.
---   The ASF licenses this file to You under the Apache License, Version 2.0
---   (the "License"); you may not use this file except in compliance with
---   the License.  You may obtain a copy of the License at
---
---      http://www.apache.org/licenses/LICENSE-2.0
---
---   Unless required by applicable law or agreed to in writing, software
---   distributed under the License is distributed on an "AS IS" BASIS,
---   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
---   See the License for the specific language governing permissions and
---   limitations under the License.
---
-
-CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES
- (101, 1, 1, 1),
- (201, 2, 1, 1),
- (301, 3, 1, 1),
- (401, 4, 1, 11),
- (501, 5, 1, null),
- (601, 6, null, 1),
- (701, 6, null, null),
- (102, 1, 2, 2),
- (202, 2, 1, 2),
- (302, 3, 2, 1),
- (402, 4, 2, 12),
- (502, 5, 2, null),
- (602, 6, null, 2),
- (702, 6, null, null),
- (103, 1, 3, 3),
- (203, 2, 1, 3),
- (303, 3, 3, 1),
- (403, 4, 3, 13),
- (503, 5, 3, null),
- (603, 6, null, 3),
- (703, 6, null, null),
- (104, 1, 4, 4),
- (204, 2, 1, 4),
- (304, 3, 4, 1),
- (404, 4, 4, 14),
- (504, 5, 4, null),
- (604, 6, null, 4),
- (704, 6, null, null),
- (800, 7, 1, 1)
-as t1(id, px, y, x);
-
-select px, var_pop(x), var_pop(y), corr(y,x), covar_samp(y,x), covar_pop(y,x), regr_count(y,x),
- regr_slope(y,x), regr_intercept(y,x), regr_r2(y,x), regr_sxx(y,x), regr_syy(y,x), regr_sxy(y,x),
- regr_avgx(y,x), regr_avgy(y,x), regr_count(y,x)
-from t1 group by px order by px;
-
-
-select id, regr_count(y,x) over (partition by px) from t1 order by id;

http://git-wip-us.apache.org/repos/asf/spark/blob/9cbd001e/sql/core/src/test/resources/sql-tests/results/udaf-regrfunctions.sql.out
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/results/udaf-regrfunctions.sql.out b/sql/core/src/test/resources/sql-tests/results/udaf-regrfunctions.sql.out
deleted file mode 100644
index d7d009a..0000000
--- a/sql/core/src/test/resources/sql-tests/results/udaf-regrfunctions.sql.out
+++ /dev/null
@@ -1,93 +0,0 @@
--- Automatically generated by SQLQueryTestSuite
--- Number of queries: 3
-
-
--- !query 0
-CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES
- (101, 1, 1, 1),
- (201, 2, 1, 1),
- (301, 3, 1, 1),
- (401, 4, 1, 11),
- (501, 5, 1, null),
- (601, 6, null, 1),
- (701, 6, null, null),
- (102, 1, 2, 2),
- (202, 2, 1, 2),
- (302, 3, 2, 1),
- (402, 4, 2, 12),
- (502, 5, 2, null),
- (602, 6, null, 2),
- (702, 6, null, null),
- (103, 1, 3, 3),
- (203, 2, 1, 3),
- (303, 3, 3, 1),
- (403, 4, 3, 13),
- (503, 5, 3, null),
- (603, 6, null, 3),
- (703, 6, null, null),
- (104, 1, 4, 4),
- (204, 2, 1, 4),
- (304, 3, 4, 1),
- (404, 4, 4, 14),
- (504, 5, 4, null),
- (604, 6, null, 4),
- (704, 6, null, null),
- (800, 7, 1, 1)
-as t1(id, px, y, x)
--- !query 0 schema
-struct<>
--- !query 0 output
-
-
-
--- !query 1
-select px, var_pop(x), var_pop(y), corr(y,x), covar_samp(y,x), covar_pop(y,x), regr_count(y,x),
- regr_slope(y,x), regr_intercept(y,x), regr_r2(y,x), regr_sxx(y,x), regr_syy(y,x), regr_sxy(y,x),
- regr_avgx(y,x), regr_avgy(y,x), regr_count(y,x)
-from t1 group by px order by px
--- !query 1 schema
-struct<px:int,var_pop(CAST(x AS DOUBLE)):double,var_pop(CAST(y AS DOUBLE)):double,corr(CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,covar_samp(CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,covar_pop(CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,regr_count(CAST(y AS DOUBLE), CAST(x AS DOUBLE)):bigint,regr_slope(CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,regr_intercept(CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,regr_r2(CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,regr_sxx(CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,regr_syy(CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,regr_sxy(CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,regr_avgx(CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,regr_avgy(CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,regr_count(CAST(y AS DOUBLE), CAST(x AS DOUBLE)):bigint>
--- !query 1 output
-1	1.25	1.25	1.0	1.6666666666666667	1.25	4	1.0	0.0	1.0	5.0	5.0	5.0	2.5	2.5	4
-2	1.25	0.0	NULL	0.0	0.0	4	0.0	1.0	1.0	5.0	0.0	0.0	2.5	1.0	4
-3	0.0	1.25	NULL	0.0	0.0	4	NULL	NULL	NULL	0.0	5.0	0.0	1.0	2.5	4
-4	1.25	1.25	1.0	1.6666666666666667	1.25	4	1.0	-10.0	1.0	5.0	5.0	5.0	12.5	2.5	4
-5	NULL	1.25	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0
-6	1.25	NULL	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0
-7	0.0	0.0	NaN	NaN	0.0	1	NULL	NULL	NULL	0.0	0.0	0.0	1.0	1.0	1
-
-
--- !query 2
-select id, regr_count(y,x) over (partition by px) from t1 order by id
--- !query 2 schema
-struct<id:int,regr_count(CAST(y AS DOUBLE), CAST(x AS DOUBLE)) OVER (PARTITION BY px ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING):bigint>
--- !query 2 output
-101	4
-102	4
-103	4
-104	4
-201	4
-202	4
-203	4
-204	4
-301	4
-302	4
-303	4
-304	4
-401	4
-402	4
-403	4
-404	4
-501	0
-502	0
-503	0
-504	0
-601	0
-602	0
-603	0
-604	0
-701	0
-702	0
-703	0
-704	0
-800	1


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org