You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2018/12/02 02:46:25 UTC
spark git commit: [SPARK-26216][SQL] Do not use case class as public
API (UserDefinedFunction)
Repository: spark
Updated Branches:
refs/heads/master 3e46e3ccd -> 39617cb2c
[SPARK-26216][SQL] Do not use case class as public API (UserDefinedFunction)
## What changes were proposed in this pull request?
It's a bad idea to use case class as public API, as it has a very wide surface. For example, the `copy` method, its fields, the companion object, etc.
For a particular case, `UserDefinedFunction`. It has a private constructor, and I believe we only want users to access a few methods:`apply`, `nullable`, `asNonNullable`, etc.
However, all its fields, and `copy` method, and the companion object are public unexpectedly. As a result, we made many tricks to work around the binary compatibility issues.
This PR proposes to only make interfaces public, and hide implementations behind with a private class. Now `UserDefinedFunction` is a pure trait, and the concrete implementation is `SparkUserDefinedFunction`, which is private.
Changing class to interface is not binary compatible(but source compatible), so 3.0 is a good chance to do it.
This is the first PR to go with this direction. If it's accepted, I'll create a umbrella JIRA and fix all the public case classes.
## How was this patch tested?
existing tests.
Closes #23178 from cloud-fan/udf.
Authored-by: Wenchen Fan <we...@databricks.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/39617cb2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/39617cb2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/39617cb2
Branch: refs/heads/master
Commit: 39617cb2c0c433494e9f17fcd4e49c6300a9c4b0
Parents: 3e46e3ccd
Author: Wenchen Fan <we...@databricks.com>
Authored: Sun Dec 2 10:46:17 2018 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Sun Dec 2 10:46:17 2018 +0800
----------------------------------------------------------------------
docs/sql-migration-guide-upgrade.md | 2 +
project/MimaExcludes.scala | 6 +-
.../sql/expressions/UserDefinedFunction.scala | 119 +++++++++----------
.../scala/org/apache/spark/sql/functions.scala | 2 +-
4 files changed, 63 insertions(+), 66 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/39617cb2/docs/sql-migration-guide-upgrade.md
----------------------------------------------------------------------
diff --git a/docs/sql-migration-guide-upgrade.md b/docs/sql-migration-guide-upgrade.md
index e48125a..787f4bc 100644
--- a/docs/sql-migration-guide-upgrade.md
+++ b/docs/sql-migration-guide-upgrade.md
@@ -31,6 +31,8 @@ displayTitle: Spark SQL Upgrading Guide
- In Spark version 2.4 and earlier, the `SET` command works without any warnings even if the specified key is for `SparkConf` entries and it has no effect because the command does not update `SparkConf`, but the behavior might confuse users. Since 3.0, the command fails if a `SparkConf` key is used. You can disable such a check by setting `spark.sql.legacy.execution.setCommandRejectsSparkConfs` to `false`.
+ - Spark applications which are built with Spark version 2.4 and prior, and call methods of `UserDefinedFunction`, need to be re-compiled with Spark 3.0, as they are not binary compatible with Spark 3.0.
+
## Upgrading From Spark SQL 2.3 to 2.4
- In Spark version 2.3 and earlier, the second parameter to array_contains function is implicitly promoted to the element type of first array type parameter. This type promotion can be lossy and may cause `array_contains` function to return wrong result. This problem has been addressed in 2.4 by employing a safer type promotion mechanism. This can cause some change in behavior and are illustrated in the table below.
http://git-wip-us.apache.org/repos/asf/spark/blob/39617cb2/project/MimaExcludes.scala
----------------------------------------------------------------------
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index fcef424..1c83cf5 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -227,7 +227,11 @@ object MimaExcludes {
case ReversedMissingMethodProblem(meth) =>
!meth.owner.fullName.startsWith("org.apache.spark.sql.sources.v2")
case _ => true
- }
+ },
+
+ // [SPARK-26216][SQL] Do not use case class as public API (UserDefinedFunction)
+ ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.expressions.UserDefinedFunction$"),
+ ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.sql.expressions.UserDefinedFunction")
)
// Exclude rules for 2.4.x
http://git-wip-us.apache.org/repos/asf/spark/blob/39617cb2/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 58a942a..f88e0e0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -38,25 +38,14 @@ import org.apache.spark.sql.types.DataType
* @since 1.3.0
*/
@Stable
-case class UserDefinedFunction protected[sql] (
- f: AnyRef,
- dataType: DataType,
- inputTypes: Option[Seq[DataType]]) {
-
- private var _nameOption: Option[String] = None
- private var _nullable: Boolean = true
- private var _deterministic: Boolean = true
-
- // This is a `var` instead of in the constructor for backward compatibility of this case class.
- // TODO: revisit this case class in Spark 3.0, and narrow down the public surface.
- private[sql] var nullableTypes: Option[Seq[Boolean]] = None
+sealed trait UserDefinedFunction {
/**
* Returns true when the UDF can return a nullable value.
*
* @since 2.3.0
*/
- def nullable: Boolean = _nullable
+ def nullable: Boolean
/**
* Returns true iff the UDF is deterministic, i.e. the UDF produces the same output given the same
@@ -64,7 +53,7 @@ case class UserDefinedFunction protected[sql] (
*
* @since 2.3.0
*/
- def deterministic: Boolean = _deterministic
+ def deterministic: Boolean
/**
* Returns an expression that invokes the UDF, using the given arguments.
@@ -72,80 +61,83 @@ case class UserDefinedFunction protected[sql] (
* @since 1.3.0
*/
@scala.annotation.varargs
- def apply(exprs: Column*): Column = {
+ def apply(exprs: Column*): Column
+
+ /**
+ * Updates UserDefinedFunction with a given name.
+ *
+ * @since 2.3.0
+ */
+ def withName(name: String): UserDefinedFunction
+
+ /**
+ * Updates UserDefinedFunction to non-nullable.
+ *
+ * @since 2.3.0
+ */
+ def asNonNullable(): UserDefinedFunction
+
+ /**
+ * Updates UserDefinedFunction to nondeterministic.
+ *
+ * @since 2.3.0
+ */
+ def asNondeterministic(): UserDefinedFunction
+}
+
+private[sql] case class SparkUserDefinedFunction(
+ f: AnyRef,
+ dataType: DataType,
+ inputTypes: Option[Seq[DataType]],
+ nullableTypes: Option[Seq[Boolean]],
+ name: Option[String] = None,
+ nullable: Boolean = true,
+ deterministic: Boolean = true) extends UserDefinedFunction {
+
+ @scala.annotation.varargs
+ override def apply(exprs: Column*): Column = {
// TODO: make sure this class is only instantiated through `SparkUserDefinedFunction.create()`
// and `nullableTypes` is always set.
- if (nullableTypes.isEmpty) {
- nullableTypes = Some(ScalaReflection.getParameterTypeNullability(f))
- }
if (inputTypes.isDefined) {
assert(inputTypes.get.length == nullableTypes.get.length)
}
+ val inputsNullSafe = nullableTypes.getOrElse {
+ ScalaReflection.getParameterTypeNullability(f)
+ }
+
Column(ScalaUDF(
f,
dataType,
exprs.map(_.expr),
- nullableTypes.get,
+ inputsNullSafe,
inputTypes.getOrElse(Nil),
- udfName = _nameOption,
- nullable = _nullable,
- udfDeterministic = _deterministic))
- }
-
- private def copyAll(): UserDefinedFunction = {
- val udf = copy()
- udf._nameOption = _nameOption
- udf._nullable = _nullable
- udf._deterministic = _deterministic
- udf.nullableTypes = nullableTypes
- udf
+ udfName = name,
+ nullable = nullable,
+ udfDeterministic = deterministic))
}
- /**
- * Updates UserDefinedFunction with a given name.
- *
- * @since 2.3.0
- */
- def withName(name: String): UserDefinedFunction = {
- val udf = copyAll()
- udf._nameOption = Option(name)
- udf
+ override def withName(name: String): UserDefinedFunction = {
+ copy(name = Option(name))
}
- /**
- * Updates UserDefinedFunction to non-nullable.
- *
- * @since 2.3.0
- */
- def asNonNullable(): UserDefinedFunction = {
+ override def asNonNullable(): UserDefinedFunction = {
if (!nullable) {
this
} else {
- val udf = copyAll()
- udf._nullable = false
- udf
+ copy(nullable = false)
}
}
- /**
- * Updates UserDefinedFunction to nondeterministic.
- *
- * @since 2.3.0
- */
- def asNondeterministic(): UserDefinedFunction = {
- if (!_deterministic) {
+ override def asNondeterministic(): UserDefinedFunction = {
+ if (!deterministic) {
this
} else {
- val udf = copyAll()
- udf._deterministic = false
- udf
+ copy(deterministic = false)
}
}
}
-// We have to use a name different than `UserDefinedFunction` here, to avoid breaking the binary
-// compatibility of the auto-generate UserDefinedFunction object.
private[sql] object SparkUserDefinedFunction {
def create(
@@ -157,8 +149,7 @@ private[sql] object SparkUserDefinedFunction {
} else {
Some(inputSchemas.map(_.get.dataType))
}
- val udf = new UserDefinedFunction(f, dataType, inputTypes)
- udf.nullableTypes = Some(inputSchemas.map(_.map(_.nullable).getOrElse(true)))
- udf
+ val nullableTypes = Some(inputSchemas.map(_.map(_.nullable).getOrElse(true)))
+ SparkUserDefinedFunction(f, dataType, inputTypes, nullableTypes)
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/39617cb2/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index efa8f85..33186f7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4259,7 +4259,7 @@ object functions {
def udf(f: AnyRef, dataType: DataType): UserDefinedFunction = {
// TODO: should call SparkUserDefinedFunction.create() instead but inputSchemas is currently
// unavailable. We may need to create type-safe overloaded versions of udf() methods.
- new UserDefinedFunction(f, dataType, inputTypes = None)
+ SparkUserDefinedFunction(f, dataType, inputTypes = None, nullableTypes = None)
}
/**
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org