You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2018/07/25 02:35:39 UTC

spark git commit: [SPARK-24891][SQL] Fix HandleNullInputsForUDF rule

Repository: spark
Updated Branches:
  refs/heads/master 15fff7903 -> c26b09216


[SPARK-24891][SQL] Fix HandleNullInputsForUDF rule

## What changes were proposed in this pull request?

The HandleNullInputsForUDF would always add a new `If` node every time it is applied. That would cause a difference between the same plan being analyzed once and being analyzed twice (or more), thus raising issues like plan not matched in the cache manager. The solution is to mark the arguments as null-checked, which is to add a "KnownNotNull" node above those arguments, when adding the UDF under an `If` node, because clearly the UDF will not be called when any of those arguments is null.

## How was this patch tested?

Add new tests under sql/UDFSuite and AnalysisSuite.

Author: maryannxue <ma...@apache.org>

Closes #21851 from maryannxue/spark-24891.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c26b0921
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c26b0921
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c26b0921

Branch: refs/heads/master
Commit: c26b0921693814f0726507f16b836d82e2e8cfe0
Parents: 15fff79
Author: maryannxue <ma...@apache.org>
Authored: Tue Jul 24 19:35:34 2018 -0700
Committer: Xiao Li <ga...@gmail.com>
Committed: Tue Jul 24 19:35:34 2018 -0700

----------------------------------------------------------------------
 .../spark/sql/catalyst/analysis/Analyzer.scala  | 22 ++++++++----
 .../expressions/constraintExpressions.scala     | 35 ++++++++++++++++++++
 .../sql/catalyst/analysis/AnalysisSuite.scala   | 16 +++++++--
 .../scala/org/apache/spark/sql/UDFSuite.scala   | 31 ++++++++++++++++-
 4 files changed, 94 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/c26b0921/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 866396c..4f474f4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.encoders.OuterScopes
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
-import org.apache.spark.sql.catalyst.expressions.objects.{LambdaVariable, MapObjects, NewInstance, UnresolvedMapObjects}
+import org.apache.spark.sql.catalyst.expressions.objects._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
@@ -2145,14 +2145,24 @@ class Analyzer(
           val parameterTypes = ScalaReflection.getParameterTypes(func)
           assert(parameterTypes.length == inputs.length)
 
+          // TODO: skip null handling for not-nullable primitive inputs after we can completely
+          // trust the `nullable` information.
+          // (cls, expr) => cls.isPrimitive && expr.nullable
+          val needsNullCheck = (cls: Class[_], expr: Expression) =>
+            cls.isPrimitive && !expr.isInstanceOf[KnowNotNull]
           val inputsNullCheck = parameterTypes.zip(inputs)
-            // TODO: skip null handling for not-nullable primitive inputs after we can completely
-            // trust the `nullable` information.
-            // .filter { case (cls, expr) => cls.isPrimitive && expr.nullable }
-            .filter { case (cls, _) => cls.isPrimitive }
+            .filter { case (cls, expr) => needsNullCheck(cls, expr) }
             .map { case (_, expr) => IsNull(expr) }
             .reduceLeftOption[Expression]((e1, e2) => Or(e1, e2))
-          inputsNullCheck.map(If(_, Literal.create(null, udf.dataType), udf)).getOrElse(udf)
+          // Once we add an `If` check above the udf, it is safe to mark those checked inputs
+          // as not nullable (i.e., wrap them with `KnownNotNull`), because the null-returning
+          // branch of `If` will be called if any of these checked inputs is null. Thus we can
+          // prevent this rule from being applied repeatedly.
+          val newInputs = parameterTypes.zip(inputs).map{ case (cls, expr) =>
+            if (needsNullCheck(cls, expr)) KnowNotNull(expr) else expr }
+          inputsNullCheck
+            .map(If(_, Literal.create(null, udf.dataType), udf.copy(children = newInputs)))
+            .getOrElse(udf)
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/c26b0921/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/constraintExpressions.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/constraintExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/constraintExpressions.scala
new file mode 100644
index 0000000..53936aa
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/constraintExpressions.scala
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, FalseLiteral}
+import org.apache.spark.sql.types.DataType
+
+case class KnowNotNull(child: Expression) extends UnaryExpression {
+  override def nullable: Boolean = false
+  override def dataType: DataType = child.dataType
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    child.genCode(ctx).copy(isNull = FalseLiteral)
+  }
+
+  override def eval(input: InternalRow): Any = {
+    child.eval(input)
+  }
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/c26b0921/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 9e0db8d..31f703d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -316,7 +316,8 @@ class AnalysisSuite extends AnalysisTest with Matchers {
 
     // only primitive parameter needs special null handling
     val udf2 = ScalaUDF((s: String, d: Double) => "x", StringType, string :: double :: Nil)
-    val expected2 = If(IsNull(double), nullResult, udf2)
+    val expected2 =
+      If(IsNull(double), nullResult, udf2.copy(children = string :: KnowNotNull(double) :: Nil))
     checkUDF(udf2, expected2)
 
     // special null handling should apply to all primitive parameters
@@ -324,7 +325,7 @@ class AnalysisSuite extends AnalysisTest with Matchers {
     val expected3 = If(
       IsNull(short) || IsNull(double),
       nullResult,
-      udf3)
+      udf3.copy(children = KnowNotNull(short) :: KnowNotNull(double) :: Nil))
     checkUDF(udf3, expected3)
 
     // we can skip special null handling for primitive parameters that are not nullable
@@ -336,10 +337,19 @@ class AnalysisSuite extends AnalysisTest with Matchers {
     val expected4 = If(
       IsNull(short),
       nullResult,
-      udf4)
+      udf4.copy(children = KnowNotNull(short) :: double.withNullability(false) :: Nil))
     // checkUDF(udf4, expected4)
   }
 
+  test("SPARK-24891 Fix HandleNullInputsForUDF rule") {
+    val a = testRelation.output(0)
+    val func = (x: Int, y: Int) => x + y
+    val udf1 = ScalaUDF(func, IntegerType, a :: a :: Nil)
+    val udf2 = ScalaUDF(func, IntegerType, a :: udf1 :: Nil)
+    val plan = Project(Alias(udf2, "")() :: Nil, testRelation)
+    comparePlans(plan.analyze, plan.analyze.analyze)
+  }
+
   test("SPARK-11863 mixture of aliases and real columns in order by clause - tpcds 19,55,71") {
     val a = testRelation2.output(0)
     val c = testRelation2.output(2)

http://git-wip-us.apache.org/repos/asf/spark/blob/c26b0921/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index 21afdc7..d807457 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
 import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.execution.command.ExplainCommand
-import org.apache.spark.sql.functions.udf
+import org.apache.spark.sql.functions.{lit, udf}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
 import org.apache.spark.sql.types.{DataTypes, DoubleType}
@@ -324,4 +324,33 @@ class UDFSuite extends QueryTest with SharedSQLContext {
       assert(outputStream.toString.contains("UDF:f(a._1 AS `_1`)"))
     }
   }
+
+  test("SPARK-24891 Fix HandleNullInputsForUDF rule") {
+    val udf1 = udf({(x: Int, y: Int) => x + y})
+    val df = spark.range(0, 3).toDF("a")
+      .withColumn("b", udf1($"a", udf1($"a", lit(10))))
+      .withColumn("c", udf1($"a", lit(null)))
+    val plan = spark.sessionState.executePlan(df.logicalPlan).analyzed
+
+    comparePlans(df.logicalPlan, plan)
+    checkAnswer(
+      df,
+      Seq(
+        Row(0, 10, null),
+        Row(1, 12, null),
+        Row(2, 14, null)))
+  }
+
+  test("SPARK-24891 Fix HandleNullInputsForUDF rule - with table") {
+    withTable("x") {
+      Seq((1, "2"), (2, "4")).toDF("a", "b").write.format("json").saveAsTable("x")
+      sql("insert into table x values(3, null)")
+      sql("insert into table x values(null, '4')")
+      spark.udf.register("f", (a: Int, b: String) => a + b)
+      val df = spark.sql("SELECT f(a, b) FROM x")
+      val plan = spark.sessionState.executePlan(df.logicalPlan).analyzed
+      comparePlans(df.logicalPlan, plan)
+      checkAnswer(df, Seq(Row("12"), Row("24"), Row("3null"), Row(null)))
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org