You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/12/23 22:49:11 UTC
[spark] branch branch-3.0 updated: [SPARK-33277][PYSPARK][SQL] Use
ContextAwareIterator to stop consuming after the task ends
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 83adba7 [SPARK-33277][PYSPARK][SQL] Use ContextAwareIterator to stop consuming after the task ends
83adba7 is described below
commit 83adba71febcd850564e1342ebf4562ffc059c13
Author: Takuya UESHIN <ue...@databricks.com>
AuthorDate: Wed Dec 23 14:48:01 2020 -0800
[SPARK-33277][PYSPARK][SQL] Use ContextAwareIterator to stop consuming after the task ends
### What changes were proposed in this pull request?
This is a retry of #30177.
This is not a complete fix, but it would take long time to complete (#30242).
As discussed offline, at least using `ContextAwareIterator` should be helpful enough for many cases.
As the Python evaluation consumes the parent iterator in a separate thread, it could consume more data from the parent even after the task ends and the parent is closed. Thus, we should use `ContextAwareIterator` to stop consuming after the task ends.
### Why are the changes needed?
Python/Pandas UDF right after off-heap vectorized reader could cause executor crash.
E.g.,:
```py
spark.range(0, 100000, 1, 1).write.parquet(path)
spark.conf.set("spark.sql.columnVector.offheap.enabled", True)
def f(x):
return 0
fUdf = udf(f, LongType())
spark.read.parquet(path).select(fUdf('id')).head()
```
This is because, the Python evaluation consumes the parent iterator in a separate thread and it consumes more data from the parent even after the task ends and the parent is closed. If an off-heap column vector exists in the parent iterator, it could cause segmentation fault which crashes the executor.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Added tests, and manually.
Closes #30899 from ueshin/issues/SPARK-33277/context_aware_iterator.
Authored-by: Takuya UESHIN <ue...@databricks.com>
Signed-off-by: Dongjoon Hyun <dh...@apple.com>
(cherry picked from commit 5c9b421c3711ba373b4d5cbbd83a8ece91291ed0)
Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
.../org/apache/spark/ContextAwareIterator.scala | 40 ++++++++++++++++++++++
.../sql/execution/python/EvalPythonExec.scala | 5 +--
.../sql/execution/python/MapInPandasExec.scala | 9 ++---
3 files changed, 48 insertions(+), 6 deletions(-)
diff --git a/core/src/main/scala/org/apache/spark/ContextAwareIterator.scala b/core/src/main/scala/org/apache/spark/ContextAwareIterator.scala
new file mode 100644
index 0000000..c4d0dd8
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/ContextAwareIterator.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import org.apache.spark.annotation.DeveloperApi
+
+/**
+ * :: DeveloperApi ::
+ * A TaskContext aware iterator.
+ *
+ * As the Python evaluation consumes the parent iterator in a separate thread,
+ * it could consume more data from the parent even after the task ends and the parent is closed.
+ * If an off-heap access exists in the parent iterator, it could cause segmentation fault
+ * which crashes the executor.
+ * Thus, we should use [[ContextAwareIterator]] to stop consuming after the task ends.
+ */
+@DeveloperApi
+class ContextAwareIterator[+T](val context: TaskContext, val delegate: Iterator[T])
+ extends Iterator[T] {
+
+ override def hasNext: Boolean =
+ !context.isCompleted() && !context.isInterrupted() && delegate.hasNext
+
+ override def next(): T = delegate.next()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
index a0f23e9..57ccdb1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
@@ -21,7 +21,7 @@ import java.io.File
import scala.collection.mutable.ArrayBuffer
-import org.apache.spark.{SparkEnv, TaskContext}
+import org.apache.spark.{ContextAwareIterator, SparkEnv, TaskContext}
import org.apache.spark.api.python.ChainedPythonFunctions
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
@@ -88,6 +88,7 @@ abstract class EvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute],
inputRDD.mapPartitions { iter =>
val context = TaskContext.get()
+ val contextAwareIterator = new ContextAwareIterator(context, iter)
// The queue used to buffer input rows so we can drain it to
// combine input with output from Python.
@@ -119,7 +120,7 @@ abstract class EvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute],
})
// Add rows to queue to join later with the result.
- val projectedRowIter = iter.map { inputRow =>
+ val projectedRowIter = contextAwareIterator.map { inputRow =>
queue.add(inputRow.asInstanceOf[UnsafeRow])
projection(inputRow)
}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala
index 2bb8081..71f51f1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.python
import scala.collection.JavaConverters._
-import org.apache.spark.TaskContext
+import org.apache.spark.{ContextAwareIterator, TaskContext}
import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
@@ -61,16 +61,17 @@ case class MapInPandasExec(
val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
val outputTypes = child.schema
+ val context = TaskContext.get()
+ val contextAwareIterator = new ContextAwareIterator(context, inputIter)
+
// Here we wrap it via another row so that Python sides understand it
// as a DataFrame.
- val wrappedIter = inputIter.map(InternalRow(_))
+ val wrappedIter = contextAwareIterator.map(InternalRow(_))
// DO NOT use iter.grouped(). See BatchIterator.
val batchIter =
if (batchSize > 0) new BatchIterator(wrappedIter, batchSize) else Iterator(wrappedIter)
- val context = TaskContext.get()
-
val columnarBatchIter = new ArrowPythonRunner(
chainedFunc,
PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org