You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jo...@apache.org on 2015/01/30 02:28:54 UTC

spark git commit: [SPARK-5395] [PySpark] fix python process leak while coalesce()

Repository: spark
Updated Branches:
  refs/heads/master ce9c43ba8 -> 5c746eedd


[SPARK-5395] [PySpark] fix python process leak while coalesce()

Currently, the Python process is released into pool only after the task had finished, it cause many process forked if coalesce() is called.

This PR will change it to release the process as soon as read all the data from it (finish the partition), then a process could be reused to process multiple partitions in a single task.

Author: Davies Liu <da...@databricks.com>

Closes #4238 from davies/py_leak and squashes the following commits:

ec80a43 [Davies Liu] add @volatile
6da437a [Davies Liu] address comments
24ed322 [Davies Liu] fix python process leak while coalesce()


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c746eed
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c746eed
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c746eed

Branch: refs/heads/master
Commit: 5c746eedda8cff2fc1692cf6dce376f4b0ca6fac
Parents: ce9c43b
Author: Davies Liu <da...@databricks.com>
Authored: Thu Jan 29 17:28:37 2015 -0800
Committer: Josh Rosen <jo...@databricks.com>
Committed: Thu Jan 29 17:28:37 2015 -0800

----------------------------------------------------------------------
 .../scala/org/apache/spark/api/python/PythonRDD.scala  | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/5c746eed/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 4ac666c..119e045 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -67,17 +67,16 @@ private[spark] class PythonRDD(
       envVars += ("SPARK_REUSE_WORKER" -> "1")
     }
     val worker: Socket = env.createPythonWorker(pythonExec, envVars.toMap)
+    // Whether is the worker released into idle pool
+    @volatile var released = false
 
     // Start a thread to feed the process input from our parent's iterator
     val writerThread = new WriterThread(env, worker, split, context)
 
-    var complete_cleanly = false
     context.addTaskCompletionListener { context =>
       writerThread.shutdownOnTaskCompletion()
       writerThread.join()
-      if (reuse_worker && complete_cleanly) {
-        env.releasePythonWorker(pythonExec, envVars.toMap, worker)
-      } else {
+      if (!reuse_worker || !released) {
         try {
           worker.close()
         } catch {
@@ -145,8 +144,12 @@ private[spark] class PythonRDD(
                 stream.readFully(update)
                 accumulator += Collections.singletonList(update)
               }
+              // Check whether the worker is ready to be re-used.
               if (stream.readInt() == SpecialLengths.END_OF_STREAM) {
-                complete_cleanly = true
+                if (reuse_worker) {
+                  env.releasePythonWorker(pythonExec, envVars.toMap, worker)
+                  released = true
+                }
               }
               null
           }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org