You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2013/11/18 03:44:05 UTC
git commit: Merge pull request #173 from kayousterhout/scheduler_hang

Updated Branches:
  refs/heads/branch-0.8 24e238bfc -> 9d563716d


Merge pull request #173 from kayousterhout/scheduler_hang

Fix bug where scheduler could hang after task failure.

When a task fails, we need to call reviveOffers() so that the
task can be rescheduled on a different machine. In the current code,
the state in ClusterTaskSetManager indicating which tasks are
pending may be updated after revive offers is called (there's a
race condition here), so when revive offers is called, the task set
manager does not yet realize that there are failed tasks that need
to be relaunched.

This isn't currently unit tested but will be once my pull request for
merging the cluster and local schedulers goes in -- at which point
many more of the unit tests will exercise the code paths through
the cluster scheduler (currently the failure test suite uses the local
scheduler, which is why we didn't see this bug before).

(cherry picked from commit 96e0fb46309698b685c811a65bd8e1a691389994)
Signed-off-by: Reynold Xin <rx...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/incubator-spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spark/commit/9d563716
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spark/tree/9d563716
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spark/diff/9d563716

Branch: refs/heads/branch-0.8
Commit: 9d563716da91c129535b6843244125b69ee4cbcb
Parents: 24e238b
Author: Matei Zaharia <ma...@eecs.berkeley.edu>
Authored: Thu Nov 14 22:29:28 2013 -0800
Committer: Reynold Xin <rx...@apache.org>
Committed: Sun Nov 17 18:43:59 2013 -0800

----------------------------------------------------------------------
 .../spark/scheduler/cluster/ClusterScheduler.scala     | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spark/blob/9d563716/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterScheduler.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterScheduler.scala
index 8503395..27145cc 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterScheduler.scala
@@ -256,7 +256,6 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
 
   def statusUpdate(tid: Long, state: TaskState, serializedData: ByteBuffer) {
     var failedExecutor: Option[String] = None
-    var taskFailed = false
     synchronized {
       try {
         if (state == TaskState.LOST && taskIdToExecutorId.contains(tid)) {
@@ -276,9 +275,6 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
               }
               taskIdToExecutorId.remove(tid)
             }
-            if (state == TaskState.FAILED) {
-              taskFailed = true
-            }
             activeTaskSets.get(taskSetId).foreach { taskSet =>
               if (state == TaskState.FINISHED) {
                 taskSet.removeRunningTask(tid)
@@ -300,10 +296,6 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
       dagScheduler.executorLost(failedExecutor.get)
       backend.reviveOffers()
     }
-    if (taskFailed) {
-      // Also revive offers if a task had failed for some reason other than host lost
-      backend.reviveOffers()
-    }
   }
 
   def handleTaskGettingResult(taskSetManager: ClusterTaskSetManager, tid: Long) {
@@ -323,8 +315,9 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
     taskState: TaskState,
     reason: Option[TaskEndReason]) = synchronized {
     taskSetManager.handleFailedTask(tid, taskState, reason)
-    if (taskState == TaskState.FINISHED) {
-      // The task finished successfully but the result was lost, so we should revive offers.
+    if (taskState != TaskState.KILLED) {
+      // Need to revive offers again now that the task set manager state has been updated to
+      // reflect failed tasks that need to be re-run.
       backend.reviveOffers()
     }
   }