You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by br...@apache.org on 2013/07/30 21:48:51 UTC

[3/4] git commit: Kill tasks that never properly launch.

Kill tasks that never properly launch.

After trying to launch a task tracker, we'll wait up to 5 minutes before
giving up and killing the task.

Review: https://reviews.apache.org/r/11124


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/aac37e84
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/aac37e84
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/aac37e84

Branch: refs/heads/master
Commit: aac37e84013cbc15e14d211960f1414c0373c1cf
Parents: 5ef452c
Author: Brenden Matthews <br...@airbnb.com>
Authored: Mon May 13 16:16:12 2013 -0700
Committer: Brenden Matthews <br...@airbnb.com>
Committed: Tue Jul 30 12:48:05 2013 -0700

----------------------------------------------------------------------
 .../apache/hadoop/mapred/MesosScheduler.java    | 41 ++++++++++++++++++--
 1 file changed, 37 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/aac37e84/hadoop/mesos/src/java/org/apache/hadoop/mapred/MesosScheduler.java
----------------------------------------------------------------------
diff --git a/hadoop/mesos/src/java/org/apache/hadoop/mapred/MesosScheduler.java b/hadoop/mesos/src/java/org/apache/hadoop/mapred/MesosScheduler.java
index ca6106b..e4fbb80 100644
--- a/hadoop/mesos/src/java/org/apache/hadoop/mapred/MesosScheduler.java
+++ b/hadoop/mesos/src/java/org/apache/hadoop/mapred/MesosScheduler.java
@@ -10,6 +10,8 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.Timer;
+import java.util.TimerTask;
 
 import org.apache.commons.httpclient.HttpHost;
 import org.apache.commons.logging.Log;
@@ -71,6 +73,10 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
   private static final int MAP_SLOTS_DEFAULT = 2;
   private static final int REDUCE_SLOTS_DEFAULT = 2;
 
+  // The amount of time to wait for task trackers to launch before
+  // giving up.
+  private static final long LAUNCH_TIMEOUT_MS = 300000; // 5 minutes
+
   // Count of the launched trackers for TaskID generation.
   private long launchedTrackers = 0;
 
@@ -143,6 +149,7 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
                     + mesosTracker.host);
 
                 driver.killTask(mesosTracker.taskId);
+		tracker.timer.cancel();
                 mesosTrackers.remove(tracker);
               }
             }
@@ -276,6 +283,11 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
     LOG.info("Re-registered with master " + masterInfo);
   }
 
+  public synchronized void killTracker(MesosTracker tracker) {
+    driver.killTask(tracker.taskId);
+    mesosTrackers.remove(tracker.host);
+  }
+
   // For some reason, pendingMaps() and pendingReduces() doesn't return the
   // values we expect. We observed negative values, which may be related to
   // https://issues.apache.org/jira/browse/MAPREDUCE-1238. Below is the
@@ -346,6 +358,7 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
         HttpHost host = new HttpHost(status.getHost(), status.getHttpPort());
         if (mesosTrackers.containsKey(host)) {
           mesosTrackers.get(host).active = true;
+          mesosTrackers.get(host).timer.cancel();
           idleMapSlots += status.getAvailableMapSlots();
           idleReduceSlots += status.getAvailableReduceSlots();
         }
@@ -507,7 +520,7 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
 
         // Add this tracker to Mesos tasks.
         mesosTrackers.put(httpAddress, new MesosTracker(httpAddress, taskId,
-            mapSlots, reduceSlots));
+              mapSlots, reduceSlots, this));
 
         // Create the environment depending on whether the executor is going to be
         // run locally.
@@ -697,6 +710,7 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
         for (HttpHost tracker : trackers) {
           if (mesosTrackers.get(tracker).taskId.equals(taskStatus.getTaskId())) {
             LOG.info("Removing terminated TaskTracker: " + tracker);
+	    tracker.timer.cancel();
             mesosTrackers.remove(tracker);
           }
         }
@@ -746,21 +760,40 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
    * Used to track the our launched TaskTrackers.
    */
   private class MesosTracker {
-    public HttpHost host;
+    public volatile HttpHost host;
     public TaskID taskId;
     public long mapSlots;
     public long reduceSlots;
-    public boolean active = false; // Set once tracked by the JobTracker.
+    public volatile boolean active = false; // Set once tracked by the JobTracker.
+    public Timer timer;
+    public volatile MesosScheduler scheduler;
 
     // Tracks Hadoop job tasks running on the tracker.
     public Set<JobID> hadoopJobs = new HashSet<JobID>();
 
     public MesosTracker(HttpHost host, TaskID taskId, long mapSlots,
-        long reduceSlots) {
+        long reduceSlots, MesosScheduler scheduler) {
       this.host = host;
       this.taskId = taskId;
       this.mapSlots = mapSlots;
       this.reduceSlots = reduceSlots;
+      this.scheduler = scheduler;
+
+      this.timer = new Timer();
+      timer.schedule(new TimerTask() {
+        @Override
+        public void run() {
+          synchronized (MesosTracker.this.scheduler) {
+            // If the tracker activated while we were awaiting to acquire the
+            // lock, return.
+            if (MesosTracker.this.active) return;
+
+            LOG.warn("Tracker " + MesosTracker.this.host + " failed to launch within " +
+              LAUNCH_TIMEOUT_MS / 1000 + " seconds, killing it");
+            MesosTracker.this.scheduler.killTracker(MesosTracker.this);
+          }
+        }
+      }, LAUNCH_TIMEOUT_MS);
     }
   }
 }