You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by br...@apache.org on 2013/07/30 21:48:51 UTC
[3/4] git commit: Kill tasks that never properly launch.
Kill tasks that never properly launch.
After trying to launch a task tracker, we'll wait up to 5 minutes before
giving up and killing the task.
Review: https://reviews.apache.org/r/11124
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/aac37e84
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/aac37e84
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/aac37e84
Branch: refs/heads/master
Commit: aac37e84013cbc15e14d211960f1414c0373c1cf
Parents: 5ef452c
Author: Brenden Matthews <br...@airbnb.com>
Authored: Mon May 13 16:16:12 2013 -0700
Committer: Brenden Matthews <br...@airbnb.com>
Committed: Tue Jul 30 12:48:05 2013 -0700
----------------------------------------------------------------------
.../apache/hadoop/mapred/MesosScheduler.java | 41 ++++++++++++++++++--
1 file changed, 37 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/aac37e84/hadoop/mesos/src/java/org/apache/hadoop/mapred/MesosScheduler.java
----------------------------------------------------------------------
diff --git a/hadoop/mesos/src/java/org/apache/hadoop/mapred/MesosScheduler.java b/hadoop/mesos/src/java/org/apache/hadoop/mapred/MesosScheduler.java
index ca6106b..e4fbb80 100644
--- a/hadoop/mesos/src/java/org/apache/hadoop/mapred/MesosScheduler.java
+++ b/hadoop/mesos/src/java/org/apache/hadoop/mapred/MesosScheduler.java
@@ -10,6 +10,8 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.Timer;
+import java.util.TimerTask;
import org.apache.commons.httpclient.HttpHost;
import org.apache.commons.logging.Log;
@@ -71,6 +73,10 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
private static final int MAP_SLOTS_DEFAULT = 2;
private static final int REDUCE_SLOTS_DEFAULT = 2;
+ // The amount of time to wait for task trackers to launch before
+ // giving up.
+ private static final long LAUNCH_TIMEOUT_MS = 300000; // 5 minutes
+
// Count of the launched trackers for TaskID generation.
private long launchedTrackers = 0;
@@ -143,6 +149,7 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
+ mesosTracker.host);
driver.killTask(mesosTracker.taskId);
+ tracker.timer.cancel();
mesosTrackers.remove(tracker);
}
}
@@ -276,6 +283,11 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
LOG.info("Re-registered with master " + masterInfo);
}
+ public synchronized void killTracker(MesosTracker tracker) {
+ driver.killTask(tracker.taskId);
+ mesosTrackers.remove(tracker.host);
+ }
+
// For some reason, pendingMaps() and pendingReduces() doesn't return the
// values we expect. We observed negative values, which may be related to
// https://issues.apache.org/jira/browse/MAPREDUCE-1238. Below is the
@@ -346,6 +358,7 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
HttpHost host = new HttpHost(status.getHost(), status.getHttpPort());
if (mesosTrackers.containsKey(host)) {
mesosTrackers.get(host).active = true;
+ mesosTrackers.get(host).timer.cancel();
idleMapSlots += status.getAvailableMapSlots();
idleReduceSlots += status.getAvailableReduceSlots();
}
@@ -507,7 +520,7 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
// Add this tracker to Mesos tasks.
mesosTrackers.put(httpAddress, new MesosTracker(httpAddress, taskId,
- mapSlots, reduceSlots));
+ mapSlots, reduceSlots, this));
// Create the environment depending on whether the executor is going to be
// run locally.
@@ -697,6 +710,7 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
for (HttpHost tracker : trackers) {
if (mesosTrackers.get(tracker).taskId.equals(taskStatus.getTaskId())) {
LOG.info("Removing terminated TaskTracker: " + tracker);
+ tracker.timer.cancel();
mesosTrackers.remove(tracker);
}
}
@@ -746,21 +760,40 @@ public class MesosScheduler extends TaskScheduler implements Scheduler {
* Used to track the our launched TaskTrackers.
*/
private class MesosTracker {
- public HttpHost host;
+ public volatile HttpHost host;
public TaskID taskId;
public long mapSlots;
public long reduceSlots;
- public boolean active = false; // Set once tracked by the JobTracker.
+ public volatile boolean active = false; // Set once tracked by the JobTracker.
+ public Timer timer;
+ public volatile MesosScheduler scheduler;
// Tracks Hadoop job tasks running on the tracker.
public Set<JobID> hadoopJobs = new HashSet<JobID>();
public MesosTracker(HttpHost host, TaskID taskId, long mapSlots,
- long reduceSlots) {
+ long reduceSlots, MesosScheduler scheduler) {
this.host = host;
this.taskId = taskId;
this.mapSlots = mapSlots;
this.reduceSlots = reduceSlots;
+ this.scheduler = scheduler;
+
+ this.timer = new Timer();
+ timer.schedule(new TimerTask() {
+ @Override
+ public void run() {
+ synchronized (MesosTracker.this.scheduler) {
+ // If the tracker activated while we were awaiting to acquire the
+ // lock, return.
+ if (MesosTracker.this.active) return;
+
+ LOG.warn("Tracker " + MesosTracker.this.host + " failed to launch within " +
+ LAUNCH_TIMEOUT_MS / 1000 + " seconds, killing it");
+ MesosTracker.this.scheduler.killTracker(MesosTracker.this);
+ }
+ }
+ }, LAUNCH_TIMEOUT_MS);
}
}
}