You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cu...@apache.org on 2006/11/01 19:53:30 UTC
svn commit: r470034 - in /lucene/hadoop/trunk: CHANGES.txt
src/java/org/apache/hadoop/ipc/Client.java
src/java/org/apache/hadoop/ipc/Server.java
src/java/org/apache/hadoop/mapred/JobTracker.java
src/java/org/apache/hadoop/mapred/TaskTracker.java
Author: cutting
Date: Wed Nov 1 10:53:30 2006
New Revision: 470034
URL: http://svn.apache.org/viewvc?view=rev&rev=470034
Log:
HADOOP-633. Keep jobtracker from dying when job initialization throws exceptions. Contributed by Owen.
Modified:
lucene/hadoop/trunk/CHANGES.txt
lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Client.java
lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Server.java
lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java
lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java
Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=470034&r1=470033&r2=470034
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Wed Nov 1 10:53:30 2006
@@ -119,6 +119,11 @@
33. HADOOP-664. Cause entire build to fail if libhdfs tests fail.
(Nigel Daley via cutting)
+34. HADOOP-633. Keep jobtracker from dying when job initialization
+ throws exceptions. Also improve exception handling in a few other
+ places and add more informative thread names.
+ (omalley via cutting)
+
Release 0.7.2 - 2006-10-18
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Client.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Client.java?view=diff&rev=470034&r1=470033&r2=470034
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Client.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Client.java Wed Nov 1 10:53:30 2006
@@ -122,7 +122,7 @@
throw new UnknownHostException("unknown host: " + address.getHostName());
}
this.address = address;
- this.setName("Client connection to " + address.toString());
+ this.setName("IPC Client connection to " + address.toString());
this.setDaemon(true);
}
@@ -421,8 +421,9 @@
Thread t = new ConnectionCuller();
t.setDaemon(true);
- t.setName(valueClass.getName()
- +" ConnectionCuller maxidletime="+maxIdleTime+"ms");
+ t.setName(valueClass.getName() + " Connection Culler");
+ LOG.info(valueClass.getName() +
+ "Connection culler maxidletime= " + maxIdleTime + "ms");
t.start();
}
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Server.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Server.java?view=diff&rev=470034&r1=470033&r2=470034
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Server.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Server.java Wed Nov 1 10:53:30 2006
@@ -158,7 +158,7 @@
// Register accepts on the server socket with the selector.
acceptChannel.register(selector, SelectionKey.OP_ACCEPT);
- this.setName("Server listener on port " + port);
+ this.setName("IPC Server listener on " + port);
this.setDaemon(true);
}
/** cleanup connections from connectionList. Choose a random range
@@ -476,7 +476,7 @@
private class Handler extends Thread {
public Handler(int instanceNumber) {
this.setDaemon(true);
- this.setName("Server handler "+ instanceNumber + " on " + port);
+ this.setName("IPC Server handler "+ instanceNumber + " on " + port);
}
public void run() {
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java?view=diff&rev=470034&r1=470033&r2=470034
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java Wed Nov 1 10:53:30 2006
@@ -40,7 +40,6 @@
* @author Mike Cafarella
*******************************************************/
public class JobTracker implements MRConstants, InterTrackerProtocol, JobSubmissionProtocol {
- static long JOBINIT_SLEEP_INTERVAL = 2000;
static long RETIRE_JOB_INTERVAL;
static long RETIRE_JOB_CHECK_INTERVAL;
static float TASK_ALLOC_EPSILON;
@@ -269,11 +268,9 @@
*/
public void run() {
while (shouldRun) {
- try {
- Thread.sleep(RETIRE_JOB_CHECK_INTERVAL);
- } catch (InterruptedException ie) {
- }
-
+ try {
+ Thread.sleep(RETIRE_JOB_CHECK_INTERVAL);
+
synchronized (jobs) {
synchronized (jobsByArrival) {
synchronized (jobInitQueue) {
@@ -293,11 +290,14 @@
}
}
}
+ } catch (InterruptedException t) {
+ shouldRun = false;
+ } catch (Throwable t) {
+ LOG.error("Error in retiring job:\n" +
+ StringUtils.stringifyException(t));
+ }
}
}
- public void stopRetirer() {
- shouldRun = false;
- }
}
/////////////////////////////////////////////////////////////////
@@ -308,31 +308,27 @@
public JobInitThread() {
}
public void run() {
- while (shouldRun) {
- JobInProgress job = null;
- synchronized (jobInitQueue) {
- if (jobInitQueue.size() > 0) {
- job = (JobInProgress) jobInitQueue.elementAt(0);
- jobInitQueue.remove(job);
- } else {
- try {
- jobInitQueue.wait(JOBINIT_SLEEP_INTERVAL);
- } catch (InterruptedException iex) {
- }
- }
- }
- try {
- if (job != null) {
- job.initTasks();
- }
- } catch (Exception e) {
- LOG.warn("job init failed", e);
- job.kill();
+ JobInProgress job;
+ while (shouldRun) {
+ job = null;
+ try {
+ synchronized (jobInitQueue) {
+ while (jobInitQueue.isEmpty()) {
+ jobInitQueue.wait();
}
+ job = jobInitQueue.remove(0);
+ }
+ job.initTasks();
+ } catch (InterruptedException t) {
+ shouldRun = false;
+ } catch (Throwable t) {
+ LOG.error("Job initialization failed:\n" +
+ StringUtils.stringifyException(t));
+ if (job != null) {
+ job.kill();
+ }
}
- }
- public void stopIniter() {
- shouldRun = false;
+ }
}
}
@@ -430,7 +426,7 @@
int totalMaps = 0;
int totalReduces = 0;
private TreeMap taskTrackers = new TreeMap();
- Vector jobInitQueue = new Vector();
+ List<JobInProgress> jobInitQueue = new ArrayList();
ExpireTrackers expireTrackers = new ExpireTrackers();
Thread expireTrackersThread = null;
RetireJobs retireJobs = new RetireJobs();
@@ -438,7 +434,8 @@
JobInitThread initJobs = new JobInitThread();
Thread initJobsThread = null;
ExpireLaunchingTasks expireLaunchingTasks = new ExpireLaunchingTasks();
- Thread expireLaunchingTaskThread = new Thread(expireLaunchingTasks);
+ Thread expireLaunchingTaskThread = new Thread(expireLaunchingTasks,
+ "expireLaunchingTasks");
/**
* It might seem like a bug to maintain a TreeSet of status objects,
@@ -524,11 +521,12 @@
this.startTime = System.currentTimeMillis();
myMetrics = new JobTrackerMetrics();
- this.expireTrackersThread = new Thread(this.expireTrackers);
+ this.expireTrackersThread = new Thread(this.expireTrackers,
+ "expireTrackers");
this.expireTrackersThread.start();
- this.retireJobsThread = new Thread(this.retireJobs);
+ this.retireJobsThread = new Thread(this.retireJobs, "retireJobs");
this.retireJobsThread.start();
- this.initJobsThread = new Thread(this.initJobs);
+ this.initJobsThread = new Thread(this.initJobs, "initJobs");
this.initJobsThread.start();
expireLaunchingTaskThread.start();
}
@@ -582,9 +580,8 @@
}
if (this.retireJobs != null) {
LOG.info("Stopping retirer");
- this.retireJobs.stopRetirer();
+ this.retireJobsThread.interrupt();
try {
- this.retireJobsThread.interrupt();
this.retireJobsThread.join();
} catch (InterruptedException ex) {
ex.printStackTrace();
@@ -592,9 +589,8 @@
}
if (this.initJobs != null) {
LOG.info("Stopping initer");
- this.initJobs.stopIniter();
+ this.initJobsThread.interrupt();
try {
- this.initJobsThread.interrupt();
this.initJobsThread.join();
} catch (InterruptedException ex) {
ex.printStackTrace();
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java?view=diff&rev=470034&r1=470033&r2=470034
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java Wed Nov 1 10:53:30 2006
@@ -147,7 +147,7 @@
}
}
}
- });
+ }, "taskCleanup");
{
taskCleanupThread.setDaemon(true);
taskCleanupThread.start();
@@ -356,7 +356,7 @@
// in parallel, as RPC servers can take a long
// time to shutdown. (They need to wait a full
// RPC timeout, which might be 10-30 seconds.)
- new Thread() {
+ new Thread("RPC shutdown") {
public void run() {
if (taskReportServer != null) {
taskReportServer.stop();