You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cu...@apache.org on 2006/11/01 19:53:30 UTC

svn commit: r470034 - in /lucene/hadoop/trunk: CHANGES.txt src/java/org/apache/hadoop/ipc/Client.java src/java/org/apache/hadoop/ipc/Server.java src/java/org/apache/hadoop/mapred/JobTracker.java src/java/org/apache/hadoop/mapred/TaskTracker.java

Author: cutting
Date: Wed Nov  1 10:53:30 2006
New Revision: 470034

URL: http://svn.apache.org/viewvc?view=rev&rev=470034
Log:
HADOOP-633.  Keep jobtracker from dying when job initialization throws exceptions.  Contributed by Owen.

Modified:
    lucene/hadoop/trunk/CHANGES.txt
    lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Client.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Server.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java

Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=470034&r1=470033&r2=470034
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Wed Nov  1 10:53:30 2006
@@ -119,6 +119,11 @@
 33. HADOOP-664.  Cause entire build to fail if libhdfs tests fail.
     (Nigel Daley via cutting)
 
+34. HADOOP-633.  Keep jobtracker from dying when job initialization
+    throws exceptions.  Also improve exception handling in a few other
+    places and add more informative thread names.
+    (omalley via cutting)
+
 
 Release 0.7.2 - 2006-10-18
 

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Client.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Client.java?view=diff&rev=470034&r1=470033&r2=470034
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Client.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Client.java Wed Nov  1 10:53:30 2006
@@ -122,7 +122,7 @@
          throw new UnknownHostException("unknown host: " + address.getHostName());
       }
       this.address = address;
-      this.setName("Client connection to " + address.toString());
+      this.setName("IPC Client connection to " + address.toString());
       this.setDaemon(true);
     }
 
@@ -421,8 +421,9 @@
 
     Thread t = new ConnectionCuller();
     t.setDaemon(true);
-    t.setName(valueClass.getName()
-              +" ConnectionCuller maxidletime="+maxIdleTime+"ms");
+    t.setName(valueClass.getName() + " Connection Culler");
+    LOG.info(valueClass.getName() + 
+             "Connection culler maxidletime= " + maxIdleTime + "ms");
     t.start();
   }
  

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Server.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Server.java?view=diff&rev=470034&r1=470033&r2=470034
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Server.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/ipc/Server.java Wed Nov  1 10:53:30 2006
@@ -158,7 +158,7 @@
 
       // Register accepts on the server socket with the selector.
       acceptChannel.register(selector, SelectionKey.OP_ACCEPT);
-      this.setName("Server listener on port " + port);
+      this.setName("IPC Server listener on " + port);
       this.setDaemon(true);
     }
     /** cleanup connections from connectionList. Choose a random range
@@ -476,7 +476,7 @@
   private class Handler extends Thread {
     public Handler(int instanceNumber) {
       this.setDaemon(true);
-      this.setName("Server handler "+ instanceNumber + " on " + port);
+      this.setName("IPC Server handler "+ instanceNumber + " on " + port);
     }
 
     public void run() {

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java?view=diff&rev=470034&r1=470033&r2=470034
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java Wed Nov  1 10:53:30 2006
@@ -40,7 +40,6 @@
  * @author Mike Cafarella
  *******************************************************/
 public class JobTracker implements MRConstants, InterTrackerProtocol, JobSubmissionProtocol {
-    static long JOBINIT_SLEEP_INTERVAL = 2000;
     static long RETIRE_JOB_INTERVAL;
     static long RETIRE_JOB_CHECK_INTERVAL;
     static float TASK_ALLOC_EPSILON;
@@ -269,11 +268,9 @@
          */
         public void run() {
             while (shouldRun) {
-                try {
-                    Thread.sleep(RETIRE_JOB_CHECK_INTERVAL);
-                } catch (InterruptedException ie) {
-                }
-                
+              try {
+                Thread.sleep(RETIRE_JOB_CHECK_INTERVAL);
+                 
                 synchronized (jobs) {
                     synchronized (jobsByArrival) {
                         synchronized (jobInitQueue) {
@@ -293,11 +290,14 @@
                         }
                     }
                 }
+              } catch (InterruptedException t) {
+                shouldRun = false;
+              } catch (Throwable t) {
+                LOG.error("Error in retiring job:\n" +
+                          StringUtils.stringifyException(t));
+              }
             }
         }
-        public void stopRetirer() {
-            shouldRun = false;
-        }
     }
 
     /////////////////////////////////////////////////////////////////
@@ -308,31 +308,27 @@
         public JobInitThread() {
         }
         public void run() {
-            while (shouldRun) {
-                JobInProgress job = null;
-                synchronized (jobInitQueue) {
-                    if (jobInitQueue.size() > 0) {
-                        job = (JobInProgress) jobInitQueue.elementAt(0);
-                        jobInitQueue.remove(job);
-                    } else {
-                        try {
-                            jobInitQueue.wait(JOBINIT_SLEEP_INTERVAL);
-                        } catch (InterruptedException iex) {
-                        }
-                    }
-                }
-                try {
-                    if (job != null) {
-                        job.initTasks();
-                    }
-                } catch (Exception e) {
-                    LOG.warn("job init failed", e);
-                    job.kill();
+          JobInProgress job;
+          while (shouldRun) {
+            job = null;
+            try {
+              synchronized (jobInitQueue) {
+                while (jobInitQueue.isEmpty()) {
+                  jobInitQueue.wait();
                 }
+                job = jobInitQueue.remove(0);
+              }
+              job.initTasks();
+            } catch (InterruptedException t) {
+              shouldRun = false;
+            } catch (Throwable t) {
+              LOG.error("Job initialization failed:\n" +
+                        StringUtils.stringifyException(t));
+              if (job != null) {
+                job.kill();
+              }
             }
-        }
-        public void stopIniter() {
-            shouldRun = false;
+          }
         }
     }
 
@@ -430,7 +426,7 @@
     int totalMaps = 0;
     int totalReduces = 0;
     private TreeMap taskTrackers = new TreeMap();
-    Vector jobInitQueue = new Vector();
+    List<JobInProgress> jobInitQueue = new ArrayList();
     ExpireTrackers expireTrackers = new ExpireTrackers();
     Thread expireTrackersThread = null;
     RetireJobs retireJobs = new RetireJobs();
@@ -438,7 +434,8 @@
     JobInitThread initJobs = new JobInitThread();
     Thread initJobsThread = null;
     ExpireLaunchingTasks expireLaunchingTasks = new ExpireLaunchingTasks();
-    Thread expireLaunchingTaskThread = new Thread(expireLaunchingTasks);
+    Thread expireLaunchingTaskThread = new Thread(expireLaunchingTasks,
+                                                  "expireLaunchingTasks");
     
     /**
      * It might seem like a bug to maintain a TreeSet of status objects,
@@ -524,11 +521,12 @@
         this.startTime = System.currentTimeMillis();
 
         myMetrics = new JobTrackerMetrics();
-        this.expireTrackersThread = new Thread(this.expireTrackers);
+        this.expireTrackersThread = new Thread(this.expireTrackers,
+                                               "expireTrackers");
         this.expireTrackersThread.start();
-        this.retireJobsThread = new Thread(this.retireJobs);
+        this.retireJobsThread = new Thread(this.retireJobs, "retireJobs");
         this.retireJobsThread.start();
-        this.initJobsThread = new Thread(this.initJobs);
+        this.initJobsThread = new Thread(this.initJobs, "initJobs");
         this.initJobsThread.start();
         expireLaunchingTaskThread.start();
     }
@@ -582,9 +580,8 @@
         }
         if (this.retireJobs != null) {
             LOG.info("Stopping retirer");
-            this.retireJobs.stopRetirer();
+            this.retireJobsThread.interrupt();
             try {
-                this.retireJobsThread.interrupt();
                 this.retireJobsThread.join();
             } catch (InterruptedException ex) {
                 ex.printStackTrace();
@@ -592,9 +589,8 @@
         }
         if (this.initJobs != null) {
             LOG.info("Stopping initer");
-            this.initJobs.stopIniter();
+            this.initJobsThread.interrupt();
             try {
-                this.initJobsThread.interrupt();
                 this.initJobsThread.join();
             } catch (InterruptedException ex) {
                 ex.printStackTrace();

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java?view=diff&rev=470034&r1=470033&r2=470034
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java Wed Nov  1 10:53:30 2006
@@ -147,7 +147,7 @@
             }
           }
         }
-      });
+      }, "taskCleanup");
     {
       taskCleanupThread.setDaemon(true);
       taskCleanupThread.start();
@@ -356,7 +356,7 @@
         // in parallel, as RPC servers can take a long
         // time to shutdown.  (They need to wait a full
         // RPC timeout, which might be 10-30 seconds.)
-        new Thread() {
+        new Thread("RPC shutdown") {
             public void run() {
                 if (taskReportServer != null) {
                     taskReportServer.stop();