You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by dh...@apache.org on 2008/10/21 08:19:34 UTC

svn commit: r706534 - in /hadoop/core/branches/branch-0.19: CHANGES.txt src/mapred/org/apache/hadoop/mapred/JobClient.java src/mapred/org/apache/hadoop/mapred/JobTracker.java

Author: dhruba
Date: Mon Oct 20 23:19:34 2008
New Revision: 706534

URL: http://svn.apache.org/viewvc?rev=706534&view=rev
Log:
HADOOP-4296. Fix job client failures by not retiring a job as soon as it
is finished. (dhruba)


Modified:
    hadoop/core/branches/branch-0.19/CHANGES.txt
    hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobClient.java
    hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobTracker.java

Modified: hadoop/core/branches/branch-0.19/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/CHANGES.txt?rev=706534&r1=706533&r2=706534&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.19/CHANGES.txt Mon Oct 20 23:19:34 2008
@@ -937,6 +937,9 @@
     list of jobs to be keyed by the priority, submit time, and job tracker id.
     (Amar Kamat via omalley)
 
+    HADOOP-4296. Fix job client failures by not retiring a job as soon as it
+    is finished. (dhruba)
+
 Release 0.18.2 - Unreleased
 
   BUG FIXES

Modified: hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobClient.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobClient.java?rev=706534&r1=706533&r2=706534&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobClient.java (original)
+++ hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobClient.java Mon Oct 20 23:19:34 2008
@@ -1141,6 +1141,9 @@
             break;
           }
           running = jc.getJob(jobId);
+          if (running == null) {
+            throw new IOException("Unable to fetch job status from server.");
+          }
           String report = 
             (" map " + StringUtils.formatPercent(running.mapProgress(), 0)+
              " reduce " + 

Modified: hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobTracker.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobTracker.java?rev=706534&r1=706533&r2=706534&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobTracker.java (original)
+++ hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobTracker.java Mon Oct 20 23:19:34 2008
@@ -113,6 +113,13 @@
    */
   final int MAX_COMPLETE_USER_JOBS_IN_MEMORY;
 
+   /**
+    * The minimum time (in ms) that a job's information has to remain
+    * in the JobTracker's memory before it is retired.
+    */
+  static final int MIN_TIME_BEFORE_RETIRE = 60000;
+
+
   private int nextJobId = 1;
 
   public static final Log LOG = LogFactory.getLog(JobTracker.class);
@@ -343,12 +350,14 @@
         try {
           Thread.sleep(RETIRE_JOB_CHECK_INTERVAL);
           List<JobInProgress> retiredJobs = new ArrayList<JobInProgress>();
-          long retireBefore = System.currentTimeMillis() - 
-            RETIRE_JOB_INTERVAL;
+          long now = System.currentTimeMillis();
+          long retireBefore = now - RETIRE_JOB_INTERVAL;
+
           synchronized (jobs) {
             for(JobInProgress job: jobs.values()) {
               if (job.getStatus().getRunState() != JobStatus.RUNNING &&
                   job.getStatus().getRunState() != JobStatus.PREP &&
+                  (job.getFinishTime() + MIN_TIME_BEFORE_RETIRE < now) &&
                   (job.getFinishTime()  < retireBefore)) {
                 retiredJobs.add(job);
               }
@@ -1524,7 +1533,8 @@
     } catch (IOException ioe) {
       LOG.info("Failed to finalize the log file recovery for job " + id, ioe);
     }
-    
+
+    long now = System.currentTimeMillis();
     
     // Purge oldest jobs and keep at-most MAX_COMPLETE_USER_JOBS_IN_MEMORY jobs of a given user
     // in memory; information about the purged jobs is available via
@@ -1553,6 +1563,11 @@
               if (rjob == job) {
                 break;
               }
+
+              // do not retire jobs that finished in the very recent past.
+              if (rjob.getFinishTime() + MIN_TIME_BEFORE_RETIRE > now) {
+                break;
+              }
                 
               // Cleanup all datastructures
               int rjobRunState =