You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by yh...@apache.org on 2009/03/02 15:38:52 UTC

svn commit: r749319 - in /hadoop/core/branches/branch-0.20: ./ CHANGES.txt src/mapred/org/apache/hadoop/mapred/JobTracker.java src/test/org/apache/hadoop/mapred/TestRecoveryManager.java

Author: yhemanth
Date: Mon Mar  2 14:38:51 2009
New Revision: 749319

URL: http://svn.apache.org/viewvc?rev=749319&view=rev
Log:
Merge -r 749317:749318 from trunk to branch 0.20 to fix HADOOP-4638.

Added:
    hadoop/core/branches/branch-0.20/src/test/org/apache/hadoop/mapred/TestRecoveryManager.java
      - copied unchanged from r749318, hadoop/core/trunk/src/test/org/apache/hadoop/mapred/TestRecoveryManager.java
Modified:
    hadoop/core/branches/branch-0.20/   (props changed)
    hadoop/core/branches/branch-0.20/CHANGES.txt   (contents, props changed)
    hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java

Propchange: hadoop/core/branches/branch-0.20/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Mon Mar  2 14:38:51 2009
@@ -1,2 +1,2 @@
 /hadoop/core/branches/branch-0.19:713112
-/hadoop/core/trunk:727001,727117,727191,727212,727217,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,736426,738328,738697,740077,740157,741703,741762,743745,743816,743892,744894,745180,746010,746206,746227,746233,746274,746902-746903,746944,746968,746970,747279,747802,748084,748090,748783,749262
+/hadoop/core/trunk:727001,727117,727191,727212,727217,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,736426,738328,738697,740077,740157,741703,741762,743745,743816,743892,744894,745180,746010,746206,746227,746233,746274,746902-746903,746944,746968,746970,747279,747802,748084,748090,748783,749262,749318

Modified: hadoop/core/branches/branch-0.20/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/CHANGES.txt?rev=749319&r1=749318&r2=749319&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.20/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.20/CHANGES.txt Mon Mar  2 14:38:51 2009
@@ -682,6 +682,9 @@
     HADOOP-5146. Fixes a race condition that causes LocalDirAllocator to miss
     files.  (Devaraj Das via yhemanth)
 
+    HADOOP-4638. Fixes job recovery to not crash the job tracker for problems
+    with a single job file. (Amar Kamat via yhemanth)
+
 Release 0.19.1 - Unreleased
 
   IMPROVEMENTS

Propchange: hadoop/core/branches/branch-0.20/CHANGES.txt
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Mon Mar  2 14:38:51 2009
@@ -1,3 +1,3 @@
 /hadoop/core/branches/branch-0.18/CHANGES.txt:727226
 /hadoop/core/branches/branch-0.19/CHANGES.txt:713112
-/hadoop/core/trunk/CHANGES.txt:727001,727117,727191,727212,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,735082,736426,738602,738697,739416,740077,740157,741703,741762,743296,743745,743816,743892,744894,745180,745268,746010,746193,746206,746227,746233,746274,746902-746903,746944,746968,746970,747279,747802,748084,748090,748783,749262
+/hadoop/core/trunk/CHANGES.txt:727001,727117,727191,727212,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,735082,736426,738602,738697,739416,740077,740157,741703,741762,743296,743745,743816,743892,744894,745180,745268,746010,746193,746206,746227,746233,746274,746902-746903,746944,746968,746970,747279,747802,748084,748090,748783,749262,749318

Modified: hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java?rev=749319&r1=749318&r2=749319&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java (original)
+++ hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java Mon Mar  2 14:38:51 2009
@@ -805,7 +805,14 @@
         if (Values.PREP.name().equals(jobStatus)) {
           hasUpdates = true;
           LOG.info("Calling init from RM for job " + jip.getJobID().toString());
-          jip.initTasks();
+          try {
+            jip.initTasks();
+          } catch (IOException ioe) {
+            LOG.error("Job initialization failed : \n" 
+                      + StringUtils.stringifyException(ioe));
+            jip.fail(); // fail the job
+            throw ioe;
+          }
         }
       }
       
@@ -1080,22 +1087,35 @@
     public void recover() throws IOException {
       // I. Init the jobs and cache the recovered job history filenames
       Map<JobID, Path> jobHistoryFilenameMap = new HashMap<JobID, Path>();
-      for (JobID id : jobsToRecover) {
+      Iterator<JobID> idIter = jobsToRecover.iterator();
+      while (idIter.hasNext()) {
+        JobID id = idIter.next();
+        LOG.info("Trying to recover job " + id);
         // 1. Create the job object
         JobInProgress job = new JobInProgress(id, JobTracker.this, conf);
         
-        // 2. Get the log file and the file path
-        String logFileName = 
-          JobHistory.JobInfo.getJobHistoryFileName(job.getJobConf(), id);
-        Path jobHistoryFilePath = 
-          JobHistory.JobInfo.getJobHistoryLogLocation(logFileName);
-        
-        // 3. Recover the history file. This involved
-        //     - deleting file.recover if file exists
-        //     - renaming file.recover to file if file doesnt exist
-        // This makes sure that the (master) file exists
-        JobHistory.JobInfo.recoverJobHistoryFile(job.getJobConf(), 
-                                                 jobHistoryFilePath);
+        String logFileName;
+        Path jobHistoryFilePath;
+        try {
+          // 2. Get the log file and the file path
+          logFileName = 
+            JobHistory.JobInfo.getJobHistoryFileName(job.getJobConf(), id);
+          jobHistoryFilePath = 
+            JobHistory.JobInfo.getJobHistoryLogLocation(logFileName);
+
+          // 3. Recover the history file. This involved
+          //     - deleting file.recover if file exists
+          //     - renaming file.recover to file if file doesnt exist
+          // This makes sure that the (master) file exists
+          JobHistory.JobInfo.recoverJobHistoryFile(job.getJobConf(), 
+                                                   jobHistoryFilePath);
+        } catch (IOException ioe) {
+          LOG.warn("Failed to recover job " + id + " history filename." 
+                   + " Ignoring.", ioe);
+          // TODO : remove job details from the system directory
+          idIter.remove();
+          continue;
+        }
 
         // 4. Cache the history file name as it costs one dfs access
         jobHistoryFilenameMap.put(job.getJobID(), jobHistoryFilePath);
@@ -1124,13 +1144,14 @@
           JobHistory.parseHistoryFromFS(jobHistoryFilePath.toString(), 
                                         listener, fs);
         } catch (IOException e) {
-          LOG.info("JobTracker failed to recover job " + pJob + "."
-                   + " Ignoring it.", e);
-          continue;
+          LOG.info("JobTracker failed to recover job " + pJob.getJobID() + "."
+                     + " Ignoring it.", e);
         }
 
         // 3. Close the listener
         listener.close();
+        
+        LOG.info("Restart count for job " + id + " is " + pJob.numRestarts());
 
         // 4. Update the recovery metric
         totalEventsRecovered += listener.getNumEventsRecovered();
@@ -1138,9 +1159,14 @@
         // 5. Cleanup history
         // Delete the master log file as an indication that the new file
         // should be used in future
-        synchronized (pJob) {
-          JobHistory.JobInfo.checkpointRecovery(logFileName, 
-              pJob.getJobConf());
+        try {
+          synchronized (pJob) {
+            JobHistory.JobInfo.checkpointRecovery(logFileName, 
+                                                  pJob.getJobConf());
+          }
+        } catch (IOException ioe) {
+          LOG.warn("Failed to delete log file (" + logFileName + ") for job " 
+                   + id + ". Ignoring it.", ioe);
         }
 
         // 6. Inform the jobtracker as to how much of the data is recovered.
@@ -2733,8 +2759,7 @@
    * adding a job. This is the core job submission logic
    * @param jobId The id for the job submitted which needs to be added
    */
-  private synchronized JobStatus addJob(JobID jobId, JobInProgress job) 
-  throws IOException {
+  private synchronized JobStatus addJob(JobID jobId, JobInProgress job) {
     totalSubmissions++;
 
     synchronized (jobs) {
@@ -3152,6 +3177,11 @@
       
       JobInProgress job = getJob(taskId.getJobID());
       if (job == null) {
+        // if job is not there in the cleanup list ... add it
+        synchronized (trackerToJobsToCleanup) {
+          Set<JobID> jobs = trackerToJobsToCleanup.get(trackerName);
+          jobs.add(taskId.getJobID());
+        }
         continue;
       }