You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by yh...@apache.org on 2009/03/02 15:38:52 UTC
svn commit: r749319 - in /hadoop/core/branches/branch-0.20: ./ CHANGES.txt
src/mapred/org/apache/hadoop/mapred/JobTracker.java
src/test/org/apache/hadoop/mapred/TestRecoveryManager.java
Author: yhemanth
Date: Mon Mar 2 14:38:51 2009
New Revision: 749319
URL: http://svn.apache.org/viewvc?rev=749319&view=rev
Log:
Merge -r 749317:749318 from trunk to branch 0.20 to fix HADOOP-4638.
Added:
hadoop/core/branches/branch-0.20/src/test/org/apache/hadoop/mapred/TestRecoveryManager.java
- copied unchanged from r749318, hadoop/core/trunk/src/test/org/apache/hadoop/mapred/TestRecoveryManager.java
Modified:
hadoop/core/branches/branch-0.20/ (props changed)
hadoop/core/branches/branch-0.20/CHANGES.txt (contents, props changed)
hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java
Propchange: hadoop/core/branches/branch-0.20/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Mon Mar 2 14:38:51 2009
@@ -1,2 +1,2 @@
/hadoop/core/branches/branch-0.19:713112
-/hadoop/core/trunk:727001,727117,727191,727212,727217,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,736426,738328,738697,740077,740157,741703,741762,743745,743816,743892,744894,745180,746010,746206,746227,746233,746274,746902-746903,746944,746968,746970,747279,747802,748084,748090,748783,749262
+/hadoop/core/trunk:727001,727117,727191,727212,727217,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,736426,738328,738697,740077,740157,741703,741762,743745,743816,743892,744894,745180,746010,746206,746227,746233,746274,746902-746903,746944,746968,746970,747279,747802,748084,748090,748783,749262,749318
Modified: hadoop/core/branches/branch-0.20/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/CHANGES.txt?rev=749319&r1=749318&r2=749319&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.20/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.20/CHANGES.txt Mon Mar 2 14:38:51 2009
@@ -682,6 +682,9 @@
HADOOP-5146. Fixes a race condition that causes LocalDirAllocator to miss
files. (Devaraj Das via yhemanth)
+ HADOOP-4638. Fixes job recovery to not crash the job tracker for problems
+ with a single job file. (Amar Kamat via yhemanth)
+
Release 0.19.1 - Unreleased
IMPROVEMENTS
Propchange: hadoop/core/branches/branch-0.20/CHANGES.txt
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Mon Mar 2 14:38:51 2009
@@ -1,3 +1,3 @@
/hadoop/core/branches/branch-0.18/CHANGES.txt:727226
/hadoop/core/branches/branch-0.19/CHANGES.txt:713112
-/hadoop/core/trunk/CHANGES.txt:727001,727117,727191,727212,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,735082,736426,738602,738697,739416,740077,740157,741703,741762,743296,743745,743816,743892,744894,745180,745268,746010,746193,746206,746227,746233,746274,746902-746903,746944,746968,746970,747279,747802,748084,748090,748783,749262
+/hadoop/core/trunk/CHANGES.txt:727001,727117,727191,727212,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,735082,736426,738602,738697,739416,740077,740157,741703,741762,743296,743745,743816,743892,744894,745180,745268,746010,746193,746206,746227,746233,746274,746902-746903,746944,746968,746970,747279,747802,748084,748090,748783,749262,749318
Modified: hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java?rev=749319&r1=749318&r2=749319&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java (original)
+++ hadoop/core/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/JobTracker.java Mon Mar 2 14:38:51 2009
@@ -805,7 +805,14 @@
if (Values.PREP.name().equals(jobStatus)) {
hasUpdates = true;
LOG.info("Calling init from RM for job " + jip.getJobID().toString());
- jip.initTasks();
+ try {
+ jip.initTasks();
+ } catch (IOException ioe) {
+ LOG.error("Job initialization failed : \n"
+ + StringUtils.stringifyException(ioe));
+ jip.fail(); // fail the job
+ throw ioe;
+ }
}
}
@@ -1080,22 +1087,35 @@
public void recover() throws IOException {
// I. Init the jobs and cache the recovered job history filenames
Map<JobID, Path> jobHistoryFilenameMap = new HashMap<JobID, Path>();
- for (JobID id : jobsToRecover) {
+ Iterator<JobID> idIter = jobsToRecover.iterator();
+ while (idIter.hasNext()) {
+ JobID id = idIter.next();
+ LOG.info("Trying to recover job " + id);
// 1. Create the job object
JobInProgress job = new JobInProgress(id, JobTracker.this, conf);
- // 2. Get the log file and the file path
- String logFileName =
- JobHistory.JobInfo.getJobHistoryFileName(job.getJobConf(), id);
- Path jobHistoryFilePath =
- JobHistory.JobInfo.getJobHistoryLogLocation(logFileName);
-
- // 3. Recover the history file. This involved
- // - deleting file.recover if file exists
- // - renaming file.recover to file if file doesnt exist
- // This makes sure that the (master) file exists
- JobHistory.JobInfo.recoverJobHistoryFile(job.getJobConf(),
- jobHistoryFilePath);
+ String logFileName;
+ Path jobHistoryFilePath;
+ try {
+ // 2. Get the log file and the file path
+ logFileName =
+ JobHistory.JobInfo.getJobHistoryFileName(job.getJobConf(), id);
+ jobHistoryFilePath =
+ JobHistory.JobInfo.getJobHistoryLogLocation(logFileName);
+
+ // 3. Recover the history file. This involved
+ // - deleting file.recover if file exists
+ // - renaming file.recover to file if file doesnt exist
+ // This makes sure that the (master) file exists
+ JobHistory.JobInfo.recoverJobHistoryFile(job.getJobConf(),
+ jobHistoryFilePath);
+ } catch (IOException ioe) {
+ LOG.warn("Failed to recover job " + id + " history filename."
+ + " Ignoring.", ioe);
+ // TODO : remove job details from the system directory
+ idIter.remove();
+ continue;
+ }
// 4. Cache the history file name as it costs one dfs access
jobHistoryFilenameMap.put(job.getJobID(), jobHistoryFilePath);
@@ -1124,13 +1144,14 @@
JobHistory.parseHistoryFromFS(jobHistoryFilePath.toString(),
listener, fs);
} catch (IOException e) {
- LOG.info("JobTracker failed to recover job " + pJob + "."
- + " Ignoring it.", e);
- continue;
+ LOG.info("JobTracker failed to recover job " + pJob.getJobID() + "."
+ + " Ignoring it.", e);
}
// 3. Close the listener
listener.close();
+
+ LOG.info("Restart count for job " + id + " is " + pJob.numRestarts());
// 4. Update the recovery metric
totalEventsRecovered += listener.getNumEventsRecovered();
@@ -1138,9 +1159,14 @@
// 5. Cleanup history
// Delete the master log file as an indication that the new file
// should be used in future
- synchronized (pJob) {
- JobHistory.JobInfo.checkpointRecovery(logFileName,
- pJob.getJobConf());
+ try {
+ synchronized (pJob) {
+ JobHistory.JobInfo.checkpointRecovery(logFileName,
+ pJob.getJobConf());
+ }
+ } catch (IOException ioe) {
+ LOG.warn("Failed to delete log file (" + logFileName + ") for job "
+ + id + ". Ignoring it.", ioe);
}
// 6. Inform the jobtracker as to how much of the data is recovered.
@@ -2733,8 +2759,7 @@
* adding a job. This is the core job submission logic
* @param jobId The id for the job submitted which needs to be added
*/
- private synchronized JobStatus addJob(JobID jobId, JobInProgress job)
- throws IOException {
+ private synchronized JobStatus addJob(JobID jobId, JobInProgress job) {
totalSubmissions++;
synchronized (jobs) {
@@ -3152,6 +3177,11 @@
JobInProgress job = getJob(taskId.getJobID());
if (job == null) {
+ // if job is not there in the cleanup list ... add it
+ synchronized (trackerToJobsToCleanup) {
+ Set<JobID> jobs = trackerToJobsToCleanup.get(trackerName);
+ jobs.add(taskId.getJobID());
+ }
continue;
}