You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by dh...@apache.org on 2008/10/21 08:19:34 UTC
svn commit: r706534 - in /hadoop/core/branches/branch-0.19: CHANGES.txt
src/mapred/org/apache/hadoop/mapred/JobClient.java
src/mapred/org/apache/hadoop/mapred/JobTracker.java
Author: dhruba
Date: Mon Oct 20 23:19:34 2008
New Revision: 706534
URL: http://svn.apache.org/viewvc?rev=706534&view=rev
Log:
HADOOP-4296. Fix job client failures by not retiring a job as soon as it
is finished. (dhruba)
Modified:
hadoop/core/branches/branch-0.19/CHANGES.txt
hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobClient.java
hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobTracker.java
Modified: hadoop/core/branches/branch-0.19/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/CHANGES.txt?rev=706534&r1=706533&r2=706534&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.19/CHANGES.txt Mon Oct 20 23:19:34 2008
@@ -937,6 +937,9 @@
list of jobs to be keyed by the priority, submit time, and job tracker id.
(Amar Kamat via omalley)
+ HADOOP-4296. Fix job client failures by not retiring a job as soon as it
+ is finished. (dhruba)
+
Release 0.18.2 - Unreleased
BUG FIXES
Modified: hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobClient.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobClient.java?rev=706534&r1=706533&r2=706534&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobClient.java (original)
+++ hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobClient.java Mon Oct 20 23:19:34 2008
@@ -1141,6 +1141,9 @@
break;
}
running = jc.getJob(jobId);
+ if (running == null) {
+ throw new IOException("Unable to fetch job status from server.");
+ }
String report =
(" map " + StringUtils.formatPercent(running.mapProgress(), 0)+
" reduce " +
Modified: hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobTracker.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobTracker.java?rev=706534&r1=706533&r2=706534&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobTracker.java (original)
+++ hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/JobTracker.java Mon Oct 20 23:19:34 2008
@@ -113,6 +113,13 @@
*/
final int MAX_COMPLETE_USER_JOBS_IN_MEMORY;
+ /**
+ * The minimum time (in ms) that a job's information has to remain
+ * in the JobTracker's memory before it is retired.
+ */
+ static final int MIN_TIME_BEFORE_RETIRE = 60000;
+
+
private int nextJobId = 1;
public static final Log LOG = LogFactory.getLog(JobTracker.class);
@@ -343,12 +350,14 @@
try {
Thread.sleep(RETIRE_JOB_CHECK_INTERVAL);
List<JobInProgress> retiredJobs = new ArrayList<JobInProgress>();
- long retireBefore = System.currentTimeMillis() -
- RETIRE_JOB_INTERVAL;
+ long now = System.currentTimeMillis();
+ long retireBefore = now - RETIRE_JOB_INTERVAL;
+
synchronized (jobs) {
for(JobInProgress job: jobs.values()) {
if (job.getStatus().getRunState() != JobStatus.RUNNING &&
job.getStatus().getRunState() != JobStatus.PREP &&
+ (job.getFinishTime() + MIN_TIME_BEFORE_RETIRE < now) &&
(job.getFinishTime() < retireBefore)) {
retiredJobs.add(job);
}
@@ -1524,7 +1533,8 @@
} catch (IOException ioe) {
LOG.info("Failed to finalize the log file recovery for job " + id, ioe);
}
-
+
+ long now = System.currentTimeMillis();
// Purge oldest jobs and keep at-most MAX_COMPLETE_USER_JOBS_IN_MEMORY jobs of a given user
// in memory; information about the purged jobs is available via
@@ -1553,6 +1563,11 @@
if (rjob == job) {
break;
}
+
+ // do not retire jobs that finished in the very recent past.
+ if (rjob.getFinishTime() + MIN_TIME_BEFORE_RETIRE > now) {
+ break;
+ }
// Cleanup all datastructures
int rjobRunState =