You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sh...@apache.org on 2009/07/20 10:54:31 UTC
svn commit: r795722 - in /hadoop/common/branches/branch-0.20: ./
src/mapred/org/apache/hadoop/mapred/ src/test/org/apache/hadoop/mapred/
Author: sharad
Date: Mon Jul 20 08:54:30 2009
New Revision: 795722
URL: http://svn.apache.org/viewvc?rev=795722&view=rev
Log:
MAPREDUCE-430. Fix bug related to Task getting stuck due to OutOfMemoryErrors. Contributed by Amar Kamat.
Modified:
hadoop/common/branches/branch-0.20/CHANGES.txt
hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/Child.java
hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/IsolationRunner.java
hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/LocalJobRunner.java
hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/ReduceTask.java
hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskRunner.java
hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskTracker.java
hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java
hadoop/common/branches/branch-0.20/src/test/org/apache/hadoop/mapred/TestTaskFail.java
Modified: hadoop/common/branches/branch-0.20/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20/CHANGES.txt?rev=795722&r1=795721&r2=795722&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20/CHANGES.txt (original)
+++ hadoop/common/branches/branch-0.20/CHANGES.txt Mon Jul 20 08:54:30 2009
@@ -184,6 +184,9 @@
MAPREDUCE-18. Puts some checks to detect cases where jetty serves up
incorrect output during shuffle. (Ravi Gummadi via ddas)
+ MAPREDUCE-430. Fix bug related to Task getting stuck due to
+ OutOfMemoryErrors. (Amar Kamat via sharad)
+
Release 0.20.0 - 2009-04-15
INCOMPATIBLE CHANGES
Modified: hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/Child.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/Child.java?rev=795722&r1=795721&r2=795722&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/Child.java (original)
+++ hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/Child.java Mon Jul 20 08:54:30 2009
@@ -180,9 +180,13 @@
break;
}
}
- } catch (FSError e) {
- LOG.fatal("FSError from child", e);
- umbilical.fsError(taskid, e.getMessage());
+ } catch (Error e) {
+ String error = "Error";
+ if (e instanceof FSError) {
+ error = "FSError";
+ }
+ LOG.fatal(error + " from child", e);
+ umbilical.taskError(taskid, e.getMessage());
} catch (Throwable throwable) {
LOG.warn("Error running child", throwable);
try {
Modified: hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/IsolationRunner.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/IsolationRunner.java?rev=795722&r1=795721&r2=795722&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/IsolationRunner.java (original)
+++ hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/IsolationRunner.java Mon Jul 20 08:54:30 2009
@@ -51,12 +51,9 @@
LOG.info("Task " + taskid + " reporting done.");
}
- public void fsError(TaskAttemptID taskId, String message) throws IOException {
- LOG.info("Task " + taskId + " reporting file system error: " + message);
- }
-
- public void shuffleError(TaskAttemptID taskId, String message) throws IOException {
- LOG.info("Task " + taskId + " reporting shuffle error: " + message);
+ public void taskError(TaskAttemptID taskId, String message)
+ throws IOException {
+ LOG.info("Task " + taskId + " reporting task error: " + message);
}
public JvmTask getTask(JVMId jvmId) throws IOException {
Modified: hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/LocalJobRunner.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/LocalJobRunner.java?rev=795722&r1=795721&r2=795722&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/LocalJobRunner.java (original)
+++ hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/LocalJobRunner.java Mon Jul 20 08:54:30 2009
@@ -333,13 +333,9 @@
}
}
- public synchronized void fsError(TaskAttemptID taskId, String message)
+ public void taskError(TaskAttemptID taskId, String message)
throws IOException {
- LOG.fatal("FSError: "+ message + "from task: " + taskId);
- }
-
- public void shuffleError(TaskAttemptID taskId, String message) throws IOException {
- LOG.fatal("shuffleError: "+ message + "from task: " + taskId);
+ LOG.fatal("Error: "+ message + "from task: " + taskId);
}
public MapTaskCompletionEventsUpdate getMapCompletionEvents(JobID jobId,
Modified: hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/ReduceTask.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/ReduceTask.java?rev=795722&r1=795721&r2=795722&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/ReduceTask.java (original)
+++ hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/ReduceTask.java Mon Jul 20 08:54:30 2009
@@ -376,8 +376,8 @@
if(reduceCopier.mergeThrowable instanceof FSError) {
LOG.error("Task: " + getTaskID() + " - FSError: " +
StringUtils.stringifyException(reduceCopier.mergeThrowable));
- umbilical.fsError(getTaskID(),
- reduceCopier.mergeThrowable.getMessage());
+ umbilical.taskError(getTaskID(),
+ "(FSError) " + reduceCopier.mergeThrowable.getMessage());
}
throw new IOException("Task: " + getTaskID() +
" - The reduce copier failed", reduceCopier.mergeThrowable);
@@ -1214,7 +1214,8 @@
LOG.error("Task: " + reduceTask.getTaskID() + " - FSError: " +
StringUtils.stringifyException(e));
try {
- umbilical.fsError(reduceTask.getTaskID(), e.getMessage());
+ umbilical.taskError(reduceTask.getTaskID(), "(FSError) "
+ + e.getMessage());
} catch (IOException io) {
LOG.error("Could not notify TT of FSError: " +
StringUtils.stringifyException(io));
@@ -2091,9 +2092,9 @@
LOG.fatal("Shuffle failed with too many fetch failures " +
"and insufficient progress!" +
"Killing task " + getTaskID() + ".");
- umbilical.shuffleError(getTaskID(),
- "Exceeded MAX_FAILED_UNIQUE_FETCHES;"
- + " bailing-out.");
+ umbilical.taskError(getTaskID(), "(Shuffle Error) "
+ + "Exceeded MAX_FAILED_UNIQUE_FETCHES;"
+ + " bailing-out.");
}
}
Modified: hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskRunner.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskRunner.java?rev=795722&r1=795721&r2=795722&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskRunner.java (original)
+++ hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskRunner.java Mon Jul 20 08:54:30 2009
@@ -419,12 +419,16 @@
exitCode + ".");
}
}
- } catch (FSError e) {
- LOG.fatal("FSError", e);
+ } catch (Error e) {
+ String error = "Error";
+ if (e instanceof FSError) {
+ error = "FSError";
+ }
+ LOG.fatal(error, e);
try {
- tracker.fsError(t.getTaskID(), e.getMessage());
+ tracker.taskError(t.getTaskID(), e.getMessage());
} catch (IOException ie) {
- LOG.fatal(t.getTaskID()+" reporting FSError", ie);
+ LOG.fatal(t.getTaskID()+" reporting " + error, ie);
}
} catch (Throwable throwable) {
LOG.warn(t.getTaskID()+" Child Error", throwable);
Modified: hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskTracker.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskTracker.java?rev=795722&r1=795721&r2=795722&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskTracker.java (original)
+++ hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskTracker.java Mon Jul 20 08:54:30 2009
@@ -2570,24 +2570,13 @@
/**
- * A reduce-task failed to shuffle the map-outputs. Kill the task.
- */
- public synchronized void shuffleError(TaskAttemptID taskId, String message)
- throws IOException {
- LOG.fatal("Task: " + taskId + " - Killed due to Shuffle Failure: " + message);
- TaskInProgress tip = runningTasks.get(taskId);
- tip.reportDiagnosticInfo("Shuffle Error: " + message);
- purgeTask(tip, true);
- }
-
- /**
* A child task had a local filesystem error. Kill the task.
*/
- public synchronized void fsError(TaskAttemptID taskId, String message)
+ public synchronized void taskError(TaskAttemptID taskId, String message)
throws IOException {
- LOG.fatal("Task: " + taskId + " - Killed due to FSError: " + message);
+ LOG.fatal("Task: " + taskId + " - Killed due to : " + message);
TaskInProgress tip = runningTasks.get(taskId);
- tip.reportDiagnosticInfo("FSError: " + message);
+ tip.reportDiagnosticInfo(message);
purgeTask(tip, true);
}
Modified: hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java?rev=795722&r1=795721&r2=795722&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java (original)
+++ hadoop/common/branches/branch-0.20/src/mapred/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java Mon Jul 20 08:54:30 2009
@@ -53,9 +53,10 @@
* Version 13 changed the getTask method signature for HADOOP-249
* Version 14 changed the getTask method signature for HADOOP-4232
* Version 15 Adds FAILED_UNCLEAN and KILLED_UNCLEAN states for HADOOP-4759
+ * Version 16 Removed fsError and shuffleError and introduced taskError.
* */
- public static final long versionID = 15L;
+ public static final long versionID = 16L;
/**
* Called when a child task process starts, to get its task.
@@ -122,11 +123,8 @@
*/
boolean canCommit(TaskAttemptID taskid) throws IOException;
- /** Report that a reduce-task couldn't shuffle map-outputs.*/
- void shuffleError(TaskAttemptID taskId, String message) throws IOException;
-
- /** Report that the task encounted a local filesystem error.*/
- void fsError(TaskAttemptID taskId, String message) throws IOException;
+ /** Report that the task encountered an error.*/
+ void taskError(TaskAttemptID taskId, String message) throws IOException;
/** Called by a reduce task to get the map output locations for finished maps.
* Returns an update centered around the map-task-completion-events.
Modified: hadoop/common/branches/branch-0.20/src/test/org/apache/hadoop/mapred/TestTaskFail.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20/src/test/org/apache/hadoop/mapred/TestTaskFail.java?rev=795722&r1=795721&r2=795722&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20/src/test/org/apache/hadoop/mapred/TestTaskFail.java (original)
+++ hadoop/common/branches/branch-0.20/src/test/org/apache/hadoop/mapred/TestTaskFail.java Mon Jul 20 08:54:30 2009
@@ -49,7 +49,9 @@
throw new IOException();
} else if (taskid.endsWith("_1")) {
System.exit(-1);
- }
+ } else if (taskid.endsWith("_2")) {
+ throw new OutOfMemoryError();
+ }
}
}
@@ -106,46 +108,57 @@
return new JobClient(conf).submitJob(conf);
}
+ private void validateAttempt(TaskInProgress tip, TaskAttemptID attemptId,
+ TaskStatus ts, boolean isCleanup)
+ throws IOException {
+ assertEquals(tip.isCleanupAttempt(attemptId), isCleanup);
+ assertTrue(ts != null);
+ assertEquals(TaskStatus.State.FAILED, ts.getRunState());
+ // validate tasklogs for task attempt
+ String log = TestMiniMRMapRedDebugScript.readTaskLog(
+ TaskLog.LogName.STDERR, attemptId, false);
+ assertTrue(log.contains(taskLog));
+ if (!isCleanup) {
+ // validate task logs: tasklog should contain both task logs
+ // and cleanup logs
+ assertTrue(log.contains(cleanupLog));
+ } else {
+ // validate tasklogs for cleanup attempt
+ log = TestMiniMRMapRedDebugScript.readTaskLog(
+ TaskLog.LogName.STDERR, attemptId, true);
+ assertTrue(log.contains(cleanupLog));
+ }
+ }
+
private void validateJob(RunningJob job, MiniMRCluster mr)
throws IOException {
assertEquals(JobStatus.SUCCEEDED, job.getJobState());
JobID jobId = job.getID();
// construct the task id of first map task
+ // this should not be cleanup attempt since the first attempt
+ // fails with an exception
TaskAttemptID attemptId =
new TaskAttemptID(new TaskID(jobId, true, 0), 0);
TaskInProgress tip = mr.getJobTrackerRunner().getJobTracker().
getTip(attemptId.getTaskID());
- // this should not be cleanup attempt since the first attempt
- // fails with an exception
- assertTrue(!tip.isCleanupAttempt(attemptId));
TaskStatus ts =
mr.getJobTrackerRunner().getJobTracker().getTaskStatus(attemptId);
- assertTrue(ts != null);
- assertEquals(TaskStatus.State.FAILED, ts.getRunState());
- // validate task logs: tasklog should contain both task logs
- // and cleanup logs
- String log = TestMiniMRMapRedDebugScript.readTaskLog(
- TaskLog.LogName.STDERR, attemptId, false);
- assertTrue(log.contains(taskLog));
- assertTrue(log.contains(cleanupLog));
+ validateAttempt(tip, attemptId, ts, false);
attemptId = new TaskAttemptID(new TaskID(jobId, true, 0), 1);
// this should be cleanup attempt since the second attempt fails
// with System.exit
- assertTrue(tip.isCleanupAttempt(attemptId));
+
ts = mr.getJobTrackerRunner().getJobTracker().getTaskStatus(attemptId);
- assertTrue(ts != null);
- assertEquals(TaskStatus.State.FAILED, ts.getRunState());
- // validate tasklogs for task attempt
- log = TestMiniMRMapRedDebugScript.readTaskLog(
- TaskLog.LogName.STDERR, attemptId, false);
- assertTrue(log.contains(taskLog));
- // validate tasklogs for cleanup attempt
- log = TestMiniMRMapRedDebugScript.readTaskLog(
- TaskLog.LogName.STDERR, attemptId, true);
- assertTrue(log.contains(cleanupLog));
+ validateAttempt(tip, attemptId, ts, true);
+
+ attemptId = new TaskAttemptID(new TaskID(jobId, true, 0), 2);
+ // this should be cleanup attempt since the third attempt fails
+ // with OutOfMemory
+ ts = mr.getJobTrackerRunner().getJobTracker().getTaskStatus(attemptId);
+ validateAttempt(tip, attemptId, ts, true);
}
public void testWithDFS() throws IOException {