You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by om...@apache.org on 2007/09/26 20:13:15 UTC
svn commit: r579744 - in /lucene/hadoop/trunk: CHANGES.txt
src/java/org/apache/hadoop/mapred/JobInProgress.java
src/java/org/apache/hadoop/mapred/JobTracker.java
Author: omalley
Date: Wed Sep 26 11:13:14 2007
New Revision: 579744
URL: http://svn.apache.org/viewvc?rev=579744&view=rev
Log:
HADOOP-1930. Fix the blame for shuffle failures to the right task tracker.
Modified:
lucene/hadoop/trunk/CHANGES.txt
lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java
lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java
Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?rev=579744&r1=579743&r2=579744&view=diff
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Wed Sep 26 11:13:14 2007
@@ -191,6 +191,9 @@
HADOOP-1940. TestDFSUpgradeFromImage must shut down its MiniDFSCluster.
(Chris Douglas via nigel)
+ HADOOP-1930. Fix the blame for failed fetchs on the right host. (Arun C.
+ Murthy via omalley)
+
IMPROVEMENTS
HADOOP-1908. Restructure data node code so that block sending and
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java?rev=579744&r1=579743&r2=579744&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java Wed Sep 26 11:13:14 2007
@@ -407,7 +407,7 @@
TaskStatus.Phase.MAP :
TaskStatus.Phase.REDUCE),
TaskStatus.State.FAILED,
- ttStatus.getHost(), status.getTaskTracker(), null);
+ status.getTaskTracker(), null);
LOG.info("Failed to copy the output of " + status.getTaskId() +
" with: " + StringUtils.stringifyException(ioe));
return;
@@ -1046,8 +1046,7 @@
*/
public void failedTask(TaskInProgress tip, String taskid, String reason,
TaskStatus.Phase phase, TaskStatus.State state,
- String hostname, String trackerName,
- JobTrackerMetrics metrics) {
+ String trackerName, JobTrackerMetrics metrics) {
TaskStatus status = TaskStatus.createTaskStatus(tip.isMapTask(),
taskid,
0.0f,
@@ -1148,7 +1147,7 @@
synchronized void fetchFailureNotification(TaskInProgress tip,
String mapTaskId,
- String hostname, String trackerName,
+ String trackerName,
JobTrackerMetrics metrics) {
Integer fetchFailures = mapTaskIdToFetchFailuresMap.get(mapTaskId);
fetchFailures = (fetchFailures == null) ? 1 : (fetchFailures+1);
@@ -1163,7 +1162,7 @@
failedTask(tip, mapTaskId, "Too many fetch-failures",
(tip.isMapTask() ? TaskStatus.Phase.MAP :
TaskStatus.Phase.REDUCE),
- TaskStatus.State.FAILED, hostname, trackerName, metrics);
+ TaskStatus.State.FAILED, trackerName, metrics);
mapTaskIdToFetchFailuresMap.remove(mapTaskId);
}
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java?rev=579744&r1=579743&r2=579744&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java Wed Sep 26 11:13:14 2007
@@ -201,8 +201,7 @@
tip.isMapTask()? TaskStatus.Phase.MAP:
TaskStatus.Phase.STARTING,
TaskStatus.State.FAILED,
- trackerStatus.getHost(), trackerName,
- myMetrics);
+ trackerName, myMetrics);
}
itr.remove();
} else {
@@ -294,8 +293,7 @@
if (now - newProfile.getLastSeen() > TASKTRACKER_EXPIRY_INTERVAL) {
// Remove completely
updateTaskTrackerStatus(trackerName, null);
- lostTaskTracker(leastRecent.getTrackerName(),
- leastRecent.getHost());
+ lostTaskTracker(leastRecent.getTrackerName());
} else {
// Update time by inserting latest profile
trackerExpiryQueue.add(newProfile);
@@ -1242,7 +1240,7 @@
// If it's first contact, then clear out
// any state hanging around
if (seenBefore) {
- lostTaskTracker(trackerName, trackerStatus.getHost());
+ lostTaskTracker(trackerName);
}
} else {
// If not first contact, there should be some record of the tracker
@@ -1771,11 +1769,16 @@
if (failedFetchMaps != null) {
for (String mapTaskId : failedFetchMaps) {
TaskInProgress failedFetchMap = taskidToTIPMap.get(mapTaskId);
+
if (failedFetchMap != null) {
+ // Gather information about the map which has to be failed, if need be
+ String failedFetchTrackerName = getAssignedTracker(mapTaskId);
+ if (failedFetchTrackerName == null) {
+ failedFetchTrackerName = "Lost task tracker";
+ }
failedFetchMap.getJob().fetchFailureNotification(failedFetchMap,
mapTaskId,
- status.getHost(),
- trackerName,
+ failedFetchTrackerName,
myMetrics);
}
}
@@ -1788,7 +1791,7 @@
* already been updated. Just process the contained tasks and any
* jobs that might be affected.
*/
- void lostTaskTracker(String trackerName, String hostname) {
+ void lostTaskTracker(String trackerName) {
LOG.info("Lost tracker '" + trackerName + "'");
Set<String> lostTasks = trackerToTaskMap.get(trackerName);
trackerToTaskMap.remove(trackerName);
@@ -1805,12 +1808,11 @@
JobInProgress job = tip.getJob();
// if the job is done, we don't want to change anything
if (job.getStatus().getRunState() == JobStatus.RUNNING) {
- job.failedTask(tip, taskId, "Lost task tracker",
+ job.failedTask(tip, taskId, ("Lost task tracker: " + trackerName),
(tip.isMapTask() ?
TaskStatus.Phase.MAP :
TaskStatus.Phase.REDUCE),
- TaskStatus.State.KILLED,
- hostname, trackerName, myMetrics);
+ TaskStatus.State.KILLED, trackerName, myMetrics);
jobsWithFailures.add(job);
}
} else if (!tip.isMapTask() && tip.isComplete()) {