You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by om...@apache.org on 2007/09/26 20:13:15 UTC

svn commit: r579744 - in /lucene/hadoop/trunk: CHANGES.txt src/java/org/apache/hadoop/mapred/JobInProgress.java src/java/org/apache/hadoop/mapred/JobTracker.java

Author: omalley
Date: Wed Sep 26 11:13:14 2007
New Revision: 579744

URL: http://svn.apache.org/viewvc?rev=579744&view=rev
Log:
HADOOP-1930.  Fix the blame for shuffle failures to the right task tracker.

Modified:
    lucene/hadoop/trunk/CHANGES.txt
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java

Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?rev=579744&r1=579743&r2=579744&view=diff
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Wed Sep 26 11:13:14 2007
@@ -191,6 +191,9 @@
     HADOOP-1940.  TestDFSUpgradeFromImage must shut down its MiniDFSCluster.
     (Chris Douglas via nigel)
 
+    HADOOP-1930.  Fix the blame for failed fetchs on the right host. (Arun C.
+    Murthy via omalley)
+
   IMPROVEMENTS
 
     HADOOP-1908. Restructure data node code so that block sending and 

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java?rev=579744&r1=579743&r2=579744&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobInProgress.java Wed Sep 26 11:13:14 2007
@@ -407,7 +407,7 @@
                          TaskStatus.Phase.MAP : 
                          TaskStatus.Phase.REDUCE), 
                      TaskStatus.State.FAILED, 
-                     ttStatus.getHost(), status.getTaskTracker(), null);
+                     status.getTaskTracker(), null);
           LOG.info("Failed to copy the output of " + status.getTaskId() + 
                    " with: " + StringUtils.stringifyException(ioe));
           return;
@@ -1046,8 +1046,7 @@
    */
   public void failedTask(TaskInProgress tip, String taskid, String reason, 
                          TaskStatus.Phase phase, TaskStatus.State state, 
-                         String hostname, String trackerName,
-                         JobTrackerMetrics metrics) {
+                         String trackerName, JobTrackerMetrics metrics) {
     TaskStatus status = TaskStatus.createTaskStatus(tip.isMapTask(), 
                                                     taskid,
                                                     0.0f,
@@ -1148,7 +1147,7 @@
   
   synchronized void fetchFailureNotification(TaskInProgress tip, 
                                              String mapTaskId, 
-                                             String hostname, String trackerName, 
+                                             String trackerName, 
                                              JobTrackerMetrics metrics) {
     Integer fetchFailures = mapTaskIdToFetchFailuresMap.get(mapTaskId);
     fetchFailures = (fetchFailures == null) ? 1 : (fetchFailures+1);
@@ -1163,7 +1162,7 @@
       failedTask(tip, mapTaskId, "Too many fetch-failures",                            
                  (tip.isMapTask() ? TaskStatus.Phase.MAP : 
                                     TaskStatus.Phase.REDUCE), 
-                 TaskStatus.State.FAILED, hostname, trackerName, metrics);
+                 TaskStatus.State.FAILED, trackerName, metrics);
       
       mapTaskIdToFetchFailuresMap.remove(mapTaskId);
     }

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java?rev=579744&r1=579743&r2=579744&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobTracker.java Wed Sep 26 11:13:14 2007
@@ -201,8 +201,7 @@
                                      tip.isMapTask()? TaskStatus.Phase.MAP:
                                      TaskStatus.Phase.STARTING,
                                      TaskStatus.State.FAILED,
-                                     trackerStatus.getHost(), trackerName,
-                                     myMetrics);
+                                     trackerName, myMetrics);
                   }
                   itr.remove();
                 } else {
@@ -294,8 +293,7 @@
                     if (now - newProfile.getLastSeen() > TASKTRACKER_EXPIRY_INTERVAL) {
                       // Remove completely
                       updateTaskTrackerStatus(trackerName, null);
-                      lostTaskTracker(leastRecent.getTrackerName(),
-                                      leastRecent.getHost());
+                      lostTaskTracker(leastRecent.getTrackerName());
                     } else {
                       // Update time by inserting latest profile
                       trackerExpiryQueue.add(newProfile);
@@ -1242,7 +1240,7 @@
           // If it's first contact, then clear out 
           // any state hanging around
           if (seenBefore) {
-            lostTaskTracker(trackerName, trackerStatus.getHost());
+            lostTaskTracker(trackerName);
           }
         } else {
           // If not first contact, there should be some record of the tracker
@@ -1771,11 +1769,16 @@
       if (failedFetchMaps != null) {
         for (String mapTaskId : failedFetchMaps) {
           TaskInProgress failedFetchMap = taskidToTIPMap.get(mapTaskId);
+          
           if (failedFetchMap != null) {
+            // Gather information about the map which has to be failed, if need be
+            String failedFetchTrackerName = getAssignedTracker(mapTaskId);
+            if (failedFetchTrackerName == null) {
+              failedFetchTrackerName = "Lost task tracker";
+            }
             failedFetchMap.getJob().fetchFailureNotification(failedFetchMap, 
                                                              mapTaskId, 
-                                                             status.getHost(), 
-                                                             trackerName, 
+                                                             failedFetchTrackerName, 
                                                              myMetrics);
           }
         }
@@ -1788,7 +1791,7 @@
    * already been updated.  Just process the contained tasks and any
    * jobs that might be affected.
    */
-  void lostTaskTracker(String trackerName, String hostname) {
+  void lostTaskTracker(String trackerName) {
     LOG.info("Lost tracker '" + trackerName + "'");
     Set<String> lostTasks = trackerToTaskMap.get(trackerName);
     trackerToTaskMap.remove(trackerName);
@@ -1805,12 +1808,11 @@
           JobInProgress job = tip.getJob();
           // if the job is done, we don't want to change anything
           if (job.getStatus().getRunState() == JobStatus.RUNNING) {
-            job.failedTask(tip, taskId, "Lost task tracker", 
+            job.failedTask(tip, taskId, ("Lost task tracker: " + trackerName), 
                            (tip.isMapTask() ? 
                                TaskStatus.Phase.MAP : 
                                TaskStatus.Phase.REDUCE), 
-                           TaskStatus.State.KILLED,
-                           hostname, trackerName, myMetrics);
+                           TaskStatus.State.KILLED, trackerName, myMetrics);
             jobsWithFailures.add(job);
           }
         } else if (!tip.isMapTask() && tip.isComplete()) {