You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cu...@apache.org on 2007/05/02 22:03:57 UTC

svn commit: r534606 - in /lucene/hadoop/trunk: CHANGES.txt conf/hadoop-default.xml src/java/org/apache/hadoop/mapred/JobConf.java src/java/org/apache/hadoop/mapred/TaskInProgress.java

Author: cutting
Date: Wed May  2 13:03:56 2007
New Revision: 534606

URL: http://svn.apache.org/viewvc?view=rev&rev=534606
Log:
HADOOP-1304.  Make configurable the maximum number of task attempts before a job fails.  Contributed by Devaraj.

Modified:
    lucene/hadoop/trunk/CHANGES.txt
    lucene/hadoop/trunk/conf/hadoop-default.xml
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobConf.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java

Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=534606&r1=534605&r2=534606
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Wed May  2 13:03:56 2007
@@ -306,6 +306,9 @@
     the new format.  Please backup your data first before upgrading 
     (using 'hadoop distcp' for example).  (tomwhite)
 
+91. HADOOP-1304.  Make configurable the maximum number of task
+    attempts before a job fails.  (Devaraj Das via cutting)
+
 
 Release 0.12.3 - 2007-04-06
 

Modified: lucene/hadoop/trunk/conf/hadoop-default.xml
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/conf/hadoop-default.xml?view=diff&rev=534606&r1=534605&r2=534606
==============================================================================
--- lucene/hadoop/trunk/conf/hadoop-default.xml (original)
+++ lucene/hadoop/trunk/conf/hadoop-default.xml Wed May  2 13:03:56 2007
@@ -544,6 +544,24 @@
 </property>
 
 <property>
+  <name>mapred.map.max.attempts</name>
+  <value>4</value>
+  <description>Expert: The maximum number of attempts per map task.
+  In other words, framework will try to execute a map task these many number
+  of times before giving up on it.
+  </description>
+</property>
+
+<property>
+  <name>mapred.reduce.max.attempts</name>
+  <value>4</value>
+  <description>Expert: The maximum number of attempts per reduce task.
+  In other words, framework will try to execute a reduce task these many number
+  of times before giving up on it.
+  </description>
+</property>
+
+<property>
   <name>mapred.reduce.parallel.copies</name>
   <value>5</value>
   <description>The default number of parallel transfers run by reduce

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobConf.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobConf.java?view=diff&rev=534606&r1=534605&r2=534606
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobConf.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobConf.java Wed May  2 13:03:56 2007
@@ -510,7 +510,41 @@
 
   public int getNumReduceTasks() { return getInt("mapred.reduce.tasks", 1); }
   public void setNumReduceTasks(int n) { setInt("mapred.reduce.tasks", n); }
+  
+  /** Get the configured number of maximum attempts that will be made to run a
+   *  map task, as specified by the <code>mapred.map.max.attempts</code>
+   *  property. If this property is not already set, the default is 4 attempts
+   * @return the max number of attempts
+   */
+  public int getMaxMapAttempts() {
+    return getInt("mapred.map.max.attempts", 4);
+  }
+  /** Expert: Set the number of maximum attempts that will be made to run a
+   *  map task
+   * @param n the number of attempts
+   *
+   */
+  public void setMaxMapAttempts(int n) {
+    setInt("mapred.map.max.attempts", n);
+  }
 
+  /** Get the configured number of maximum attempts  that will be made to run a
+   *  reduce task, as specified by the <code>mapred.reduce.max.attempts</code>
+   *  property. If this property is not already set, the default is 4 attempts
+   * @return the max number of attempts
+   */
+  public int getMaxReduceAttempts() {
+    return getInt("mapred.reduce.max.attempts", 4);
+  }
+  /** Expert: Set the number of maximum attempts that will be made to run a
+   *  reduce task
+   * @param n the number of attempts
+   *
+   */
+  public void setMaxReduceAttempts(int n) {
+    setInt("mapred.reduce.max.attempts", n);
+  }
+  
   /**
    * Get the user-specified job name. This is only used to identify the 
    * job to the user.

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java?view=diff&rev=534606&r1=534605&r2=534606
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java Wed May  2 13:03:56 2007
@@ -49,7 +49,7 @@
 ////////////////////////////////////////////////////////
 class TaskInProgress {
   static final int MAX_TASK_EXECS = 1;
-  static final int MAX_TASK_FAILURES = 4;    
+  int maxTaskAttempts = 4;    
   static final double SPECULATIVE_GAP = 0.2;
   static final long SPECULATIVE_LAG = 60 * 1000;
   private static NumberFormat idFormat = NumberFormat.getInstance();
@@ -125,6 +125,7 @@
     this.job = job;
     this.conf = conf;
     this.partition = partition;
+    setMaxTaskAttempts();
     init(uniqueString);
   }
         
@@ -141,8 +142,19 @@
     this.jobtracker = jobtracker;
     this.job = job;
     this.conf = conf;
+    setMaxTaskAttempts();
     init(uniqueString);
   }
+  /**
+   * Set the max number of attempts before we declare a TIP as "failed"
+   */
+  private void setMaxTaskAttempts() {
+    if (isMapTask()) {
+      this.maxTaskAttempts = conf.getMaxMapAttempts();
+    } else {
+      this.maxTaskAttempts = conf.getMaxReduceAttempts();
+    }
+  }
 
   /**
    * Make a unique name for this TIP.
@@ -430,7 +442,7 @@
       numKilledTasks++;
     }
 
-    if (numTaskFailures >= MAX_TASK_FAILURES) {
+    if (numTaskFailures >= maxTaskAttempts) {
       LOG.info("TaskInProgress " + getTIPId() + " has failed " + numTaskFailures + " times.");
       kill();
     }
@@ -620,11 +632,11 @@
 
     // Create the 'taskid'; do not count the 'killed' tasks against the job!
     String taskid = null;
-    if (nextTaskId < (MAX_TASK_EXECS + MAX_TASK_FAILURES + numKilledTasks)) {
+    if (nextTaskId < (MAX_TASK_EXECS + maxTaskAttempts + numKilledTasks)) {
       taskid = new String("task_" + taskIdPrefix + "_" + nextTaskId);
       ++nextTaskId;
     } else {
-      LOG.warn("Exceeded limit of " + (MAX_TASK_EXECS + MAX_TASK_FAILURES) +
+      LOG.warn("Exceeded limit of " + (MAX_TASK_EXECS + maxTaskAttempts) +
               " (plus " + numKilledTasks + " killed)"  + 
               " attempts for the tip '" + getTIPId() + "'");
       return null;