You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by bo...@apache.org on 2013/01/04 21:38:37 UTC
svn commit: r1429115 [2/2] - in
/hadoop/common/branches/branch-2/hadoop-mapreduce-project: ./
hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/jobhistory/
hadoop-mapreduce-client/hadoop-mapreduce-client-app/...
Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/OutputCommitter.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/OutputCommitter.java?rev=1429115&r1=1429114&r2=1429115&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/OutputCommitter.java (original)
+++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/OutputCommitter.java Fri Jan 4 20:38:36 2013
@@ -52,6 +52,14 @@ import org.apache.hadoop.classification.
* Discard the task commit.
* </li>
* </ol>
+ * The methods in this class can be called from several different processes and
+ * from several different contexts. It is important to know which process and
+ * which context each is called from. Each method should be marked accordingly
+ * in its documentation. It is also important to note that not all methods are
+ * guaranteed to be called once and only once. If a method is not guaranteed to
+ * have this property the output committer needs to handle this appropriately.
+ * Also note it will only be in rare situations where they may be called
+ * multiple times for the same task.
*
* @see FileOutputCommitter
* @see JobContext
@@ -62,7 +70,9 @@ import org.apache.hadoop.classification.
public abstract class OutputCommitter
extends org.apache.hadoop.mapreduce.OutputCommitter {
/**
- * For the framework to setup the job output during initialization
+ * For the framework to setup the job output during initialization. This is
+ * called from the application master process for the entire job. This will be
+ * called multiple times, once per job attempt.
*
* @param jobContext Context of the job whose output is being written.
* @throws IOException if temporary output could not be created
@@ -70,7 +80,9 @@ public abstract class OutputCommitter
public abstract void setupJob(JobContext jobContext) throws IOException;
/**
- * For cleaning up the job's output after job completion
+ * For cleaning up the job's output after job completion. This is called
+ * from the application master process for the entire job. This may be called
+ * multiple times.
*
* @param jobContext Context of the job whose output is being written.
* @throws IOException
@@ -82,7 +94,10 @@ public abstract class OutputCommitter
/**
* For committing job's output after successful job completion. Note that this
- * is invoked for jobs with final runstate as SUCCESSFUL.
+ * is invoked for jobs with final runstate as SUCCESSFUL. This is called
+ * from the application master process for the entire job. This is guaranteed
+ * to only be called once. If it throws an exception the entire job will
+ * fail.
*
* @param jobContext Context of the job whose output is being written.
* @throws IOException
@@ -94,7 +109,8 @@ public abstract class OutputCommitter
/**
* For aborting an unsuccessful job's output. Note that this is invoked for
* jobs with final runstate as {@link JobStatus#FAILED} or
- * {@link JobStatus#KILLED}
+ * {@link JobStatus#KILLED}. This is called from the application
+ * master process for the entire job. This may be called multiple times.
*
* @param jobContext Context of the job whose output is being written.
* @param status final runstate of the job
@@ -106,7 +122,10 @@ public abstract class OutputCommitter
}
/**
- * Sets up output for the task.
+ * Sets up output for the task. This is called from each individual task's
+ * process that will output to HDFS, and it is called just for that task. This
+ * may be called multiple times for the same task, but for different task
+ * attempts.
*
* @param taskContext Context of the task whose output is being written.
* @throws IOException
@@ -115,7 +134,9 @@ public abstract class OutputCommitter
throws IOException;
/**
- * Check whether task needs a commit
+ * Check whether task needs a commit. This is called from each individual
+ * task's process that will output to HDFS, and it is called just for that
+ * task.
*
* @param taskContext
* @return true/false
@@ -125,9 +146,16 @@ public abstract class OutputCommitter
throws IOException;
/**
- * To promote the task's temporary output to final output location
- *
- * The task's output is moved to the job's output directory.
+ * To promote the task's temporary output to final output location.
+ * If {@link #needsTaskCommit(TaskAttemptContext)} returns true and this
+ * task is the task that the AM determines finished first, this method
+ * is called to commit an individual task's output. This is to mark
+ * that tasks output as complete, as {@link #commitJob(JobContext)} will
+ * also be called later on if the entire job finished successfully. This
+ * is called from a task's process. This may be called multiple times for the
+ * same task, but different task attempts. It should be very rare for this to
+ * be called multiple times and requires odd networking failures to make this
+ * happen. In the future the Hadoop framework may eliminate this race.
*
* @param taskContext Context of the task whose output is being written.
* @throws IOException if commit is not
@@ -136,7 +164,9 @@ public abstract class OutputCommitter
throws IOException;
/**
- * Discard the task output
+ * Discard the task output. This is called from a task's process to clean
+ * up a single task's output that can not yet been committed. This may be
+ * called multiple times for the same task, but for different task attempts.
*
* @param taskContext
* @throws IOException
@@ -160,7 +190,8 @@ public abstract class OutputCommitter
* The retry-count for the job will be passed via the
* {@link MRConstants#APPLICATION_ATTEMPT_ID} key in
* {@link TaskAttemptContext#getConfiguration()} for the
- * <code>OutputCommitter</code>.
+ * <code>OutputCommitter</code>. This is called from the application master
+ * process, but it is called individually for each task.
*
* If an exception is thrown the task will be attempted again.
*
Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/OutputCommitter.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/OutputCommitter.java?rev=1429115&r1=1429114&r2=1429115&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/OutputCommitter.java (original)
+++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/OutputCommitter.java Fri Jan 4 20:38:36 2013
@@ -54,7 +54,11 @@ import org.apache.hadoop.classification.
* The methods in this class can be called from several different processes and
* from several different contexts. It is important to know which process and
* which context each is called from. Each method should be marked accordingly
- * in its documentation.
+ * in its documentation. It is also important to note that not all methods are
+ * guaranteed to be called once and only once. If a method is not guaranteed to
+ * have this property the output committer needs to handle this appropriately.
+ * Also note it will only be in rare situations where they may be called
+ * multiple times for the same task.
*
* @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
* @see JobContext
@@ -65,7 +69,8 @@ import org.apache.hadoop.classification.
public abstract class OutputCommitter {
/**
* For the framework to setup the job output during initialization. This is
- * called from the application master process for the entire job.
+ * called from the application master process for the entire job. This will be
+ * called multiple times, once per job attempt.
*
* @param jobContext Context of the job whose output is being written.
* @throws IOException if temporary output could not be created
@@ -74,7 +79,8 @@ public abstract class OutputCommitter {
/**
* For cleaning up the job's output after job completion. This is called
- * from the application master process for the entire job.
+ * from the application master process for the entire job. This may be called
+ * multiple times.
*
* @param jobContext Context of the job whose output is being written.
* @throws IOException
@@ -87,7 +93,9 @@ public abstract class OutputCommitter {
/**
* For committing job's output after successful job completion. Note that this
* is invoked for jobs with final runstate as SUCCESSFUL. This is called
- * from the application master process for the entire job.
+ * from the application master process for the entire job. This is guaranteed
+ * to only be called once. If it throws an exception the entire job will
+ * fail.
*
* @param jobContext Context of the job whose output is being written.
* @throws IOException
@@ -101,7 +109,7 @@ public abstract class OutputCommitter {
* For aborting an unsuccessful job's output. Note that this is invoked for
* jobs with final runstate as {@link JobStatus.State#FAILED} or
* {@link JobStatus.State#KILLED}. This is called from the application
- * master process for the entire job.
+ * master process for the entire job. This may be called multiple times.
*
* @param jobContext Context of the job whose output is being written.
* @param state final runstate of the job
@@ -114,7 +122,9 @@ public abstract class OutputCommitter {
/**
* Sets up output for the task. This is called from each individual task's
- * process that will output to HDFS, and it is called just for that task.
+ * process that will output to HDFS, and it is called just for that task. This
+ * may be called multiple times for the same task, but for different task
+ * attempts.
*
* @param taskContext Context of the task whose output is being written.
* @throws IOException
@@ -141,7 +151,10 @@ public abstract class OutputCommitter {
* is called to commit an individual task's output. This is to mark
* that tasks output as complete, as {@link #commitJob(JobContext)} will
* also be called later on if the entire job finished successfully. This
- * is called from a task's process.
+ * is called from a task's process. This may be called multiple times for the
+ * same task, but different task attempts. It should be very rare for this to
+ * be called multiple times and requires odd networking failures to make this
+ * happen. In the future the Hadoop framework may eliminate this race.
*
* @param taskContext Context of the task whose output is being written.
* @throws IOException if commit is not successful.
@@ -151,7 +164,8 @@ public abstract class OutputCommitter {
/**
* Discard the task output. This is called from a task's process to clean
- * up a single task's output that can not yet been committed.
+ * up a single task's output that can not yet been committed. This may be
+ * called multiple times for the same task, but for different task attempts.
*
* @param taskContext
* @throws IOException
@@ -184,6 +198,9 @@ public abstract class OutputCommitter {
*
* If an exception is thrown the task will be attempted again.
*
+ * This may be called multiple times for the same task. But from different
+ * application attempts.
+ *
* @param taskContext Context of the task whose output is being recovered
* @throws IOException
*/
Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobHistoryParser.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobHistoryParser.java?rev=1429115&r1=1429114&r2=1429115&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobHistoryParser.java (original)
+++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/jobhistory/JobHistoryParser.java Fri Jan 4 20:38:36 2013
@@ -54,7 +54,7 @@ import org.apache.hadoop.yarn.api.record
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
-public class JobHistoryParser {
+public class JobHistoryParser implements HistoryEventHandler {
private static final Log LOG = LogFactory.getLog(JobHistoryParser.class);
@@ -94,6 +94,34 @@ public class JobHistoryParser {
this.in = in;
}
+ public synchronized void parse(HistoryEventHandler handler)
+ throws IOException {
+ parse(new EventReader(in), handler);
+ }
+
+ /**
+ * Only used for unit tests.
+ */
+ @Private
+ public synchronized void parse(EventReader reader, HistoryEventHandler handler)
+ throws IOException {
+ int eventCtr = 0;
+ HistoryEvent event;
+ try {
+ while ((event = reader.getNextEvent()) != null) {
+ handler.handleEvent(event);
+ ++eventCtr;
+ }
+ } catch (IOException ioe) {
+ LOG.info("Caught exception parsing history file after " + eventCtr +
+ " events", ioe);
+ parseException = ioe;
+ } finally {
+ in.close();
+ }
+ }
+
+
/**
* Parse the entire history file and populate the JobInfo object
* The first invocation will populate the object, subsequent calls
@@ -122,21 +150,7 @@ public class JobHistoryParser {
}
info = new JobInfo();
-
- int eventCtr = 0;
- HistoryEvent event;
- try {
- while ((event = reader.getNextEvent()) != null) {
- handleEvent(event);
- ++eventCtr;
- }
- } catch (IOException ioe) {
- LOG.info("Caught exception parsing history file after " + eventCtr +
- " events", ioe);
- parseException = ioe;
- } finally {
- in.close();
- }
+ parse(reader, this);
return info;
}
@@ -150,7 +164,8 @@ public class JobHistoryParser {
return parseException;
}
- private void handleEvent(HistoryEvent event) {
+ @Override
+ public void handleEvent(HistoryEvent event) {
EventType type = event.getEventType();
switch (type) {
Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java?rev=1429115&r1=1429114&r2=1429115&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java (original)
+++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java Fri Jan 4 20:38:36 2013
@@ -667,6 +667,9 @@ public class HistoryFileManager extends
}
});
}
+ } else if (old != null && !old.isMovePending()) {
+ //This is a duplicate so just delete it
+ fileInfo.delete();
}
}
}