You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@giraph.apache.org by ni...@apache.org on 2013/03/21 19:21:29 UTC
git commit: GIRAPH-576: BspServiceMaster.failureCleanup() shouldn't
pass null in observers' applicationFailed() method
Updated Branches:
refs/heads/trunk 5bb956cad -> 974c30b74
GIRAPH-576: BspServiceMaster.failureCleanup() shouldn't pass null in observers' applicationFailed() method
Project: http://git-wip-us.apache.org/repos/asf/giraph/repo
Commit: http://git-wip-us.apache.org/repos/asf/giraph/commit/974c30b7
Tree: http://git-wip-us.apache.org/repos/asf/giraph/tree/974c30b7
Diff: http://git-wip-us.apache.org/repos/asf/giraph/diff/974c30b7
Branch: refs/heads/trunk
Commit: 974c30b7485b653ba37e11d43b474b458134c6ce
Parents: 5bb956c
Author: Nitay Joffe <ni...@fb.com>
Authored: Thu Mar 21 14:21:08 2013 -0400
Committer: Nitay Joffe <ni...@fb.com>
Committed: Thu Mar 21 14:21:08 2013 -0400
----------------------------------------------------------------------
CHANGELOG | 3 +
.../org/apache/giraph/master/BspServiceMaster.java | 56 +++++++++++----
2 files changed, 46 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/giraph/blob/974c30b7/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index d1983d0..765dc37 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,9 @@
Giraph Change Log
Release 0.2.0 - unreleased
+ GIRAPH-576: BspServiceMaster.failureCleanup() shouldn't pass null in
+ observers' applicationFailed() method (jgarms via nitay)
+
GIRAPH-547: Allow in-place modification of edges (apresta)
GIRAPH-537: Fix log messages produced by aggregators (majakabiljo)
http://git-wip-us.apache.org/repos/asf/giraph/blob/974c30b7/giraph-core/src/main/java/org/apache/giraph/master/BspServiceMaster.java
----------------------------------------------------------------------
diff --git a/giraph-core/src/main/java/org/apache/giraph/master/BspServiceMaster.java b/giraph-core/src/main/java/org/apache/giraph/master/BspServiceMaster.java
index 87497b8..9188a23 100644
--- a/giraph-core/src/main/java/org/apache/giraph/master/BspServiceMaster.java
+++ b/giraph-core/src/main/java/org/apache/giraph/master/BspServiceMaster.java
@@ -226,6 +226,22 @@ public class BspServiceMaster<I extends WritableComparable,
public void setJobState(ApplicationState state,
long applicationAttempt,
long desiredSuperstep) {
+ setJobState(state, applicationAttempt, desiredSuperstep, true);
+ }
+
+ /**
+ * Set the job state.
+ *
+ * @param state State of the application.
+ * @param applicationAttempt Attempt to start on
+ * @param desiredSuperstep Superstep to restart from (if applicable)
+ * @param killJobOnFailure if true, and the desired state is FAILED,
+ * then kill this job.
+ */
+ private void setJobState(ApplicationState state,
+ long applicationAttempt,
+ long desiredSuperstep,
+ boolean killJobOnFailure) {
JSONObject jobState = new JSONObject();
try {
jobState.put(JSONOBJ_STATE_KEY, state.toString());
@@ -258,10 +274,21 @@ public class BspServiceMaster<I extends WritableComparable,
"setJobState: Unknown InterruptedException for " +
masterJobStatePath, e);
}
-
- if (state == ApplicationState.FAILED) {
- failJob();
+ if (state == ApplicationState.FAILED && killJobOnFailure) {
+ failJob(new IllegalStateException("FAILED"));
}
+
+ }
+
+ /**
+ * Set the job state to FAILED. This will kill the job, and log exceptions to
+ * any observers.
+ *
+ * @param reason The reason the job failed
+ */
+ private void setJobStateFailed(String reason) {
+ setJobState(ApplicationState.FAILED, -1, -1, false);
+ failJob(new IllegalStateException(reason));
}
/**
@@ -308,9 +335,12 @@ public class BspServiceMaster<I extends WritableComparable,
/**
* When there is no salvaging this job, fail it.
+ *
+ * @param e Exception to log to observers
*/
- private void failJob() {
+ private void failJob(Exception e) {
LOG.fatal("failJob: Killing job " + getJobId());
+ LOG.fatal("failJob: exception " + e.toString());
try {
@SuppressWarnings("deprecation")
org.apache.hadoop.mapred.JobClient jobClient =
@@ -320,10 +350,11 @@ public class BspServiceMaster<I extends WritableComparable,
@SuppressWarnings("deprecation")
JobID jobId = JobID.forName(getJobId());
RunningJob job = jobClient.getJob(jobId);
- failureCleanup(null);
job.killJob();
- } catch (IOException e) {
- throw new RuntimeException(e);
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ } finally {
+ failureCleanup(e);
}
}
@@ -572,7 +603,7 @@ public class BspServiceMaster<I extends WritableComparable,
// where it left off.
List<WorkerInfo> healthyWorkerInfoList = checkWorkers();
if (healthyWorkerInfoList == null) {
- setJobState(ApplicationState.FAILED, -1, -1);
+ setJobStateFailed("Not enough healthy workers to create input splits");
return -1;
}
@@ -590,7 +621,7 @@ public class BspServiceMaster<I extends WritableComparable,
"check input of " + inputFormat.getClass().getName() + "!");
getContext().setStatus("Failing job due to 0 input splits, " +
"check input of " + inputFormat.getClass().getName() + "!");
- failJob();
+ setJobStateFailed("0 input splits");
}
if (minSplitCountHint > splitList.size()) {
LOG.warn(logPrefix + ": Number of inputSplits=" +
@@ -1189,7 +1220,7 @@ public class BspServiceMaster<I extends WritableComparable,
} catch (IOException e) {
LOG.fatal("getLastGoodCheckpoint: No last good checkpoints can be " +
"found, killing the job.", e);
- failJob();
+ failJob(e);
}
}
@@ -1401,9 +1432,8 @@ public class BspServiceMaster<I extends WritableComparable,
chosenWorkerInfoList = checkWorkers();
if (chosenWorkerInfoList == null) {
- LOG.fatal("coordinateSuperstep: Not enough healthy workers for " +
- "superstep " + getSuperstep());
- setJobState(ApplicationState.FAILED, -1, -1);
+ setJobStateFailed("coordinateSuperstep: Not enough healthy workers for " +
+ "superstep " + getSuperstep());
} else {
for (WorkerInfo workerInfo : chosenWorkerInfoList) {
String workerInfoHealthyPath =