You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by nn...@apache.org on 2014/04/17 22:49:05 UTC

git commit: Changed executor state recovery to allow run recovery in absence of executor info.

Repository: mesos
Updated Branches:
  refs/heads/master 7bf1e8a6b -> a57b2eb52


Changed executor state recovery to allow run recovery in absence of executor info.

This patch let executor recovery recover runs in the absence of
executor info.  This is needed as new task-info patch will introduce
an intermediate state where the executor info hasn't been check
pointed. In this interim, the slave may fail-over and should be in a
position to clean up orphan containers (as for now, the containerizer
API doesn't provide a way to reconcile the executor info and it is
therefore not possible to recover the containers in this case).

Review: https://reviews.apache.org/r/20221


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a57b2eb5
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a57b2eb5
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a57b2eb5

Branch: refs/heads/master
Commit: a57b2eb523b0f603d60461cbad5599ebd5e776aa
Parents: 7bf1e8a
Author: Niklas Nielsen <ni...@qni.dk>
Authored: Thu Apr 17 11:29:32 2014 -0700
Committer: Niklas Q. Nielsen <ni...@mesosphere.io>
Committed: Thu Apr 17 11:29:32 2014 -0700

----------------------------------------------------------------------
 src/slave/slave.cpp |  5 ++--
 src/slave/state.cpp | 70 ++++++++++++++++++++++++------------------------
 2 files changed, 38 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/a57b2eb5/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 19c5f0d..d6ec87c 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -3070,10 +3070,11 @@ void Framework::recoverExecutor(const ExecutorState& state)
 
   CHECK_NOTNULL(slave);
 
-  if (state.runs.empty() || state.latest.isNone()) {
+  if (state.runs.empty() || state.latest.isNone() || state.info.isNone()) {
     LOG(WARNING) << "Skipping recovery of executor '" << state.id
                  << "' of framework " << id
-                 << " because its latest run cannot be recovered";
+                 << " because its latest run or executor info"
+                 << " cannot be recovered";
 
     // GC the top level executor work directory.
     slave->garbageCollect(paths::getExecutorPath(

http://git-wip-us.apache.org/repos/asf/mesos/blob/a57b2eb5/src/slave/state.cpp
----------------------------------------------------------------------
diff --git a/src/slave/state.cpp b/src/slave/state.cpp
index a2af33c..2889245 100644
--- a/src/slave/state.cpp
+++ b/src/slave/state.cpp
@@ -274,41 +274,6 @@ Try<ExecutorState> ExecutorState::recover(
   state.id = executorId;
   string message;
 
-  // Read the executor info.
-  const string& path =
-    paths::getExecutorInfoPath(rootDir, slaveId, frameworkId, executorId);
-  if (!os::exists(path)) {
-    // This could happen if the slave died after creating the executor
-    // directory but before it checkpointed the executor info.
-    LOG(WARNING) << "Failed to find executor info file '" << path << "'";
-    return state;
-  }
-
-  const Result<ExecutorInfo>& executorInfo =
-    ::protobuf::read<ExecutorInfo>(path);
-
-  if (executorInfo.isError()) {
-    message = "Failed to read executor info from '" + path + "': " +
-              executorInfo.error();
-
-    if (strict) {
-      return Error(message);
-    } else {
-      LOG(WARNING) << message;
-      state.errors++;
-      return state;
-    }
-  }
-
-  if (executorInfo.isNone()) {
-    // This could happen if the slave died after opening the file for
-    // writing but before it checkpointed anything.
-    LOG(WARNING) << "Found empty executor info file '" << path << "'";
-    return state;
-  }
-
-  state.info = executorInfo.get();
-
   // Find the runs.
   Try<list<string> > runs = os::glob(strings::format(
       paths::EXECUTOR_RUN_PATH,
@@ -368,6 +333,41 @@ Try<ExecutorState> ExecutorState::recover(
     return state;
   }
 
+  // Read the executor info.
+  const string& path =
+    paths::getExecutorInfoPath(rootDir, slaveId, frameworkId, executorId);
+  if (!os::exists(path)) {
+    // This could happen if the slave died after creating the executor
+    // directory but before it checkpointed the executor info.
+    LOG(WARNING) << "Failed to find executor info file '" << path << "'";
+    return state;
+  }
+
+  const Result<ExecutorInfo>& executorInfo =
+    ::protobuf::read<ExecutorInfo>(path);
+
+  if (executorInfo.isError()) {
+    message = "Failed to read executor info from '" + path + "': " +
+              executorInfo.error();
+
+    if (strict) {
+      return Error(message);
+    } else {
+      LOG(WARNING) << message;
+      state.errors++;
+      return state;
+    }
+  }
+
+  if (executorInfo.isNone()) {
+    // This could happen if the slave died after opening the file for
+    // writing but before it checkpointed anything.
+    LOG(WARNING) << "Found empty executor info file '" << path << "'";
+    return state;
+  }
+
+  state.info = executorInfo.get();
+
   return state;
 }