You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2013/08/13 21:49:57 UTC

[13/23] git commit: Changed slave command line flag 'safe' to 'strict'.

Changed slave command line flag 'safe' to 'strict'.

Review: https://reviews.apache.org/r/12084


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a6a2fe98
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a6a2fe98
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a6a2fe98

Branch: refs/heads/vinod/0.13.0
Commit: a6a2fe98f20ec2cf0ec6bd60dc7e300b0dbf9321
Parents: bf703d2
Author: Vinod Kone <vi...@twitter.com>
Authored: Tue Jun 25 12:13:43 2013 -0700
Committer: Vinod Kone <vi...@twitter.com>
Committed: Mon Jul 1 12:47:20 2013 -0700

----------------------------------------------------------------------
 src/slave/flags.hpp                | 17 +++++++------
 src/slave/slave.cpp                |  8 +++---
 src/slave/slave.hpp                |  4 +--
 src/slave/state.cpp                | 44 ++++++++++++++++-----------------
 src/slave/state.hpp                | 16 ++++++------
 src/tests/slave_recovery_tests.cpp |  2 +-
 6 files changed, 46 insertions(+), 45 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/a6a2fe98/src/slave/flags.hpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.hpp b/src/slave/flags.hpp
index 9612983..94c393e 100644
--- a/src/slave/flags.hpp
+++ b/src/slave/flags.hpp
@@ -135,13 +135,14 @@ public:
         "      and the slave registers with the master as a new slave.",
         "reconnect");
 
-    add(&Flags::safe,
-        "safe",
-        "Whether to ignore (safe=false) any ambiguous errors during recovery.\n"
-        "An ambiguous error is one where it is impossible to differentiate\n"
-        "between a safe error that can be ignored and an unsafe error that "
-        "cannot be.",
-        true);
+    add(&Flags::strict,
+        "strict",
+        "If strict=true, any and all recovery errors are considered fatal.\n"
+        "If strict=false, any expected errors (e.g., slave cannot recover\n"
+        "information about an executor, because the slave died right before\n"
+        "the executor registered.) during recovery are ignored and as much\n"
+        "state as possible is recovered.\n",
+        false);
 
 #ifdef __linux__
     add(&Flags::cgroups_hierarchy,
@@ -182,7 +183,7 @@ public:
   Duration resource_monitoring_interval;
   bool checkpoint;
   std::string recover;
-  bool safe;
+  bool strict;
 #ifdef __linux__
   std::string cgroups_hierarchy;
   std::string cgroups_root;

http://git-wip-us.apache.org/repos/asf/mesos/blob/a6a2fe98/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index aada544..c56dafc 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -371,7 +371,7 @@ void Slave::initialize()
   }
 
   // Start recovery.
-  recover(flags.recover == "reconnect", flags.safe)
+  recover(flags.recover == "reconnect", flags.strict)
     .onAny(defer(self(), &Slave::_initialize, params::_1));
 }
 
@@ -2512,7 +2512,7 @@ void Slave::_checkDiskUsage(const Future<Try<double> >& usage)
 }
 
 
-Future<Nothing> Slave::recover(bool reconnect, bool safe)
+Future<Nothing> Slave::recover(bool reconnect, bool strict)
 {
   const string& metaDir = paths::getMetaRootDir(flags.work_dir);
 
@@ -2526,7 +2526,7 @@ Future<Nothing> Slave::recover(bool reconnect, bool safe)
   }
 
   // First, recover the slave state.
-  Result<SlaveState> state = state::recover(metaDir, safe);
+  Result<SlaveState> state = state::recover(metaDir, strict);
   if (state.isError()) {
     EXIT(1) << "Failed to recover slave state: " << state.error();
   }
@@ -2859,7 +2859,7 @@ Executor* Framework::recoverExecutor(const ExecutorState& state)
 
   // Recover the libprocess PID if possible.
   if (run.libprocessPid.isSome()) {
-    // When recovering in unsafe mode, the assumption is that the
+    // When recovering in non-strict mode, the assumption is that the
     // slave can die after checkpointing the forked pid but before the
     // libprocess pid. So, it is not possible for the libprocess pid
     // to exist but not the forked pid. If so, it is a really bad

http://git-wip-us.apache.org/repos/asf/mesos/blob/a6a2fe98/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index 3a147f0..a9fd9c9 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -238,8 +238,8 @@ protected:
   // Reads the checkpointed data from a previous run and recovers state.
   // If 'reconnect' is true, the slave attempts to reconnect to any old
   // live executors. Otherwise, the slave attempts to shutdown/kill them.
-  // If 'safe' is true, any recovery errors are considered fatal.
-  Future<Nothing> recover(bool reconnect, bool safe);
+  // If 'strict' is true, any recovery errors are considered fatal.
+  Future<Nothing> recover(bool reconnect, bool strict);
   Future<Nothing> _recover(const state::SlaveState& state, bool reconnect);
 
   // Helper to recover a framework from the specified state.

http://git-wip-us.apache.org/repos/asf/mesos/blob/a6a2fe98/src/slave/state.cpp
----------------------------------------------------------------------
diff --git a/src/slave/state.cpp b/src/slave/state.cpp
index 27a2731..e910ab7 100644
--- a/src/slave/state.cpp
+++ b/src/slave/state.cpp
@@ -28,7 +28,7 @@ using std::string;
 using std::max;
 
 
-Result<SlaveState> recover(const string& rootDir, bool safe)
+Result<SlaveState> recover(const string& rootDir, bool strict)
 {
   LOG(INFO) << "Recovering state from " << rootDir;
 
@@ -51,7 +51,7 @@ Result<SlaveState> recover(const string& rootDir, bool safe)
   SlaveID slaveId;
   slaveId.set_value(os::basename(directory.get()).get());
 
-  Try<SlaveState> state = SlaveState::recover(rootDir, slaveId, safe);
+  Try<SlaveState> state = SlaveState::recover(rootDir, slaveId, strict);
   if (state.isError()) {
     return Error(state.error());
   }
@@ -63,7 +63,7 @@ Result<SlaveState> recover(const string& rootDir, bool safe)
 Try<SlaveState> SlaveState::recover(
     const string& rootDir,
     const SlaveID& slaveId,
-    bool safe)
+    bool strict)
 {
   SlaveState state;
   state.id = slaveId;
@@ -75,7 +75,7 @@ Try<SlaveState> SlaveState::recover(
   if (!slaveInfo.isSome()) {
     const string& message = "Failed to read slave info from '" + path + "': " +
                             (slaveInfo.isError() ? slaveInfo.error() : " none");
-    if (safe) {
+    if (strict) {
       return Error(message);
     } else {
       LOG(WARNING) << message;
@@ -100,7 +100,7 @@ Try<SlaveState> SlaveState::recover(
     frameworkId.set_value(os::basename(path).get());
 
     const Try<FrameworkState>& framework =
-      FrameworkState::recover(rootDir, slaveId, frameworkId, safe);
+      FrameworkState::recover(rootDir, slaveId, frameworkId, strict);
 
     if (framework.isError()) {
       return Error("Failed to recover framework " + frameworkId.value() +
@@ -118,7 +118,7 @@ Try<FrameworkState> FrameworkState::recover(
     const string& rootDir,
     const SlaveID& slaveId,
     const FrameworkID& frameworkId,
-    bool safe)
+    bool strict)
 {
   FrameworkState state;
   state.id = frameworkId;
@@ -133,7 +133,7 @@ Try<FrameworkState> FrameworkState::recover(
     message = "Failed to read framework info from '" + path + "': " +
               (frameworkInfo.isError() ? frameworkInfo.error() : " none");
 
-      if (safe) {
+      if (strict) {
         return Error(message);
       } else {
         LOG(WARNING) << message;
@@ -151,7 +151,7 @@ Try<FrameworkState> FrameworkState::recover(
     message =
       "Failed to read framework pid from '" + path + "': " + pid.error();
 
-    if (safe) {
+    if (strict) {
       return Error(message);
     } else {
       LOG(WARNING) << message;
@@ -177,7 +177,7 @@ Try<FrameworkState> FrameworkState::recover(
     executorId.set_value(os::basename(path).get());
 
     const Try<ExecutorState>& executor =
-      ExecutorState::recover(rootDir, slaveId, frameworkId, executorId, safe);
+      ExecutorState::recover(rootDir, slaveId, frameworkId, executorId, strict);
 
     if (executor.isError()) {
       return Error("Failed to recover executor " + executorId.value() +
@@ -196,7 +196,7 @@ Try<ExecutorState> ExecutorState::recover(
     const SlaveID& slaveId,
     const FrameworkID& frameworkId,
     const ExecutorID& executorId,
-    bool safe)
+    bool strict)
 {
   ExecutorState state;
   state.id = executorId;
@@ -214,7 +214,7 @@ Try<ExecutorState> ExecutorState::recover(
       "Failed to read executor info from '" + path +
       "': " + (executorInfo.isError() ? executorInfo.error() : " none");
 
-    if (safe) {
+    if (strict) {
       return Error(message);
     } else {
       LOG(WARNING) << message;
@@ -254,7 +254,7 @@ Try<ExecutorState> ExecutorState::recover(
       const UUID& uuid = UUID::fromString(os::basename(path).get());
 
       const Try<RunState>& run = RunState::recover(
-          rootDir, slaveId, frameworkId, executorId, uuid, safe);
+          rootDir, slaveId, frameworkId, executorId, uuid, strict);
 
       if (run.isError()) {
        return Error("Failed to recover run " + uuid.toString() +
@@ -271,7 +271,7 @@ Try<ExecutorState> ExecutorState::recover(
     message =
       "Failed to find the latest run of executor '" + executorId.value() + "'";
 
-    if (safe) {
+    if (strict) {
       return Error(message);
     } else {
       LOG(WARNING) << message;
@@ -289,7 +289,7 @@ Try<RunState> RunState::recover(
     const FrameworkID& frameworkId,
     const ExecutorID& executorId,
     const UUID& uuid,
-    bool safe)
+    bool strict)
 {
   RunState state;
   state.id = uuid;
@@ -316,7 +316,7 @@ Try<RunState> RunState::recover(
     taskId.set_value(os::basename(path).get());
 
     const Try<TaskState>& task = TaskState::recover(
-        rootDir, slaveId, frameworkId, executorId, uuid, taskId, safe);
+        rootDir, slaveId, frameworkId, executorId, uuid, taskId, strict);
 
     if (task.isError()) {
       return Error(
@@ -336,7 +336,7 @@ Try<RunState> RunState::recover(
     message = "Failed to read executor's forked pid from '" + path +
               "': " + pid.error();
 
-    if (safe) {
+    if (strict) {
       return Error(message);
     } else {
       LOG(WARNING) << message;
@@ -362,7 +362,7 @@ Try<RunState> RunState::recover(
     message = "Failed to read executor's libprocess pid from '" + path +
               "': " + pid.error();
 
-    if (safe) {
+    if (strict) {
       return Error(message);
     } else {
       LOG(WARNING) << message;
@@ -383,7 +383,7 @@ Try<TaskState> TaskState::recover(
     const ExecutorID& executorId,
     const UUID& uuid,
     const TaskID& taskId,
-    bool safe)
+    bool strict)
 {
   TaskState state;
   state.id = taskId;
@@ -399,7 +399,7 @@ Try<TaskState> TaskState::recover(
     message = "Failed to read task info from '" + path +
               "': " + (task.isError() ? task.error() : " none");
 
-    if (safe) {
+    if (strict) {
       return Error(message);
     } else {
       LOG(WARNING) << message;
@@ -420,7 +420,7 @@ Try<TaskState> TaskState::recover(
     message = "Failed to open status updates file '" + path +
               "': " + fd.error();
 
-    if (safe) {
+    if (strict) {
       return Error(message);
     } else {
       LOG(WARNING) << message;
@@ -449,7 +449,7 @@ Try<TaskState> TaskState::recover(
     message = "Failed to read status updates file  '" + path +
               "': " + record.error();
 
-    if (safe) {
+    if (strict) {
       return Error(message);
     } else {
       LOG(WARNING) << message;
@@ -471,7 +471,7 @@ Try<TaskState> TaskState::recover(
     message = "Failed to close status updates file '" + path +
               "': " + close.error();
 
-    if (safe) {
+    if (strict) {
       return Error(message);
     } else {
       LOG(WARNING) << message;

http://git-wip-us.apache.org/repos/asf/mesos/blob/a6a2fe98/src/slave/state.hpp
----------------------------------------------------------------------
diff --git a/src/slave/state.hpp b/src/slave/state.hpp
index 1e3e894..08e3617 100644
--- a/src/slave/state.hpp
+++ b/src/slave/state.hpp
@@ -52,10 +52,10 @@ struct RunState;
 struct TaskState;
 
 // Each of the structs below (recursively) recover the checkpointed
-// state. If the 'safe' flag is set, any errors encountered while
+// state. If the 'strict' flag is set, any errors encountered while
 // recovering a state are considered fatal and hence the recovery is
 // short-circuited and returns an error. There might be orphaned
-// executors that need to be manually cleaned up. If 'safe' flag is
+// executors that need to be manually cleaned up. If 'strict' flag is
 // not set, any errors encountered are considered  non-fatal and the
 // recovery continues by recovering as much of the state as possible.
 
@@ -64,7 +64,7 @@ struct SlaveState
   static Try<SlaveState> recover(
       const std::string& rootDir,
       const SlaveID& slaveId,
-      bool safe);
+      bool strict);
 
   SlaveID id;
   Option<SlaveInfo> info;
@@ -78,7 +78,7 @@ struct FrameworkState
       const std::string& rootDir,
       const SlaveID& slaveId,
       const FrameworkID& frameworkId,
-      bool safe);
+      bool strict);
 
   FrameworkID id;
   Option<FrameworkInfo> info;
@@ -94,7 +94,7 @@ struct ExecutorState
       const SlaveID& slaveId,
       const FrameworkID& frameworkId,
       const ExecutorID& executorId,
-      bool safe);
+      bool strict);
 
   ExecutorID id;
   Option<ExecutorInfo> info;
@@ -111,7 +111,7 @@ struct RunState
       const FrameworkID& frameworkId,
       const ExecutorID& executorId,
       const UUID& uuid,
-      bool safe);
+      bool strict);
 
   Option<UUID> id;
   hashmap<TaskID, TaskState> tasks;
@@ -129,7 +129,7 @@ struct TaskState
       const ExecutorID& executorId,
       const UUID& uuid,
       const TaskID& taskId,
-      bool safe);
+      bool strict);
 
   TaskID id;
   Option<Task> info;
@@ -139,7 +139,7 @@ struct TaskState
 
 
 // This function performs recovery from the state stored at 'rootDir'.
-Result<SlaveState> recover(const std::string& rootDir, bool safe);
+Result<SlaveState> recover(const std::string& rootDir, bool strict);
 
 
 // Thin wrappers to checkpoint data to disk and perform the

http://git-wip-us.apache.org/repos/asf/mesos/blob/a6a2fe98/src/tests/slave_recovery_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_recovery_tests.cpp b/src/tests/slave_recovery_tests.cpp
index 2480b5c..daae66e 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -126,7 +126,7 @@ public:
     // Setup recovery slave flags.
     flags.checkpoint = true;
     flags.recover = "reconnect";
-    flags.safe = false;
+    flags.strict = false;
 
     return flags;
   }