You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2013/08/13 21:49:57 UTC
[13/23] git commit: Changed slave command line flag 'safe' to
'strict'.
Changed slave command line flag 'safe' to 'strict'.
Review: https://reviews.apache.org/r/12084
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a6a2fe98
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a6a2fe98
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a6a2fe98
Branch: refs/heads/vinod/0.13.0
Commit: a6a2fe98f20ec2cf0ec6bd60dc7e300b0dbf9321
Parents: bf703d2
Author: Vinod Kone <vi...@twitter.com>
Authored: Tue Jun 25 12:13:43 2013 -0700
Committer: Vinod Kone <vi...@twitter.com>
Committed: Mon Jul 1 12:47:20 2013 -0700
----------------------------------------------------------------------
src/slave/flags.hpp | 17 +++++++------
src/slave/slave.cpp | 8 +++---
src/slave/slave.hpp | 4 +--
src/slave/state.cpp | 44 ++++++++++++++++-----------------
src/slave/state.hpp | 16 ++++++------
src/tests/slave_recovery_tests.cpp | 2 +-
6 files changed, 46 insertions(+), 45 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/a6a2fe98/src/slave/flags.hpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.hpp b/src/slave/flags.hpp
index 9612983..94c393e 100644
--- a/src/slave/flags.hpp
+++ b/src/slave/flags.hpp
@@ -135,13 +135,14 @@ public:
" and the slave registers with the master as a new slave.",
"reconnect");
- add(&Flags::safe,
- "safe",
- "Whether to ignore (safe=false) any ambiguous errors during recovery.\n"
- "An ambiguous error is one where it is impossible to differentiate\n"
- "between a safe error that can be ignored and an unsafe error that "
- "cannot be.",
- true);
+ add(&Flags::strict,
+ "strict",
+ "If strict=true, any and all recovery errors are considered fatal.\n"
+ "If strict=false, any expected errors (e.g., slave cannot recover\n"
+ "information about an executor, because the slave died right before\n"
+ "the executor registered.) during recovery are ignored and as much\n"
+ "state as possible is recovered.\n",
+ false);
#ifdef __linux__
add(&Flags::cgroups_hierarchy,
@@ -182,7 +183,7 @@ public:
Duration resource_monitoring_interval;
bool checkpoint;
std::string recover;
- bool safe;
+ bool strict;
#ifdef __linux__
std::string cgroups_hierarchy;
std::string cgroups_root;
http://git-wip-us.apache.org/repos/asf/mesos/blob/a6a2fe98/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index aada544..c56dafc 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -371,7 +371,7 @@ void Slave::initialize()
}
// Start recovery.
- recover(flags.recover == "reconnect", flags.safe)
+ recover(flags.recover == "reconnect", flags.strict)
.onAny(defer(self(), &Slave::_initialize, params::_1));
}
@@ -2512,7 +2512,7 @@ void Slave::_checkDiskUsage(const Future<Try<double> >& usage)
}
-Future<Nothing> Slave::recover(bool reconnect, bool safe)
+Future<Nothing> Slave::recover(bool reconnect, bool strict)
{
const string& metaDir = paths::getMetaRootDir(flags.work_dir);
@@ -2526,7 +2526,7 @@ Future<Nothing> Slave::recover(bool reconnect, bool safe)
}
// First, recover the slave state.
- Result<SlaveState> state = state::recover(metaDir, safe);
+ Result<SlaveState> state = state::recover(metaDir, strict);
if (state.isError()) {
EXIT(1) << "Failed to recover slave state: " << state.error();
}
@@ -2859,7 +2859,7 @@ Executor* Framework::recoverExecutor(const ExecutorState& state)
// Recover the libprocess PID if possible.
if (run.libprocessPid.isSome()) {
- // When recovering in unsafe mode, the assumption is that the
+ // When recovering in non-strict mode, the assumption is that the
// slave can die after checkpointing the forked pid but before the
// libprocess pid. So, it is not possible for the libprocess pid
// to exist but not the forked pid. If so, it is a really bad
http://git-wip-us.apache.org/repos/asf/mesos/blob/a6a2fe98/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index 3a147f0..a9fd9c9 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -238,8 +238,8 @@ protected:
// Reads the checkpointed data from a previous run and recovers state.
// If 'reconnect' is true, the slave attempts to reconnect to any old
// live executors. Otherwise, the slave attempts to shutdown/kill them.
- // If 'safe' is true, any recovery errors are considered fatal.
- Future<Nothing> recover(bool reconnect, bool safe);
+ // If 'strict' is true, any recovery errors are considered fatal.
+ Future<Nothing> recover(bool reconnect, bool strict);
Future<Nothing> _recover(const state::SlaveState& state, bool reconnect);
// Helper to recover a framework from the specified state.
http://git-wip-us.apache.org/repos/asf/mesos/blob/a6a2fe98/src/slave/state.cpp
----------------------------------------------------------------------
diff --git a/src/slave/state.cpp b/src/slave/state.cpp
index 27a2731..e910ab7 100644
--- a/src/slave/state.cpp
+++ b/src/slave/state.cpp
@@ -28,7 +28,7 @@ using std::string;
using std::max;
-Result<SlaveState> recover(const string& rootDir, bool safe)
+Result<SlaveState> recover(const string& rootDir, bool strict)
{
LOG(INFO) << "Recovering state from " << rootDir;
@@ -51,7 +51,7 @@ Result<SlaveState> recover(const string& rootDir, bool safe)
SlaveID slaveId;
slaveId.set_value(os::basename(directory.get()).get());
- Try<SlaveState> state = SlaveState::recover(rootDir, slaveId, safe);
+ Try<SlaveState> state = SlaveState::recover(rootDir, slaveId, strict);
if (state.isError()) {
return Error(state.error());
}
@@ -63,7 +63,7 @@ Result<SlaveState> recover(const string& rootDir, bool safe)
Try<SlaveState> SlaveState::recover(
const string& rootDir,
const SlaveID& slaveId,
- bool safe)
+ bool strict)
{
SlaveState state;
state.id = slaveId;
@@ -75,7 +75,7 @@ Try<SlaveState> SlaveState::recover(
if (!slaveInfo.isSome()) {
const string& message = "Failed to read slave info from '" + path + "': " +
(slaveInfo.isError() ? slaveInfo.error() : " none");
- if (safe) {
+ if (strict) {
return Error(message);
} else {
LOG(WARNING) << message;
@@ -100,7 +100,7 @@ Try<SlaveState> SlaveState::recover(
frameworkId.set_value(os::basename(path).get());
const Try<FrameworkState>& framework =
- FrameworkState::recover(rootDir, slaveId, frameworkId, safe);
+ FrameworkState::recover(rootDir, slaveId, frameworkId, strict);
if (framework.isError()) {
return Error("Failed to recover framework " + frameworkId.value() +
@@ -118,7 +118,7 @@ Try<FrameworkState> FrameworkState::recover(
const string& rootDir,
const SlaveID& slaveId,
const FrameworkID& frameworkId,
- bool safe)
+ bool strict)
{
FrameworkState state;
state.id = frameworkId;
@@ -133,7 +133,7 @@ Try<FrameworkState> FrameworkState::recover(
message = "Failed to read framework info from '" + path + "': " +
(frameworkInfo.isError() ? frameworkInfo.error() : " none");
- if (safe) {
+ if (strict) {
return Error(message);
} else {
LOG(WARNING) << message;
@@ -151,7 +151,7 @@ Try<FrameworkState> FrameworkState::recover(
message =
"Failed to read framework pid from '" + path + "': " + pid.error();
- if (safe) {
+ if (strict) {
return Error(message);
} else {
LOG(WARNING) << message;
@@ -177,7 +177,7 @@ Try<FrameworkState> FrameworkState::recover(
executorId.set_value(os::basename(path).get());
const Try<ExecutorState>& executor =
- ExecutorState::recover(rootDir, slaveId, frameworkId, executorId, safe);
+ ExecutorState::recover(rootDir, slaveId, frameworkId, executorId, strict);
if (executor.isError()) {
return Error("Failed to recover executor " + executorId.value() +
@@ -196,7 +196,7 @@ Try<ExecutorState> ExecutorState::recover(
const SlaveID& slaveId,
const FrameworkID& frameworkId,
const ExecutorID& executorId,
- bool safe)
+ bool strict)
{
ExecutorState state;
state.id = executorId;
@@ -214,7 +214,7 @@ Try<ExecutorState> ExecutorState::recover(
"Failed to read executor info from '" + path +
"': " + (executorInfo.isError() ? executorInfo.error() : " none");
- if (safe) {
+ if (strict) {
return Error(message);
} else {
LOG(WARNING) << message;
@@ -254,7 +254,7 @@ Try<ExecutorState> ExecutorState::recover(
const UUID& uuid = UUID::fromString(os::basename(path).get());
const Try<RunState>& run = RunState::recover(
- rootDir, slaveId, frameworkId, executorId, uuid, safe);
+ rootDir, slaveId, frameworkId, executorId, uuid, strict);
if (run.isError()) {
return Error("Failed to recover run " + uuid.toString() +
@@ -271,7 +271,7 @@ Try<ExecutorState> ExecutorState::recover(
message =
"Failed to find the latest run of executor '" + executorId.value() + "'";
- if (safe) {
+ if (strict) {
return Error(message);
} else {
LOG(WARNING) << message;
@@ -289,7 +289,7 @@ Try<RunState> RunState::recover(
const FrameworkID& frameworkId,
const ExecutorID& executorId,
const UUID& uuid,
- bool safe)
+ bool strict)
{
RunState state;
state.id = uuid;
@@ -316,7 +316,7 @@ Try<RunState> RunState::recover(
taskId.set_value(os::basename(path).get());
const Try<TaskState>& task = TaskState::recover(
- rootDir, slaveId, frameworkId, executorId, uuid, taskId, safe);
+ rootDir, slaveId, frameworkId, executorId, uuid, taskId, strict);
if (task.isError()) {
return Error(
@@ -336,7 +336,7 @@ Try<RunState> RunState::recover(
message = "Failed to read executor's forked pid from '" + path +
"': " + pid.error();
- if (safe) {
+ if (strict) {
return Error(message);
} else {
LOG(WARNING) << message;
@@ -362,7 +362,7 @@ Try<RunState> RunState::recover(
message = "Failed to read executor's libprocess pid from '" + path +
"': " + pid.error();
- if (safe) {
+ if (strict) {
return Error(message);
} else {
LOG(WARNING) << message;
@@ -383,7 +383,7 @@ Try<TaskState> TaskState::recover(
const ExecutorID& executorId,
const UUID& uuid,
const TaskID& taskId,
- bool safe)
+ bool strict)
{
TaskState state;
state.id = taskId;
@@ -399,7 +399,7 @@ Try<TaskState> TaskState::recover(
message = "Failed to read task info from '" + path +
"': " + (task.isError() ? task.error() : " none");
- if (safe) {
+ if (strict) {
return Error(message);
} else {
LOG(WARNING) << message;
@@ -420,7 +420,7 @@ Try<TaskState> TaskState::recover(
message = "Failed to open status updates file '" + path +
"': " + fd.error();
- if (safe) {
+ if (strict) {
return Error(message);
} else {
LOG(WARNING) << message;
@@ -449,7 +449,7 @@ Try<TaskState> TaskState::recover(
message = "Failed to read status updates file '" + path +
"': " + record.error();
- if (safe) {
+ if (strict) {
return Error(message);
} else {
LOG(WARNING) << message;
@@ -471,7 +471,7 @@ Try<TaskState> TaskState::recover(
message = "Failed to close status updates file '" + path +
"': " + close.error();
- if (safe) {
+ if (strict) {
return Error(message);
} else {
LOG(WARNING) << message;
http://git-wip-us.apache.org/repos/asf/mesos/blob/a6a2fe98/src/slave/state.hpp
----------------------------------------------------------------------
diff --git a/src/slave/state.hpp b/src/slave/state.hpp
index 1e3e894..08e3617 100644
--- a/src/slave/state.hpp
+++ b/src/slave/state.hpp
@@ -52,10 +52,10 @@ struct RunState;
struct TaskState;
// Each of the structs below (recursively) recover the checkpointed
-// state. If the 'safe' flag is set, any errors encountered while
+// state. If the 'strict' flag is set, any errors encountered while
// recovering a state are considered fatal and hence the recovery is
// short-circuited and returns an error. There might be orphaned
-// executors that need to be manually cleaned up. If 'safe' flag is
+// executors that need to be manually cleaned up. If 'strict' flag is
// not set, any errors encountered are considered non-fatal and the
// recovery continues by recovering as much of the state as possible.
@@ -64,7 +64,7 @@ struct SlaveState
static Try<SlaveState> recover(
const std::string& rootDir,
const SlaveID& slaveId,
- bool safe);
+ bool strict);
SlaveID id;
Option<SlaveInfo> info;
@@ -78,7 +78,7 @@ struct FrameworkState
const std::string& rootDir,
const SlaveID& slaveId,
const FrameworkID& frameworkId,
- bool safe);
+ bool strict);
FrameworkID id;
Option<FrameworkInfo> info;
@@ -94,7 +94,7 @@ struct ExecutorState
const SlaveID& slaveId,
const FrameworkID& frameworkId,
const ExecutorID& executorId,
- bool safe);
+ bool strict);
ExecutorID id;
Option<ExecutorInfo> info;
@@ -111,7 +111,7 @@ struct RunState
const FrameworkID& frameworkId,
const ExecutorID& executorId,
const UUID& uuid,
- bool safe);
+ bool strict);
Option<UUID> id;
hashmap<TaskID, TaskState> tasks;
@@ -129,7 +129,7 @@ struct TaskState
const ExecutorID& executorId,
const UUID& uuid,
const TaskID& taskId,
- bool safe);
+ bool strict);
TaskID id;
Option<Task> info;
@@ -139,7 +139,7 @@ struct TaskState
// This function performs recovery from the state stored at 'rootDir'.
-Result<SlaveState> recover(const std::string& rootDir, bool safe);
+Result<SlaveState> recover(const std::string& rootDir, bool strict);
// Thin wrappers to checkpoint data to disk and perform the
http://git-wip-us.apache.org/repos/asf/mesos/blob/a6a2fe98/src/tests/slave_recovery_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_recovery_tests.cpp b/src/tests/slave_recovery_tests.cpp
index 2480b5c..daae66e 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -126,7 +126,7 @@ public:
// Setup recovery slave flags.
flags.checkpoint = true;
flags.recover = "reconnect";
- flags.safe = false;
+ flags.strict = false;
return flags;
}