You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2013/08/26 18:12:28 UTC

git commit: Exposed recovery errors that are encountered in non-strict recovery mode.

Updated Branches:
  refs/heads/master c014cfb2d -> 2b86e3906


Exposed recovery errors that are encountered in non-strict
recovery mode.

Review: https://reviews.apache.org/r/13789


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/2b86e390
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/2b86e390
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/2b86e390

Branch: refs/heads/master
Commit: 2b86e3906854bd3afa352e4b623eecfc2145466b
Parents: c014cfb
Author: Vinod Kone <vi...@twitter.com>
Authored: Fri Aug 23 17:30:58 2013 -0700
Committer: Vinod Kone <vi...@twitter.com>
Committed: Mon Aug 26 12:11:59 2013 -0400

----------------------------------------------------------------------
 src/slave/http.cpp  |  2 +-
 src/slave/slave.cpp |  8 +++++++-
 src/slave/slave.hpp |  3 +++
 src/slave/state.cpp | 34 ++++++++++++++++++++++++----------
 src/slave/state.hpp | 22 ++++++++++++++++++++--
 5 files changed, 55 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/2b86e390/src/slave/http.cpp
----------------------------------------------------------------------
diff --git a/src/slave/http.cpp b/src/slave/http.cpp
index 073d092..62fbb37 100644
--- a/src/slave/http.cpp
+++ b/src/slave/http.cpp
@@ -289,7 +289,7 @@ Future<Response> Slave::Http::stats(const Request& request)
   object.values["valid_status_updates"] = slave.stats.validStatusUpdates;
   object.values["invalid_status_updates"] = slave.stats.invalidStatusUpdates;
   object.values["registered"] = slave.master ? "1" : "0";
-
+  object.values["recovery_errors"] = slave.recoveryErrors;
 
   return OK(object, request.query.get("jsonp"));
 }

http://git-wip-us.apache.org/repos/asf/mesos/blob/2b86e390/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index dd599a7..3e2c600 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -90,7 +90,8 @@ Slave::Slave(const slave::Flags& _flags,
     files(_files),
     monitor(_isolator),
     statusUpdateManager(new StatusUpdateManager()),
-    metaDir(paths::getMetaRootDir(flags.work_dir)) {}
+    metaDir(paths::getMetaRootDir(flags.work_dir)),
+    recoveryErrors(0) {}
 
 
 Slave::~Slave()
@@ -2666,6 +2667,11 @@ Future<Nothing> Slave::recover(bool reconnect, bool strict)
 
   info = state.get().info.get(); // Recover the slave info.
 
+  recoveryErrors = state.get().errors;
+  if (recoveryErrors > 0) {
+    LOG(WARNING) << "Errors encountered during recovery: " << recoveryErrors;
+  }
+
   // First, recover the frameworks and executors.
   foreachvalue (const FrameworkState& frameworkState, state.get().frameworks) {
     recoverFramework(frameworkState);

http://git-wip-us.apache.org/repos/asf/mesos/blob/2b86e390/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index aca7f03..ce2b0da 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -334,6 +334,9 @@ private:
 
   // Root meta directory containing checkpointed data.
   const std::string metaDir;
+
+  // Indicates the number of errors ignored in "--no-strict" recovery mode.
+  unsigned int recoveryErrors;
 };
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/2b86e390/src/slave/state.cpp
----------------------------------------------------------------------
diff --git a/src/slave/state.cpp b/src/slave/state.cpp
index b66aaa3..51bc8f4 100644
--- a/src/slave/state.cpp
+++ b/src/slave/state.cpp
@@ -86,6 +86,7 @@ Try<SlaveState> SlaveState::recover(
       return Error(message);
     } else {
       LOG(WARNING) << message;
+      state.errors++;
       return state;
     }
   }
@@ -115,6 +116,7 @@ Try<SlaveState> SlaveState::recover(
     }
 
     state.frameworks[frameworkId] = framework.get();
+    state.errors += framework.get().errors;
   }
 
   return state;
@@ -148,13 +150,14 @@ Try<FrameworkState> FrameworkState::recover(
     message = "Failed to read framework info from '" + path + "': " +
               (frameworkInfo.isError() ? frameworkInfo.error() : " none");
 
-      if (strict) {
-        return Error(message);
-      } else {
-        LOG(WARNING) << message;
-        return state;
-      }
+    if (strict) {
+      return Error(message);
+    } else {
+      LOG(WARNING) << message;
+      state.errors++;
+      return state;
     }
+  }
 
   state.info = frameworkInfo.get();
 
@@ -177,6 +180,7 @@ Try<FrameworkState> FrameworkState::recover(
       return Error(message);
     } else {
       LOG(WARNING) << message;
+      state.errors++;
       return state;
     }
   }
@@ -207,6 +211,7 @@ Try<FrameworkState> FrameworkState::recover(
     }
 
     state.executors[executorId] = executor.get();
+    state.errors += executor.get().errors;
   }
 
   return state;
@@ -246,6 +251,7 @@ Try<ExecutorState> ExecutorState::recover(
       return Error(message);
     } else {
       LOG(WARNING) << message;
+      state.errors++;
       return state;
     }
   }
@@ -285,12 +291,13 @@ Try<ExecutorState> ExecutorState::recover(
           rootDir, slaveId, frameworkId, executorId, uuid, strict);
 
       if (run.isError()) {
-       return Error("Failed to recover run " + uuid.toString() +
-                    " of executor '" + executorId.value() +
-                    "': " + run.error());
+        return Error("Failed to recover run " + uuid.toString() +
+                     " of executor '" + executorId.value() +
+                     "': " + run.error());
       }
 
       state.runs[uuid] = run.get();
+      state.errors += run.get().errors;
     }
   }
 
@@ -303,6 +310,7 @@ Try<ExecutorState> ExecutorState::recover(
       return Error(message);
     } else {
       LOG(WARNING) << message;
+      state.errors++;
       return state;
     }
   }
@@ -321,7 +329,6 @@ Try<RunState> RunState::recover(
 {
   RunState state;
   state.id = uuid;
-  state.completed = false;
   string message;
 
   // Find the tasks.
@@ -353,6 +360,7 @@ Try<RunState> RunState::recover(
     }
 
     state.tasks[taskId] = task.get();
+    state.errors += task.get().errors;
   }
 
   // Read the forked pid.
@@ -375,6 +383,7 @@ Try<RunState> RunState::recover(
       return Error(message);
     } else {
       LOG(WARNING) << message;
+      state.errors++;
       return state;
     }
   }
@@ -409,6 +418,7 @@ Try<RunState> RunState::recover(
       return Error(message);
     } else {
       LOG(WARNING) << message;
+      state.errors++;
       return state;
     }
   }
@@ -458,6 +468,7 @@ Try<TaskState> TaskState::recover(
       return Error(message);
     } else {
       LOG(WARNING) << message;
+      state.errors++;
       return state;
     }
   }
@@ -485,6 +496,7 @@ Try<TaskState> TaskState::recover(
       return Error(message);
     } else {
       LOG(WARNING) << message;
+      state.errors++;
       return state;
     }
   }
@@ -524,6 +536,7 @@ Try<TaskState> TaskState::recover(
       return Error(message);
     } else {
       LOG(WARNING) << message;
+      state.errors++;
       return state;
     }
   }
@@ -539,6 +552,7 @@ Try<TaskState> TaskState::recover(
       return Error(message);
     } else {
       LOG(WARNING) << message;
+      state.errors++;
       return state;
     }
   }

http://git-wip-us.apache.org/repos/asf/mesos/blob/2b86e390/src/slave/state.hpp
----------------------------------------------------------------------
diff --git a/src/slave/state.hpp b/src/slave/state.hpp
index 7e88705..f06cba8 100644
--- a/src/slave/state.hpp
+++ b/src/slave/state.hpp
@@ -56,11 +56,16 @@ struct TaskState;
 // recovering a state are considered fatal and hence the recovery is
 // short-circuited and returns an error. There might be orphaned
 // executors that need to be manually cleaned up. If 'strict' flag is
-// not set, any errors encountered are considered  non-fatal and the
-// recovery continues by recovering as much of the state as possible.
+// not set, any errors encountered are considered non-fatal and the
+// recovery continues by recovering as much of the state as possible,
+// while increasing the 'errors' count. Note that 'errors' on a struct
+// includes the 'errors' encountered recursively. In other words,
+// 'SlaveState.errors' is the sum total of all recovery errors.
 
 struct SlaveState
 {
+  SlaveState () : errors(0) {}
+
   static Try<SlaveState> recover(
       const std::string& rootDir,
       const SlaveID& slaveId,
@@ -69,11 +74,14 @@ struct SlaveState
   SlaveID id;
   Option<SlaveInfo> info;
   hashmap<FrameworkID, FrameworkState> frameworks;
+  unsigned int errors;
 };
 
 
 struct FrameworkState
 {
+  FrameworkState () : errors(0) {}
+
   static Try<FrameworkState> recover(
       const std::string& rootDir,
       const SlaveID& slaveId,
@@ -84,11 +92,14 @@ struct FrameworkState
   Option<FrameworkInfo> info;
   Option<process::UPID> pid;
   hashmap<ExecutorID, ExecutorState> executors;
+  unsigned int errors;
 };
 
 
 struct ExecutorState
 {
+  ExecutorState () : errors(0) {}
+
   static Try<ExecutorState> recover(
       const std::string& rootDir,
       const SlaveID& slaveId,
@@ -100,11 +111,14 @@ struct ExecutorState
   Option<ExecutorInfo> info;
   Option<UUID> latest;
   hashmap<UUID, RunState> runs;
+  unsigned int errors;
 };
 
 
 struct RunState
 {
+  RunState () : completed(false), errors(0) {}
+
   static Try<RunState> recover(
       const std::string& rootDir,
       const SlaveID& slaveId,
@@ -118,11 +132,14 @@ struct RunState
   Option<pid_t> forkedPid;
   Option<process::UPID> libprocessPid;
   bool completed; // Executor terminated and all its updates acknowledged.
+  unsigned int errors;
 };
 
 
 struct TaskState
 {
+  TaskState () : errors(0) {}
+
   static Try<TaskState> recover(
       const std::string& rootDir,
       const SlaveID& slaveId,
@@ -136,6 +153,7 @@ struct TaskState
   Option<Task> info;
   std::vector<StatusUpdate> updates;
   hashset<UUID> acks;
+  unsigned int errors;
 };