You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gi...@apache.org on 2019/01/11 21:33:28 UTC

[mesos] 01/11: Made agent not read the forked pid and libprocess pid after reboot.

This is an automated email from the ASF dual-hosted git repository.

gilbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit eda091215a132a7ea758a95cd4f66945a462387e
Author: Qian Zhang <zh...@gmail.com>
AuthorDate: Fri Jan 11 12:21:18 2019 -0800

    Made agent not read the forked pid and libprocess pid after reboot.
    
    After agent host is rebooted, the forked pid and libprocess pid in
    agent's meta directory are obsolete, so we should not read them during
    agent recovery, otherwise containerizer may wait for an irrelevant
    process if the forked pid is reused by another process after reboot.
    
    Review: https://reviews.apache.org/r/69705/
---
 src/slave/state.cpp | 57 +++++++++++++++++++++++++++++++++++++++++------------
 src/slave/state.hpp | 12 +++++++----
 2 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/src/slave/state.cpp b/src/slave/state.cpp
index e7cf849..ae16d6f 100644
--- a/src/slave/state.cpp
+++ b/src/slave/state.cpp
@@ -123,7 +123,9 @@ Try<State> recover(const string& rootDir, bool strict)
   SlaveID slaveId;
   slaveId.set_value(Path(directory.get()).basename());
 
-  Try<SlaveState> slave = SlaveState::recover(rootDir, slaveId, strict);
+  Try<SlaveState> slave =
+    SlaveState::recover(rootDir, slaveId, strict, state.rebooted);
+
   if (slave.isError()) {
     return Error(slave.error());
   }
@@ -137,7 +139,8 @@ Try<State> recover(const string& rootDir, bool strict)
 Try<SlaveState> SlaveState::recover(
     const string& rootDir,
     const SlaveID& slaveId,
-    bool strict)
+    bool strict,
+    bool rebooted)
 {
   SlaveState state;
   state.id = slaveId;
@@ -188,7 +191,7 @@ Try<SlaveState> SlaveState::recover(
     frameworkId.set_value(Path(path).basename());
 
     Try<FrameworkState> framework =
-      FrameworkState::recover(rootDir, slaveId, frameworkId, strict);
+      FrameworkState::recover(rootDir, slaveId, frameworkId, strict, rebooted);
 
     if (framework.isError()) {
       return Error("Failed to recover framework " + frameworkId.value() +
@@ -207,7 +210,8 @@ Try<FrameworkState> FrameworkState::recover(
     const string& rootDir,
     const SlaveID& slaveId,
     const FrameworkID& frameworkId,
-    bool strict)
+    bool strict,
+    bool rebooted)
 {
   FrameworkState state;
   state.id = frameworkId;
@@ -295,8 +299,8 @@ Try<FrameworkState> FrameworkState::recover(
     ExecutorID executorId;
     executorId.set_value(Path(path).basename());
 
-    Try<ExecutorState> executor =
-      ExecutorState::recover(rootDir, slaveId, frameworkId, executorId, strict);
+    Try<ExecutorState> executor = ExecutorState::recover(
+        rootDir, slaveId, frameworkId, executorId, strict, rebooted);
 
     if (executor.isError()) {
       return Error("Failed to recover executor '" + executorId.value() +
@@ -316,7 +320,8 @@ Try<ExecutorState> ExecutorState::recover(
     const SlaveID& slaveId,
     const FrameworkID& frameworkId,
     const ExecutorID& executorId,
-    bool strict)
+    bool strict,
+    bool rebooted)
 {
   ExecutorState state;
   state.id = executorId;
@@ -356,7 +361,13 @@ Try<ExecutorState> ExecutorState::recover(
       containerId.set_value(Path(path).basename());
 
       Try<RunState> run = RunState::recover(
-          rootDir, slaveId, frameworkId, executorId, containerId, strict);
+          rootDir,
+          slaveId,
+          frameworkId,
+          executorId,
+          containerId,
+          strict,
+          rebooted);
 
       if (run.isError()) {
         return Error(
@@ -423,7 +434,8 @@ Try<RunState> RunState::recover(
     const FrameworkID& frameworkId,
     const ExecutorID& executorId,
     const ContainerID& containerId,
-    bool strict)
+    bool strict,
+    bool rebooted)
 {
   RunState state;
   state.id = containerId;
@@ -469,18 +481,37 @@ Try<RunState> RunState::recover(
     state.errors += task->errors;
   }
 
-  // Read the forked pid.
   path = paths::getForkedPidPath(
       rootDir, slaveId, frameworkId, executorId, containerId);
+
+  // If agent host is rebooted, we do not read the forked pid and libprocess pid
+  // since those two pids are obsolete after reboot. And we remove the forked
+  // pid file to make sure we will not read it in the case the agent process is
+  // restarted after we checkpoint the new boot ID in `Slave::__recover` (i.e.,
+  // agent recovery is done after the reboot).
+  if (rebooted) {
+    if (os::exists(path)) {
+      Try<Nothing> rm = os::rm(path);
+      if (rm.isError()) {
+        return Error(
+            "Failed to remove executor forked pid file '" + path + "': " +
+            rm.error());
+      }
+    }
+
+    return state;
+  }
+
   if (!os::exists(path)) {
-    // This could happen if the slave died before the isolator
-    // checkpointed the forked pid.
+    // This could happen if the slave died before the containerizer checkpointed
+    // the forked pid or agent process is restarted after agent host is rebooted
+    // since we remove this file in the above code.
     LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'";
     return state;
   }
 
+  // Read the forked pid.
   Result<string> pid = state::read<string>(path);
-
   if (pid.isError()) {
     message = "Failed to read executor forked pid from '" + path +
               "': " + pid.error();
diff --git a/src/slave/state.hpp b/src/slave/state.hpp
index 4f3d4ce..e2180ae 100644
--- a/src/slave/state.hpp
+++ b/src/slave/state.hpp
@@ -279,7 +279,8 @@ struct RunState
       const FrameworkID& frameworkId,
       const ExecutorID& executorId,
       const ContainerID& containerId,
-      bool strict);
+      bool strict,
+      bool rebooted);
 
   Option<ContainerID> id;
   hashmap<TaskID, TaskState> tasks;
@@ -306,7 +307,8 @@ struct ExecutorState
       const SlaveID& slaveId,
       const FrameworkID& frameworkId,
       const ExecutorID& executorId,
-      bool strict);
+      bool strict,
+      bool rebooted);
 
   ExecutorID id;
   Option<ExecutorInfo> info;
@@ -324,7 +326,8 @@ struct FrameworkState
       const std::string& rootDir,
       const SlaveID& slaveId,
       const FrameworkID& frameworkId,
-      bool strict);
+      bool strict,
+      bool rebooted);
 
   FrameworkID id;
   Option<FrameworkInfo> info;
@@ -359,7 +362,8 @@ struct SlaveState
   static Try<SlaveState> recover(
       const std::string& rootDir,
       const SlaveID& slaveId,
-      bool strict);
+      bool strict,
+      bool rebooted);
 
   SlaveID id;
   Option<SlaveInfo> info;