You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gi...@apache.org on 2019/01/11 21:34:22 UTC
[mesos] branch 1.5.x updated (052a350 -> 4c19426)
This is an automated email from the ASF dual-hosted git repository.
gilbert pushed a change to branch 1.5.x
in repository https://gitbox.apache.org/repos/asf/mesos.git.
from 052a350 Added MESOS-9510 and MESOS-9516 to the 1.5.2 CHANGELOG.
new efcc347 Made agent not read the forked pid and libprocess pid after reboot.
new 1ef2ef5 Updated `SlaveRecoveryTest.Reboot` to expect none pids.
new ec8bfbf Added MESOS-9502 to 1.5.2 CHANGELOG.
new 4c19426 Added MESOS-9501 to 1.5.2 CHANGELOG.
The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
CHANGELOG | 2 ++
src/slave/state.cpp | 57 +++++++++++++++++++++++++++++---------
src/slave/state.hpp | 12 +++++---
src/tests/slave_recovery_tests.cpp | 7 ++---
4 files changed, 56 insertions(+), 22 deletions(-)
[mesos] 01/04: Made agent not read the forked pid and libprocess
pid after reboot.
Posted by gi...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
gilbert pushed a commit to branch 1.5.x
in repository https://gitbox.apache.org/repos/asf/mesos.git
commit efcc347bb86dfd0be469d8c0215af43435737c71
Author: Qian Zhang <zh...@gmail.com>
AuthorDate: Fri Jan 11 12:21:18 2019 -0800
Made agent not read the forked pid and libprocess pid after reboot.
After agent host is rebooted, the forked pid and libprocess pid in
agent's meta directory are obsolete, so we should not read them during
agent recovery, otherwise containerizer may wait for an irrelevant
process if the forked pid is reused by another process after reboot.
Review: https://reviews.apache.org/r/69705/
(cherry picked from commit eda091215a132a7ea758a95cd4f66945a462387e)
---
src/slave/state.cpp | 57 +++++++++++++++++++++++++++++++++++++++++------------
src/slave/state.hpp | 12 +++++++----
2 files changed, 52 insertions(+), 17 deletions(-)
diff --git a/src/slave/state.cpp b/src/slave/state.cpp
index e7cf849..ae16d6f 100644
--- a/src/slave/state.cpp
+++ b/src/slave/state.cpp
@@ -123,7 +123,9 @@ Try<State> recover(const string& rootDir, bool strict)
SlaveID slaveId;
slaveId.set_value(Path(directory.get()).basename());
- Try<SlaveState> slave = SlaveState::recover(rootDir, slaveId, strict);
+ Try<SlaveState> slave =
+ SlaveState::recover(rootDir, slaveId, strict, state.rebooted);
+
if (slave.isError()) {
return Error(slave.error());
}
@@ -137,7 +139,8 @@ Try<State> recover(const string& rootDir, bool strict)
Try<SlaveState> SlaveState::recover(
const string& rootDir,
const SlaveID& slaveId,
- bool strict)
+ bool strict,
+ bool rebooted)
{
SlaveState state;
state.id = slaveId;
@@ -188,7 +191,7 @@ Try<SlaveState> SlaveState::recover(
frameworkId.set_value(Path(path).basename());
Try<FrameworkState> framework =
- FrameworkState::recover(rootDir, slaveId, frameworkId, strict);
+ FrameworkState::recover(rootDir, slaveId, frameworkId, strict, rebooted);
if (framework.isError()) {
return Error("Failed to recover framework " + frameworkId.value() +
@@ -207,7 +210,8 @@ Try<FrameworkState> FrameworkState::recover(
const string& rootDir,
const SlaveID& slaveId,
const FrameworkID& frameworkId,
- bool strict)
+ bool strict,
+ bool rebooted)
{
FrameworkState state;
state.id = frameworkId;
@@ -295,8 +299,8 @@ Try<FrameworkState> FrameworkState::recover(
ExecutorID executorId;
executorId.set_value(Path(path).basename());
- Try<ExecutorState> executor =
- ExecutorState::recover(rootDir, slaveId, frameworkId, executorId, strict);
+ Try<ExecutorState> executor = ExecutorState::recover(
+ rootDir, slaveId, frameworkId, executorId, strict, rebooted);
if (executor.isError()) {
return Error("Failed to recover executor '" + executorId.value() +
@@ -316,7 +320,8 @@ Try<ExecutorState> ExecutorState::recover(
const SlaveID& slaveId,
const FrameworkID& frameworkId,
const ExecutorID& executorId,
- bool strict)
+ bool strict,
+ bool rebooted)
{
ExecutorState state;
state.id = executorId;
@@ -356,7 +361,13 @@ Try<ExecutorState> ExecutorState::recover(
containerId.set_value(Path(path).basename());
Try<RunState> run = RunState::recover(
- rootDir, slaveId, frameworkId, executorId, containerId, strict);
+ rootDir,
+ slaveId,
+ frameworkId,
+ executorId,
+ containerId,
+ strict,
+ rebooted);
if (run.isError()) {
return Error(
@@ -423,7 +434,8 @@ Try<RunState> RunState::recover(
const FrameworkID& frameworkId,
const ExecutorID& executorId,
const ContainerID& containerId,
- bool strict)
+ bool strict,
+ bool rebooted)
{
RunState state;
state.id = containerId;
@@ -469,18 +481,37 @@ Try<RunState> RunState::recover(
state.errors += task->errors;
}
- // Read the forked pid.
path = paths::getForkedPidPath(
rootDir, slaveId, frameworkId, executorId, containerId);
+
+ // If agent host is rebooted, we do not read the forked pid and libprocess pid
+ // since those two pids are obsolete after reboot. And we remove the forked
+ // pid file to make sure we will not read it in the case the agent process is
+ // restarted after we checkpoint the new boot ID in `Slave::__recover` (i.e.,
+ // agent recovery is done after the reboot).
+ if (rebooted) {
+ if (os::exists(path)) {
+ Try<Nothing> rm = os::rm(path);
+ if (rm.isError()) {
+ return Error(
+ "Failed to remove executor forked pid file '" + path + "': " +
+ rm.error());
+ }
+ }
+
+ return state;
+ }
+
if (!os::exists(path)) {
- // This could happen if the slave died before the isolator
- // checkpointed the forked pid.
+ // This could happen if the slave died before the containerizer checkpointed
+ // the forked pid or agent process is restarted after agent host is rebooted
+ // since we remove this file in the above code.
LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'";
return state;
}
+ // Read the forked pid.
Result<string> pid = state::read<string>(path);
-
if (pid.isError()) {
message = "Failed to read executor forked pid from '" + path +
"': " + pid.error();
diff --git a/src/slave/state.hpp b/src/slave/state.hpp
index 003211e..37cd2f6 100644
--- a/src/slave/state.hpp
+++ b/src/slave/state.hpp
@@ -271,7 +271,8 @@ struct RunState
const FrameworkID& frameworkId,
const ExecutorID& executorId,
const ContainerID& containerId,
- bool strict);
+ bool strict,
+ bool rebooted);
Option<ContainerID> id;
hashmap<TaskID, TaskState> tasks;
@@ -298,7 +299,8 @@ struct ExecutorState
const SlaveID& slaveId,
const FrameworkID& frameworkId,
const ExecutorID& executorId,
- bool strict);
+ bool strict,
+ bool rebooted);
ExecutorID id;
Option<ExecutorInfo> info;
@@ -316,7 +318,8 @@ struct FrameworkState
const std::string& rootDir,
const SlaveID& slaveId,
const FrameworkID& frameworkId,
- bool strict);
+ bool strict,
+ bool rebooted);
FrameworkID id;
Option<FrameworkInfo> info;
@@ -351,7 +354,8 @@ struct SlaveState
static Try<SlaveState> recover(
const std::string& rootDir,
const SlaveID& slaveId,
- bool strict);
+ bool strict,
+ bool rebooted);
SlaveID id;
Option<SlaveInfo> info;
[mesos] 03/04: Added MESOS-9502 to 1.5.2 CHANGELOG.
Posted by gi...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
gilbert pushed a commit to branch 1.5.x
in repository https://gitbox.apache.org/repos/asf/mesos.git
commit ec8bfbfeee7a8a62db74cbd699d5f97f5ab4e20e
Author: Gilbert Song <so...@gmail.com>
AuthorDate: Fri Jan 11 13:09:22 2019 -0800
Added MESOS-9502 to 1.5.2 CHANGELOG.
(cherry picked from commit fbb5d01275390783b2712e681c5ccebfc2a77d78)
---
CHANGELOG | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGELOG b/CHANGELOG
index 88a27a7..dd1ec1a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -62,6 +62,7 @@ Release Notes - Mesos - Version 1.5.2
* [MESOS-9308] - URI disk profile adaptor could deadlock.
* [MESOS-9334] - Container stuck at ISOLATING state due to libevent poll never returns.
* [MESOS-9480] - Master may skip processing authorization results for `LAUNCH_GROUP`.
+ * [MESOS-9502] - IOswitchboard cleanup could get stuck due to FD leak from a race.
** Improvement:
* [MESOS-9510] - Disallowed nan, inf and so on in `Value::Scalar`.
[mesos] 02/04: Updated `SlaveRecoveryTest.Reboot` to expect none
pids.
Posted by gi...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
gilbert pushed a commit to branch 1.5.x
in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 1ef2ef592a381d756d5d00a9afe4c5e9a458bdb0
Author: Qian Zhang <zh...@gmail.com>
AuthorDate: Fri Jan 11 12:21:22 2019 -0800
Updated `SlaveRecoveryTest.Reboot` to expect none pids.
After agent host is rebooted, the recovered executor's forked pid and
libprocess pid will be `NONE()` since we do not read them from agent's
meta directory which are actually obsolete after reboot.
Review: https://reviews.apache.org/r/69716/
(cherry picked from commit 43772dfa7111ebdf759b066728736921062cc4fc)
---
src/tests/slave_recovery_tests.cpp | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/src/tests/slave_recovery_tests.cpp b/src/tests/slave_recovery_tests.cpp
index 60efba5..d012d39 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -2747,11 +2747,8 @@ TYPED_TEST(SlaveRecoveryTest, Reboot)
EXPECT_TRUE(executorState.runs.contains(containerId2.get()));
- EXPECT_SOME_EQ(
- executorPid,
- executorState
- .runs[containerId2.get()]
- .libprocessPid);
+ EXPECT_NONE(executorState.runs[containerId2.get()].forkedPid);
+ EXPECT_NONE(executorState.runs[containerId2.get()].libprocessPid);
EXPECT_TRUE(executorState
.runs[containerId2.get()]
[mesos] 04/04: Added MESOS-9501 to 1.5.2 CHANGELOG.
Posted by gi...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
gilbert pushed a commit to branch 1.5.x
in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 4c19426cd801e9dff957fc401a34d036e3e632c2
Author: Gilbert Song <so...@gmail.com>
AuthorDate: Fri Jan 11 13:11:31 2019 -0800
Added MESOS-9501 to 1.5.2 CHANGELOG.
(cherry picked from commit 98526e33ef2ec197c763f8ca1a1a7188b8d93f19)
---
CHANGELOG | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGELOG b/CHANGELOG
index dd1ec1a..51609b5 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -62,6 +62,7 @@ Release Notes - Mesos - Version 1.5.2
* [MESOS-9308] - URI disk profile adaptor could deadlock.
* [MESOS-9334] - Container stuck at ISOLATING state due to libevent poll never returns.
* [MESOS-9480] - Master may skip processing authorization results for `LAUNCH_GROUP`.
+ * [MESOS-9501] - Mesos executor fails to terminate and gets stuck after agent host reboot.
* [MESOS-9502] - IOswitchboard cleanup could get stuck due to FD leak from a race.
** Improvement: