You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2016/12/21 20:06:52 UTC

[1/5] mesos git commit: Simplified `State::recover` interface in the agent.

Repository: mesos
Updated Branches:
  refs/heads/master 477275744 -> 744e7aa9b


Simplified `State::recover` interface in the agent.

`State::recover` previously returned `Result<State>`, which implied
three possibilities: Error, None (if the `work_dir` doesn't exist), or a
`State` value. However, the agent code treated the latter two cases the
same way: if the `work_dir` doesn't exist, it is simpler to represent
this case by returning a `State` value whose fields are `None()`.

Review: https://reviews.apache.org/r/53233/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e5fe8119
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e5fe8119
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e5fe8119

Branch: refs/heads/master
Commit: e5fe81195b3708f8cabe683c0aa53ae2433506eb
Parents: 4772757
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Dec 21 15:06:07 2016 -0500
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Dec 21 15:06:07 2016 -0500

----------------------------------------------------------------------
 src/slave/slave.cpp | 10 +++-------
 src/slave/slave.hpp |  2 +-
 src/slave/state.cpp |  9 ++++-----
 src/slave/state.hpp | 11 +++++------
 4 files changed, 13 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/e5fe8119/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index a7a3a39..c72959b 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -5181,18 +5181,14 @@ void Slave::_checkDiskUsage(const Future<double>& usage)
 }
 
 
-Future<Nothing> Slave::recover(const Result<state::State>& state)
+Future<Nothing> Slave::recover(const Try<state::State>& state)
 {
   if (state.isError()) {
     return Failure(state.error());
   }
 
-  Option<ResourcesState> resourcesState;
-  Option<SlaveState> slaveState;
-  if (state.isSome()) {
-    resourcesState = state.get().resources;
-    slaveState = state.get().slave;
-  }
+  Option<ResourcesState> resourcesState = state->resources;
+  Option<SlaveState> slaveState = state->slave;
 
   // Recover checkpointed resources.
   // NOTE: 'resourcesState' is None if the slave rootDir does not

http://git-wip-us.apache.org/repos/asf/mesos/blob/e5fe8119/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index 03860b5..5b4a271 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -390,7 +390,7 @@ public:
   void checkDiskUsage();
 
   // Recovers the slave, status update manager and isolator.
-  process::Future<Nothing> recover(const Result<state::State>& state);
+  process::Future<Nothing> recover(const Try<state::State>& state);
 
   // This is called after 'recover()'. If 'flags.reconnect' is
   // 'reconnect', the slave attempts to reconnect to any old live

http://git-wip-us.apache.org/repos/asf/mesos/blob/e5fe8119/src/slave/state.cpp
----------------------------------------------------------------------
diff --git a/src/slave/state.cpp b/src/slave/state.cpp
index a94bb8d..8656694 100644
--- a/src/slave/state.cpp
+++ b/src/slave/state.cpp
@@ -52,20 +52,19 @@ using std::string;
 using std::max;
 
 
-Result<State> recover(const string& rootDir, bool strict)
+Try<State> recover(const string& rootDir, bool strict)
 {
   LOG(INFO) << "Recovering state from '" << rootDir << "'";
 
+  State state;
+
   // We consider the absence of 'rootDir' to mean that this is either
   // the first time this slave was started or this slave was started after
   // an upgrade (--recover=cleanup).
   if (!os::exists(rootDir)) {
-    return None();
+    return state;
   }
 
-  // Now, start to recover state from 'rootDir'.
-  State state;
-
   // Recover resources regardless whether the host has rebooted.
   Try<ResourcesState> resources = ResourcesState::recover(rootDir, strict);
   if (resources.isError()) {

http://git-wip-us.apache.org/repos/asf/mesos/blob/e5fe8119/src/slave/state.hpp
----------------------------------------------------------------------
diff --git a/src/slave/state.hpp b/src/slave/state.hpp
index a1c8496..8c57fd6 100644
--- a/src/slave/state.hpp
+++ b/src/slave/state.hpp
@@ -64,14 +64,13 @@ struct TaskState;
 // If the 'strict' flag is set, any errors encountered while
 // recovering a state are considered fatal and hence the recovery is
 // short-circuited and returns an error. There might be orphaned
-// executors that need to be manually cleaned up. If 'strict' flag is
-// not set, any errors encountered are considered non-fatal and the
+// executors that need to be manually cleaned up. If the 'strict' flag
+// is not set, any errors encountered are considered non-fatal and the
 // recovery continues by recovering as much of the state as possible,
 // while increasing the 'errors' count. Note that 'errors' on a struct
 // includes the 'errors' encountered recursively. In other words,
-// 'State.errors' is the sum total of all recovery errors. If the
-// machine has rebooted since the last slave run, None is returned.
-Result<State> recover(const std::string& rootDir, bool strict);
+// 'State.errors' is the sum total of all recovery errors.
+Try<State> recover(const std::string& rootDir, bool strict);
 
 
 namespace internal {
@@ -312,7 +311,7 @@ struct State
 
   // TODO(jieyu): Consider using a vector of Option<Error> here so
   // that we can print all the errors. This also applies to all the
-  // State structs below.
+  // State structs above.
   unsigned int errors;
 };
 


[4/5] mesos git commit: Cleaned up Master::markUnreachableAfterFailover.

Posted by vi...@apache.org.
Cleaned up Master::markUnreachableAfterFailover.

Review: https://reviews.apache.org/r/53236/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f076bb8c
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f076bb8c
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f076bb8c

Branch: refs/heads/master
Commit: f076bb8c795cb270c68d79478b252302c8a4de31
Parents: fcf38bf
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Dec 21 15:06:25 2016 -0500
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Dec 21 15:06:25 2016 -0500

----------------------------------------------------------------------
 src/master/master.cpp | 22 +++++++++++-----------
 src/master/master.hpp |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/f076bb8c/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 46ca84a..bbd16d4 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -1861,7 +1861,7 @@ void Master::recoveredSlavesTimeout(const Registry& registry)
     const string failure = "Agent removal rate limit acquisition failed";
 
     acquire
-      .then(defer(self(), &Self::markUnreachableAfterFailover, slave))
+      .then(defer(self(), &Self::markUnreachableAfterFailover, slave.info()))
       .onFailed(lambda::bind(fail, failure, lambda::_1))
       .onDiscarded(lambda::bind(fail, failure, "discarded"));
 
@@ -1870,13 +1870,13 @@ void Master::recoveredSlavesTimeout(const Registry& registry)
 }
 
 
-Nothing Master::markUnreachableAfterFailover(const Registry::Slave& slave)
+Nothing Master::markUnreachableAfterFailover(const SlaveInfo& slave)
 {
   // The slave might have reregistered while we were waiting to
   // acquire the rate limit.
-  if (!slaves.recovered.contains(slave.info().id())) {
+  if (!slaves.recovered.contains(slave.id())) {
     LOG(INFO) << "Canceling transition of agent "
-              << slave.info().id() << " (" << slave.info().hostname() << ")"
+              << slave.id() << " (" << slave.hostname() << ")"
               << " to unreachable because it re-registered";
 
     ++metrics->slave_unreachable_canceled;
@@ -1884,17 +1884,17 @@ Nothing Master::markUnreachableAfterFailover(const Registry::Slave& slave)
   }
 
   // The slave might be in the process of reregistering.
-  if (slaves.reregistering.contains(slave.info().id())) {
+  if (slaves.reregistering.contains(slave.id())) {
     LOG(INFO) << "Canceling transition of agent "
-              << slave.info().id() << " (" << slave.info().hostname() << ")"
+              << slave.id() << " (" << slave.hostname() << ")"
               << " to unreachable because it is re-registering";
 
     ++metrics->slave_unreachable_canceled;
     return Nothing();
   }
 
-  LOG(WARNING) << "Agent " << slave.info().id()
-               << " (" << slave.info().hostname() << ") did not re-register"
+  LOG(WARNING) << "Agent " << slave.id()
+               << " (" << slave.hostname() << ") did not re-register"
                << " within " << flags.agent_reregister_timeout
                << " after master failover; marking it unreachable";
 
@@ -1902,13 +1902,13 @@ Nothing Master::markUnreachableAfterFailover(const Registry::Slave& slave)
 
   TimeInfo unreachableTime = protobuf::getCurrentTime();
 
-  slaves.markingUnreachable.insert(slave.info().id());
+  slaves.markingUnreachable.insert(slave.id());
 
   registrar->apply(Owned<Operation>(
-          new MarkSlaveUnreachable(slave.info(), unreachableTime)))
+          new MarkSlaveUnreachable(slave, unreachableTime)))
     .onAny(defer(self(),
                  &Self::_markUnreachableAfterFailover,
-                 slave.info(),
+                 slave,
                  unreachableTime,
                  lambda::_1));
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/f076bb8c/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 89b3c39..a8bb43b 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -710,7 +710,7 @@ protected:
 
   // Mark a slave as unreachable in the registry. Called when the slave
   // does not re-register in time after a master failover.
-  Nothing markUnreachableAfterFailover(const Registry::Slave& slave);
+  Nothing markUnreachableAfterFailover(const SlaveInfo& slave);
 
   void _markUnreachableAfterFailover(
       const SlaveInfo& slaveInfo,


[5/5] mesos git commit: Improved slave recovery tests.

Posted by vi...@apache.org.
Improved slave recovery tests.

Check metrics, simplify/remove redundant code.

Review: https://reviews.apache.org/r/53237/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/744e7aa9
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/744e7aa9
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/744e7aa9

Branch: refs/heads/master
Commit: 744e7aa9b66cad0f4c0541380f03d0823d7f5c0f
Parents: f076bb8
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Dec 21 15:06:34 2016 -0500
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Dec 21 15:06:34 2016 -0500

----------------------------------------------------------------------
 .../docker_containerizer_tests.cpp              |  2 -
 src/tests/slave_recovery_tests.cpp              | 98 +++++++++++---------
 2 files changed, 54 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/744e7aa9/src/tests/containerizer/docker_containerizer_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/docker_containerizer_tests.cpp b/src/tests/containerizer/docker_containerizer_tests.cpp
index 6035171..4e3b67b 100644
--- a/src/tests/containerizer/docker_containerizer_tests.cpp
+++ b/src/tests/containerizer/docker_containerizer_tests.cpp
@@ -2691,7 +2691,6 @@ TEST_F(DockerContainerizerTest, ROOT_DOCKER_SlaveRecoveryTaskContainer)
 
   // Ensure the executor re-registers.
   AWAIT_READY(reregisterExecutorMessage);
-  UPID executorPid = reregisterExecutorMessage.get().from;
 
   ReregisterExecutorMessage reregister;
   reregister.ParseFromString(reregisterExecutorMessage.get().body);
@@ -2880,7 +2879,6 @@ TEST_F(DockerContainerizerTest,
 
   // Ensure the executor re-registers.
   AWAIT_READY(reregisterExecutorMessage);
-  UPID executorPid = reregisterExecutorMessage.get().from;
 
   ReregisterExecutorMessage reregister;
   reregister.ParseFromString(reregisterExecutorMessage.get().body);

http://git-wip-us.apache.org/repos/asf/mesos/blob/744e7aa9/src/tests/slave_recovery_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_recovery_tests.cpp b/src/tests/slave_recovery_tests.cpp
index 5b86c06..2e14f8a 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -242,6 +242,7 @@ TYPED_TEST(SlaveRecoveryTest, RecoverSlaveState)
 
   // Capture the executor pids.
   AWAIT_READY(registerExecutorMessage);
+
   RegisterExecutorMessage registerExecutor;
   registerExecutor.ParseFromString(registerExecutorMessage.get().body);
   ExecutorID executorId = registerExecutor.executor_id();
@@ -400,9 +401,7 @@ TYPED_TEST(SlaveRecoveryTest, RecoverStatusUpdateManager)
 
   driver.launchTasks(offers.get()[0].id(), {task});
 
-  // Capture the executor pid.
   AWAIT_READY(registerExecutor);
-  UPID executorPid = registerExecutor.get().from;
 
   // Wait for the status update drop.
   AWAIT_READY(update);
@@ -774,8 +773,8 @@ TYPED_TEST(SlaveRecoveryTest, ReconnectExecutor)
 
   slave.get()->terminate();
 
-  Future<Message> reregisterExecutorMessage =
-    FUTURE_MESSAGE(Eq(ReregisterExecutorMessage().GetTypeName()), _, _);
+  Future<ReregisterExecutorMessage> reregister =
+    FUTURE_PROTOBUF(ReregisterExecutorMessage(), _, _);
 
   Future<TaskStatus> status;
   EXPECT_CALL(sched, statusUpdate(_, _))
@@ -791,15 +790,11 @@ TYPED_TEST(SlaveRecoveryTest, ReconnectExecutor)
   ASSERT_SOME(slave);
 
   // Ensure the executor re-registers.
-  AWAIT_READY(reregisterExecutorMessage);
-  UPID executorPid = reregisterExecutorMessage.get().from;
-
-  ReregisterExecutorMessage reregister;
-  reregister.ParseFromString(reregisterExecutorMessage.get().body);
+  AWAIT_READY(reregister);
 
   // Executor should inform about the unacknowledged update.
-  EXPECT_EQ(1, reregister.updates_size());
-  const StatusUpdate& update = reregister.updates(0);
+  ASSERT_EQ(1, reregister->updates_size());
+  const StatusUpdate& update = reregister->updates(0);
   EXPECT_EQ(task.task_id(), update.status().task_id());
   EXPECT_EQ(TASK_RUNNING, update.status().state());
 
@@ -982,7 +977,6 @@ TYPED_TEST(SlaveRecoveryTest, RecoverUnregisteredExecutor)
 
   // Stop the slave before the executor is registered.
   AWAIT_READY(registerExecutor);
-  UPID executorPid = registerExecutor.get().from;
 
   slave.get()->terminate();
 
@@ -1230,12 +1224,12 @@ TYPED_TEST(SlaveRecoveryTest, RecoverTerminatedHTTPExecutor)
   slave::state::FrameworkState frameworkState =
     state.get().slave.get().frameworks.get(frameworkId.get()).get();
 
-  EXPECT_EQ(1u, frameworkState.executors.size());
+  ASSERT_EQ(1u, frameworkState.executors.size());
 
   slave::state::ExecutorState executorState =
     frameworkState.executors.begin()->second;
 
-  EXPECT_EQ(1u, executorState.runs.size());
+  ASSERT_EQ(1u, executorState.runs.size());
 
   slave::state::RunState runState = executorState.runs.begin()->second;
 
@@ -2304,34 +2298,35 @@ TYPED_TEST(SlaveRecoveryTest, Reboot)
   TaskInfo task = createTask(offers1.get()[0], "sleep 1000");
 
   // Capture the slave and framework ids.
-  SlaveID slaveId = offers1.get()[0].slave_id();
+  SlaveID slaveId1 = offers1.get()[0].slave_id();
   FrameworkID frameworkId = offers1.get()[0].framework_id();
 
   Future<Message> registerExecutorMessage =
     FUTURE_MESSAGE(Eq(RegisterExecutorMessage().GetTypeName()), _, _);
 
-  Future<Nothing> status;
+  Future<Nothing> runningStatus;
   EXPECT_CALL(sched, statusUpdate(_, _))
-    .WillOnce(FutureSatisfy(&status))
+    .WillOnce(FutureSatisfy(&runningStatus))
     .WillRepeatedly(Return()); // Ignore subsequent updates.
 
   driver.launchTasks(offers1.get()[0].id(), {task});
 
   // Capture the executor ID and PID.
   AWAIT_READY(registerExecutorMessage);
+
   RegisterExecutorMessage registerExecutor;
   registerExecutor.ParseFromString(registerExecutorMessage.get().body);
   ExecutorID executorId = registerExecutor.executor_id();
   UPID executorPid = registerExecutorMessage.get().from;
 
   // Wait for TASK_RUNNING update.
-  AWAIT_READY(status);
+  AWAIT_READY(runningStatus);
 
   // Capture the container ID.
   Future<hashset<ContainerID>> containers = containerizer->containers();
 
   AWAIT_READY(containers);
-  EXPECT_EQ(1u, containers.get().size());
+  ASSERT_EQ(1u, containers.get().size());
 
   ContainerID containerId = *containers.get().begin();
 
@@ -2341,7 +2336,7 @@ TYPED_TEST(SlaveRecoveryTest, Reboot)
   // reboot.
   string pidPath = paths::getForkedPidPath(
         paths::getMetaRootDir(flags.work_dir),
-        slaveId,
+        slaveId1,
         frameworkId,
         executorId,
         containerId);
@@ -2364,8 +2359,8 @@ TYPED_TEST(SlaveRecoveryTest, Reboot)
       paths::getBootIdPath(paths::getMetaRootDir(flags.work_dir)),
       "rebooted! ;)"));
 
-  Future<RegisterSlaveMessage> registerSlave =
-    FUTURE_PROTOBUF(RegisterSlaveMessage(), _, _);
+  Future<SlaveRegisteredMessage> slaveRegistered =
+    FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
 
   // Restart the slave (use same flags) with a new containerizer.
   _containerizer = TypeParam::create(flags, true, &fetcher);
@@ -2380,13 +2375,25 @@ TYPED_TEST(SlaveRecoveryTest, Reboot)
   slave = this->StartSlave(detector.get(), containerizer.get(), flags);
   ASSERT_SOME(slave);
 
-  AWAIT_READY(registerSlave);
+  AWAIT_READY(slaveRegistered);
+
+  SlaveID slaveId2 = slaveRegistered.get().slave_id();
+
+  EXPECT_NE(slaveId1, slaveId2);
 
   // Make sure all slave resources are reoffered.
   AWAIT_READY(offers2);
   EXPECT_EQ(Resources(offers1.get()[0].resources()),
             Resources(offers2.get()[0].resources()));
 
+  // The old agent ID is not removed (MESOS-5396).
+  JSON::Object stats = Metrics();
+  EXPECT_EQ(0, stats.values["master/tasks_lost"]);
+  EXPECT_EQ(0, stats.values["master/slave_unreachable_scheduled"]);
+  EXPECT_EQ(0, stats.values["master/slave_unreachable_completed"]);
+  EXPECT_EQ(0, stats.values["master/slave_removals"]);
+  EXPECT_EQ(0, stats.values["master/slave_removals/reason_registered"]);
+
   driver.stop();
   driver.join();
 }
@@ -2442,8 +2449,8 @@ TYPED_TEST(SlaveRecoveryTest, GCExecutor)
   SlaveID slaveId = offers1.get()[0].slave_id();
   FrameworkID frameworkId = offers1.get()[0].framework_id();
 
-  Future<Message> registerExecutorMessage =
-    FUTURE_MESSAGE(Eq(RegisterExecutorMessage().GetTypeName()), _, _);
+  Future<RegisterExecutorMessage> registerExecutor =
+    FUTURE_PROTOBUF(RegisterExecutorMessage(), _, _);
 
   Future<Nothing> status;
   EXPECT_CALL(sched, statusUpdate(_, _))
@@ -2452,12 +2459,9 @@ TYPED_TEST(SlaveRecoveryTest, GCExecutor)
 
   driver.launchTasks(offers1.get()[0].id(), {task});
 
-  // Capture the executor id and pid.
-  AWAIT_READY(registerExecutorMessage);
-  RegisterExecutorMessage registerExecutor;
-  registerExecutor.ParseFromString(registerExecutorMessage.get().body);
-  ExecutorID executorId = registerExecutor.executor_id();
-  UPID executorPid = registerExecutorMessage.get().from;
+  // Capture the executor id.
+  AWAIT_READY(registerExecutor);
+  ExecutorID executorId = registerExecutor->executor_id();
 
   // Wait for TASK_RUNNING update.
   AWAIT_READY(status);
@@ -2819,11 +2823,7 @@ TYPED_TEST(SlaveRecoveryTest, RegisterDisconnectedSlave)
 
   driver.launchTasks(offers.get()[0].id(), {task});
 
-  // Capture the executor pid.
   AWAIT_READY(registerExecutorMessage);
-  RegisterExecutorMessage registerExecutor;
-  registerExecutor.ParseFromString(registerExecutorMessage.get().body);
-  UPID executorPid = registerExecutorMessage.get().from;
 
   // Wait for TASK_RUNNING update.
   AWAIT_READY(status);
@@ -2831,7 +2831,9 @@ TYPED_TEST(SlaveRecoveryTest, RegisterDisconnectedSlave)
   EXPECT_CALL(sched, slaveLost(_, _))
     .Times(AtMost(1));
 
-  slave.get()->terminate();
+  UPID slavePid = slave.get()->pid;
+
+  slave->reset();
 
   Future<TaskStatus> status2;
   EXPECT_CALL(sched, statusUpdate(_, _))
@@ -2840,7 +2842,7 @@ TYPED_TEST(SlaveRecoveryTest, RegisterDisconnectedSlave)
   // Spoof the registration attempt of a slave that failed recovery.
   // We do this because simply restarting the slave will result in a slave
   // with a different pid than the previous one.
-  post(slave.get()->pid, master.get()->pid, registerSlaveMessage.get());
+  post(slavePid, master.get()->pid, registerSlaveMessage.get());
 
   // Scheduler should get a TASK_LOST message.
   AWAIT_READY(status2);
@@ -2849,6 +2851,14 @@ TYPED_TEST(SlaveRecoveryTest, RegisterDisconnectedSlave)
   EXPECT_EQ(TaskStatus::SOURCE_MASTER, status2.get().source());
   EXPECT_EQ(TaskStatus::REASON_SLAVE_REMOVED, status2.get().reason());
 
+  JSON::Object stats = Metrics();
+  EXPECT_EQ(1, stats.values["master/tasks_lost"]);
+  EXPECT_EQ(0, stats.values["master/tasks_unreachable"]);
+  EXPECT_EQ(0, stats.values["master/slave_unreachable_scheduled"]);
+  EXPECT_EQ(0, stats.values["master/slave_unreachable_completed"]);
+  EXPECT_EQ(1, stats.values["master/slave_removals"]);
+  EXPECT_EQ(1, stats.values["master/slave_removals/reason_registered"]);
+
   driver.stop();
   driver.join();
 
@@ -3202,12 +3212,12 @@ TYPED_TEST(SlaveRecoveryTest, ReconcileTasksMissingFromSlave)
   slave::state::FrameworkState frameworkState =
     state.get().slave.get().frameworks.get(frameworkId.get()).get();
 
-  EXPECT_EQ(1u, frameworkState.executors.size());
+  ASSERT_EQ(1u, frameworkState.executors.size());
 
   slave::state::ExecutorState executorState =
     frameworkState.executors.begin()->second;
 
-  EXPECT_EQ(1u, executorState.runs.size());
+  ASSERT_EQ(1u, executorState.runs.size());
 
   slave::state::RunState runState = executorState.runs.begin()->second;
 
@@ -3796,7 +3806,7 @@ TYPED_TEST(SlaveRecoveryTest, MultipleSlaves)
   ASSERT_SOME(slave1);
 
   AWAIT_READY(offers1);
-  EXPECT_EQ(1u, offers1.get().size());
+  ASSERT_FALSE(offers1->empty());
 
   // Launch a long running task in the first slave.
   TaskInfo task1 = createTask(offers1.get()[0], "sleep 1000");
@@ -3837,7 +3847,7 @@ TYPED_TEST(SlaveRecoveryTest, MultipleSlaves)
   ASSERT_SOME(slave2);
 
   AWAIT_READY(offers2);
-  EXPECT_EQ(1u, offers2.get().size());
+  ASSERT_FALSE(offers2->empty());
 
   // Launch a long running task in each slave.
   TaskInfo task2 = createTask(offers2.get()[0], "sleep 1000");
@@ -4102,7 +4112,7 @@ TEST_F(MesosContainerizerSlaveRecoveryTest, ResourceStatistics)
 
   Future<hashset<ContainerID>> containers = containerizer->containers();
   AWAIT_READY(containers);
-  EXPECT_EQ(1u, containers.get().size());
+  ASSERT_EQ(1u, containers.get().size());
 
   ContainerID containerId = *(containers.get().begin());
 
@@ -4194,7 +4204,7 @@ TEST_F(MesosContainerizerSlaveRecoveryTest, CGROUPS_ROOT_PidNamespaceForward)
 
   Future<hashset<ContainerID>> containers = containerizer->containers();
   AWAIT_READY(containers);
-  EXPECT_EQ(1u, containers.get().size());
+  ASSERT_EQ(1u, containers.get().size());
 
   ContainerID containerId = *(containers.get().begin());
 
@@ -4300,7 +4310,7 @@ TEST_F(MesosContainerizerSlaveRecoveryTest, CGROUPS_ROOT_PidNamespaceBackward)
 
   Future<hashset<ContainerID>> containers = containerizer->containers();
   AWAIT_READY(containers);
-  EXPECT_EQ(1u, containers.get().size());
+  ASSERT_EQ(1u, containers.get().size());
 
   ContainerID containerId = *(containers.get().begin());
 


[2/5] mesos git commit: Logged warning message if reading previous boot ID fails.

Posted by vi...@apache.org.
Logged warning message if reading previous boot ID fails.

Not much we can do here, but as a general rule, we should never silently
ignore I/O errors.

Review: https://reviews.apache.org/r/53234/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/c9807d60
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/c9807d60
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/c9807d60

Branch: refs/heads/master
Commit: c9807d60ad58a44513c812a6b6c08fbd352b441f
Parents: e5fe811
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Dec 21 15:06:13 2016 -0500
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Dec 21 15:06:13 2016 -0500

----------------------------------------------------------------------
 src/slave/state.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/c9807d60/src/slave/state.cpp
----------------------------------------------------------------------
diff --git a/src/slave/state.cpp b/src/slave/state.cpp
index 8656694..7926117 100644
--- a/src/slave/state.cpp
+++ b/src/slave/state.cpp
@@ -75,11 +75,14 @@ Try<State> recover(const string& rootDir, bool strict)
   // resources checkpoint file.
   state.resources = resources.get();
 
-  // Did the machine reboot? No need to recover slave state if the
-  // machine has rebooted.
-  if (os::exists(paths::getBootIdPath(rootDir))) {
-    Try<string> read = os::read(paths::getBootIdPath(rootDir));
-    if (read.isSome()) {
+  // If the machine has rebooted, skip recovering slave state.
+  const string& bootIdPath = paths::getBootIdPath(rootDir);
+  if (os::exists(bootIdPath)) {
+    Try<string> read = os::read(bootIdPath);
+    if (read.isError()) {
+      LOG(WARNING) << "Failed to read '"
+                   << bootIdPath << "': " << read.error();
+    } else {
       Try<string> id = os::bootId();
       CHECK_SOME(id);
 


[3/5] mesos git commit: Cleaned up header includes and `using`.

Posted by vi...@apache.org.
Cleaned up header includes and `using`.

Review: https://reviews.apache.org/r/53235/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/fcf38bf4
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/fcf38bf4
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/fcf38bf4

Branch: refs/heads/master
Commit: fcf38bf4fde10711abc40f1d5133a6d9f0540ed2
Parents: c9807d6
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Dec 21 15:06:19 2016 -0500
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Dec 21 15:06:19 2016 -0500

----------------------------------------------------------------------
 src/slave/state.cpp | 5 ++++-
 src/slave/state.hpp | 3 +--
 2 files changed, 5 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/fcf38bf4/src/slave/state.cpp
----------------------------------------------------------------------
diff --git a/src/slave/state.cpp b/src/slave/state.cpp
index 7926117..6894875 100644
--- a/src/slave/state.cpp
+++ b/src/slave/state.cpp
@@ -28,14 +28,17 @@
 #include <stout/numify.hpp>
 #include <stout/path.hpp>
 #include <stout/protobuf.hpp>
+#include <stout/strings.hpp>
 #include <stout/try.hpp>
 
 #include <stout/os/bootid.hpp>
 #include <stout/os/close.hpp>
 #include <stout/os/exists.hpp>
 #include <stout/os/ftruncate.hpp>
+#include <stout/os/ls.hpp>
 #include <stout/os/read.hpp>
 #include <stout/os/realpath.hpp>
+#include <stout/os/stat.hpp>
 
 #include "messages/messages.hpp"
 
@@ -48,8 +51,8 @@ namespace slave {
 namespace state {
 
 using std::list;
-using std::string;
 using std::max;
+using std::string;
 
 
 Try<State> recover(const string& rootDir, bool strict)

http://git-wip-us.apache.org/repos/asf/mesos/blob/fcf38bf4/src/slave/state.hpp
----------------------------------------------------------------------
diff --git a/src/slave/state.hpp b/src/slave/state.hpp
index 8c57fd6..a497ce1 100644
--- a/src/slave/state.hpp
+++ b/src/slave/state.hpp
@@ -28,12 +28,11 @@
 
 #include <process/pid.hpp>
 
-#include <stout/foreach.hpp>
 #include <stout/hashmap.hpp>
 #include <stout/hashset.hpp>
 #include <stout/path.hpp>
 #include <stout/protobuf.hpp>
-#include <stout/strings.hpp>
+#include <stout/try.hpp>
 #include <stout/utils.hpp>
 #include <stout/uuid.hpp>