You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by me...@apache.org on 2015/06/29 12:33:33 UTC

[1/3] mesos git commit: Added Configurable Slave Ping Timeouts

Repository: mesos
Updated Branches:
  refs/heads/master b8007aaa9 -> 6cd28dd91


Added Configurable Slave Ping Timeouts

Added new --slave_ping_timeout and --max_slave_ping_timeouts flags
to mesos-master to supplement the DEFAULT_SLAVE_PING_TIMEOUT (15secs)
and DEFAULT_MAX_SLAVE_PING_TIMEOUTS (5).

These can be extended if slaves are expected/allowed to be down for
longer than a minute or two.

Slave will receive master's ping timeout in SlaveRe[re]gisteredMessage.

Beware that this affects recovery from network timeouts as well as
actual slave node/process failover.

Also fixed the log message in recoveredSlavesTimeout() to correctly
reference flags.slave_reregister_timeout instead of the unrelated
ping timeouts.

Review: https://reviews.apache.org/r/29507


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/6cd28dd9
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/6cd28dd9
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/6cd28dd9

Branch: refs/heads/master
Commit: 6cd28dd9154a1a28ff0756078cb776c221d025d1
Parents: c8e091d
Author: Adam B <ad...@mesosphere.io>
Authored: Mon Jun 29 02:59:59 2015 -0700
Committer: Adam B <ad...@mesosphere.io>
Committed: Mon Jun 29 03:33:11 2015 -0700

----------------------------------------------------------------------
 docs/configuration.md              | 46 ++++++++++++++++------
 docs/upgrades.md                   |  4 ++
 src/master/constants.cpp           |  4 +-
 src/master/constants.hpp           |  4 +-
 src/master/flags.cpp               | 31 +++++++++++++++
 src/master/flags.hpp               |  2 +
 src/master/master.cpp              | 58 +++++++++++++++++++++++-----
 src/messages/messages.proto        | 10 +++++
 src/slave/constants.cpp            |  5 ++-
 src/slave/constants.hpp            |  2 +-
 src/slave/slave.cpp                | 67 ++++++++++++++++++++++++++-------
 src/slave/slave.hpp                | 11 +++++-
 src/tests/partition_tests.cpp      | 63 +++++++++++++++++--------------
 src/tests/slave_recovery_tests.cpp | 11 +++---
 src/tests/slave_tests.cpp          | 33 ++++++++--------
 15 files changed, 259 insertions(+), 92 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/docs/configuration.md
----------------------------------------------------------------------
diff --git a/docs/configuration.md b/docs/configuration.md
index aaf65bf..9a8505d 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -365,6 +365,17 @@ file:///path/to/file (where file contains one of the above)</code></pre>
   </tr>
   <tr>
     <td>
+      --max_slave_ping_timeouts=VALUE
+    </td>
+    <td>
+      The number of times a slave can fail to respond to a
+      ping from the master. Slaves that incur more than
+      `max_slave_ping_timeouts` timeouts will be removed.
+      (default: 5)
+    </td>
+  </tr>
+  <tr>
+    <td>
       --modules=VALUE
     </td>
     <td>
@@ -478,18 +489,6 @@ file:///path/to/file (where file contains one of the above)</code></pre>
   </tr>
   <tr>
     <td>
-      --slave_removal_rate_limit=VALUE
-    </td>
-    <td>
-      The maximum rate (e.g., 1/10mins, 2/3hrs, etc) at which slaves will
-      be removed from the master when they fail health checks. By default
-      slaves will be removed as soon as they fail the health checks.
-      <p/>
-      The value is of the form 'Number of slaves'/'Duration'
-    </td>
-  </tr>
-  <tr>
-    <td>
       --registry=VALUE
     </td>
     <td>
@@ -551,6 +550,29 @@ file:///path/to/file (where file contains one of the above)</code></pre>
   </tr>
   <tr>
     <td>
+      --slave_ping_timeout=VALUE
+    </td>
+    <td>
+      The timeout within which each slave is expected to respond to a
+      ping from the master. Slaves that do not respond within
+      `max_slave_ping_timeouts` ping retries will be removed.
+      (default: 15secs)
+    </td>
+  </tr>
+  <tr>
+    <td>
+      --slave_removal_rate_limit=VALUE
+    </td>
+    <td>
+      The maximum rate (e.g., 1/10mins, 2/3hrs, etc) at which slaves will
+      be removed from the master when they fail health checks. By default
+      slaves will be removed as soon as they fail the health checks.
+      <p/>
+      The value is of the form 'Number of slaves'/'Duration'
+    </td>
+  </tr>
+  <tr>
+    <td>
       --slave_reregister_timeout=VALUE
     </td>
     <td>

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/docs/upgrades.md
----------------------------------------------------------------------
diff --git a/docs/upgrades.md b/docs/upgrades.md
index 73e503c..35e893c 100644
--- a/docs/upgrades.md
+++ b/docs/upgrades.md
@@ -10,6 +10,9 @@ This document serves as a guide for users who wish to upgrade an existing mesos
 
 **NOTE** In order to enable decorator modules to remove metadata (environment variables or labels), we changed the meaning of the return value for decorator hooks in Mesos 0.23.0. Please refer to the modules documentation for more details.
 
+**NOTE** Slave ping timeouts are now configurable on the master via `--slave_ping_timeout` and `--max_slave_ping_timeouts`. Slaves should be upgraded to 0.23.x before changing these flags.
+
+
 ## Upgrading from 0.21.x to 0.22.x
 
 **NOTE** Slave checkpoint flag has been removed as it will be enabled for all
@@ -41,6 +44,7 @@ In order to upgrade a running cluster:
 * Restart the schedulers.
 * Upgrade the executors by linking the latest native library / jar / egg.
 
+
 ## Upgrading from 0.20.x to 0.21.x
 
 **NOTE** Disabling slave checkpointing has been deprecated; the slave --checkpoint flag has been deprecated and will be removed in a future release.

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/src/master/constants.cpp
----------------------------------------------------------------------
diff --git a/src/master/constants.cpp b/src/master/constants.cpp
index 997b792..fbcae60 100644
--- a/src/master/constants.cpp
+++ b/src/master/constants.cpp
@@ -31,8 +31,8 @@ namespace master {
 const int MAX_OFFERS_PER_FRAMEWORK = 50;
 const double MIN_CPUS = 0.01;
 const Bytes MIN_MEM = Megabytes(32);
-const Duration SLAVE_PING_TIMEOUT = Seconds(15);
-const uint32_t MAX_SLAVE_PING_TIMEOUTS = 5;
+const Duration DEFAULT_SLAVE_PING_TIMEOUT = Seconds(15);
+const size_t DEFAULT_MAX_SLAVE_PING_TIMEOUTS = 5;
 const Duration MIN_SLAVE_REREGISTER_TIMEOUT = Minutes(10);
 const double RECOVERY_SLAVE_REMOVAL_PERCENT_LIMIT = 1.0; // 100%.
 const size_t MAX_REMOVED_SLAVES = 100000;

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/src/master/constants.hpp
----------------------------------------------------------------------
diff --git a/src/master/constants.hpp b/src/master/constants.hpp
index 072d59c..7cec18b 100644
--- a/src/master/constants.hpp
+++ b/src/master/constants.hpp
@@ -60,10 +60,10 @@ extern const Bytes MIN_MEM;
 // configurable, then we'll need to rely on upper/lower bounds
 // to ensure that the slave is not unnecessarily triggering
 // re-registrations.
-extern const Duration SLAVE_PING_TIMEOUT;
+extern const Duration DEFAULT_SLAVE_PING_TIMEOUT;
 
 // Maximum number of ping timeouts until slave is considered failed.
-extern const uint32_t MAX_SLAVE_PING_TIMEOUTS;
+extern const size_t DEFAULT_MAX_SLAVE_PING_TIMEOUTS;
 
 // The minimum timeout that can be used by a newly elected leader to
 // allow re-registration of slaves. Any slaves that do not re-register

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/src/master/flags.cpp
----------------------------------------------------------------------
diff --git a/src/master/flags.cpp b/src/master/flags.cpp
index 4377715..60ac64d 100644
--- a/src/master/flags.cpp
+++ b/src/master/flags.cpp
@@ -376,4 +376,35 @@ mesos::internal::master::Flags::Flags()
       "hooks",
       "A comma separated list of hook modules to be\n"
       "installed inside master.");
+
+  add(&Flags::slave_ping_timeout,
+      "slave_ping_timeout",
+      "The timeout within which each slave is expected to respond to a\n"
+      "ping from the master. Slaves that do not respond within\n"
+      "max_slave_ping_timeouts ping retries will be asked to shutdown.\n"
+      "NOTE: The total ping timeout (slave_ping_timeout multiplied by\n"
+      "max_slave_ping_timeouts) should be greater than the ZooKeeper\n"
+      "session timeout to prevent useless re-registration attempts.\n",
+      DEFAULT_SLAVE_PING_TIMEOUT,
+      [](const Duration& value) -> Option<Error> {
+        if (value < Seconds(1) || value > Minutes(15)) {
+          return Error("Expected --slave_ping_timeout to be between " +
+                       stringify(Seconds(1)) + " and " +
+                       stringify(Minutes(15)));
+        }
+        return None();
+      });
+
+  add(&Flags::max_slave_ping_timeouts,
+      "max_slave_ping_timeouts",
+      "The number of times a slave can fail to respond to a\n"
+      "ping from the master. Slaves that do not respond within\n"
+      "max_slave_ping_timeouts ping retries will be asked to shutdown.\n",
+      DEFAULT_MAX_SLAVE_PING_TIMEOUTS,
+      [](size_t value) -> Option<Error> {
+        if (value < 1) {
+          return Error("Expected --max_slave_ping_timeouts to be at least 1");
+        }
+        return None();
+      });
 }

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/src/master/flags.hpp
----------------------------------------------------------------------
diff --git a/src/master/flags.hpp b/src/master/flags.hpp
index 55ed3a9..f2cd19a 100644
--- a/src/master/flags.hpp
+++ b/src/master/flags.hpp
@@ -73,6 +73,8 @@ public:
   std::string authenticators;
   std::string allocator;
   Option<std::string> hooks;
+  Duration slave_ping_timeout;
+  size_t max_slave_ping_timeouts;
 
 #ifdef WITH_NETWORK_ISOLATOR
   Option<size_t> max_executors_per_slave;

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 0782b54..cbc6618 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -124,7 +124,9 @@ public:
                 const SlaveID& _slaveId,
                 const PID<Master>& _master,
                 const Option<shared_ptr<RateLimiter>>& _limiter,
-                const shared_ptr<Metrics> _metrics)
+                const shared_ptr<Metrics> _metrics,
+                const Duration& _slavePingTimeout,
+                const size_t _maxSlavePingTimeouts)
     : ProcessBase(process::ID::generate("slave-observer")),
       slave(_slave),
       slaveInfo(_slaveInfo),
@@ -132,6 +134,8 @@ public:
       master(_master),
       limiter(_limiter),
       metrics(_metrics),
+      slavePingTimeout(_slavePingTimeout),
+      maxSlavePingTimeouts(_maxSlavePingTimeouts),
       timeouts(0),
       pinged(false),
       connected(true)
@@ -170,7 +174,7 @@ protected:
     send(slave, "PING", data.data(), data.size());
 
     pinged = true;
-    delay(SLAVE_PING_TIMEOUT, self(), &SlaveObserver::timeout);
+    delay(slavePingTimeout, self(), &SlaveObserver::timeout);
   }
 
   void pong(const UPID& from, const string& body)
@@ -190,9 +194,9 @@ protected:
   {
     if (pinged) {
       timeouts++; // No pong has been received before the timeout.
-      if (timeouts >= MAX_SLAVE_PING_TIMEOUTS) {
+      if (timeouts >= maxSlavePingTimeouts) {
         // No pong has been received for the last
-        // 'MAX_SLAVE_PING_TIMEOUTS' pings.
+        // 'maxSlavePingTimeouts' pings.
         shutdown();
       }
     }
@@ -261,6 +265,8 @@ private:
   const Option<shared_ptr<RateLimiter>> limiter;
   shared_ptr<Metrics> metrics;
   Option<Future<Nothing>> shuttingDown;
+  const Duration slavePingTimeout;
+  const size_t maxSlavePingTimeouts;
   uint32_t timeouts;
   bool pinged;
   bool connected;
@@ -1313,7 +1319,7 @@ void Master::recoveredSlavesTimeout(const Registry& registry)
 
   if (removalPercentage > limit) {
     EXIT(1) << "Post-recovery slave removal limit exceeded! After "
-            << SLAVE_PING_TIMEOUT * MAX_SLAVE_PING_TIMEOUTS
+            << flags.slave_reregister_timeout
             << " there were " << slaves.recovered.size()
             << " (" << removalPercentage * 100 << "%) slaves recovered from the"
             << " registry that did not re-register: \n"
@@ -3143,8 +3149,15 @@ void Master::registerSlave(
 
       LOG(INFO) << "Slave " << *slave << " already registered,"
                 << " resending acknowledgement";
+
+      Duration pingTimeout =
+        flags.slave_ping_timeout * flags.max_slave_ping_timeouts;
+      MasterSlaveConnection connection;
+      connection.set_total_ping_timeout_seconds(pingTimeout.secs());
+
       SlaveRegisteredMessage message;
-      message.mutable_slave_id()->MergeFrom(slave->id);
+      message.mutable_slave_id()->CopyFrom(slave->id);
+      message.mutable_connection()->CopyFrom(connection);
       send(from, message);
       return;
     }
@@ -3217,8 +3230,14 @@ void Master::_registerSlave(
 
     addSlave(slave);
 
+    Duration pingTimeout =
+      flags.slave_ping_timeout * flags.max_slave_ping_timeouts;
+    MasterSlaveConnection connection;
+    connection.set_total_ping_timeout_seconds(pingTimeout.secs());
+
     SlaveRegisteredMessage message;
-    message.mutable_slave_id()->MergeFrom(slave->id);
+    message.mutable_slave_id()->CopyFrom(slave->id);
+    message.mutable_connection()->CopyFrom(connection);
     send(slave->pid, message);
 
     LOG(INFO) << "Registered slave " << *slave
@@ -3413,8 +3432,14 @@ void Master::_reregisterSlave(
 
     addSlave(slave, completedFrameworks);
 
+    Duration pingTimeout =
+      flags.slave_ping_timeout * flags.max_slave_ping_timeouts;
+    MasterSlaveConnection connection;
+    connection.set_total_ping_timeout_seconds(pingTimeout.secs());
+
     SlaveReregisteredMessage message;
-    message.mutable_slave_id()->MergeFrom(slave->id);
+    message.mutable_slave_id()->CopyFrom(slave->id);
+    message.mutable_connection()->CopyFrom(connection);
     send(slave->pid, message);
 
     LOG(INFO) << "Re-registered slave " << *slave
@@ -4266,8 +4291,14 @@ void Master::reconcile(
   // To resolve both cases correctly, we must reconcile through the
   // slave. For slaves that do not support reconciliation, we keep
   // the old semantics and cover only case (1) via TASK_LOST.
+  Duration pingTimeout =
+    flags.slave_ping_timeout * flags.max_slave_ping_timeouts;
+  MasterSlaveConnection connection;
+  connection.set_total_ping_timeout_seconds(pingTimeout.secs());
+
   SlaveReregisteredMessage reregistered;
-  reregistered.mutable_slave_id()->MergeFrom(slave->id);
+  reregistered.mutable_slave_id()->CopyFrom(slave->id);
+  reregistered.mutable_connection()->CopyFrom(connection);
 
   // NOTE: copies are needed because removeTask modified slave->tasks.
   foreachkey (const FrameworkID& frameworkId, utils::copy(slave->tasks)) {
@@ -4699,7 +4730,14 @@ void Master::addSlave(
 
   // Set up an observer for the slave.
   slave->observer = new SlaveObserver(
-      slave->pid, slave->info, slave->id, self(), slaves.limiter, metrics);
+      slave->pid,
+      slave->info,
+      slave->id,
+      self(),
+      slaves.limiter,
+      metrics,
+      flags.slave_ping_timeout,
+      flags.max_slave_ping_timeouts);
 
   spawn(slave->observer);
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/src/messages/messages.proto
----------------------------------------------------------------------
diff --git a/src/messages/messages.proto b/src/messages/messages.proto
index 1c8d79e..a1e71d8 100644
--- a/src/messages/messages.proto
+++ b/src/messages/messages.proto
@@ -276,6 +276,7 @@ message ReregisterSlaveMessage {
 
 message SlaveRegisteredMessage {
   required SlaveID slave_id = 1;
+  optional MasterSlaveConnection connection = 2;
 }
 
 
@@ -283,6 +284,7 @@ message SlaveReregisteredMessage {
   required SlaveID slave_id = 1;
 
   repeated ReconcileTasksMessage reconciliations = 2;
+  optional MasterSlaveConnection connection = 3;
 }
 
 
@@ -291,6 +293,14 @@ message UnregisterSlaveMessage {
 }
 
 
+message MasterSlaveConnection {
+  // Product of max_slave_ping_timeouts * slave_ping_timeout.
+  // If no pings are received within the total timeout,
+  // the master will remove the slave.
+  optional double total_ping_timeout_seconds = 1;
+}
+
+
 // This message is periodically sent by the master to the slave.
 // If the slave is connected to the master, "connected" is true.
 message PingSlaveMessage {

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/src/slave/constants.cpp
----------------------------------------------------------------------
diff --git a/src/slave/constants.cpp b/src/slave/constants.cpp
index d8d2f98..cf3ee7b 100644
--- a/src/slave/constants.cpp
+++ b/src/slave/constants.cpp
@@ -55,9 +55,10 @@ const Duration DOCKER_INSPECT_DELAY = Seconds(1);
 const Duration DOCKER_VERSION_WAIT_TIMEOUT = Seconds(5);
 const std::string DEFAULT_AUTHENTICATEE = "crammd5";
 
-Duration MASTER_PING_TIMEOUT()
+Duration DEFAULT_MASTER_PING_TIMEOUT()
 {
-  return master::SLAVE_PING_TIMEOUT * master::MAX_SLAVE_PING_TIMEOUTS;
+  return master::DEFAULT_SLAVE_PING_TIMEOUT *
+    master::DEFAULT_MAX_SLAVE_PING_TIMEOUTS;
 }
 
 } // namespace slave {

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/src/slave/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/constants.hpp b/src/slave/constants.hpp
index 84927e5..ccfe89c 100644
--- a/src/slave/constants.hpp
+++ b/src/slave/constants.hpp
@@ -113,7 +113,7 @@ const Bytes DEFAULT_FETCHER_CACHE_SIZE = Gigabytes(2);
 
 // If no pings received within this timeout, then the slave will
 // trigger a re-detection of the master to cause a re-registration.
-Duration MASTER_PING_TIMEOUT();
+Duration DEFAULT_MASTER_PING_TIMEOUT();
 
 } // namespace slave {
 } // namespace internal {

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index b859111..9b72fad 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -132,6 +132,7 @@ Slave::Slave(const slave::Flags& _flags,
     gc(_gc),
     monitor(defer(self(), &Self::usage)),
     statusUpdateManager(_statusUpdateManager),
+    masterPingTimeout(DEFAULT_MASTER_PING_TIMEOUT()),
     metaDir(paths::getMetaRootDir(flags.work_dir)),
     recoveryErrors(0),
     credential(None()),
@@ -399,12 +400,14 @@ void Slave::initialize()
   // Install protobuf handlers.
   install<SlaveRegisteredMessage>(
       &Slave::registered,
-      &SlaveRegisteredMessage::slave_id);
+      &SlaveRegisteredMessage::slave_id,
+      &SlaveRegisteredMessage::connection);
 
   install<SlaveReregisteredMessage>(
       &Slave::reregistered,
       &SlaveReregisteredMessage::slave_id,
-      &SlaveReregisteredMessage::reconciliations);
+      &SlaveReregisteredMessage::reconciliations,
+      &SlaveReregisteredMessage::connection);
 
   install<RunTaskMessage>(
       &Slave::runTask,
@@ -830,7 +833,10 @@ void Slave::authenticationTimeout(Future<bool> future)
 }
 
 
-void Slave::registered(const UPID& from, const SlaveID& slaveId)
+void Slave::registered(
+    const UPID& from,
+    const SlaveID& slaveId,
+    const MasterSlaveConnection& connection)
 {
   if (master != from) {
     LOG(WARNING) << "Ignoring registration message from " << from
@@ -841,7 +847,13 @@ void Slave::registered(const UPID& from, const SlaveID& slaveId)
 
   CHECK_SOME(master);
 
-  switch (state) {
+  if (connection.has_total_ping_timeout_seconds()) {
+    masterPingTimeout = Seconds(connection.total_ping_timeout_seconds());
+  } else {
+    masterPingTimeout = DEFAULT_MASTER_PING_TIMEOUT();
+  }
+
+  switch(state) {
     case DISCONNECTED: {
       LOG(INFO) << "Registered with master " << master.get()
                 << "; given slave ID " << slaveId;
@@ -874,8 +886,11 @@ void Slave::registered(const UPID& from, const SlaveID& slaveId)
       // in case we never receive an initial ping.
       Clock::cancel(pingTimer);
 
-      pingTimer =
-        delay(MASTER_PING_TIMEOUT(), self(), &Slave::pingTimeout, detection);
+      pingTimer = delay(
+          masterPingTimeout,
+          self(),
+          &Slave::pingTimeout,
+          detection);
 
       break;
     }
@@ -886,6 +901,7 @@ void Slave::registered(const UPID& from, const SlaveID& slaveId)
                << "(expected: " << info.id() << "). Committing suicide";
       }
       LOG(WARNING) << "Already registered with master " << master.get();
+
       break;
     case TERMINATING:
       LOG(WARNING) << "Ignoring registration because slave is terminating";
@@ -914,7 +930,8 @@ void Slave::registered(const UPID& from, const SlaveID& slaveId)
 void Slave::reregistered(
     const UPID& from,
     const SlaveID& slaveId,
-    const vector<ReconcileTasksMessage>& reconciliations)
+    const vector<ReconcileTasksMessage>& reconciliations,
+    const MasterSlaveConnection& connection)
 {
   if (master != from) {
     LOG(WARNING) << "Ignoring re-registration message from " << from
@@ -930,11 +947,29 @@ void Slave::reregistered(
             << "(expected: " << info.id() << "). Committing suicide";
   }
 
-  switch (state) {
+  if (connection.has_total_ping_timeout_seconds()) {
+    masterPingTimeout = Seconds(connection.total_ping_timeout_seconds());
+  } else {
+    masterPingTimeout = DEFAULT_MASTER_PING_TIMEOUT();
+  }
+
+  switch(state) {
     case DISCONNECTED:
       LOG(INFO) << "Re-registered with master " << master.get();
       state = RUNNING;
       statusUpdateManager->resume(); // Resume status updates.
+
+      // If we don't get a ping from the master, trigger a
+      // re-registration. This needs to be done once re-registered,
+      // in case we never receive an initial ping.
+      Clock::cancel(pingTimer);
+
+      pingTimer = delay(
+          masterPingTimeout,
+          self(),
+          &Slave::pingTimeout,
+          detection);
+
       break;
     case RUNNING:
       LOG(WARNING) << "Already re-registered with master " << master.get();
@@ -2968,8 +3003,11 @@ void Slave::pingOld(const UPID& from, const string& body)
   // when this occurs.
   Clock::cancel(pingTimer);
 
-  pingTimer =
-    delay(MASTER_PING_TIMEOUT(), self(), &Slave::pingTimeout, detection);
+  pingTimer = delay(
+      masterPingTimeout,
+      self(),
+      &Slave::pingTimeout,
+      detection);
 
   send(from, "PONG");
 }
@@ -2997,8 +3035,11 @@ void Slave::ping(const UPID& from, bool connected)
   // when this occurs.
   Clock::cancel(pingTimer);
 
-  pingTimer =
-    delay(MASTER_PING_TIMEOUT(), self(), &Slave::pingTimeout, detection);
+  pingTimer = delay(
+      masterPingTimeout,
+      self(),
+      &Slave::pingTimeout,
+      detection);
 
   send(from, PongSlaveMessage());
 }
@@ -3011,7 +3052,7 @@ void Slave::pingTimeout(Future<Option<MasterInfo>> future)
   // bother trying to re-detect.
   if (pingTimer.timeout().expired()) {
     LOG(INFO) << "No pings from master received within "
-              << MASTER_PING_TIMEOUT();
+              << masterPingTimeout;
 
     future.discard();
   }

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/src/slave/slave.hpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index f1cf3b8..dec4ca8 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -99,12 +99,16 @@ public:
 
   void shutdown(const process::UPID& from, const std::string& message);
 
-  void registered(const process::UPID& from, const SlaveID& slaveId);
+  void registered(
+      const process::UPID& from,
+      const SlaveID& slaveId,
+      const MasterSlaveConnection& connection);
 
   void reregistered(
       const process::UPID& from,
       const SlaveID& slaveId,
-      const std::vector<ReconcileTasksMessage>& reconciliations);
+      const std::vector<ReconcileTasksMessage>& reconciliations,
+      const MasterSlaveConnection& connection);
 
   void doReliableRegistration(Duration maxBackoff);
 
@@ -492,6 +496,9 @@ private:
   // Master detection future.
   process::Future<Option<MasterInfo>> detection;
 
+  // Master's ping timeout value, updated on reregistration.
+  Duration masterPingTimeout;
+
   // Timer for triggering re-detection when no ping is received from
   // the master.
   process::Timer pingTimer;

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/src/tests/partition_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/partition_tests.cpp b/src/tests/partition_tests.cpp
index f7ee3ab..c6ec14d 100644
--- a/src/tests/partition_tests.cpp
+++ b/src/tests/partition_tests.cpp
@@ -69,7 +69,8 @@ class PartitionTest : public MesosTest {};
 // message for a partitioned slave.
 TEST_F(PartitionTest, PartitionedSlave)
 {
-  Try<PID<Master> > master = StartMaster();
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<PID<Master>> master = StartMaster(masterFlags);
   ASSERT_SOME(master);
 
   // Set these expectations up before we spawn the slave so that we
@@ -79,7 +80,7 @@ TEST_F(PartitionTest, PartitionedSlave)
   // Drop all the PONGs to simulate slave partition.
   DROP_MESSAGES(Eq("PONG"), _, _);
 
-  Try<PID<Slave> > slave = StartSlave();
+  Try<PID<Slave>> slave = StartSlave();
   ASSERT_SOME(slave);
 
   MockScheduler sched;
@@ -109,18 +110,18 @@ TEST_F(PartitionTest, PartitionedSlave)
     .WillOnce(FutureSatisfy(&slaveLost));
 
   // Now advance through the PINGs.
-  uint32_t pings = 0;
+  size_t pings = 0;
   while (true) {
     AWAIT_READY(ping);
     pings++;
-    if (pings == master::MAX_SLAVE_PING_TIMEOUTS) {
+    if (pings == masterFlags.max_slave_ping_timeouts) {
      break;
     }
     ping = FUTURE_MESSAGE(Eq("PING"), _, _);
-    Clock::advance(master::SLAVE_PING_TIMEOUT);
+    Clock::advance(masterFlags.slave_ping_timeout);
   }
 
-  Clock::advance(master::SLAVE_PING_TIMEOUT);
+  Clock::advance(masterFlags.slave_ping_timeout);
 
   AWAIT_READY(slaveLost);
 
@@ -150,7 +151,8 @@ TEST_F(PartitionTest, PartitionedSlave)
 // slave shut down.
 TEST_F(PartitionTest, PartitionedSlaveReregistration)
 {
-  Try<PID<Master> > master = StartMaster();
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<PID<Master>> master = StartMaster(masterFlags);
   ASSERT_SOME(master);
 
   // Allow the master to PING the slave, but drop all PONG messages
@@ -164,7 +166,7 @@ TEST_F(PartitionTest, PartitionedSlaveReregistration)
 
   StandaloneMasterDetector detector(master.get());
 
-  Try<PID<Slave> > slave = StartSlave(&exec, &detector);
+  Try<PID<Slave>> slave = StartSlave(&exec, &detector);
   ASSERT_SOME(slave);
 
   MockScheduler sched;
@@ -173,7 +175,7 @@ TEST_F(PartitionTest, PartitionedSlaveReregistration)
 
   EXPECT_CALL(sched, registered(&driver, _, _));
 
-  Future<vector<Offer> > offers;
+  Future<vector<Offer>> offers;
   EXPECT_CALL(sched, resourceOffers(&driver, _))
     .WillOnce(FutureArg<1>(&offers))
     .WillRepeatedly(Return());
@@ -238,19 +240,19 @@ TEST_F(PartitionTest, PartitionedSlaveReregistration)
 
   // Now, induce a partition of the slave by having the master
   // timeout the slave.
-  uint32_t pings = 0;
+  size_t pings = 0;
   while (true) {
     AWAIT_READY(ping);
     pings++;
-    if (pings == master::MAX_SLAVE_PING_TIMEOUTS) {
+    if (pings == masterFlags.max_slave_ping_timeouts) {
      break;
     }
     ping = FUTURE_MESSAGE(Eq("PING"), _, _);
-    Clock::advance(master::SLAVE_PING_TIMEOUT);
+    Clock::advance(masterFlags.slave_ping_timeout);
     Clock::settle();
   }
 
-  Clock::advance(master::SLAVE_PING_TIMEOUT);
+  Clock::advance(masterFlags.slave_ping_timeout);
   Clock::settle();
 
   // The master will have notified the framework of the lost task.
@@ -301,7 +303,8 @@ TEST_F(PartitionTest, PartitionedSlaveReregistration)
 // tasks were LOST, so we have to have the slave shut down.
 TEST_F(PartitionTest, PartitionedSlaveStatusUpdates)
 {
-  Try<PID<Master> > master = StartMaster();
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<PID<Master>> master = StartMaster(masterFlags);
   ASSERT_SOME(master);
 
   // Allow the master to PING the slave, but drop all PONG messages
@@ -316,7 +319,7 @@ TEST_F(PartitionTest, PartitionedSlaveStatusUpdates)
 
   MockExecutor exec(DEFAULT_EXECUTOR_ID);
 
-  Try<PID<Slave> > slave = StartSlave(&exec);
+  Try<PID<Slave>> slave = StartSlave(&exec);
   ASSERT_SOME(slave);
 
   AWAIT_READY(slaveRegisteredMessage);
@@ -354,19 +357,19 @@ TEST_F(PartitionTest, PartitionedSlaveStatusUpdates)
 
   // Now, induce a partition of the slave by having the master
   // timeout the slave.
-  uint32_t pings = 0;
+  size_t pings = 0;
   while (true) {
     AWAIT_READY(ping);
     pings++;
-    if (pings == master::MAX_SLAVE_PING_TIMEOUTS) {
+    if (pings == masterFlags.max_slave_ping_timeouts) {
      break;
     }
     ping = FUTURE_MESSAGE(Eq("PING"), _, _);
-    Clock::advance(master::SLAVE_PING_TIMEOUT);
+    Clock::advance(masterFlags.slave_ping_timeout);
     Clock::settle();
   }
 
-  Clock::advance(master::SLAVE_PING_TIMEOUT);
+  Clock::advance(masterFlags.slave_ping_timeout);
   Clock::settle();
 
   // Wait for the master to attempt to shut down the slave.
@@ -417,7 +420,8 @@ TEST_F(PartitionTest, PartitionedSlaveStatusUpdates)
 // so we have to have the slave shut down.
 TEST_F(PartitionTest, PartitionedSlaveExitedExecutor)
 {
-  Try<PID<Master> > master = StartMaster();
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<PID<Master>> master = StartMaster(masterFlags);
   ASSERT_SOME(master);
 
   // Allow the master to PING the slave, but drop all PONG messages
@@ -430,7 +434,7 @@ TEST_F(PartitionTest, PartitionedSlaveExitedExecutor)
   MockExecutor exec(DEFAULT_EXECUTOR_ID);
   TestContainerizer containerizer(&exec);
 
-  Try<PID<Slave> > slave = StartSlave(&containerizer);
+  Try<PID<Slave>> slave = StartSlave(&containerizer);
   ASSERT_SOME(slave);
 
   MockScheduler sched;
@@ -441,7 +445,7 @@ TEST_F(PartitionTest, PartitionedSlaveExitedExecutor)
   EXPECT_CALL(sched, registered(&driver, _, _))
     .WillOnce(FutureArg<1>(&frameworkId));\
 
-  Future<vector<Offer> > offers;
+  Future<vector<Offer>> offers;
   EXPECT_CALL(sched, resourceOffers(&driver, _))
     .WillOnce(FutureArg<1>(&offers))
     .WillRepeatedly(Return());
@@ -499,19 +503,19 @@ TEST_F(PartitionTest, PartitionedSlaveExitedExecutor)
 
   // Now, induce a partition of the slave by having the master
   // timeout the slave.
-  uint32_t pings = 0;
+  size_t pings = 0;
   while (true) {
     AWAIT_READY(ping);
     pings++;
-    if (pings == master::MAX_SLAVE_PING_TIMEOUTS) {
+    if (pings == masterFlags.max_slave_ping_timeouts) {
      break;
     }
     ping = FUTURE_MESSAGE(Eq("PING"), _, _);
-    Clock::advance(master::SLAVE_PING_TIMEOUT);
+    Clock::advance(masterFlags.slave_ping_timeout);
     Clock::settle();
   }
 
-  Clock::advance(master::SLAVE_PING_TIMEOUT);
+  Clock::advance(masterFlags.slave_ping_timeout);
   Clock::settle();
 
   // The master will have notified the framework of the lost task.
@@ -548,7 +552,8 @@ TEST_F(PartitionTest, PartitionedSlaveExitedExecutor)
 TEST_F(PartitionTest, OneWayPartitionMasterToSlave)
 {
   // Start a master.
-  Try<PID<Master> > master = StartMaster();
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<PID<Master>> master = StartMaster(masterFlags);
   ASSERT_SOME(master);
 
   Future<Message> slaveRegisteredMessage =
@@ -583,10 +588,12 @@ TEST_F(PartitionTest, OneWayPartitionMasterToSlave)
   Clock::settle();
 
   // Let the slave observer send the next ping.
-  Clock::advance(slave::MASTER_PING_TIMEOUT());
+  Clock::advance(masterFlags.slave_ping_timeout);
 
   // Slave should re-register.
   AWAIT_READY(slaveReregisteredMessage);
+
+  Shutdown();
 }
 
 } // namespace tests {

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/src/tests/slave_recovery_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_recovery_tests.cpp b/src/tests/slave_recovery_tests.cpp
index c036e9c..2f882cf 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -2519,7 +2519,8 @@ TYPED_TEST(SlaveRecoveryTest, SchedulerFailover)
 // register itself with the master and get a new slave id.
 TYPED_TEST(SlaveRecoveryTest, PartitionedSlave)
 {
-  Try<PID<Master> > master = this->StartMaster();
+  master::Flags masterFlags = this->CreateMasterFlags();
+  Try<PID<Master>> master = this->StartMaster(masterFlags);
   ASSERT_SOME(master);
 
   // Set these expectations up before we spawn the slave so that we
@@ -2593,19 +2594,19 @@ TYPED_TEST(SlaveRecoveryTest, PartitionedSlave)
 
   // Now, induce a partition of the slave by having the master
   // timeout the slave.
-  uint32_t pings = 0;
+  size_t pings = 0;
   while (true) {
     AWAIT_READY(ping);
     pings++;
-    if (pings == master::MAX_SLAVE_PING_TIMEOUTS) {
+    if (pings == masterFlags.max_slave_ping_timeouts) {
      break;
     }
     ping = FUTURE_MESSAGE(Eq("PING"), _, _);
-    Clock::advance(master::SLAVE_PING_TIMEOUT);
+    Clock::advance(masterFlags.slave_ping_timeout);
     Clock::settle();
   }
 
-  Clock::advance(master::SLAVE_PING_TIMEOUT);
+  Clock::advance(masterFlags.slave_ping_timeout);
   Clock::settle();
 
   // The master will notify the framework that the slave was lost.

http://git-wip-us.apache.org/repos/asf/mesos/blob/6cd28dd9/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 036a0c2..90bbc81 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -1545,7 +1545,7 @@ TEST_F(SlaveTest, PingTimeoutNoPings)
   Future<SlaveReregisteredMessage> slaveReregisteredMessage =
     FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _);
 
-  Clock::advance(slave::MASTER_PING_TIMEOUT());
+  Clock::advance(slave::DEFAULT_MASTER_PING_TIMEOUT());
 
   AWAIT_READY(detected);
   AWAIT_READY(slaveReregisteredMessage);
@@ -1557,7 +1557,8 @@ TEST_F(SlaveTest, PingTimeoutNoPings)
 TEST_F(SlaveTest, PingTimeoutSomePings)
 {
   // Start a master.
-  Try<PID<Master>> master = StartMaster();
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<PID<Master>> master = StartMaster(masterFlags);
   ASSERT_SOME(master);
 
   Future<SlaveRegisteredMessage> slaveRegisteredMessage =
@@ -1574,7 +1575,7 @@ TEST_F(SlaveTest, PingTimeoutSomePings)
   // Ensure a ping reaches the slave.
   Future<Message> ping = FUTURE_MESSAGE(Eq("PING"), _, _);
 
-  Clock::advance(master::SLAVE_PING_TIMEOUT);
+  Clock::advance(masterFlags.slave_ping_timeout);
 
   AWAIT_READY(ping);
 
@@ -1588,7 +1589,7 @@ TEST_F(SlaveTest, PingTimeoutSomePings)
   Future<SlaveReregisteredMessage> slaveReregisteredMessage =
     FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _);
 
-  Clock::advance(slave::MASTER_PING_TIMEOUT());
+  Clock::advance(slave::DEFAULT_MASTER_PING_TIMEOUT());
 
   AWAIT_READY(detected);
   AWAIT_READY(slaveReregisteredMessage);
@@ -1602,7 +1603,8 @@ TEST_F(SlaveTest, RateLimitSlaveShutdown)
 {
   // Start a master.
   shared_ptr<MockRateLimiter> slaveRemovalLimiter(new MockRateLimiter());
-  Try<PID<Master>> master = StartMaster(slaveRemovalLimiter);
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<PID<Master>> master = StartMaster(slaveRemovalLimiter, masterFlags);
   ASSERT_SOME(master);
 
   // Set these expectations up before we spawn the slave so that we
@@ -1632,16 +1634,16 @@ TEST_F(SlaveTest, RateLimitSlaveShutdown)
 
   // Induce a health check failure of the slave.
   Clock::pause();
-  uint32_t pings = 0;
+  size_t pings = 0;
   while (true) {
     AWAIT_READY(ping);
     pings++;
-    if (pings == master::MAX_SLAVE_PING_TIMEOUTS) {
-      Clock::advance(master::SLAVE_PING_TIMEOUT);
+    if (pings == masterFlags.max_slave_ping_timeouts) {
+      Clock::advance(masterFlags.slave_ping_timeout);
       break;
     }
     ping = FUTURE_MESSAGE(Eq("PING"), _, _);
-    Clock::advance(master::SLAVE_PING_TIMEOUT);
+    Clock::advance(masterFlags.slave_ping_timeout);
   }
 
   // The master should attempt to acquire a permit.
@@ -1665,7 +1667,8 @@ TEST_F(SlaveTest, CancelSlaveShutdown)
 {
   // Start a master.
   shared_ptr<MockRateLimiter> slaveRemovalLimiter(new MockRateLimiter());
-  Try<PID<Master>> master = StartMaster(slaveRemovalLimiter);
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<PID<Master>> master = StartMaster(slaveRemovalLimiter, masterFlags);
   ASSERT_SOME(master);
 
   // Set these expectations up before we spawn the slave so that we
@@ -1696,16 +1699,16 @@ TEST_F(SlaveTest, CancelSlaveShutdown)
 
   // Induce a health check failure of the slave.
   Clock::pause();
-  uint32_t pings = 0;
+  size_t pings = 0;
   while (true) {
     AWAIT_READY(ping);
     pings++;
-    if (pings == master::MAX_SLAVE_PING_TIMEOUTS) {
-      Clock::advance(master::SLAVE_PING_TIMEOUT);
+    if (pings == masterFlags.max_slave_ping_timeouts) {
+      Clock::advance(masterFlags.slave_ping_timeout);
       break;
     }
     ping = FUTURE_MESSAGE(Eq("PING"), _, _);
-    Clock::advance(master::SLAVE_PING_TIMEOUT);
+    Clock::advance(masterFlags.slave_ping_timeout);
   }
 
   // The master should attempt to acquire a permit.
@@ -1718,7 +1721,7 @@ TEST_F(SlaveTest, CancelSlaveShutdown)
   filter(NULL);
 
   // Advance clock enough to do a ping pong.
-  Clock::advance(master::SLAVE_PING_TIMEOUT);
+  Clock::advance(masterFlags.slave_ping_timeout);
   Clock::settle();
 
   // The master should have tried to cancel the removal.


[2/3] mesos git commit: Set the ownership of persistent volumes to match the sandbox directory.

Posted by me...@apache.org.
Set the ownership of persistent volumes to match the sandbox directory.

Review: https://reviews.apache.org/r/35721


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/c8e091d1
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/c8e091d1
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/c8e091d1

Branch: refs/heads/master
Commit: c8e091d1d694e812cd9061217fa8018986109aed
Parents: 3d2dec4
Author: haosdent huang <ha...@gmail.com>
Authored: Mon Jun 29 02:39:59 2015 -0700
Committer: Adam B <ad...@mesosphere.io>
Committed: Mon Jun 29 03:33:11 2015 -0700

----------------------------------------------------------------------
 src/slave/containerizer/mesos/containerizer.cpp | 21 ++++++++++++++++++++
 1 file changed, 21 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/c8e091d1/src/slave/containerizer/mesos/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp
index 313e9b7..47d1461 100644
--- a/src/slave/containerizer/mesos/containerizer.cpp
+++ b/src/slave/containerizer/mesos/containerizer.cpp
@@ -1331,6 +1331,27 @@ Try<Nothing> MesosContainerizerProcess::updateVolumes(
           "Failed to symlink persistent volume from '" +
           original + "' to '" + link + "'");
     }
+
+    // Set the ownership of persistent volume to match the sandbox
+    // directory. Currently, persistent volumes in mesos are
+    // exclusive. If one persistent volume is used by one
+    // task/executor, it cannot be concurrently used by other
+    // task/executor. But if we allow multiple executors use same
+    // persistent volume at the same time in the future, the ownership
+    // of persistent volume may conflict here.
+    // TODO(haosdent): We need to update this after we have a proposed
+    // plan to adding user/group to persistent volumes.
+    struct stat s;
+    if (::stat(container->directory.c_str(), &s) < 0) {
+      return Error("Failed to get permissions on '" + container->directory +
+                   "': " + strerror(errno));
+    }
+
+    Try<Nothing> chown = os::chown(s.st_uid, s.st_gid, original, true);
+    if (chown.isError()) {
+      return Error("Failed to chown persistent volume '" + original +
+                   "': " + chown.error());
+    }
   }
 
   return Nothing();


[3/3] mesos git commit: Fix failing test: SlaveTest.ROOT_RunTaskWithCommandInfoWithUser.

Posted by me...@apache.org.
Fix failing test: SlaveTest.ROOT_RunTaskWithCommandInfoWithUser.

Review: https://reviews.apache.org/r/35728


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/3d2dec44
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/3d2dec44
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/3d2dec44

Branch: refs/heads/master
Commit: 3d2dec44d2476e216a3b735868f7bcee9917f5e2
Parents: b8007aa
Author: haosdent huang <ha...@gmail.com>
Authored: Mon Jun 29 01:59:03 2015 -0700
Committer: Adam B <ad...@mesosphere.io>
Committed: Mon Jun 29 03:33:11 2015 -0700

----------------------------------------------------------------------
 src/tests/slave_tests.cpp | 77 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 65 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/3d2dec44/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index e9002e8..036a0c2 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -668,10 +668,10 @@ TEST_F(SlaveTest, ROOT_RunTaskWithCommandInfoWithoutUser)
 
 // This test runs a command _with_ the command user field set. The
 // command will verify the assumption that the command is run as the
-// specified user. We use (and assume the precense) of the
+// specified user. We use (and assume the presence) of the
 // unprivileged 'nobody' user which should be available on both Linux
 // and Mac OS X.
-TEST_F(SlaveTest, DISABLED_ROOT_RunTaskWithCommandInfoWithUser)
+TEST_F(SlaveTest, ROOT_RunTaskWithCommandInfoWithUser)
 {
   // TODO(nnielsen): Introduce STOUT abstraction for user verification
   // instead of flat getpwnam call.
@@ -705,6 +705,11 @@ TEST_F(SlaveTest, DISABLED_ROOT_RunTaskWithCommandInfoWithUser)
   EXPECT_CALL(sched, registered(&driver, _, _))
     .Times(1);
 
+  Future<TaskStatus> statusRunning;
+  Future<TaskStatus> statusFinished;
+  const string helper =
+      path::join(tests::flags.build_dir, "src", "active-user-test-helper");
+
   Future<vector<Offer>> offers;
   EXPECT_CALL(sched, resourceOffers(&driver, _))
     .WillOnce(FutureArg<1>(&offers))
@@ -715,15 +720,66 @@ TEST_F(SlaveTest, DISABLED_ROOT_RunTaskWithCommandInfoWithUser)
   AWAIT_READY(offers);
   EXPECT_NE(0u, offers.get().size());
 
+  // HACK: Launch a prepare task as root to prepare the binaries.
+  // This task creates the lt-mesos-executor binary in the build dir.
+  // Because the real task is run as a test user (nobody), it does not
+  // have permission to create files in the build directory.
+  TaskInfo prepareTask;
+  prepareTask.set_name("prepare task");
+  prepareTask.mutable_task_id()->set_value("1");
+  prepareTask.mutable_slave_id()->CopyFrom(offers.get()[0].slave_id());
+  prepareTask.mutable_resources()->CopyFrom(
+      offers.get()[0].resources());
+
+  Result<string> user = os::user();
+  CHECK_SOME(user) << "Failed to get current user name"
+                   << (user.isError() ? ": " + user.error() : "");
+  // Current user should be root.
+  EXPECT_EQ("root", user.get());
+
+  // This prepare command executor will run as the current user
+  // running the tests (root). After this command executor finishes,
+  // we know that the lt-mesos-executor binary file exists.
+  CommandInfo prepareCommand;
+  prepareCommand.set_shell(false);
+  prepareCommand.set_value(helper);
+  prepareCommand.add_arguments(helper);
+  prepareCommand.add_arguments(user.get());
+  prepareTask.mutable_command()->CopyFrom(prepareCommand);
+
+  vector<TaskInfo> prepareTasks;
+  prepareTasks.push_back(prepareTask);
+
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&statusRunning))
+    .WillOnce(FutureArg<1>(&statusFinished));
+
+  driver.launchTasks(offers.get()[0].id(), prepareTasks);
+
+  // Scheduler should first receive TASK_RUNNING followed by the
+  // TASK_FINISHED from the executor.
+  AWAIT_READY(statusRunning);
+  EXPECT_EQ(TASK_RUNNING, statusRunning.get().state());
+  EXPECT_EQ(TaskStatus::SOURCE_EXECUTOR, statusRunning.get().source());
+
+  AWAIT_READY(statusFinished);
+  EXPECT_EQ(TASK_FINISHED, statusFinished.get().state());
+  EXPECT_EQ(TaskStatus::SOURCE_EXECUTOR, statusFinished.get().source());
+
+  // Start to launch a task with different user.
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  AWAIT_READY(offers);
+  EXPECT_NE(0u, offers.get().size());
+
   // Launch a task with the command executor.
   TaskInfo task;
   task.set_name("");
-  task.mutable_task_id()->set_value("1");
-  task.mutable_slave_id()->MergeFrom(offers.get()[0].slave_id());
-  task.mutable_resources()->MergeFrom(offers.get()[0].resources());
-
-  const string helper =
-      path::join(tests::flags.build_dir, "src", "active-user-test-helper");
+  task.mutable_task_id()->set_value("2");
+  task.mutable_slave_id()->CopyFrom(offers.get()[0].slave_id());
+  task.mutable_resources()->CopyFrom(offers.get()[0].resources());
 
   CommandInfo command;
   command.set_user(testUser);
@@ -732,13 +788,10 @@ TEST_F(SlaveTest, DISABLED_ROOT_RunTaskWithCommandInfoWithUser)
   command.add_arguments(helper);
   command.add_arguments(testUser);
 
-  task.mutable_command()->MergeFrom(command);
-
+  task.mutable_command()->CopyFrom(command);
   vector<TaskInfo> tasks;
   tasks.push_back(task);
 
-  Future<TaskStatus> statusRunning;
-  Future<TaskStatus> statusFinished;
   EXPECT_CALL(sched, statusUpdate(&driver, _))
     .WillOnce(FutureArg<1>(&statusRunning))
     .WillOnce(FutureArg<1>(&statusFinished));