You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2017/01/24 01:07:41 UTC

[01/25] mesos git commit: Improved management of unreachable and completed tasks in master.

Repository: mesos
Updated Branches:
  refs/heads/master 18a70b0e7 -> f1d0cdf1d


Improved management of unreachable and completed tasks in master.

Before partition-awareness, when an agent failed health checks, the
master removed the agent from the registry, marked all of its tasks
TASK_LOST, and moved them to the `completedTasks` list in the master's
memory. Although "lost" tasks might still be running, partitioned agents
would only be allowed to re-register if the master failed over, in which
case the `completedTasks` map would be emptied.

When partition-awareness was introduced, we initially followed the same
scheme, with the only difference that partition-aware tasks are marked
TASK_UNREACHABLE, not TASK_LOST.

This scheme has a few shortcomings. First, partition-aware tasks might
resume running when the partitioned agent re-registers. Second, we
re-added non-partition aware tasks when the agent re-registered but then
marked them completed when the framework is shutdown, resulting in two
entries in `completedTasks`.

This commit introduces a separate bounded map, `unreachableTasks`. These
tasks are reported separately via the HTTP endpoints, because they have
different semantics (unlike completed tasks, unreachable tasks can
resume running). The size of this map is limited by a new master flag,
`--max_unreachable_tasks_per_framework`. This commit also changes the
master to omit re-adding non-partition-aware tasks on re-registering
agents (unless the master has failed over): those tasks will shortly be
shutdown anyway.

Finally, this commit fixes a minor bug in the previous code: the
previous coding neglected to shutdown non-partition-aware frameworks
running on pre-1.0 Mesos agents that re-register with the master after
a network partition.

Review: https://reviews.apache.org/r/54183/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/44fc29fd
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/44fc29fd
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/44fc29fd

Branch: refs/heads/master
Commit: 44fc29fd62bf6b21b7e21b31542f9ef9fefc948b
Parents: 18a70b0
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:03:00 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:03:00 2017 -0800

----------------------------------------------------------------------
 docs/configuration.md                |  11 +-
 include/mesos/master/master.proto    |  14 +-
 include/mesos/v1/master/master.proto |  14 +-
 src/master/constants.hpp             |   4 +
 src/master/flags.cpp                 |   5 +
 src/master/flags.hpp                 |   1 +
 src/master/http.cpp                  |  42 ++
 src/master/master.cpp                |  98 +++-
 src/master/master.hpp                |  37 +-
 src/tests/partition_tests.cpp        | 730 +++++++++++++++++++++++++++++-
 10 files changed, 925 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/44fc29fd/docs/configuration.md
----------------------------------------------------------------------
diff --git a/docs/configuration.md b/docs/configuration.md
index 2113d06..9ba6c1e 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -708,8 +708,7 @@ Maximum number of completed frameworks to store in memory. (default: 50)
 </tr>
 <tr>
   <td>
-    --max_completed_tasks_per_framework
-=VALUE
+    --max_completed_tasks_per_framework=VALUE
   </td>
   <td>
 Maximum number of completed tasks per framework to store in memory. (default: 1000)
@@ -717,6 +716,14 @@ Maximum number of completed tasks per framework to store in memory. (default: 10
 </tr>
 <tr>
   <td>
+    --max_unreachable_tasks_per_framework=VALUE
+  </td>
+  <td>
+Maximum number of unreachable tasks per framework to store in memory. (default: 1000)
+  </td>
+</tr>
+<tr>
+  <td>
     --offer_timeout=VALUE
   </td>
   <td>

http://git-wip-us.apache.org/repos/asf/mesos/blob/44fc29fd/include/mesos/master/master.proto
----------------------------------------------------------------------
diff --git a/include/mesos/master/master.proto b/include/mesos/master/master.proto
index 03203c8..7eaae22 100644
--- a/include/mesos/master/master.proto
+++ b/include/mesos/master/master.proto
@@ -386,10 +386,20 @@ message Response {
     // to be launched.
     repeated Task pending_tasks = 1;
 
-    // Tasks that have been forwarded to the agent for launch. This includes
-    // tasks that are running and reached terminal state.
+    // Tasks that have been forwarded to the agent for launch. This
+    // includes tasks that are staging or running; it also includes
+    // tasks that have reached a terminal state but the terminal status
+    // update has not yet been acknowledged by the scheduler.
     repeated Task tasks = 2;
 
+    // Tasks that were running on agents that have become partitioned
+    // from the master. If/when the agent is no longer partitioned,
+    // tasks running on that agent will no longer be unreachable (they
+    // will either be running or completed). Note that the master only
+    // stores a limited number of unreachable tasks; information about
+    // unreachable tasks is also not preserved across master failover.
+    repeated Task unreachable_tasks = 5;
+
     // Tasks that have reached terminal state and have all their updates
     // acknowledged by the scheduler.
     repeated Task completed_tasks = 3;

http://git-wip-us.apache.org/repos/asf/mesos/blob/44fc29fd/include/mesos/v1/master/master.proto
----------------------------------------------------------------------
diff --git a/include/mesos/v1/master/master.proto b/include/mesos/v1/master/master.proto
index f8edf39..5de3a93 100644
--- a/include/mesos/v1/master/master.proto
+++ b/include/mesos/v1/master/master.proto
@@ -386,10 +386,20 @@ message Response {
     // to be launched.
     repeated Task pending_tasks = 1;
 
-    // Tasks that have been forwarded to the agent for launch. This includes
-    // tasks that are running and reached terminal state.
+    // Tasks that have been forwarded to the agent for launch. This
+    // includes tasks that are staging or running; it also includes
+    // tasks that have reached a terminal state but the terminal status
+    // update has not yet been acknowledged by the scheduler.
     repeated Task tasks = 2;
 
+    // Tasks that were running on agents that have become partitioned
+    // from the master. If/when the agent is no longer partitioned,
+    // tasks running on that agent will no longer be unreachable (they
+    // will either be running or completed). Note that the master only
+    // stores a limited number of unreachable tasks; information about
+    // unreachable tasks is also not preserved across master failover.
+    repeated Task unreachable_tasks = 5;
+
     // Tasks that have reached terminal state and have all their updates
     // acknowledged by the scheduler.
     repeated Task completed_tasks = 3;

http://git-wip-us.apache.org/repos/asf/mesos/blob/44fc29fd/src/master/constants.hpp
----------------------------------------------------------------------
diff --git a/src/master/constants.hpp b/src/master/constants.hpp
index 900b694..7edf9f6 100644
--- a/src/master/constants.hpp
+++ b/src/master/constants.hpp
@@ -87,6 +87,10 @@ constexpr size_t DEFAULT_MAX_COMPLETED_FRAMEWORKS = 50;
 // to store in the cache.
 constexpr size_t DEFAULT_MAX_COMPLETED_TASKS_PER_FRAMEWORK = 1000;
 
+// Default maximum number of unreachable tasks per framework
+// to store in the cache.
+constexpr size_t DEFAULT_MAX_UNREACHABLE_TASKS_PER_FRAMEWORK = 1000;
+
 // Time interval to check for updated watchers list.
 constexpr Duration WHITELIST_WATCH_INTERVAL = Seconds(5);
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/44fc29fd/src/master/flags.cpp
----------------------------------------------------------------------
diff --git a/src/master/flags.cpp b/src/master/flags.cpp
index 124bfbc..d25cfdd 100644
--- a/src/master/flags.cpp
+++ b/src/master/flags.cpp
@@ -543,6 +543,11 @@ mesos::internal::master::Flags::Flags()
       "Maximum number of completed tasks per framework to store in memory.",
       DEFAULT_MAX_COMPLETED_TASKS_PER_FRAMEWORK);
 
+  add(&Flags::max_unreachable_tasks_per_framework,
+      "max_unreachable_tasks_per_framework",
+      "Maximum number of unreachable tasks per framework to store in memory.",
+      DEFAULT_MAX_UNREACHABLE_TASKS_PER_FRAMEWORK);
+
   add(&Flags::master_contender,
       "master_contender",
       "The symbol name of the master contender to use.\n"

http://git-wip-us.apache.org/repos/asf/mesos/blob/44fc29fd/src/master/flags.hpp
----------------------------------------------------------------------
diff --git a/src/master/flags.hpp b/src/master/flags.hpp
index 6a17b76..41a0edf 100644
--- a/src/master/flags.hpp
+++ b/src/master/flags.hpp
@@ -87,6 +87,7 @@ public:
   Option<std::string> http_framework_authenticators;
   size_t max_completed_frameworks;
   size_t max_completed_tasks_per_framework;
+  size_t max_unreachable_tasks_per_framework;
   Option<std::string> master_contender;
   Option<std::string> master_detector;
   Duration registry_gc_interval;

http://git-wip-us.apache.org/repos/asf/mesos/blob/44fc29fd/src/master/http.cpp
----------------------------------------------------------------------
diff --git a/src/master/http.cpp b/src/master/http.cpp
index a44621f..51734e5 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -289,6 +289,17 @@ struct FullFrameworkWriter {
       }
     });
 
+    writer->field("unreachable_tasks", [this](JSON::ArrayWriter* writer) {
+      foreachvalue (const Owned<Task>& task, framework_->unreachableTasks) {
+        // Skip unauthorized tasks.
+        if (!approveViewTask(taskApprover_, *task.get(), framework_->info)) {
+          continue;
+        }
+
+        writer->element(*task.get());
+      }
+    });
+
     writer->field("completed_tasks", [this](JSON::ArrayWriter* writer) {
       foreach (const Owned<Task>& task, framework_->completedTasks) {
         // Skip unauthorized tasks.
@@ -2853,6 +2864,11 @@ public:
         slavesToFrameworks[task->slave_id()].insert(frameworkId);
       }
 
+      foreachvalue (const Owned<Task>& task, framework->unreachableTasks) {
+        frameworksToSlaves[frameworkId].insert(task->slave_id());
+        slavesToFrameworks[task->slave_id()].insert(frameworkId);
+      }
+
       foreach (const Owned<Task>& task, framework->completedTasks) {
         frameworksToSlaves[frameworkId].insert(task->slave_id());
         slavesToFrameworks[task->slave_id()].insert(frameworkId);
@@ -2969,6 +2985,11 @@ public:
         slaveTaskSummaries[task->slave_id()].count(*task);
       }
 
+      foreachvalue (const Owned<Task>& task, framework->unreachableTasks) {
+        frameworkTaskSummaries[frameworkId].count(*task.get());
+        slaveTaskSummaries[task->slave_id()].count(*task.get());
+      }
+
       foreach (const Owned<Task>& task, framework->completedTasks) {
         frameworkTaskSummaries[frameworkId].count(*task.get());
         slaveTaskSummaries[task->slave_id()].count(*task.get());
@@ -2989,6 +3010,7 @@ public:
     return iterator != slaveTaskSummaries.end() ?
       iterator->second : TaskStateSummary::EMPTY;
   }
+
 private:
   hashmap<FrameworkID, TaskStateSummary> frameworkTaskSummaries;
   hashmap<SlaveID, TaskStateSummary> slaveTaskSummaries;
@@ -3694,6 +3716,16 @@ Future<Response> Master::Http::tasks(
 
           tasks.push_back(task);
         }
+
+        foreachvalue (const Owned<Task>& task, framework->unreachableTasks) {
+          // Skip unauthorized tasks.
+          if (!approveViewTask(tasksApprover, *task.get(), framework->info)) {
+            continue;
+          }
+
+          tasks.push_back(task.get());
+        }
+
         foreach (const Owned<Task>& task, framework->completedTasks) {
           // Skip unauthorized tasks.
           if (!approveViewTask(tasksApprover, *task.get(), framework->info)) {
@@ -3832,6 +3864,16 @@ mesos::master::Response::GetTasks Master::Http::_getTasks(
       getTasks.add_tasks()->CopyFrom(*task);
     }
 
+    // Unreachable tasks.
+    foreachvalue (const Owned<Task>& task, framework->unreachableTasks) {
+      // Skip unauthorized tasks.
+      if (!approveViewTask(tasksApprover, *task.get(), framework->info)) {
+        continue;
+      }
+
+      getTasks.add_unreachable_tasks()->CopyFrom(*task);
+    }
+
     // Completed tasks.
     foreach (const Owned<Task>& task, framework->completedTasks) {
       // Skip unauthorized tasks.

http://git-wip-us.apache.org/repos/asf/mesos/blob/44fc29fd/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 7315932..731bb5b 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -5512,6 +5512,78 @@ void Master::_reregisterSlave(
   machineId.set_hostname(slaveInfo.hostname());
   machineId.set_ip(stringify(pid.address.ip));
 
+  // For easy lookup, first determine the set of FrameworkIDs on the
+  // re-registering agent that are partition-aware. We examine both
+  // the frameworks and the tasks running on the agent. The former is
+  // necessary because the master might have failed over and not know
+  // about a framework running on the agent; the latter is necessary
+  // because pre-1.0 Mesos agents don't supply a list of running
+  // frameworks on re-registration.
+  hashset<FrameworkID> partitionAwareFrameworks;
+
+  // TODO(neilc): Remove this loop when we remove compatibility with
+  // pre-1.0 Mesos agents.
+  foreach (const Task& task, tasks) {
+    Framework* framework = getFramework(task.framework_id());
+
+    if (framework == nullptr) {
+      continue;
+    }
+
+    if (protobuf::frameworkHasCapability(
+            framework->info, FrameworkInfo::Capability::PARTITION_AWARE)) {
+      partitionAwareFrameworks.insert(framework->id());
+    }
+  }
+
+  foreach (const FrameworkInfo& framework, frameworks) {
+    if (protobuf::frameworkHasCapability(
+            framework, FrameworkInfo::Capability::PARTITION_AWARE)) {
+      partitionAwareFrameworks.insert(framework.id());
+    }
+  }
+
+  // Check if this master was the one that removed the reregistering
+  // agent from the cluster originally. This is false if the master
+  // has failed over since the agent was removed, for example.
+  //
+  // TODO(neilc): Since `removed` is a cache, we might mistakenly
+  // think the master has failed over and neglect to shutdown
+  // non-partition-aware frameworks on reregistering agents.
+  bool slaveWasRemoved = slaves.removed.get(slaveInfo.id()).isSome();
+
+  // Decide how to handle the tasks running on the agent:
+  //
+  // (a) If the master has not failed over since the agent was marked
+  // unreachable, only partition-aware tasks are re-added to the
+  // master (those tasks were previously marked "unreachable", so they
+  // should be removed from that collection). Any non-partition-aware
+  // frameworks running on the agent are shutdown. We already marked
+  // such tasks "completed" when the agent was marked unreachable, so
+  // no further cleanup for non-partition-aware tasks is required.
+  //
+  // (b) If the master has failed over, all tasks are re-added to the
+  // master. The master shouldn't have any record of the tasks running
+  // on the agent, so no further cleanup is required.
+  vector<Task> tasks_;
+  foreach (const Task& task, tasks) {
+    const FrameworkID& frameworkId = task.framework_id();
+    Framework* framework = getFramework(frameworkId);
+
+    // Always re-add partition-aware tasks.
+    if (partitionAwareFrameworks.contains(frameworkId)) {
+      tasks_.push_back(task);
+
+      if (framework != nullptr) {
+        framework->unreachableTasks.erase(task.task_id());
+      }
+    } else if (!slaveWasRemoved) {
+      // Only re-add non-partition-aware tasks if the master has
+      // failed over since the agent was marked unreachable.
+      tasks_.push_back(task);
+    }
+  }
+
   Slave* slave = new Slave(
       this,
       slaveInfo,
@@ -5521,20 +5593,12 @@ void Master::_reregisterSlave(
       Clock::now(),
       checkpointedResources,
       executorInfos,
-      tasks);
+      tasks_);
 
   slave->reregisteredTime = Clock::now();
 
   ++metrics->slave_reregistrations;
 
-  // Check if this master was the one that removed the reregistering
-  // agent from the cluster originally. This is false if the master
-  // has failed over since the agent was removed, for example. Since
-  // `removed` is a cache, we might mistakenly think the master has
-  // failed over and neglect to remove non-partition-aware frameworks
-  // on reregistering agents, but that should be rare in practice.
-  bool slaveWasRemoved = slaves.removed.get(slave->id).isSome();
-
   slaves.removed.erase(slave->id);
   slaves.unreachable.erase(slave->id);
 
@@ -5562,8 +5626,7 @@ void Master::_reregisterSlave(
   // master has not failed over.
   if (slaveWasRemoved) {
     foreach (const FrameworkInfo& framework, frameworks) {
-      if (!protobuf::frameworkHasCapability(
-              framework, FrameworkInfo::Capability::PARTITION_AWARE)) {
+      if (!partitionAwareFrameworks.contains(framework.id())) {
         LOG(INFO) << "Shutting down framework " << framework.id()
                   << " at reregistered agent " << *slave
                   << " because the framework is not partition-aware";
@@ -5572,10 +5635,9 @@ void Master::_reregisterSlave(
         message.mutable_framework_id()->MergeFrom(framework.id());
         send(slave->pid, message);
 
-        // Remove the framework's tasks from the master's in-memory state.
-        foreachvalue (Task* task, utils::copy(slave->tasks[framework.id()])) {
-          removeTask(task);
-        }
+        // The framework's tasks should not be stored in the master's
+        // in-memory state, because they were not re-added above.
+        CHECK(!slave->tasks.contains(framework.id()));
       }
     }
   }
@@ -5882,6 +5944,12 @@ void Master::statusUpdate(StatusUpdate update, const UPID& pid)
   // Lookup the task and see if we need to update anything locally.
   Task* task = slave->getTask(update.framework_id(), update.status().task_id());
   if (task == nullptr) {
+    // TODO(neilc): We might see status updates for non-partition
+    // aware tasks running on a partitioned agent that has
+    // re-registered with the master. The master marks such tasks
+    // completed when the agent partitions; it will shutdown the
+    // framework when the agent-reregisters, but we may see a number
+    // of status updates before the framework is shutdown.
     LOG(WARNING) << "Could not lookup task for status update " << update
                  << " from agent " << *slave;
     metrics->invalid_status_updates++;

http://git-wip-us.apache.org/repos/asf/mesos/blob/44fc29fd/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 8e8a903..fe0590f 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -2256,7 +2256,8 @@ struct Framework
       active(true),
       registeredTime(time),
       reregisteredTime(time),
-      completedTasks(masterFlags.max_completed_tasks_per_framework) {}
+      completedTasks(masterFlags.max_completed_tasks_per_framework),
+      unreachableTasks(masterFlags.max_unreachable_tasks_per_framework) {}
 
   Framework(Master* const _master,
             const Flags& masterFlags,
@@ -2271,7 +2272,8 @@ struct Framework
       active(true),
       registeredTime(time),
       reregisteredTime(time),
-      completedTasks(masterFlags.max_completed_tasks_per_framework) {}
+      completedTasks(masterFlags.max_completed_tasks_per_framework),
+      unreachableTasks(masterFlags.max_unreachable_tasks_per_framework) {}
 
   Framework(Master* const _master,
             const Flags& masterFlags,
@@ -2281,7 +2283,8 @@ struct Framework
       capabilities(_info.capabilities()),
       state(RECOVERED),
       active(false),
-      completedTasks(masterFlags.max_completed_tasks_per_framework) {}
+      completedTasks(masterFlags.max_completed_tasks_per_framework),
+      unreachableTasks(masterFlags.max_unreachable_tasks_per_framework) {}
 
   ~Framework()
   {
@@ -2361,6 +2364,15 @@ struct Framework
     completedTasks.push_back(process::Owned<Task>(new Task(task)));
   }
 
+  void addUnreachableTask(const Task& task)
+  {
+    CHECK(protobuf::frameworkHasCapability(
+              info, FrameworkInfo::Capability::PARTITION_AWARE));
+
+    // TODO(adam-mesos): Check if unreachable task already exists.
+    unreachableTasks.set(task.task_id(), process::Owned<Task>(new Task(task)));
+  }
+
   void removeTask(Task* task)
   {
     CHECK(tasks.contains(task->task_id()))
@@ -2375,7 +2387,11 @@ struct Framework
       }
     }
 
-    addCompletedTask(*task);
+    if (task->state() == TASK_UNREACHABLE) {
+      addUnreachableTask(*task);
+    } else {
+      addCompletedTask(*task);
+    }
 
     tasks.erase(task->task_id());
   }
@@ -2630,9 +2646,20 @@ struct Framework
 
   // Tasks launched by this framework that have reached a terminal
   // state and have had all their updates acknowledged. We only keep a
-  // fixed-size cache to avoid consuming too much memory.
+  // fixed-size cache to avoid consuming too much memory. We use
+  // boost::circular_buffer rather than BoundedHashMap because there
+  // can be multiple completed tasks with the same task ID.
+  //
+  // NOTE: When an agent is marked unreachable, non-partition-aware
+  // tasks are marked TASK_LOST and stored here; partition-aware tasks
+  // are marked TASK_UNREACHABLE and stored in `unreachableTasks`.
   boost::circular_buffer<process::Owned<Task>> completedTasks;
 
+  // Partition-aware tasks running on agents that have been marked
+  // unreachable. We only keep a fixed-size cache to avoid consuming
+  // too much memory.
+  BoundedHashMap<TaskID, process::Owned<Task>> unreachableTasks;
+
   hashset<Offer*> offers; // Active offers for framework.
 
   hashset<InverseOffer*> inverseOffers; // Active inverse offers for framework.

http://git-wip-us.apache.org/repos/asf/mesos/blob/44fc29fd/src/tests/partition_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/partition_tests.cpp b/src/tests/partition_tests.cpp
index e7f6418..a6919c5 100644
--- a/src/tests/partition_tests.cpp
+++ b/src/tests/partition_tests.cpp
@@ -72,6 +72,7 @@ using testing::AtMost;
 using testing::DoAll;
 using testing::Eq;
 using testing::Return;
+using testing::SaveArg;
 
 using ::testing::WithParamInterface;
 
@@ -275,6 +276,81 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlavePartitionAware)
   EXPECT_EQ(1, stats.values["master/slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
 
+  // Check the master's "/state" endpoint. The "tasks" and
+  // "completed_tasks" fields should be empty; there should be a
+  // single unreachable task.
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "state",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+
+    EXPECT_TRUE(parse->values["orphan_tasks"].as<JSON::Array>().values.empty());
+
+    JSON::Array frameworks = parse->values["frameworks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, frameworks.values.size());
+
+    JSON::Object framework = frameworks.values.front().as<JSON::Object>();
+
+    JSON::Array completedTasks =
+      framework.values["completed_tasks"].as<JSON::Array>();
+
+    EXPECT_TRUE(completedTasks.values.empty());
+
+    JSON::Array runningTasks = framework.values["tasks"].as<JSON::Array>();
+
+    EXPECT_TRUE(runningTasks.values.empty());
+
+    JSON::Array unreachableTasks =
+      framework.values["unreachable_tasks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, unreachableTasks.values.size());
+
+    JSON::Object unreachableTask =
+      unreachableTasks.values.front().as<JSON::Object>();
+
+    EXPECT_EQ(
+        task.task_id(), unreachableTask.values["id"].as<JSON::String>().value);
+    EXPECT_EQ(
+        "TASK_UNREACHABLE",
+        unreachableTask.values["state"].as<JSON::String>().value);
+  }
+
+  // Check the master's "/tasks" endpoint.
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "tasks",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+
+    JSON::Array tasks = parse->values["tasks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, tasks.values.size());
+
+    JSON::Object jsonTask = tasks.values.front().as<JSON::Object>();
+
+    EXPECT_EQ(
+        task.task_id(), jsonTask.values["id"].as<JSON::String>().value);
+    EXPECT_EQ(
+        "TASK_UNREACHABLE",
+        jsonTask.values["state"].as<JSON::String>().value);
+  }
+
   // We now complete the partition on the slave side as well. We
   // simulate a master loss event, which would normally happen during
   // a network partition. The slave should then reregister with the
@@ -307,6 +383,77 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlavePartitionAware)
 
   Clock::resume();
 
+  // Check the master's "/state" endpoint. The "unreachable_tasks" and
+  // "completed_tasks" fields should be empty; there should be a
+  // single running task.
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "state",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+
+    EXPECT_TRUE(parse->values["orphan_tasks"].as<JSON::Array>().values.empty());
+
+    JSON::Array frameworks = parse->values["frameworks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, frameworks.values.size());
+
+    JSON::Object framework = frameworks.values.front().as<JSON::Object>();
+
+    JSON::Array completedTasks =
+      framework.values["completed_tasks"].as<JSON::Array>();
+
+    EXPECT_TRUE(completedTasks.values.empty());
+
+    JSON::Array unreachableTasks =
+      framework.values["unreachable_tasks"].as<JSON::Array>();
+
+    EXPECT_TRUE(unreachableTasks.values.empty());
+
+    JSON::Array tasks = framework.values["tasks"].as<JSON::Array>();
+
+    JSON::Object activeTask = tasks.values.front().as<JSON::Object>();
+
+    EXPECT_EQ(
+        task.task_id(), activeTask.values["id"].as<JSON::String>().value);
+    EXPECT_EQ(
+        "TASK_RUNNING", activeTask.values["state"].as<JSON::String>().value);
+  }
+
+  // Check the master's "/tasks" endpoint.
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "tasks",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+
+    JSON::Array tasks = parse->values["tasks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, tasks.values.size());
+
+    JSON::Object jsonTask = tasks.values.front().as<JSON::Object>();
+
+    EXPECT_EQ(
+        task.task_id(), jsonTask.values["id"].as<JSON::String>().value);
+    EXPECT_EQ(
+        "TASK_RUNNING",
+        jsonTask.values["state"].as<JSON::String>().value);
+  }
+
   driver.stop();
   driver.join();
 }
@@ -432,6 +579,53 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlaveNotPartitionAware)
   EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
   EXPECT_EQ(0, stats.values["master/slave_removals/reason_unregistered"]);
 
+  // Check the master's "/state" endpoint. The "tasks" and
+  // "unreachable_tasks" fields should be empty; there should be a
+  // single completed task (we report LOST tasks as "completed" for
+  // backward compatibility).
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "state",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+
+    EXPECT_TRUE(parse->values["orphan_tasks"].as<JSON::Array>().values.empty());
+
+    JSON::Array frameworks = parse->values["frameworks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, frameworks.values.size());
+
+    JSON::Object framework = frameworks.values.front().as<JSON::Object>();
+
+    JSON::Array runningTasks = framework.values["tasks"].as<JSON::Array>();
+
+    EXPECT_TRUE(runningTasks.values.empty());
+
+    JSON::Array unreachableTasks =
+      framework.values["unreachable_tasks"].as<JSON::Array>();
+
+    EXPECT_TRUE(unreachableTasks.values.empty());
+
+    JSON::Array completedTasks =
+      framework.values["completed_tasks"].as<JSON::Array>();
+
+    JSON::Object completedTask =
+      completedTasks.values.front().as<JSON::Object>();
+
+    EXPECT_EQ(
+        task.task_id(), completedTask.values["id"].as<JSON::String>().value);
+    EXPECT_EQ(
+        "TASK_LOST",
+        completedTask.values["state"].as<JSON::String>().value);
+  }
+
   // We now complete the partition on the slave side as well. We
   // simulate a master loss event, which would normally happen during
   // a network partition. The slave should then reregister with the
@@ -466,6 +660,53 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlaveNotPartitionAware)
 
   Clock::resume();
 
+  // Check the master's "/state" endpoint. The "tasks" and
+  // "unreachable_tasks" fields should be empty; there should be a
+  // single completed task (we report LOST tasks as "completed" for
+  // backward compatibility).
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "state",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+
+    EXPECT_TRUE(parse->values["orphan_tasks"].as<JSON::Array>().values.empty());
+
+    JSON::Array frameworks = parse->values["frameworks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, frameworks.values.size());
+
+    JSON::Object framework = frameworks.values.front().as<JSON::Object>();
+
+    JSON::Array runningTasks = framework.values["tasks"].as<JSON::Array>();
+
+    EXPECT_TRUE(runningTasks.values.empty());
+
+    JSON::Array unreachableTasks =
+      framework.values["unreachable_tasks"].as<JSON::Array>();
+
+    EXPECT_TRUE(unreachableTasks.values.empty());
+
+    JSON::Array completedTasks =
+      framework.values["completed_tasks"].as<JSON::Array>();
+
+    JSON::Object completedTask =
+      completedTasks.values.front().as<JSON::Object>();
+
+    EXPECT_EQ(
+        task.task_id(), completedTask.values["id"].as<JSON::String>().value);
+    EXPECT_EQ(
+        "TASK_LOST",
+        completedTask.values["state"].as<JSON::String>().value);
+  }
+
   driver.stop();
   driver.join();
 }
@@ -897,17 +1138,23 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, PartitionedSlaveOrphanedTask)
       jsonFramework.values["id"].as<JSON::String>().value);
 
   EXPECT_TRUE(jsonFramework.values["tasks"].as<JSON::Array>().values.empty());
+  EXPECT_TRUE(
+      jsonFramework.values["completed_tasks"].as<JSON::Array>().values.empty());
 
-  JSON::Array completedTasks =
-    jsonFramework.values["completed_tasks"].as<JSON::Array>();
+  JSON::Array unreachableTasks =
+    jsonFramework.values["unreachable_tasks"].as<JSON::Array>();
 
-  ASSERT_EQ(1u, completedTasks.values.size());
+  ASSERT_EQ(1u, unreachableTasks.values.size());
 
-  JSON::Object completedTask = completedTasks.values.front().as<JSON::Object>();
+  JSON::Object unreachableTask =
+    unreachableTasks.values.front().as<JSON::Object>();
 
   EXPECT_EQ(
+      task.task_id(),
+      unreachableTask.values["id"].as<JSON::String>().value);
+  EXPECT_EQ(
       "TASK_UNREACHABLE",
-      completedTask.values["state"].as<JSON::String>().value);
+      unreachableTask.values["state"].as<JSON::String>().value);
 }
 
 
@@ -1453,6 +1700,479 @@ TEST_F(PartitionTest, PartitionedSlaveExitedExecutor)
 }
 
 
+// This test verifies that when a non-partition-aware task finishes
+// while an agent is partitioned, that task is displayed correctly at
+// the master after the partition heals.
+TEST_F(PartitionTest, TaskCompletedOnPartitionedAgent)
+{
+  Clock::pause();
+
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  // Allow the master to PING the slave, but drop all PONG messages
+  // from the slave. Note that we don't match on the master / slave
+  // PIDs because it's actually the `SlaveObserver` process that sends
+  // the pings.
+  Future<Message> ping = FUTURE_MESSAGE(
+      Eq(PingSlaveMessage().GetTypeName()), _, _);
+
+  DROP_PROTOBUFS(PongSlaveMessage(), _, _);
+
+  MockExecutor exec(DEFAULT_EXECUTOR_ID);
+  TestContainerizer containerizer(&exec);
+  StandaloneMasterDetector detector(master.get()->pid);
+
+  slave::Flags agentFlags = CreateSlaveFlags();
+  Try<Owned<cluster::Slave>> slave =
+    StartSlave(&detector, &containerizer, agentFlags);
+  ASSERT_SOME(slave);
+
+  // Start a non-partition-aware scheduler.
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(&driver, _, _));
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  Clock::advance(agentFlags.registration_backoff_factor);
+  AWAIT_READY(offers);
+  ASSERT_FALSE(offers->empty());
+
+  Offer offer = offers->at(0);
+  SlaveID slaveId = offer.slave_id();
+
+  ExecutorDriver* execDriver;
+  EXPECT_CALL(exec, registered(_, _, _, _))
+    .WillOnce(SaveArg<0>(&execDriver));
+
+  Future<TaskInfo> execTask;
+  EXPECT_CALL(exec, launchTask(_, _))
+    .WillOnce(FutureArg<1>(&execTask));
+
+  Future<Nothing> runningAtScheduler;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureSatisfy(&runningAtScheduler));
+
+  Future<Nothing> statusUpdateAck = FUTURE_DISPATCH(
+      slave.get()->pid, &Slave::_statusUpdateAcknowledgement);
+
+  TaskInfo task;
+  task.set_name("test-task");
+  task.mutable_task_id()->set_value("1");
+  task.mutable_slave_id()->MergeFrom(slaveId);
+  task.mutable_resources()->MergeFrom(offer.resources());
+  task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO);
+
+  driver.launchTasks(offer.id(), {task});
+
+  AWAIT_READY(execTask);
+
+  {
+    TaskStatus runningStatus;
+    runningStatus.mutable_task_id()->MergeFrom(execTask->task_id());
+    runningStatus.set_state(TASK_RUNNING);
+
+    execDriver->sendStatusUpdate(runningStatus);
+  }
+
+  AWAIT_READY(runningAtScheduler);
+  AWAIT_READY(statusUpdateAck);
+
+  // Now, induce a partition of the slave by having the master
+  // timeout the slave.
+  Future<TaskStatus> lostStatus;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&lostStatus));
+
+  Future<Nothing> slaveLost;
+  EXPECT_CALL(sched, slaveLost(&driver, _))
+    .WillOnce(FutureSatisfy(&slaveLost));
+
+  size_t pings = 0;
+  while (true) {
+    AWAIT_READY(ping);
+    pings++;
+    if (pings == masterFlags.max_agent_ping_timeouts) {
+      break;
+    }
+    ping = FUTURE_MESSAGE(Eq(PingSlaveMessage().GetTypeName()), _, _);
+    Clock::advance(masterFlags.agent_ping_timeout);
+  }
+
+  Clock::advance(masterFlags.agent_ping_timeout);
+  Clock::settle();
+
+  // The scheduler should see TASK_LOST because it is not
+  // PARTITION_AWARE.
+  AWAIT_READY(lostStatus);
+  EXPECT_EQ(TASK_LOST, lostStatus->state());
+  EXPECT_EQ(TaskStatus::REASON_SLAVE_REMOVED, lostStatus->reason());
+  EXPECT_EQ(task.task_id(), lostStatus->task_id());
+  EXPECT_EQ(slaveId, lostStatus->slave_id());
+  EXPECT_TRUE(lostStatus->has_unreachable_time());
+
+  AWAIT_READY(slaveLost);
+
+  // Have the executor inform the slave that the task has finished.
+  {
+    TaskStatus finishedStatus;
+    finishedStatus.mutable_task_id()->MergeFrom(execTask->task_id());
+    finishedStatus.set_state(TASK_FINISHED);
+
+    execDriver->sendStatusUpdate(finishedStatus);
+  }
+
+  // Cause the slave to reregister with the master. Because the
+  // framework is not partition-aware, this results in shutting down
+  // the executor on the slave. The enqueued TASK_FINISHED update
+  // should also be propagated to the scheduler.
+  detector.appoint(None());
+
+  Future<SlaveReregisteredMessage> slaveReregistered = FUTURE_PROTOBUF(
+      SlaveReregisteredMessage(), master.get()->pid, slave.get()->pid);
+
+  Future<Nothing> execShutdown;
+  EXPECT_CALL(exec, shutdown(_))
+    .WillOnce(FutureSatisfy(&execShutdown));
+
+  Future<TaskStatus> finishedStatus;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&finishedStatus));
+
+  detector.appoint(master.get()->pid);
+
+  Clock::advance(agentFlags.registration_backoff_factor);
+  AWAIT_READY(slaveReregistered);
+  AWAIT_READY(execShutdown);
+
+  AWAIT_READY(finishedStatus);
+  EXPECT_EQ(TASK_FINISHED, finishedStatus->state());
+  EXPECT_EQ(task.task_id(), finishedStatus->task_id());
+  EXPECT_EQ(slaveId, finishedStatus->slave_id());
+
+  // Perform explicit reconciliation. The task should not be running
+  // (TASK_LOST) because the framework is not PARTITION_AWARE.
+  TaskStatus status;
+  status.mutable_task_id()->CopyFrom(task.task_id());
+  status.mutable_slave_id()->CopyFrom(slaveId);
+  status.set_state(TASK_STAGING); // Dummy value.
+
+  Future<TaskStatus> reconcileUpdate;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&reconcileUpdate));
+
+  driver.reconcileTasks({status});
+
+  AWAIT_READY(reconcileUpdate);
+  EXPECT_EQ(TASK_LOST, reconcileUpdate->state());
+  EXPECT_EQ(TaskStatus::REASON_RECONCILIATION, reconcileUpdate->reason());
+  EXPECT_FALSE(reconcileUpdate->has_unreachable_time());
+
+  Clock::resume();
+
+  // Check the master's "/state" endpoint. The "tasks" and
+  // "unreachable_tasks" fields should be empty; there should be a
+  // single completed task.
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "state",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+
+    EXPECT_TRUE(parse->values["orphan_tasks"].as<JSON::Array>().values.empty());
+
+    JSON::Array frameworks = parse->values["frameworks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, frameworks.values.size());
+
+    JSON::Object framework = frameworks.values.front().as<JSON::Object>();
+
+    JSON::Array runningTasks = framework.values["tasks"].as<JSON::Array>();
+
+    EXPECT_TRUE(runningTasks.values.empty());
+
+    JSON::Array unreachableTasks =
+      framework.values["unreachable_tasks"].as<JSON::Array>();
+
+    EXPECT_TRUE(unreachableTasks.values.empty());
+
+    JSON::Array completedTasks =
+      framework.values["completed_tasks"].as<JSON::Array>();
+
+    JSON::Object completedTask =
+      completedTasks.values.front().as<JSON::Object>();
+
+    // TODO(neilc): It might be better to report TASK_FINISHED here. We
+    // report TASK_LOST currently because the master doesn't update
+    // the state of tasks it has already marked as completed.
+    EXPECT_EQ(
+        task.task_id(), completedTask.values["id"].as<JSON::String>().value);
+    EXPECT_EQ(
+        "TASK_LOST",
+        completedTask.values["state"].as<JSON::String>().value);
+  }
+
+  driver.stop();
+  driver.join();
+}
+
+
+// This test verifies that when a partition-aware task finishes while
+// an agent is partitioned, that task is displayed correctly at the
+// master after the partition heals.
+TEST_F(PartitionTest, PartitionAwareTaskCompletedOnPartitionedAgent)
+{
+  Clock::pause();
+
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  // Allow the master to PING the slave, but drop all PONG messages
+  // from the slave. Note that we don't match on the master / slave
+  // PIDs because it's actually the `SlaveObserver` process that sends
+  // the pings.
+  Future<Message> ping = FUTURE_MESSAGE(
+      Eq(PingSlaveMessage().GetTypeName()), _, _);
+
+  DROP_PROTOBUFS(PongSlaveMessage(), _, _);
+
+  MockExecutor exec(DEFAULT_EXECUTOR_ID);
+  TestContainerizer containerizer(&exec);
+  StandaloneMasterDetector detector(master.get()->pid);
+
+  slave::Flags agentFlags = CreateSlaveFlags();
+  Try<Owned<cluster::Slave>> slave =
+    StartSlave(&detector, &containerizer, agentFlags);
+  ASSERT_SOME(slave);
+
+  // Start a partition-aware scheduler.
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      FrameworkInfo::Capability::PARTITION_AWARE);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(&driver, _, _));
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  Clock::advance(agentFlags.registration_backoff_factor);
+  AWAIT_READY(offers);
+  ASSERT_FALSE(offers->empty());
+
+  Offer offer = offers->at(0);
+  SlaveID slaveId = offer.slave_id();
+
+  ExecutorDriver* execDriver;
+  EXPECT_CALL(exec, registered(_, _, _, _))
+    .WillOnce(SaveArg<0>(&execDriver));
+
+  Future<TaskInfo> execTask;
+  EXPECT_CALL(exec, launchTask(_, _))
+    .WillOnce(FutureArg<1>(&execTask));
+
+  Future<Nothing> runningAtScheduler;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureSatisfy(&runningAtScheduler));
+
+  Future<Nothing> statusUpdateAck = FUTURE_DISPATCH(
+      slave.get()->pid, &Slave::_statusUpdateAcknowledgement);
+
+  TaskInfo task;
+  task.set_name("test-task");
+  task.mutable_task_id()->set_value("1");
+  task.mutable_slave_id()->MergeFrom(slaveId);
+  task.mutable_resources()->MergeFrom(offer.resources());
+  task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO);
+
+  driver.launchTasks(offer.id(), {task});
+
+  AWAIT_READY(execTask);
+
+  {
+    TaskStatus runningStatus;
+    runningStatus.mutable_task_id()->MergeFrom(execTask->task_id());
+    runningStatus.set_state(TASK_RUNNING);
+
+    execDriver->sendStatusUpdate(runningStatus);
+  }
+
+  AWAIT_READY(runningAtScheduler);
+  AWAIT_READY(statusUpdateAck);
+
+  // Now, induce a partition of the slave by having the master
+  // timeout the slave.
+  Future<TaskStatus> unreachableStatus;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&unreachableStatus));
+
+  Future<Nothing> slaveLost;
+  EXPECT_CALL(sched, slaveLost(&driver, _))
+    .WillOnce(FutureSatisfy(&slaveLost));
+
+  size_t pings = 0;
+  while (true) {
+    AWAIT_READY(ping);
+    pings++;
+    if (pings == masterFlags.max_agent_ping_timeouts) {
+      break;
+    }
+    ping = FUTURE_MESSAGE(Eq(PingSlaveMessage().GetTypeName()), _, _);
+    Clock::advance(masterFlags.agent_ping_timeout);
+  }
+
+  Clock::advance(masterFlags.agent_ping_timeout);
+  Clock::settle();
+
+  // The scheduler should see TASK_UNREACHABLE because it is
+  // PARTITION_AWARE.
+  AWAIT_READY(unreachableStatus);
+  EXPECT_EQ(TASK_UNREACHABLE, unreachableStatus->state());
+  EXPECT_EQ(TaskStatus::REASON_SLAVE_REMOVED, unreachableStatus->reason());
+  EXPECT_EQ(task.task_id(), unreachableStatus->task_id());
+  EXPECT_EQ(slaveId, unreachableStatus->slave_id());
+  EXPECT_TRUE(unreachableStatus->has_unreachable_time());
+
+  AWAIT_READY(slaveLost);
+
+  // Have the executor inform the slave that the task has finished.
+  {
+    TaskStatus finishedStatus;
+    finishedStatus.mutable_task_id()->MergeFrom(execTask->task_id());
+    finishedStatus.set_state(TASK_FINISHED);
+
+    execDriver->sendStatusUpdate(finishedStatus);
+  }
+
+  // Cause the slave to reregister with the master. Because the
+  // framework is partition-aware, this should not result in shutting
+  // down the executor on the slave. The enqueued TASK_FINISHED update
+  // should also be propagated to the scheduler.
+  //
+  // The master sends the current framework PID to the slave via
+  // `UpdateFrameworkMessage`; the slave then resends any pending
+  // status updates. This would make the test non-deterministic; since
+  // the framework PID is unchanged, drop the `UpdateFrameworkMessage`.
+  detector.appoint(None());
+
+  Future<SlaveReregisteredMessage> slaveReregistered = FUTURE_PROTOBUF(
+      SlaveReregisteredMessage(), master.get()->pid, slave.get()->pid);
+
+  Future<UpdateFrameworkMessage> frameworkUpdate = DROP_PROTOBUF(
+      UpdateFrameworkMessage(), master.get()->pid, slave.get()->pid);
+
+  Future<TaskStatus> finishedStatus;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&finishedStatus));
+
+  detector.appoint(master.get()->pid);
+
+  Clock::advance(agentFlags.registration_backoff_factor);
+  AWAIT_READY(slaveReregistered);
+  AWAIT_READY(frameworkUpdate);
+
+  AWAIT_READY(finishedStatus);
+  EXPECT_EQ(TASK_FINISHED, finishedStatus->state());
+  EXPECT_EQ(task.task_id(), finishedStatus->task_id());
+  EXPECT_EQ(slaveId, finishedStatus->slave_id());
+
+  // Perform explicit reconciliation. The task should not be running.
+  TaskStatus status;
+  status.mutable_task_id()->CopyFrom(task.task_id());
+  status.mutable_slave_id()->CopyFrom(slaveId);
+  status.set_state(TASK_STAGING); // Dummy value.
+
+  Future<TaskStatus> reconcileUpdate;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&reconcileUpdate));
+
+  driver.reconcileTasks({status});
+
+  AWAIT_READY(reconcileUpdate);
+  EXPECT_EQ(TASK_UNKNOWN, reconcileUpdate->state());
+  EXPECT_EQ(TaskStatus::REASON_RECONCILIATION, reconcileUpdate->reason());
+  EXPECT_FALSE(reconcileUpdate->has_unreachable_time());
+
+  // Check the master's "/state" endpoint. The "tasks" and
+  // "unreachable_tasks" fields should be empty; there should be a
+  // single completed task.
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "state",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+
+    EXPECT_TRUE(parse->values["orphan_tasks"].as<JSON::Array>().values.empty());
+
+    JSON::Array frameworks = parse->values["frameworks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, frameworks.values.size());
+
+    JSON::Object framework = frameworks.values.front().as<JSON::Object>();
+
+    JSON::Array runningTasks = framework.values["tasks"].as<JSON::Array>();
+
+    EXPECT_TRUE(runningTasks.values.empty());
+
+    JSON::Array unreachableTasks =
+      framework.values["unreachable_tasks"].as<JSON::Array>();
+
+    EXPECT_TRUE(unreachableTasks.values.empty());
+
+    JSON::Array completedTasks =
+      framework.values["completed_tasks"].as<JSON::Array>();
+
+    JSON::Object completedTask =
+      completedTasks.values.front().as<JSON::Object>();
+
+    EXPECT_EQ(
+        task.task_id(), completedTask.values["id"].as<JSON::String>().value);
+    EXPECT_EQ(
+        "TASK_FINISHED",
+        completedTask.values["state"].as<JSON::String>().value);
+  }
+
+  EXPECT_CALL(exec, shutdown(_))
+    .Times(AtMost(1));
+
+  Clock::resume();
+
+  driver.stop();
+  driver.join();
+}
+
+
 // This test checks that the master correctly garbage collects
 // information about unreachable agents from the registry using the
 // count-based GC criterion.


[18/25] mesos git commit: Added new master metric, "unreachable_slaves".

Posted by vi...@apache.org.
Added new master metric, "unreachable_slaves".

This reports the number of unreachable agents the master is
tracking. Because the master GCs the list of unreachable agents, this
might be smaller than the number of agents that have been marked
unreachable and never re-registered.

Review: https://reviews.apache.org/r/55804/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a16c557a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a16c557a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a16c557a

Branch: refs/heads/master
Commit: a16c557a276acb082fd11ca7ad550f4935a29d74
Parents: 4332ffa
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:05:07 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:27 2017 -0800

----------------------------------------------------------------------
 docs/monitoring.md            |  9 +++++++++
 src/master/http.cpp           |  1 +
 src/master/master.cpp         |  6 ++++++
 src/master/master.hpp         |  1 +
 src/master/metrics.cpp        |  3 +++
 src/master/metrics.hpp        |  1 +
 src/tests/partition_tests.cpp | 10 +++++++---
 7 files changed, 28 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/a16c557a/docs/monitoring.md
----------------------------------------------------------------------
diff --git a/docs/monitoring.md b/docs/monitoring.md
index f32ee40..59ee364 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -403,6 +403,15 @@ unhealthy or that they are not able to connect to the elected master.
   <td>Number of inactive agents</td>
   <td>Gauge</td>
 </tr>
+<tr>
+  <td>
+  <code>master/slaves_inactive</code>
+  </td>
+  <td>Number of unreachable agents. Unreachable agents are periodically
+      garbage collected from the registry, which will cause this value to
+      decrease.</td>
+  <td>Gauge</td>
+</tr>
 </table>
 
 #### Frameworks

http://git-wip-us.apache.org/repos/asf/mesos/blob/a16c557a/src/master/http.cpp
----------------------------------------------------------------------
diff --git a/src/master/http.cpp b/src/master/http.cpp
index 10382f4..add88e1 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -2642,6 +2642,7 @@ Future<Response> Master::Http::state(
         writer->field("hostname", master->info().hostname());
         writer->field("activated_slaves", master->_slaves_active());
         writer->field("deactivated_slaves", master->_slaves_inactive());
+        writer->field("unreachable_slaves", master->_slaves_unreachable());
 
         // TODO(haosdent): Deprecated this in favor of `leader_info` below.
         if (master->leader.isSome()) {

http://git-wip-us.apache.org/repos/asf/mesos/blob/a16c557a/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 5e6eeb9..994b43b 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -8462,6 +8462,12 @@ double Master::_slaves_inactive()
 }
 
 
+double Master::_slaves_unreachable()
+{
+  return slaves.unreachable.size();
+}
+
+
 double Master::_frameworks_connected()
 {
   double count = 0.0;

http://git-wip-us.apache.org/repos/asf/mesos/blob/a16c557a/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 17d7f03..9c610d1 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -1800,6 +1800,7 @@ private:
   double _slaves_disconnected();
   double _slaves_active();
   double _slaves_inactive();
+  double _slaves_unreachable();
 
   double _frameworks_connected();
   double _frameworks_disconnected();

http://git-wip-us.apache.org/repos/asf/mesos/blob/a16c557a/src/master/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.cpp b/src/master/metrics.cpp
index 1f049f3..4c7072c 100644
--- a/src/master/metrics.cpp
+++ b/src/master/metrics.cpp
@@ -56,6 +56,9 @@ Metrics::Metrics(const Master& master)
     slaves_inactive(
         "master/slaves_inactive",
         defer(master, &Master::_slaves_inactive)),
+    slaves_unreachable(
+        "master/slaves_unreachable",
+        defer(master, &Master::_slaves_unreachable)),
     frameworks_connected(
         "master/frameworks_connected",
         defer(master, &Master::_frameworks_connected)),

http://git-wip-us.apache.org/repos/asf/mesos/blob/a16c557a/src/master/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.hpp b/src/master/metrics.hpp
index 056d290..62c3620 100644
--- a/src/master/metrics.hpp
+++ b/src/master/metrics.hpp
@@ -48,6 +48,7 @@ struct Metrics
   process::metrics::Gauge slaves_disconnected;
   process::metrics::Gauge slaves_active;
   process::metrics::Gauge slaves_inactive;
+  process::metrics::Gauge slaves_unreachable;
 
   process::metrics::Gauge frameworks_connected;
   process::metrics::Gauge frameworks_disconnected;

http://git-wip-us.apache.org/repos/asf/mesos/blob/a16c557a/src/tests/partition_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/partition_tests.cpp b/src/tests/partition_tests.cpp
index 9f7e4d9..3bcb463 100644
--- a/src/tests/partition_tests.cpp
+++ b/src/tests/partition_tests.cpp
@@ -275,9 +275,9 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlavePartitionAware)
   EXPECT_EQ(1, stats.values["master/slave_removals"]);
   EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
 
-  // Check the master's "/state" endpoint. The "tasks" and
-  // "completed_tasks" fields should be empty; there should be a
-  // single unreachable task.
+  // Check the master's "/state" endpoint. There should be a single
+  // unreachable agent. The "tasks" and "completed_tasks" fields
+  // should be empty; there should be a single unreachable task.
   {
     Future<Response> response = process::http::get(
         master.get()->pid,
@@ -291,6 +291,10 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlavePartitionAware)
     Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
     ASSERT_SOME(parse);
 
+    EXPECT_EQ(0, parse->values["activated_slaves"].as<JSON::Number>());
+    EXPECT_EQ(0, parse->values["deactivated_slaves"].as<JSON::Number>());
+    EXPECT_EQ(1, parse->values["unreachable_slaves"].as<JSON::Number>());
+
     EXPECT_TRUE(parse->values["orphan_tasks"].as<JSON::Array>().values.empty());
 
     JSON::Array frameworks = parse->values["frameworks"].as<JSON::Array>();


[09/25] mesos git commit: Added TASK_UNREACHABLE to master's state-summary endpoint.

Posted by vi...@apache.org.
Added TASK_UNREACHABLE to master's state-summary endpoint.

Review: https://reviews.apache.org/r/54312/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/b5711fdd
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/b5711fdd
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/b5711fdd

Branch: refs/heads/master
Commit: b5711fdd12998a1b5bba206066b09a93ffc1a087
Parents: 482796b
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:04:00 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:22 2017 -0800

----------------------------------------------------------------------
 src/master/http.cpp           | 10 +++-
 src/tests/partition_tests.cpp | 93 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/b5711fdd/src/master/http.cpp
----------------------------------------------------------------------
diff --git a/src/master/http.cpp b/src/master/http.cpp
index 51734e5..3dc83dd 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -3108,7 +3108,11 @@ Future<Response> Master::Http::stateSummary(
               const TaskStateSummary& summary =
                 taskStateSummaries.slave(slave->id);
 
-              // TODO(neilc): Update for new PARTITION_AWARE task statuses.
+              // Certain per-agent status totals will always be zero
+              // (e.g., TASK_ERROR, TASK_UNREACHABLE). We report them
+              // here anyway, for completeness.
+              //
+              // TODO(neilc): Update for TASK_GONE and TASK_GONE_BY_OPERATOR.
               writer->field("TASK_STAGING", summary.staging);
               writer->field("TASK_STARTING", summary.starting);
               writer->field("TASK_RUNNING", summary.running);
@@ -3118,6 +3122,7 @@ Future<Response> Master::Http::stateSummary(
               writer->field("TASK_FAILED", summary.failed);
               writer->field("TASK_LOST", summary.lost);
               writer->field("TASK_ERROR", summary.error);
+              writer->field("TASK_UNREACHABLE", summary.unreachable);
 
               // Add the ids of all the frameworks running on this slave.
               const hashset<FrameworkID>& frameworks =
@@ -3162,7 +3167,7 @@ Future<Response> Master::Http::stateSummary(
               const TaskStateSummary& summary =
                 taskStateSummaries.framework(frameworkId);
 
-              // TODO(neilc): Update for new PARTITION_AWARE task statuses.
+              // TODO(neilc): Update for TASK_GONE and TASK_GONE_BY_OPERATOR.
               writer->field("TASK_STAGING", summary.staging);
               writer->field("TASK_STARTING", summary.starting);
               writer->field("TASK_RUNNING", summary.running);
@@ -3172,6 +3177,7 @@ Future<Response> Master::Http::stateSummary(
               writer->field("TASK_FAILED", summary.failed);
               writer->field("TASK_LOST", summary.lost);
               writer->field("TASK_ERROR", summary.error);
+              writer->field("TASK_UNREACHABLE", summary.unreachable);
 
               // Add the ids of all the slaves running this framework.
               const hashset<SlaveID>& slaves =

http://git-wip-us.apache.org/repos/asf/mesos/blob/b5711fdd/src/tests/partition_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/partition_tests.cpp b/src/tests/partition_tests.cpp
index 47f6dc9..6961ff6 100644
--- a/src/tests/partition_tests.cpp
+++ b/src/tests/partition_tests.cpp
@@ -351,6 +351,30 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlavePartitionAware)
         jsonTask.values["state"].as<JSON::String>().value);
   }
 
+  // Check the master's "/state-summary" endpoint.
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "state-summary",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+    JSON::Array frameworks = parse->values["frameworks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, frameworks.values.size());
+
+    JSON::Object framework = frameworks.values.front().as<JSON::Object>();
+
+    EXPECT_EQ(0, framework.values["TASK_LOST"].as<JSON::Number>());
+    EXPECT_EQ(0, framework.values["TASK_RUNNING"].as<JSON::Number>());
+    EXPECT_EQ(1, framework.values["TASK_UNREACHABLE"].as<JSON::Number>());
+  }
+
   // We now complete the partition on the slave side as well. We
   // simulate a master loss event, which would normally happen during
   // a network partition. The slave should then reregister with the
@@ -454,6 +478,31 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlavePartitionAware)
         jsonTask.values["state"].as<JSON::String>().value);
   }
 
+  // Check the master's "/state-summary" endpoint.
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "state-summary",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+
+    JSON::Array frameworks = parse->values["frameworks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, frameworks.values.size());
+
+    JSON::Object framework = frameworks.values.front().as<JSON::Object>();
+
+    EXPECT_EQ(0, framework.values["TASK_LOST"].as<JSON::Number>());
+    EXPECT_EQ(0, framework.values["TASK_UNREACHABLE"].as<JSON::Number>());
+    EXPECT_EQ(1, framework.values["TASK_RUNNING"].as<JSON::Number>());
+  }
+
   driver.stop();
   driver.join();
 }
@@ -707,6 +756,31 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlaveNotPartitionAware)
         completedTask.values["state"].as<JSON::String>().value);
   }
 
+  // Check the master's "/state-summary" endpoint.
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "state-summary",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+
+    JSON::Array frameworks = parse->values["frameworks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, frameworks.values.size());
+
+    JSON::Object framework = frameworks.values.front().as<JSON::Object>();
+
+    EXPECT_EQ(0, framework.values["TASK_RUNNING"].as<JSON::Number>());
+    EXPECT_EQ(0, framework.values["TASK_UNREACHABLE"].as<JSON::Number>());
+    EXPECT_EQ(1, framework.values["TASK_LOST"].as<JSON::Number>());
+  }
+
   driver.stop();
   driver.join();
 }
@@ -1209,6 +1283,25 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, PartitionedSlaveOrphanedTask)
         completedTask.values["state"].as<JSON::String>().value);
   }
 
+  // Check the master's "/state-summary" endpoint.
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "state-summary",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+
+    JSON::Array frameworks = parse->values["frameworks"].as<JSON::Array>();
+
+    EXPECT_TRUE( frameworks.values.empty());
+  }
+
   // Also check the master's "/tasks" endpoint.
   {
     Future<Response> response = process::http::get(


[05/25] mesos git commit: Renamed `taskTerminated` for Slave/Framework to `recoverResources`.

Posted by vi...@apache.org.
Renamed `taskTerminated` for Slave/Framework to `recoverResources`.

The old name was misleading: these functions are invoked when a task
becomes unreachable, which does not count as "task termination".

Review: https://reviews.apache.org/r/55474/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/484db11a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/484db11a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/484db11a

Branch: refs/heads/master
Commit: 484db11a4e75696fa8e047508975f8061e3c0bb4
Parents: 33e2b21
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:03:35 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:03:35 2017 -0800

----------------------------------------------------------------------
 src/master/master.cpp |  6 +++---
 src/master/master.hpp | 12 ++++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/484db11a/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index decd3b0..be2db4d 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -8038,11 +8038,11 @@ void Master::updateTask(Task* task, const StatusUpdate& update)
     Slave* slave = slaves.registered.get(task->slave_id());
     CHECK_NOTNULL(slave);
 
-    slave->taskTerminated(task);
+    slave->recoverResources(task);
 
     Framework* framework = getFramework(task->framework_id());
     if (framework != nullptr) {
-      framework->taskTerminated(task);
+      framework->recoverResources(task);
     }
 
     switch (status.state()) {
@@ -8755,7 +8755,7 @@ void Slave::addTask(Task* task)
 }
 
 
-void Slave::taskTerminated(Task* task)
+void Slave::recoverResources(Task* task)
 {
   const TaskID& taskId = task->task_id();
   const FrameworkID& frameworkId = task->framework_id();

http://git-wip-us.apache.org/repos/asf/mesos/blob/484db11a/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 747c1eb..bbf42d7 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -131,11 +131,13 @@ struct Slave
 
   void addTask(Task* task);
 
-  // Notification of task termination, for resource accounting.
+  // Update slave to recover the resources that were previously
+  // being used by `task`.
+  //
   // TODO(bmahler): This is a hack for performance. We need to
   // maintain resource counters because computing task resources
   // functionally for all tasks is expensive, for now.
-  void taskTerminated(Task* task);
+  void recoverResources(Task* task);
 
   void removeTask(Task* task);
 
@@ -2186,11 +2188,13 @@ struct Framework
     }
   }
 
-  // Notification of task termination, for resource accounting.
+  // Update framework to recover the resources that were previously
+  // being used by `task`.
+  //
   // TODO(bmahler): This is a hack for performance. We need to
   // maintain resource counters because computing task resources
   // functionally for all tasks is expensive, for now.
-  void taskTerminated(Task* task)
+  void recoverResources(Task* task)
   {
     CHECK(protobuf::isTerminalState(task->state()));
     CHECK(tasks.contains(task->task_id()))


[25/25] mesos git commit: Displayed unreachable tasks in the webui.

Posted by vi...@apache.org.
Displayed unreachable tasks in the webui.

This commit adds:

* A count of the total number of unreachable tasks
* A list of unreachable tasks to the "home" page
* A list of unreachable tasks to the "framework" page

In all three cases, we only report unreachable tasks that the master
knows about. The master only keeps a limited-size cache of unreachable
tasks for each framework and does not attempt to preserve this
information across master failover, which means there may be unreachable
tasks that are not displayed by the webui.

Review: https://reviews.apache.org/r/55856/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/3a42e4d7
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/3a42e4d7
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/3a42e4d7

Branch: refs/heads/master
Commit: 3a42e4d7fe49a8a389d4514e0aae025cff81f52b
Parents: 95aa7ff
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:05:37 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:27 2017 -0800

----------------------------------------------------------------------
 src/webui/master/static/framework.html    | 22 ++++++++++++++++
 src/webui/master/static/home.html         | 35 ++++++++++++++++++++++++++
 src/webui/master/static/js/controllers.js |  3 +++
 3 files changed, 60 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/3a42e4d7/src/webui/master/static/framework.html
----------------------------------------------------------------------
diff --git a/src/webui/master/static/framework.html b/src/webui/master/static/framework.html
index 6297cf9..37e0b31 100644
--- a/src/webui/master/static/framework.html
+++ b/src/webui/master/static/framework.html
@@ -95,6 +95,28 @@
       </tbody>
     </table>
 
+    <table m-table table-content="framework.unreachable_tasks" title="Unreachable Tasks"
+      class="table table-striped table-bordered table-condensed">
+      <thead>
+        <tr>
+          <th data-key="id">ID</th>
+          <th data-key="name">Name</th>
+          <th data-key="start_time">Started</th>
+          <th data-key="agent_id">Agent ID</th>
+        </tr>
+      </thead>
+      <tbody>
+        <tr ng-repeat="task in $data">
+          <td>{{task.id}}</td>
+          <td>{{task.name}}</td>
+          <td>
+            <m-timestamp value="{{task.start_time}}"></m-timestamp>
+          </td>
+          <td>{{task.slave_id}}</td>
+        </tr>
+      </tbody>
+    </table>
+
     <table m-table table-content="framework.completed_tasks" title="Completed Tasks"
       class="table table-striped table-bordered table-condensed">
       <thead>

http://git-wip-us.apache.org/repos/asf/mesos/blob/3a42e4d7/src/webui/master/static/home.html
----------------------------------------------------------------------
diff --git a/src/webui/master/static/home.html b/src/webui/master/static/home.html
index 295da52..07bc848 100644
--- a/src/webui/master/static/home.html
+++ b/src/webui/master/static/home.html
@@ -81,6 +81,10 @@
             <td class="text-right">{{running_tasks | number}}</td>
           </tr>
           <tr>
+            <td>Unreachable</td>
+            <td class="text-right">{{unreachable_tasks.length | number}}</td>
+          </tr>
+          <tr>
             <td>Killing</td>
             <td class="text-right">{{killing_tasks | number}}</td>
           </tr>
@@ -207,6 +211,37 @@
       </tbody>
     </table>
 
+    <table m-table table-content="unreachable_tasks" title="Unreachable Tasks"
+      class="table table-striped table-bordered table-condensed">
+      <thead>
+        <tr>
+          <th data-key="framework_id">Framework ID</th>
+          <th data-key="id">Task ID</th>
+          <th data-key="name">Task Name</th>
+          <th data-key="start_time" data-sort>Started</th>
+          <th data-key="agent_id">Agent ID</th>
+        </tr>
+      </thead>
+      <tbody>
+        <tr data-ng-if="unreachable_tasks.length === 0">
+          <td colspan="8">No unreachable tasks.</td>
+        </tr>
+        <tr ng-repeat="task in $data">
+          <td>
+            <a href="#/frameworks/{{task.framework_id}}/">
+              {{task.framework_id}}
+            </a>
+          </td>
+          <td>{{task.id}}</td>
+          <td>{{task.name}}</td>
+          <td>
+            <m-timestamp value="{{task.start_time}}"></m-timestamp>
+          </td>
+          <td>{{task.slave_id}}</td>
+        </tr>
+      </tbody>
+    </table>
+
     <table m-table table-content="completed_tasks" title="Completed Tasks"
       class="table table-striped table-bordered table-condensed">
       <thead>

http://git-wip-us.apache.org/repos/asf/mesos/blob/3a42e4d7/src/webui/master/static/js/controllers.js
----------------------------------------------------------------------
diff --git a/src/webui/master/static/js/controllers.js b/src/webui/master/static/js/controllers.js
index 5e90813..78bceae 100644
--- a/src/webui/master/static/js/controllers.js
+++ b/src/webui/master/static/js/controllers.js
@@ -113,6 +113,7 @@
     $scope.offers = {};
     $scope.completed_frameworks = {};
     $scope.active_tasks = [];
+    $scope.unreachable_tasks = [];
     $scope.completed_tasks = [];
     $scope.orphan_tasks = [];
 
@@ -222,9 +223,11 @@
       // TODO(brenden): Remove this once
       // https://issues.apache.org/jira/browse/MESOS-527 is fixed.
       _.each(framework.tasks, setTaskMetadata);
+      _.each(framework.unreachable_tasks, setTaskMetadata);
       _.each(framework.completed_tasks, setTaskMetadata);
 
       $scope.active_tasks = $scope.active_tasks.concat(framework.tasks);
+      $scope.unreachable_tasks = $scope.unreachable_tasks.concat(framework.unreachable_tasks);
       $scope.completed_tasks =
         $scope.completed_tasks.concat(framework.completed_tasks);
     });


[24/25] mesos git commit: Enabled partition-awareness in mesos-execute by default.

Posted by vi...@apache.org.
Enabled partition-awareness in mesos-execute by default.

mesos-execute will now register as a partition-aware framework by
default. This behavior can be disabled via the "--no_partition-aware"
flag.

Review: https://reviews.apache.org/r/55862/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f1d0cdf1
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f1d0cdf1
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f1d0cdf1

Branch: refs/heads/master
Commit: f1d0cdf1db2a28fa44f7aded0c3760636c0a51de
Parents: 3a42e4d
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:05:41 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:27 2017 -0800

----------------------------------------------------------------------
 src/cli/execute.cpp | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/f1d0cdf1/src/cli/execute.cpp
----------------------------------------------------------------------
diff --git a/src/cli/execute.cpp b/src/cli/execute.cpp
index 16da356..7f8300e 100644
--- a/src/cli/execute.cpp
+++ b/src/cli/execute.cpp
@@ -241,7 +241,8 @@ public:
     add(&Flags::framework_capabilities,
         "framework_capabilities",
         "Comma-separated list of optional framework capabilities to enable.\n"
-        "The TASK_KILLING_STATE capability is always enabled.");
+        "TASK_KILLING_STATE is always enabled. PARTITION_AWARE is enabled\n"
+        "unless --no-partition-aware is specified.");
 
     add(&Flags::containerizer,
         "containerizer",
@@ -338,6 +339,11 @@ public:
         "The content type to use for scheduler protocol messages. 'json'\n"
         "and 'protobuf' are valid choices.",
         "protobuf");
+
+    add(&Flags::partition_aware,
+        "partition_aware",
+        "Enable partition-awareness for the framework.",
+        true);
   }
 
   string master;
@@ -366,6 +372,7 @@ public:
   Option<string> principal;
   Option<string> secret;
   string content_type;
+  bool partition_aware;
 };
 
 
@@ -1008,10 +1015,16 @@ int main(int argc, char** argv)
     return EXIT_FAILURE;
   }
 
-  // We set the TASK_KILLING_STATE capability by default.
+  // Always enable the TASK_KILLING_STATE capability.
   vector<FrameworkInfo::Capability::Type> frameworkCapabilities =
     { FrameworkInfo::Capability::TASK_KILLING_STATE };
 
+  // Enable PARTITION_AWARE unless disabled by the user.
+  if (flags.partition_aware) {
+    frameworkCapabilities.push_back(
+        FrameworkInfo::Capability::PARTITION_AWARE);
+  }
+
   if (flags.framework_capabilities.isSome()) {
     foreach (const string& capability, flags.framework_capabilities.get()) {
       FrameworkInfo::Capability::Type type;


[15/25] mesos git commit: Fixed comments, usage text in mesos-execute.

Posted by vi...@apache.org.
Fixed comments, usage text in mesos-execute.

Review: https://reviews.apache.org/r/55801/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f361b083
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f361b083
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f361b083

Branch: refs/heads/master
Commit: f361b083c25219530b31ab2198f72f3900544fef
Parents: b12c762
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:04:50 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:26 2017 -0800

----------------------------------------------------------------------
 src/cli/execute.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/f361b083/src/cli/execute.cpp
----------------------------------------------------------------------
diff --git a/src/cli/execute.cpp b/src/cli/execute.cpp
index 34c5d6e..aee3d80 100644
--- a/src/cli/execute.cpp
+++ b/src/cli/execute.cpp
@@ -240,8 +240,8 @@ public:
 
     add(&Flags::framework_capabilities,
         "framework_capabilities",
-        "Comma separated list of optional framework capabilities to enable.\n"
-        "(the only valid value is currently 'GPU_RESOURCES')");
+        "Comma-separated list of optional framework capabilities to enable.\n"
+        "The TASK_KILLING_STATE capability is always enabled.");
 
     add(&Flags::containerizer,
         "containerizer",
@@ -937,8 +937,8 @@ int main(int argc, char** argv)
     environment = flags.environment.get();
   }
 
-  // Copy the package to HDFS if requested save it's location as a URI
-  // for passing to the command (in CommandInfo).
+  // Copy the package to HDFS, if requested. Save its location
+  // as a URI for passing to the command (in CommandInfo).
   Option<string> uri = None();
 
   if (flags.package.isSome()) {
@@ -1018,14 +1018,14 @@ int main(int argc, char** argv)
 
       if (!FrameworkInfo::Capability::Type_Parse(capability, &type)) {
         cerr << "Flags '--framework_capabilities'"
-                " specifes an unknown capability"
+                " specifies an unknown capability"
                 " '" << capability << "'" << endl;
         return EXIT_FAILURE;
       }
 
       if (type != FrameworkInfo::Capability::GPU_RESOURCES) {
         cerr << "Flags '--framework_capabilities'"
-                " specifes an unsupported capability"
+                " specifies an unsupported capability"
                 " '" << capability << "'" << endl;
         return EXIT_FAILURE;
       }


[11/25] mesos git commit: Replaced `Master::Framework::active` with a new `state` enum value.

Posted by vi...@apache.org.
Replaced `Master::Framework::active` with a new `state` enum value.

That is, the master previously tracked two separate things about a
framework: its "state" (CONNECTED, DISCONNECTED, or RECOVERED), and
whether the framework is considered active. It is simpler to represent
the latter value as just another state: a framework can now be ACTIVE,
INACTIVE, DISCONNECTED, or RECOVERED. A framework is "connected" if it
is either ACTIVE or INACTIVE. This rules out a few combinations that
never made sense, such as "state = DISCONNECTED and active = TRUE".

Review: https://reviews.apache.org/r/54408/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e65fa035
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e65fa035
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e65fa035

Branch: refs/heads/master
Commit: e65fa0359fcd7be871a5a38dfbe908a8e98e07ab
Parents: 8234dc5
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:04:14 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:26 2017 -0800

----------------------------------------------------------------------
 src/master/http.cpp          |  4 +--
 src/master/master.cpp        | 61 ++++++++++++++++++++++-----------------
 src/master/master.hpp        | 38 ++++++++++--------------
 src/master/quota_handler.cpp |  2 +-
 4 files changed, 53 insertions(+), 52 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/e65fa035/src/master/http.cpp
----------------------------------------------------------------------
diff --git a/src/master/http.cpp b/src/master/http.cpp
index 3dc83dd..95cb6c6 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -406,7 +406,7 @@ static void json(JSON::ObjectWriter* writer, const Summary<Framework>& summary)
   writer->field("capabilities", framework.info.capabilities());
   writer->field("hostname", framework.info.hostname());
   writer->field("webui_url", framework.info.webui_url());
-  writer->field("active", framework.active);
+  writer->field("active", framework.active());
   writer->field("connected", framework.connected());
   writer->field("recovered", framework.recovered());
 }
@@ -1412,7 +1412,7 @@ mesos::master::Response::GetFrameworks::Framework model(
 
   _framework.mutable_framework_info()->CopyFrom(framework.info);
 
-  _framework.set_active(framework.active);
+  _framework.set_active(framework.active());
   _framework.set_connected(framework.connected());
   _framework.set_recovered(framework.recovered());
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/e65fa035/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index deb1367..30e401c 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -1378,7 +1378,9 @@ void Master::_exited(Framework* framework)
   LOG(INFO) << "Framework " << *framework << " disconnected";
 
   // Disconnect the framework.
-  disconnect(framework);
+  if (framework->connected()) {
+    disconnect(framework);
+  }
 
   // We can assume framework's failover_timeout is valid
   // because it has been validated in framework subscription.
@@ -2904,16 +2906,15 @@ void Master::_subscribe(
         removeInverseOffer(inverseOffer, true); // Rescind.
       }
 
-      // Relink to the framework and mark it connected. Relinking
-      // might be necessary if the framework link previously broke.
+      // Relink to the framework. This might be necessary if the
+      // framework link previously broke.
       link(framework->pid.get());
-      framework->state = Framework::State::CONNECTED;
 
       // Reactivate the framework.
       // NOTE: We do this after recovering resources (above) so that
       // the allocator has the correct view of the framework's share.
-      if (!framework->active) {
-        framework->active = true;
+      if (!framework->active()) {
+        framework->state = Framework::State::ACTIVE;
         allocator->activateFramework(framework->id());
       }
 
@@ -2981,13 +2982,27 @@ void Master::deactivateFramework(
     return;
   }
 
-  deactivate(framework, true);
+  if (!framework->connected()) {
+    LOG(INFO)
+      << "Ignoring deactivate framework message for framework" << *framework
+      << " because it is disconnected";
+    return;
+  }
+
+  if (framework->active()) {
+    deactivate(framework, true);
+  }
 }
 
 
 void Master::disconnect(Framework* framework)
 {
   CHECK_NOTNULL(framework);
+  CHECK(framework->connected());
+
+  if (framework->active()) {
+    deactivate(framework, true);
+  }
 
   LOG(INFO) << "Disconnecting framework " << *framework;
 
@@ -3004,19 +3019,17 @@ void Master::disconnect(Framework* framework)
     // been closed due to scheduler disconnection.
     framework->http.get().close();
   }
-
-  deactivate(framework, true);
 }
 
 
 void Master::deactivate(Framework* framework, bool rescind)
 {
   CHECK_NOTNULL(framework);
+  CHECK(framework->active());
 
   LOG(INFO) << "Deactivating framework " << *framework;
 
-  // Stop sending offers here for now.
-  framework->active = false;
+  framework->state = Framework::State::INACTIVE;
 
   // Tell the allocator to stop allocating resources to this framework.
   allocator->deactivateFramework(framework->id());
@@ -6630,7 +6643,7 @@ void Master::offer(const FrameworkID& frameworkId,
                    const hashmap<SlaveID, Resources>& resources)
 {
   if (!frameworks.registered.contains(frameworkId) ||
-      !frameworks.registered[frameworkId]->active) {
+      !frameworks.registered[frameworkId]->active()) {
     LOG(WARNING) << "Master returning resources offered to framework "
                  << frameworkId << " because the framework"
                  << " has terminated or is inactive";
@@ -6778,7 +6791,7 @@ void Master::inverseOffer(
     const hashmap<SlaveID, UnavailableResources>& resources)
 {
   if (!frameworks.registered.contains(frameworkId) ||
-      !frameworks.registered[frameworkId]->active) {
+      !frameworks.registered[frameworkId]->active()) {
     LOG(INFO) << "Master ignoring inverse offers to framework " << frameworkId
               << " because the framework has terminated or is inactive";
     return;
@@ -7205,7 +7218,7 @@ void Master::addFramework(Framework* framework)
       framework->id(),
       framework->info,
       framework->usedResources,
-      framework->active);
+      framework->active());
 
   // Export framework metrics if a principal is specified in `FrameworkInfo`.
 
@@ -7268,7 +7281,6 @@ void Master::activateRecoveredFramework(
 
   CHECK_NOTNULL(framework);
   CHECK(framework->recovered());
-  CHECK(!framework->active);
   CHECK(framework->offers.empty());
   CHECK(framework->inverseOffers.empty());
   CHECK(framework->pid.isNone());
@@ -7289,8 +7301,6 @@ void Master::activateRecoveredFramework(
   framework->reregisteredTime = Clock::now();
 
   // Update the framework's connection state.
-  framework->state = Framework::State::CONNECTED;
-
   if (pid.isSome()) {
     framework->updateConnection(pid.get());
     link(pid.get());
@@ -7301,7 +7311,7 @@ void Master::activateRecoveredFramework(
   }
 
   // Activate the framework.
-  framework->active = true;
+  framework->state = Framework::State::ACTIVE;
   allocator->activateFramework(framework->id());
 
   // Export framework metrics if a principal is specified in `FrameworkInfo`.
@@ -7451,14 +7461,13 @@ void Master::_failoverFramework(Framework* framework)
     removeInverseOffer(inverseOffer);
   }
 
-  // Reconnect and reactivate the framework.
-  framework->state = Framework::State::CONNECTED;
+  CHECK(!framework->recovered());
 
-  // Reactivate the framework.
+  // Reactivate the framework, if needed.
   // NOTE: We do this after recovering resources (above) so that
   // the allocator has the correct view of the framework's share.
-  if (!framework->active) {
-    framework->active = true;
+  if (!framework->active()) {
+    framework->state = Framework::State::ACTIVE;
     allocator->activateFramework(framework->id());
   }
 
@@ -7489,7 +7498,7 @@ void Master::removeFramework(Framework* framework)
 
   LOG(INFO) << "Removing framework " << *framework;
 
-  if (framework->active) {
+  if (framework->active()) {
     // Deactivate framework, but don't bother rescinding offers
     // because the framework is being removed.
     deactivate(framework, false);
@@ -8481,7 +8490,7 @@ double Master::_frameworks_active()
 {
   double count = 0.0;
   foreachvalue (Framework* framework, frameworks.registered) {
-    if (framework->active) {
+    if (framework->active()) {
       count++;
     }
   }
@@ -8493,7 +8502,7 @@ double Master::_frameworks_inactive()
 {
   double count = 0.0;
   foreachvalue (Framework* framework, frameworks.registered) {
-    if (!framework->active) {
+    if (!framework->active()) {
       count++;
     }
   }

http://git-wip-us.apache.org/repos/asf/mesos/blob/e65fa035/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 3b0d898..17d7f03 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -2116,19 +2116,22 @@ struct Framework
 {
   enum State
   {
-    // Framework is currently connected. Note that frameworks must
-    // also be `active` to be eligible to receive offers.
-    CONNECTED,
+    // Framework has never connected to this master. This implies the
+    // master failed over and the framework has not yet re-registered,
+    // but some framework state has been recovered from re-registering
+    // agents that are running tasks for the framework.
+    RECOVERED,
 
     // Framework was previously connected to this master. A framework
     // becomes disconnected when there is a socket error.
     DISCONNECTED,
 
-    // Framework has never connected to this master. This implies the
-    // master failed over and the framework has not yet re-registered,
-    // but some framework state has been recovered from re-registering
-    // agents that are running tasks for the framework.
-    RECOVERED
+    // The framework is connected but not active.
+    INACTIVE,
+
+    // Framework is connected and eligible to receive offers. No
+    // offers will be made to frameworks that are not active.
+    ACTIVE
   };
 
   Framework(Master* const _master,
@@ -2140,8 +2143,7 @@ struct Framework
       info(_info),
       capabilities(_info.capabilities()),
       pid(_pid),
-      state(CONNECTED),
-      active(true),
+      state(ACTIVE),
       registeredTime(time),
       reregisteredTime(time),
       completedTasks(masterFlags.max_completed_tasks_per_framework),
@@ -2156,8 +2158,7 @@ struct Framework
       info(_info),
       capabilities(_info.capabilities()),
       http(_http),
-      state(CONNECTED),
-      active(true),
+      state(ACTIVE),
       registeredTime(time),
       reregisteredTime(time),
       completedTasks(masterFlags.max_completed_tasks_per_framework),
@@ -2170,7 +2171,6 @@ struct Framework
       info(_info),
       capabilities(_info.capabilities()),
       state(RECOVERED),
-      active(false),
       completedTasks(masterFlags.max_completed_tasks_per_framework),
       unreachableTasks(masterFlags.max_unreachable_tasks_per_framework) {}
 
@@ -2497,8 +2497,8 @@ struct Framework
     process::spawn(heartbeater.get().get());
   }
 
-  bool connected() const { return state == CONNECTED; }
-
+  bool active() const    { return state == ACTIVE; }
+  bool connected() const { return state == ACTIVE || state == INACTIVE; }
   bool recovered() const { return state == RECOVERED; }
 
   Master* const master;
@@ -2516,14 +2516,6 @@ struct Framework
 
   State state;
 
-  // Framework becomes deactivated when it is disconnected or
-  // the master receives a DeactivateFrameworkMessage.
-  // No offers will be made to a deactivated framework.
-  //
-  // TODO(neilc): Consider replacing this with an additional
-  // `state` enumeration value (MESOS-6719).
-  bool active;
-
   process::Time registeredTime;
   process::Time reregisteredTime;
   process::Time unregisteredTime;

http://git-wip-us.apache.org/repos/asf/mesos/blob/e65fa035/src/master/quota_handler.cpp
----------------------------------------------------------------------
diff --git a/src/master/quota_handler.cpp b/src/master/quota_handler.cpp
index 6e6e737..f4a27ea 100644
--- a/src/master/quota_handler.cpp
+++ b/src/master/quota_handler.cpp
@@ -134,7 +134,7 @@ void Master::QuotaHandler::rescindOffers(const QuotaInfo& request) const
   if (master->activeRoles.contains(role)) {
     Role* roleState = master->activeRoles[role];
     foreachvalue (const Framework* framework, roleState->frameworks) {
-      if (framework->connected() && framework->active) {
+      if (framework->active()) {
         ++frameworksInRole;
       }
     }


[20/25] mesos git commit: Updated the list of terminal states in the webui.

Posted by vi...@apache.org.
Updated the list of terminal states in the webui.

Review: https://reviews.apache.org/r/55807/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e8ff54fd
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e8ff54fd
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e8ff54fd

Branch: refs/heads/master
Commit: e8ff54fd1375b4af4063fefcf43658c8dface851
Parents: c66f57a
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:05:23 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:27 2017 -0800

----------------------------------------------------------------------
 src/webui/master/static/js/controllers.js | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/e8ff54fd/src/webui/master/static/js/controllers.js
----------------------------------------------------------------------
diff --git a/src/webui/master/static/js/controllers.js b/src/webui/master/static/js/controllers.js
index d9fa7a1..5e90813 100644
--- a/src/webui/master/static/js/controllers.js
+++ b/src/webui/master/static/js/controllers.js
@@ -165,7 +165,10 @@
           'TASK_FAILED',
           'TASK_FINISHED',
           'TASK_KILLED',
-          'TASK_LOST'
+          'TASK_LOST',
+          'TASK_DROPPED',
+          'TASK_GONE',
+          'TASK_GONE_BY_OPERATOR'
       ];
       return terminalStates.indexOf(taskState) > -1;
     };


[04/25] mesos git commit: Marked a member function `const`.

Posted by vi...@apache.org.
Marked a member function `const`.

Review: https://reviews.apache.org/r/55473/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/33e2b21f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/33e2b21f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/33e2b21f

Branch: refs/heads/master
Commit: 33e2b21f6d5faf8b27c575cef382df32f22c6b33
Parents: bf7fb1e
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:03:26 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:03:26 2017 -0800

----------------------------------------------------------------------
 src/master/master.cpp | 6 +++---
 src/master/master.hpp | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/33e2b21f/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 3e98c0b..decd3b0 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -8722,10 +8722,10 @@ Slave::~Slave()
 }
 
 
-Task* Slave::getTask(const FrameworkID& frameworkId, const TaskID& taskId)
+Task* Slave::getTask(const FrameworkID& frameworkId, const TaskID& taskId) const
 {
-  if (tasks.contains(frameworkId) && tasks[frameworkId].contains(taskId)) {
-    return tasks[frameworkId][taskId];
+  if (tasks.contains(frameworkId) && tasks.at(frameworkId).contains(taskId)) {
+    return tasks.at(frameworkId).at(taskId);
   }
   return nullptr;
 }

http://git-wip-us.apache.org/repos/asf/mesos/blob/33e2b21f/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 478c2ab..747c1eb 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -127,7 +127,7 @@ struct Slave
 
   Task* getTask(
       const FrameworkID& frameworkId,
-      const TaskID& taskId);
+      const TaskID& taskId) const;
 
   void addTask(Task* task);
 


[23/25] mesos git commit: Updated webui to display number of unreachable agents.

Posted by vi...@apache.org.
Updated webui to display number of unreachable agents.

Review: https://reviews.apache.org/r/55805/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/c66f57a9
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/c66f57a9
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/c66f57a9

Branch: refs/heads/master
Commit: c66f57a901ea480294830c3f56f00af10443605d
Parents: a16c557
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:05:17 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:27 2017 -0800

----------------------------------------------------------------------
 src/webui/master/static/home.html         | 4 ++++
 src/webui/master/static/js/controllers.js | 1 +
 2 files changed, 5 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/c66f57a9/src/webui/master/static/home.html
----------------------------------------------------------------------
diff --git a/src/webui/master/static/home.html b/src/webui/master/static/home.html
index 23b09b6..295da52 100644
--- a/src/webui/master/static/home.html
+++ b/src/webui/master/static/home.html
@@ -58,6 +58,10 @@
             <td>Deactivated</td>
             <td class="text-right">{{deactivated_agents | number}}</td>
           </tr>
+          <tr>
+            <td>Unreachable</td>
+            <td class="text-right">{{unreachable_agents | number}}</td>
+          </tr>
         </tbody>
       </table>
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/c66f57a9/src/webui/master/static/js/controllers.js
----------------------------------------------------------------------
diff --git a/src/webui/master/static/js/controllers.js b/src/webui/master/static/js/controllers.js
index b6364fa..d9fa7a1 100644
--- a/src/webui/master/static/js/controllers.js
+++ b/src/webui/master/static/js/controllers.js
@@ -133,6 +133,7 @@
 
     $scope.activated_agents = $scope.state.activated_slaves;
     $scope.deactivated_agents = $scope.state.deactivated_slaves;
+    $scope.unreachable_agents = $scope.state.unreachable_slaves;
 
     _.each($scope.state.slaves, function(agent) {
       $scope.agents[agent.id] = agent;


[06/25] mesos git commit: Added `Master::isRemovable(const TaskState&)`.

Posted by vi...@apache.org.
Added `Master::isRemovable(const TaskState&)`.

This determines whether a task in the given state can safely be
discarded from the master's in-memory state. When a task becomes
removable, we move the task from the master's main task data structures
to a fixed-size cache (either the "unreachable" or "completed" task
list, as appropriate).

Review: https://reviews.apache.org/r/55475/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/d4acb10d
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/d4acb10d
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/d4acb10d

Branch: refs/heads/master
Commit: d4acb10d74638b24617a5007425161fad4a6d47d
Parents: 484db11
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:03:42 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:03:42 2017 -0800

----------------------------------------------------------------------
 src/master/master.cpp | 28 +++++++++++++---------------
 src/master/master.hpp | 22 +++++++++++++++++++---
 2 files changed, 32 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/d4acb10d/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index be2db4d..062b210 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -7969,12 +7969,11 @@ void Master::updateTask(Task* task, const StatusUpdate& update)
   // task transitioned to a new state.
   bool sendSubscribersUpdate = false;
 
-  // Set 'terminated' to true if this is the first time the task
-  // transitioned to terminal state. Also set the latest state.
-  bool terminated;
+  // Set 'removable' to true if this is the first time the task
+  // transitioned to a removable state. Also set the latest state.
+  bool removable;
   if (latestState.isSome()) {
-    terminated = !protobuf::isTerminalState(task->state()) &&
-                 protobuf::isTerminalState(latestState.get());
+    removable = !isRemovable(task->state()) && isRemovable(latestState.get());
 
     // If the task has already transitioned to a terminal state,
     // do not update its state.
@@ -7986,8 +7985,7 @@ void Master::updateTask(Task* task, const StatusUpdate& update)
       task->set_state(latestState.get());
     }
   } else {
-    terminated = !protobuf::isTerminalState(task->state()) &&
-                 protobuf::isTerminalState(status.state());
+    removable = !isRemovable(task->state()) && isRemovable(status.state());
 
     // If the task has already transitioned to a terminal state, do not update
     // its state. Note that we are being defensive here because this should not
@@ -8026,8 +8024,8 @@ void Master::updateTask(Task* task, const StatusUpdate& update)
             << " (latest state: " << task->state()
             << ", status update state: " << status.state() << ")";
 
-  // Once the task becomes terminal, we recover the resources.
-  if (terminated) {
+  // Once the task becomes removable, recover the resources.
+  if (removable) {
     allocator->recoverResources(
         task->framework_id(),
         task->slave_id(),
@@ -8102,14 +8100,14 @@ void Master::removeTask(Task* task)
   Slave* slave = slaves.registered.get(task->slave_id());
   CHECK_NOTNULL(slave);
 
-  if (!protobuf::isTerminalState(task->state())) {
+  if (!isRemovable(task->state())) {
     LOG(WARNING) << "Removing task " << task->task_id()
                  << " with resources " << task->resources()
                  << " of framework " << task->framework_id()
                  << " on agent " << *slave
-                 << " in non-terminal state " << task->state();
+                 << " in non-removable state " << task->state();
 
-    // If the task is not terminal, then the resources have
+    // If the task is not removable, then the resources have
     // not yet been recovered.
     allocator->recoverResources(
         task->framework_id(),
@@ -8741,7 +8739,7 @@ void Slave::addTask(Task* task)
 
   tasks[frameworkId][taskId] = task;
 
-  if (!protobuf::isTerminalState(task->state())) {
+  if (!Master::isRemovable(task->state())) {
     usedResources[frameworkId] += task->resources();
   }
 
@@ -8760,7 +8758,7 @@ void Slave::recoverResources(Task* task)
   const TaskID& taskId = task->task_id();
   const FrameworkID& frameworkId = task->framework_id();
 
-  CHECK(protobuf::isTerminalState(task->state()));
+  CHECK(Master::isRemovable(task->state()));
   CHECK(tasks.at(frameworkId).contains(taskId))
     << "Unknown task " << taskId << " of framework " << frameworkId;
 
@@ -8779,7 +8777,7 @@ void Slave::removeTask(Task* task)
   CHECK(tasks.at(frameworkId).contains(taskId))
     << "Unknown task " << taskId << " of framework " << frameworkId;
 
-  if (!protobuf::isTerminalState(task->state())) {
+  if (!Master::isRemovable(task->state())) {
     usedResources[frameworkId] -= task->resources();
     if (usedResources[frameworkId].empty()) {
       usedResources.erase(frameworkId);

http://git-wip-us.apache.org/repos/asf/mesos/blob/d4acb10d/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index bbf42d7..0b82025 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -907,6 +907,22 @@ private:
   bool isWhitelistedRole(const std::string& name);
 
   /**
+   * Indicates whether a task in the given state can safely be removed
+   * from the master's in-memory state. When a task becomes removable,
+   * it is erased from the master's primary task data structures; a
+   * limited number of such tasks are kept as a cache (see
+   * `framework.unreachableTasks` and `framework.completedTasks`).
+   */
+  static bool isRemovable(const TaskState& state)
+  {
+    if (state == TASK_UNREACHABLE) {
+      return true;
+    }
+
+    return protobuf::isTerminalState(state);
+  }
+
+  /**
    * Inner class used to namespace the handling of quota requests.
    *
    * It operates inside the Master actor. It is responsible for validating
@@ -2182,7 +2198,7 @@ struct Framework
 
     tasks[task->task_id()] = task;
 
-    if (!protobuf::isTerminalState(task->state())) {
+    if (!Master::isRemovable(task->state())) {
       totalUsedResources += task->resources();
       usedResources[task->slave_id()] += task->resources();
     }
@@ -2196,7 +2212,7 @@ struct Framework
   // functionally for all tasks is expensive, for now.
   void recoverResources(Task* task)
   {
-    CHECK(protobuf::isTerminalState(task->state()));
+    CHECK(Master::isRemovable(task->state()));
     CHECK(tasks.contains(task->task_id()))
       << "Unknown task " << task->task_id()
       << " of framework " << task->framework_id();
@@ -2253,7 +2269,7 @@ struct Framework
       << "Unknown task " << task->task_id()
       << " of framework " << task->framework_id();
 
-    if (!protobuf::isTerminalState(task->state())) {
+    if (!Master::isRemovable(task->state())) {
       totalUsedResources -= task->resources();
       usedResources[task->slave_id()] -= task->resources();
       if (usedResources[task->slave_id()].empty()) {


[16/25] mesos git commit: Cleaned up test case code slightly.

Posted by vi...@apache.org.
Cleaned up test case code slightly.

Review: https://reviews.apache.org/r/55800/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/b12c762b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/b12c762b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/b12c762b

Branch: refs/heads/master
Commit: b12c762bc17d5b82e05f97f1bff822763fde393b
Parents: 4c77574
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:04:43 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:26 2017 -0800

----------------------------------------------------------------------
 src/tests/api_tests.cpp       |  4 ++--
 src/tests/partition_tests.cpp |  7 -------
 src/tests/registrar_tests.cpp | 24 ++++++++++++------------
 3 files changed, 14 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/b12c762b/src/tests/api_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/api_tests.cpp b/src/tests/api_tests.cpp
index 8c0ed23..400ac6f 100644
--- a/src/tests/api_tests.cpp
+++ b/src/tests/api_tests.cpp
@@ -948,7 +948,7 @@ TEST_P(MasterAPITest, ReserveResources)
   ASSERT_SOME(slave);
 
   FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
-  frameworkInfo .set_role("role");
+  frameworkInfo.set_role("role");
 
   Resources unreserved = Resources::parse("cpus:1;mem:512").get();
   Resources dynamicallyReserved = unreserved.flatten(
@@ -1038,7 +1038,7 @@ TEST_P(MasterAPITest, UnreserveResources)
   ASSERT_SOME(slave);
 
   FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
-  frameworkInfo .set_role("role");
+  frameworkInfo.set_role("role");
 
   Resources unreserved = Resources::parse("cpus:1;mem:512").get();
   Resources dynamicallyReserved = unreserved.flatten(

http://git-wip-us.apache.org/repos/asf/mesos/blob/b12c762b/src/tests/partition_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/partition_tests.cpp b/src/tests/partition_tests.cpp
index 6961ff6..9f7e4d9 100644
--- a/src/tests/partition_tests.cpp
+++ b/src/tests/partition_tests.cpp
@@ -184,7 +184,6 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlavePartitionAware)
   DROP_PROTOBUFS(PongSlaveMessage(), _, _);
 
   StandaloneMasterDetector detector(master.get()->pid);
-
   slave::Flags agentFlags = CreateSlaveFlags();
   Try<Owned<cluster::Slave>> slave = StartSlave(&detector, agentFlags);
   ASSERT_SOME(slave);
@@ -529,7 +528,6 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlaveNotPartitionAware)
   DROP_PROTOBUFS(PongSlaveMessage(), _, _);
 
   StandaloneMasterDetector detector(master.get()->pid);
-
   slave::Flags agentFlags = CreateSlaveFlags();
   Try<Owned<cluster::Slave>> slave = StartSlave(&detector, agentFlags);
   ASSERT_SOME(slave);
@@ -816,7 +814,6 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(
   slaveFlags.resources = "cpus:2;mem:1024";
 
   StandaloneMasterDetector detector(master.get()->pid);
-
   Try<Owned<cluster::Slave>> slave = StartSlave(&detector, slaveFlags);
   ASSERT_SOME(slave);
 
@@ -1067,7 +1064,6 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, PartitionedSlaveOrphanedTask)
   DROP_PROTOBUFS(PongSlaveMessage(), _, _);
 
   StandaloneMasterDetector detector(master.get()->pid);
-
   slave::Flags agentFlags = CreateSlaveFlags();
   Try<Owned<cluster::Slave>> slave = StartSlave(&detector, agentFlags);
   ASSERT_SOME(slave);
@@ -1503,7 +1499,6 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, SpuriousSlaveReregistration)
   ASSERT_SOME(master);
 
   StandaloneMasterDetector detector(master.get()->pid);
-
   slave::Flags agentFlags = CreateSlaveFlags();
   Try<Owned<cluster::Slave>> slave = StartSlave(&detector, agentFlags);
   ASSERT_SOME(slave);
@@ -2999,7 +2994,6 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, RegistryGcRace)
     FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
 
   Owned<MasterDetector> detector1 = master.get()->createDetector();
-
   slave::Flags agentFlags1 = CreateSlaveFlags();
   Try<Owned<cluster::Slave>> slave1 = StartSlave(detector1.get(), agentFlags1);
   ASSERT_SOME(slave1);
@@ -3059,7 +3053,6 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, RegistryGcRace)
     FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
 
   StandaloneMasterDetector detector2(master.get()->pid);
-
   slave::Flags agentFlags2 = CreateSlaveFlags();
   Try<Owned<cluster::Slave>> slave2 = StartSlave(&detector2, agentFlags2);
   ASSERT_SOME(slave2);

http://git-wip-us.apache.org/repos/asf/mesos/blob/b12c762b/src/tests/registrar_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/registrar_tests.cpp b/src/tests/registrar_tests.cpp
index 73cc25c..fb693ea 100644
--- a/src/tests/registrar_tests.cpp
+++ b/src/tests/registrar_tests.cpp
@@ -269,11 +269,11 @@ TEST_F(RegistrarTest, MarkReachable)
   Registrar registrar(flags, state);
   AWAIT_READY(registrar.recover(master));
 
-  SlaveInfo info1;
-  info1.set_hostname("localhost");
-
   SlaveID id1;
   id1.set_value("1");
+
+  SlaveInfo info1;
+  info1.set_hostname("localhost");
   info1.mutable_id()->CopyFrom(id1);
 
   SlaveID id2;
@@ -301,11 +301,11 @@ TEST_F(RegistrarTest, MarkUnreachable)
   Registrar registrar(flags, state);
   AWAIT_READY(registrar.recover(master));
 
-  SlaveInfo info1;
-  info1.set_hostname("localhost");
-
   SlaveID id1;
   id1.set_value("1");
+
+  SlaveInfo info1;
+  info1.set_hostname("localhost");
   info1.mutable_id()->CopyFrom(id1);
 
   SlaveID id2;
@@ -344,11 +344,11 @@ TEST_F(RegistrarTest, PruneUnreachable)
   Registrar registrar(flags, state);
   AWAIT_READY(registrar.recover(master));
 
-  SlaveInfo info1;
-  info1.set_hostname("localhost");
-
   SlaveID id1;
   id1.set_value("1");
+
+  SlaveInfo info1;
+  info1.set_hostname("localhost");
   info1.mutable_id()->CopyFrom(id1);
 
   SlaveID id2;
@@ -390,11 +390,11 @@ TEST_F(RegistrarTest, Remove)
   Registrar registrar(flags, state);
   AWAIT_READY(registrar.recover(master));
 
-  SlaveInfo info1;
-  info1.set_hostname("localhost");
-
   SlaveID id1;
   id1.set_value("1");
+
+  SlaveInfo info1;
+  info1.set_hostname("localhost");
   info1.mutable_id()->CopyFrom(id1);
 
   SlaveID id2;


[17/25] mesos git commit: Refactored Master::removeFramework to use Master::deactivate.

Posted by vi...@apache.org.
Refactored Master::removeFramework to use Master::deactivate.

Review: https://reviews.apache.org/r/54407/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/8234dc5f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/8234dc5f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/8234dc5f

Branch: refs/heads/master
Commit: 8234dc5f428d73783c291de973689ba3d9a1a5bd
Parents: b5711fd
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:04:06 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:26 2017 -0800

----------------------------------------------------------------------
 src/master/master.cpp | 56 +++++++++++++++++-----------------------------
 src/master/master.hpp |  2 +-
 2 files changed, 22 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/8234dc5f/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index c28ca25..deb1367 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -2981,7 +2981,7 @@ void Master::deactivateFramework(
     return;
   }
 
-  deactivate(framework);
+  deactivate(framework, true);
 }
 
 
@@ -3005,11 +3005,11 @@ void Master::disconnect(Framework* framework)
     framework->http.get().close();
   }
 
-  deactivate(framework);
+  deactivate(framework, true);
 }
 
 
-void Master::deactivate(Framework* framework)
+void Master::deactivate(Framework* framework, bool rescind)
 {
   CHECK_NOTNULL(framework);
 
@@ -3024,9 +3024,12 @@ void Master::deactivate(Framework* framework)
   // Remove the framework's offers.
   foreach (Offer* offer, utils::copy(framework->offers)) {
     allocator->recoverResources(
-        offer->framework_id(), offer->slave_id(), offer->resources(), None());
+        offer->framework_id(),
+        offer->slave_id(),
+        offer->resources(),
+        None());
 
-    removeOffer(offer, true); // Rescind.
+    removeOffer(offer, rescind);
   }
 
   // Remove the framework's inverse offers.
@@ -3039,7 +3042,7 @@ void Master::deactivate(Framework* framework)
             inverseOffer->unavailability()},
         None());
 
-    removeInverseOffer(inverseOffer, true); // Rescind.
+    removeInverseOffer(inverseOffer, rescind);
   }
 }
 
@@ -3076,7 +3079,10 @@ void Master::deactivate(Slave* slave)
   // Remove and rescind offers.
   foreach (Offer* offer, utils::copy(slave->offers)) {
     allocator->recoverResources(
-        offer->framework_id(), slave->id, offer->resources(), None());
+        offer->framework_id(),
+        slave->id,
+        offer->resources(),
+        None());
 
     removeOffer(offer, true); // Rescind!
   }
@@ -7484,12 +7490,16 @@ void Master::removeFramework(Framework* framework)
   LOG(INFO) << "Removing framework " << *framework;
 
   if (framework->active) {
-    // Tell the allocator to stop allocating resources to this framework.
-    // TODO(vinod): Consider setting  framework->active to false here
-    // or just calling 'deactivate(Framework*)'.
-    allocator->deactivateFramework(framework->id());
+    // Deactivate framework, but don't bother rescinding offers
+    // because the framework is being removed.
+    deactivate(framework, false);
   }
 
+  // The framework's offers should have been removed when the
+  // framework was deactivated.
+  CHECK(framework->offers.empty());
+  CHECK(framework->inverseOffers.empty());
+
   foreachvalue (Slave* slave, slaves.registered) {
     // Remove the pending tasks from the slave.
     slave->pendingTasks.erase(framework->id());
@@ -7580,30 +7590,6 @@ void Master::removeFramework(Framework* framework)
     framework->unreachableTasks.erase(taskId);
   }
 
-  // Remove the framework's offers (if they weren't removed before).
-  foreach (Offer* offer, utils::copy(framework->offers)) {
-    allocator->recoverResources(
-        offer->framework_id(),
-        offer->slave_id(),
-        offer->resources(),
-        None());
-
-    removeOffer(offer);
-  }
-
-  // Also remove the inverse offers.
-  foreach (InverseOffer* inverseOffer, utils::copy(framework->inverseOffers)) {
-    allocator->updateInverseOffer(
-        inverseOffer->slave_id(),
-        inverseOffer->framework_id(),
-        UnavailableResources{
-            inverseOffer->resources(),
-            inverseOffer->unavailability()},
-        None());
-
-    removeInverseOffer(inverseOffer);
-  }
-
   // Remove the framework's executors for correct resource accounting.
   foreachkey (const SlaveID& slaveId, utils::copy(framework->executors)) {
     Slave* slave = slaves.registered.get(slaveId);

http://git-wip-us.apache.org/repos/asf/mesos/blob/8234dc5f/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 0b82025..3b0d898 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -586,7 +586,7 @@ protected:
   void removeFramework(Slave* slave, Framework* framework);
 
   void disconnect(Framework* framework);
-  void deactivate(Framework* framework);
+  void deactivate(Framework* framework, bool rescind);
 
   void disconnect(Slave* slave);
   void deactivate(Slave* slave);


[21/25] mesos git commit: Changed "master/tasks_unreachable" metric from counter to gauge.

Posted by vi...@apache.org.
Changed "master/tasks_unreachable" metric from counter to gauge.

The number of unreachable tasks should be a gauge, because it can
increase and decrease over time. Note that we report the number of
unreachable tasks that the master knows about; this is a subset of the
"true" number of unreachable tasks, because the master only keeps a
limited-size cache of unreachable tasks for each framework.

Review: https://reviews.apache.org/r/55808/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/95aa7ff8
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/95aa7ff8
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/95aa7ff8

Branch: refs/heads/master
Commit: 95aa7ff83a6e511b0b5f255dc1887ce71550dbde
Parents: e8ff54f
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:05:28 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:27 2017 -0800

----------------------------------------------------------------------
 docs/monitoring.md            | 21 +++++++++++-
 src/master/master.cpp         | 16 ++++++++--
 src/master/master.hpp         |  1 +
 src/master/metrics.cpp        |  5 +--
 src/master/metrics.hpp        |  2 +-
 src/tests/partition_tests.cpp | 65 +++++++++++++++++++++++++++++---------
 6 files changed, 88 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/95aa7ff8/docs/monitoring.md
----------------------------------------------------------------------
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 59ee364..c42afce 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -14,12 +14,17 @@ you should monitor to detect abnormal situations in your cluster.
 ## Overview
 
 Mesos master and agent nodes report a set of statistics and metrics that enable
-you to  monitor resource usage and detect abnormal situations early. The
+cluster operators to monitor resource usage and detect abnormal situations early. The
 information reported by Mesos includes details about available resources, used
 resources, registered frameworks, active agents, and task state. You can use
 this information to create automated alerts and to plot different metrics over
 time inside a monitoring dashboard.
 
+Metric information is not persisted to disk at either master or agent
+nodes, which means that metrics will be reset when masters and agents
+are restarted. Similarly, if the current leading master fails and a new
+leading master is elected, metrics at the new master will be reset.
+
 
 ## Metric Types
 
@@ -501,6 +506,13 @@ The task states listed here match those of the task state machine.
 </tr>
 <tr>
   <td>
+  <code>master/tasks_killing</code>
+  </td>
+  <td>Number of tasks currently being killed</td>
+  <td>Gauge</td>
+</tr>
+<tr>
+  <td>
   <code>master/tasks_lost</code>
   </td>
   <td>Number of lost tasks</td>
@@ -527,6 +539,13 @@ The task states listed here match those of the task state machine.
   <td>Number of starting tasks</td>
   <td>Gauge</td>
 </tr>
+<tr>
+  <td>
+  <code>master/tasks_unreachable</code>
+  </td>
+  <td>Number of unreachable tasks</td>
+  <td>Gauge</td>
+</tr>
 </table>
 
 #### Messages

http://git-wip-us.apache.org/repos/asf/mesos/blob/95aa7ff8/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 994b43b..d2aee2b 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -8119,9 +8119,6 @@ void Master::updateTask(Task* task, const StatusUpdate& update)
       case TASK_ERROR:
         ++metrics->tasks_error;
         break;
-      case TASK_UNREACHABLE:
-        ++metrics->tasks_unreachable;
-        break;
       case TASK_DROPPED:
         ++metrics->tasks_dropped;
         break;
@@ -8135,6 +8132,7 @@ void Master::updateTask(Task* task, const StatusUpdate& update)
       case TASK_STAGING:
       case TASK_RUNNING:
       case TASK_KILLING:
+      case TASK_UNREACHABLE:
         break;
       case TASK_UNKNOWN:
         // Should not happen.
@@ -8578,6 +8576,18 @@ double Master::_tasks_running()
 }
 
 
+double Master::_tasks_unreachable()
+{
+  double count = 0.0;
+
+  foreachvalue (Framework* framework, frameworks.registered) {
+    count += framework->unreachableTasks.size();
+  }
+
+  return count;
+}
+
+
 double Master::_tasks_killing()
 {
   double count = 0.0;

http://git-wip-us.apache.org/repos/asf/mesos/blob/95aa7ff8/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 9c610d1..e4dd677 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -1830,6 +1830,7 @@ private:
   double _tasks_staging();
   double _tasks_starting();
   double _tasks_running();
+  double _tasks_unreachable();
   double _tasks_killing();
 
   double _resources_total(const std::string& name);

http://git-wip-us.apache.org/repos/asf/mesos/blob/95aa7ff8/src/master/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.cpp b/src/master/metrics.cpp
index 4c7072c..646041d 100644
--- a/src/master/metrics.cpp
+++ b/src/master/metrics.cpp
@@ -83,6 +83,9 @@ Metrics::Metrics(const Master& master)
     tasks_running(
         "master/tasks_running",
         defer(master, &Master::_tasks_running)),
+    tasks_unreachable(
+        "master/tasks_unreachable",
+        defer(master, &Master::_tasks_unreachable)),
     tasks_killing(
         "master/tasks_killing",
         defer(master, &Master::_tasks_killing)),
@@ -98,8 +101,6 @@ Metrics::Metrics(const Master& master)
         "master/tasks_error"),
     tasks_dropped(
         "master/tasks_dropped"),
-    tasks_unreachable(
-        "master/tasks_unreachable"),
     tasks_gone(
         "master/tasks_gone"),
     tasks_gone_by_operator(

http://git-wip-us.apache.org/repos/asf/mesos/blob/95aa7ff8/src/master/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/master/metrics.hpp b/src/master/metrics.hpp
index 62c3620..f701efe 100644
--- a/src/master/metrics.hpp
+++ b/src/master/metrics.hpp
@@ -61,6 +61,7 @@ struct Metrics
   process::metrics::Gauge tasks_staging;
   process::metrics::Gauge tasks_starting;
   process::metrics::Gauge tasks_running;
+  process::metrics::Gauge tasks_unreachable;
   process::metrics::Gauge tasks_killing;
   process::metrics::Counter tasks_finished;
   process::metrics::Counter tasks_failed;
@@ -68,7 +69,6 @@ struct Metrics
   process::metrics::Counter tasks_lost;
   process::metrics::Counter tasks_error;
   process::metrics::Counter tasks_dropped;
-  process::metrics::Counter tasks_unreachable;
   process::metrics::Counter tasks_gone;
   process::metrics::Counter tasks_gone_by_operator;
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/95aa7ff8/src/tests/partition_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/partition_tests.cpp b/src/tests/partition_tests.cpp
index 3bcb463..f03c5bd 100644
--- a/src/tests/partition_tests.cpp
+++ b/src/tests/partition_tests.cpp
@@ -267,13 +267,15 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlavePartitionAware)
 
   AWAIT_READY(slaveLost);
 
-  JSON::Object stats = Metrics();
-  EXPECT_EQ(0, stats.values["master/tasks_lost"]);
-  EXPECT_EQ(1, stats.values["master/tasks_unreachable"]);
-  EXPECT_EQ(1, stats.values["master/slave_unreachable_scheduled"]);
-  EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
-  EXPECT_EQ(1, stats.values["master/slave_removals"]);
-  EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
+  {
+    JSON::Object stats = Metrics();
+    EXPECT_EQ(0, stats.values["master/tasks_lost"]);
+    EXPECT_EQ(1, stats.values["master/tasks_unreachable"]);
+    EXPECT_EQ(1, stats.values["master/slave_unreachable_scheduled"]);
+    EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
+    EXPECT_EQ(1, stats.values["master/slave_removals"]);
+    EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
+  }
 
   // Check the master's "/state" endpoint. There should be a single
   // unreachable agent. The "tasks" and "completed_tasks" fields
@@ -506,6 +508,17 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlavePartitionAware)
     EXPECT_EQ(1, framework.values["TASK_RUNNING"].as<JSON::Number>());
   }
 
+  {
+    JSON::Object stats = Metrics();
+    EXPECT_EQ(1, stats.values["master/tasks_running"]);
+    EXPECT_EQ(0, stats.values["master/tasks_unreachable"]);
+    EXPECT_EQ(1, stats.values["master/slave_unreachable_scheduled"]);
+    EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
+    EXPECT_EQ(1, stats.values["master/slave_removals"]);
+    EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
+    EXPECT_EQ(0, stats.values["master/slave_removals/reason_unregistered"]);
+  }
+
   driver.stop();
   driver.join();
 }
@@ -621,14 +634,16 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlaveNotPartitionAware)
 
   AWAIT_READY(slaveLost);
 
-  JSON::Object stats = Metrics();
-  EXPECT_EQ(1, stats.values["master/tasks_lost"]);
-  EXPECT_EQ(0, stats.values["master/tasks_unreachable"]);
-  EXPECT_EQ(1, stats.values["master/slave_unreachable_scheduled"]);
-  EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
-  EXPECT_EQ(1, stats.values["master/slave_removals"]);
-  EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
-  EXPECT_EQ(0, stats.values["master/slave_removals/reason_unregistered"]);
+  {
+    JSON::Object stats = Metrics();
+    EXPECT_EQ(1, stats.values["master/tasks_lost"]);
+    EXPECT_EQ(0, stats.values["master/tasks_unreachable"]);
+    EXPECT_EQ(1, stats.values["master/slave_unreachable_scheduled"]);
+    EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
+    EXPECT_EQ(1, stats.values["master/slave_removals"]);
+    EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
+    EXPECT_EQ(0, stats.values["master/slave_removals/reason_unregistered"]);
+  }
 
   // Check the master's "/state" endpoint. The "tasks" and
   // "unreachable_tasks" fields should be empty; there should be a
@@ -783,6 +798,18 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, ReregisterSlaveNotPartitionAware)
     EXPECT_EQ(1, framework.values["TASK_LOST"].as<JSON::Number>());
   }
 
+  {
+    JSON::Object stats = Metrics();
+    EXPECT_EQ(1, stats.values["master/tasks_lost"]);
+    EXPECT_EQ(0, stats.values["master/tasks_unreachable"]);
+    EXPECT_EQ(0, stats.values["master/tasks_killed"]);
+    EXPECT_EQ(1, stats.values["master/slave_unreachable_scheduled"]);
+    EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
+    EXPECT_EQ(1, stats.values["master/slave_removals"]);
+    EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
+    EXPECT_EQ(0, stats.values["master/slave_removals/reason_unregistered"]);
+  }
+
   driver.stop();
   driver.join();
 }
@@ -1038,6 +1065,14 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(
 
   Clock::resume();
 
+  {
+    JSON::Object stats = Metrics();
+    EXPECT_EQ(2, stats.values["master/tasks_running"]);
+    EXPECT_EQ(0, stats.values["master/tasks_lost"]);
+    EXPECT_EQ(0, stats.values["master/tasks_unreachable"]);
+    EXPECT_EQ(0, stats.values["master/slave_removals"]);
+  }
+
   driver1.stop();
   driver1.join();
 


[22/25] mesos git commit: Fixed whitespace in mesos-execute.

Posted by vi...@apache.org.
Fixed whitespace in mesos-execute.

Review: https://reviews.apache.org/r/55802/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/ce9e00fa
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/ce9e00fa
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/ce9e00fa

Branch: refs/heads/master
Commit: ce9e00fae80c9be81c365a307e3a102f0c612ac1
Parents: f361b08
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:04:55 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:27 2017 -0800

----------------------------------------------------------------------
 src/cli/execute.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/ce9e00fa/src/cli/execute.cpp
----------------------------------------------------------------------
diff --git a/src/cli/execute.cpp b/src/cli/execute.cpp
index aee3d80..16da356 100644
--- a/src/cli/execute.cpp
+++ b/src/cli/execute.cpp
@@ -501,21 +501,21 @@ protected:
           _task.mutable_resources()->CopyFrom(resources.get());
         } else {
           foreach (TaskInfo _task, taskGroup->tasks()) {
-              _task.mutable_agent_id()->MergeFrom(offer.agent_id());
+            _task.mutable_agent_id()->MergeFrom(offer.agent_id());
 
-              // Takes resources first from the specified role, then from '*'.
-              Try<Resources> flattened =
-                Resources(_task.resources()).flatten(frameworkInfo.role());
+            // Takes resources first from the specified role, then from '*'.
+            Try<Resources> flattened =
+              Resources(_task.resources()).flatten(frameworkInfo.role());
 
-              // `frameworkInfo.role()` must be valid as it's allowed to
-              // register.
-              CHECK_SOME(flattened);
-              Option<Resources> resources = offered.find(flattened.get());
+            // `frameworkInfo.role()` must be valid as it's allowed to
+            // register.
+            CHECK_SOME(flattened);
+            Option<Resources> resources = offered.find(flattened.get());
 
-              CHECK_SOME(resources);
+            CHECK_SOME(resources);
 
-              _task.mutable_resources()->CopyFrom(resources.get());
-              _taskGroup.add_tasks()->CopyFrom(_task);
+            _task.mutable_resources()->CopyFrom(resources.get());
+            _taskGroup.add_tasks()->CopyFrom(_task);
           }
        }
        Call call;


[02/25] mesos git commit: Cleaned up `Master::updateTask` slightly.

Posted by vi...@apache.org.
Cleaned up `Master::updateTask` slightly.

Review: https://reviews.apache.org/r/55395/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/1ba2dce6
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/1ba2dce6
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/1ba2dce6

Branch: refs/heads/master
Commit: 1ba2dce6318cef3bd0fab1b9be0f8af6a03fba3b
Parents: 44fc29f
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:03:11 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:03:11 2017 -0800

----------------------------------------------------------------------
 src/master/master.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/1ba2dce6/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 731bb5b..3e2b29a 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -7993,7 +7993,7 @@ void Master::updateTask(Task* task, const StatusUpdate& update)
     // its state. Note that we are being defensive here because this should not
     // happen unless there is a bug in the master code.
     if (!protobuf::isTerminalState(task->state())) {
-      if (task->state() != status.state()) {
+      if (status.state() != task->state()) {
         sendSubscribersUpdate = true;
       }
 


[19/25] mesos git commit: Reordered function definitions to match declaration order.

Posted by vi...@apache.org.
Reordered function definitions to match declaration order.

Review: https://reviews.apache.org/r/55803/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/4332ffac
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/4332ffac
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/4332ffac

Branch: refs/heads/master
Commit: 4332ffac40b4b0a1a4573e50c49190ae0018ac54
Parents: ce9e00f
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:05:00 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:27 2017 -0800

----------------------------------------------------------------------
 src/master/master.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/4332ffac/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 30e401c..5e6eeb9 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -8414,11 +8414,11 @@ SlaveID Master::newSlaveId()
 }
 
 
-double Master::_slaves_active()
+double Master::_slaves_connected()
 {
   double count = 0.0;
   foreachvalue (Slave* slave, slaves.registered) {
-    if (slave->active) {
+    if (slave->connected) {
       count++;
     }
   }
@@ -8426,11 +8426,11 @@ double Master::_slaves_active()
 }
 
 
-double Master::_slaves_inactive()
+double Master::_slaves_disconnected()
 {
   double count = 0.0;
   foreachvalue (Slave* slave, slaves.registered) {
-    if (!slave->active) {
+    if (!slave->connected) {
       count++;
     }
   }
@@ -8438,11 +8438,11 @@ double Master::_slaves_inactive()
 }
 
 
-double Master::_slaves_connected()
+double Master::_slaves_active()
 {
   double count = 0.0;
   foreachvalue (Slave* slave, slaves.registered) {
-    if (slave->connected) {
+    if (slave->active) {
       count++;
     }
   }
@@ -8450,11 +8450,11 @@ double Master::_slaves_connected()
 }
 
 
-double Master::_slaves_disconnected()
+double Master::_slaves_inactive()
 {
   double count = 0.0;
   foreachvalue (Slave* slave, slaves.registered) {
-    if (!slave->connected) {
+    if (!slave->active) {
       count++;
     }
   }


[07/25] mesos git commit: Changed TASK_UNREACHABLE to be a non-terminal state.

Posted by vi...@apache.org.
Changed TASK_UNREACHABLE to be a non-terminal state.

This task state was always conceptually non-terminal, but previously
`isTerminalState` returned true for it. The master now distinguishes
between "removable" and "terminal" task states, so we can correctly
classify TASK_UNREACHABLE as a removable but non-terminal task state.

Review: https://reviews.apache.org/r/55476/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/6fd373b2
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/6fd373b2
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/6fd373b2

Branch: refs/heads/master
Commit: 6fd373b2643dd792c9141c2b4833d5945a50562e
Parents: d4acb10
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:03:49 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:06:29 2017 -0800

----------------------------------------------------------------------
 src/common/protobuf_utils.cpp | 3 ---
 1 file changed, 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/6fd373b2/src/common/protobuf_utils.cpp
----------------------------------------------------------------------
diff --git a/src/common/protobuf_utils.cpp b/src/common/protobuf_utils.cpp
index dd20759..3ccda1a 100644
--- a/src/common/protobuf_utils.cpp
+++ b/src/common/protobuf_utils.cpp
@@ -79,14 +79,11 @@ bool frameworkHasCapability(
 
 bool isTerminalState(const TaskState& state)
 {
-  // TODO(neilc): Revise/rename this function. LOST, UNREACHABLE, and
-  // GONE_BY_OPERATOR are not truly "terminal".
   return (state == TASK_FINISHED ||
           state == TASK_FAILED ||
           state == TASK_KILLED ||
           state == TASK_LOST ||
           state == TASK_ERROR ||
-          state == TASK_UNREACHABLE ||
           state == TASK_DROPPED ||
           state == TASK_GONE ||
           state == TASK_GONE_BY_OPERATOR);


[03/25] mesos git commit: Moved `Slave` definitions out-of-line to master.cpp.

Posted by vi...@apache.org.
Moved `Slave` definitions out-of-line to master.cpp.

Previously, one of the `Slave` member functions was defined out-of-line,
but the rest were defined inline; make them all defined out-of-line for
consistency, and also to allow the function implementations to access
members of `Master` in the future.

Review: https://reviews.apache.org/r/55472/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/bf7fb1ef
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/bf7fb1ef
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/bf7fb1ef

Branch: refs/heads/master
Commit: bf7fb1ef50672e3d82cc5ccc16f2990d14a8dc58
Parents: 1ba2dce
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:03:19 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:03:19 2017 -0800

----------------------------------------------------------------------
 src/master/master.cpp | 227 ++++++++++++++++++++++++++++++++++++++++-----
 src/master/master.hpp | 172 +++++-----------------------------
 2 files changed, 227 insertions(+), 172 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/bf7fb1ef/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 3e2b29a..3e98c0b 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -8631,6 +8631,106 @@ static bool isValidFailoverTimeout(const FrameworkInfo& frameworkInfo)
 }
 
 
+void Master::Subscribers::send(const mesos::master::Event& event)
+{
+  VLOG(1) << "Notifying all active subscribers about " << event.type() << " "
+          << "event";
+
+  foreachvalue (const Owned<Subscriber>& subscriber, subscribed) {
+    subscriber->http.send<mesos::master::Event, v1::master::Event>(event);
+  }
+}
+
+
+void Master::exited(const UUID& id)
+{
+  if (!subscribers.subscribed.contains(id)) {
+    LOG(WARNING) << "Unknown subscriber" << id << " disconnected";
+    return;
+  }
+
+  subscribers.subscribed.erase(id);
+}
+
+
+void Master::subscribe(const HttpConnection& http)
+{
+  LOG(INFO) << "Added subscriber: " << http.streamId << " to the "
+            << "list of active subscribers";
+
+  http.closed()
+    .onAny(defer(self(),
+           [this, http](const Future<Nothing>&) {
+             exited(http.streamId);
+           }));
+
+  subscribers.subscribed.put(
+      http.streamId,
+      Owned<Subscribers::Subscriber>(new Subscribers::Subscriber{http}));
+}
+
+
+Slave::Slave(
+    Master* const _master,
+    const SlaveInfo& _info,
+    const UPID& _pid,
+    const MachineID& _machineId,
+    const string& _version,
+    const Time& _registeredTime,
+    const Resources& _checkpointedResources,
+    const vector<ExecutorInfo> executorInfos,
+    const vector<Task> tasks)
+  : master(_master),
+    id(_info.id()),
+    info(_info),
+    machineId(_machineId),
+    pid(_pid),
+    version(_version),
+    registeredTime(_registeredTime),
+    connected(true),
+    active(true),
+    checkpointedResources(_checkpointedResources),
+    observer(nullptr),
+    capabilities(_info.capabilities())
+{
+  CHECK(_info.has_id());
+
+  Try<Resources> resources = applyCheckpointedResources(
+      info.resources(),
+      _checkpointedResources);
+
+  // NOTE: This should be validated during slave recovery.
+  CHECK_SOME(resources);
+  totalResources = resources.get();
+
+  foreach (const ExecutorInfo& executorInfo, executorInfos) {
+    CHECK(executorInfo.has_framework_id());
+    addExecutor(executorInfo.framework_id(), executorInfo);
+  }
+
+  foreach (const Task& task, tasks) {
+    addTask(new Task(task));
+  }
+}
+
+
+Slave::~Slave()
+{
+  if (reregistrationTimer.isSome()) {
+    process::Clock::cancel(reregistrationTimer.get());
+  }
+}
+
+
+Task* Slave::getTask(const FrameworkID& frameworkId, const TaskID& taskId)
+{
+  if (tasks.contains(frameworkId) && tasks[frameworkId].contains(taskId)) {
+    return tasks[frameworkId][taskId];
+  }
+  return nullptr;
+}
+
+
 void Slave::addTask(Task* task)
 {
   const TaskID& taskId = task->task_id();
@@ -8655,42 +8755,127 @@ void Slave::addTask(Task* task)
 }
 
 
-void Master::Subscribers::send(const mesos::master::Event& event)
+void Slave::taskTerminated(Task* task)
 {
-  VLOG(1) << "Notifying all active subscribers about " << event.type() << " "
-          << "event";
+  const TaskID& taskId = task->task_id();
+  const FrameworkID& frameworkId = task->framework_id();
 
-  foreachvalue (const Owned<Subscriber>& subscriber, subscribed) {
-    subscriber->http.send<mesos::master::Event, v1::master::Event>(event);
+  CHECK(protobuf::isTerminalState(task->state()));
+  CHECK(tasks.at(frameworkId).contains(taskId))
+    << "Unknown task " << taskId << " of framework " << frameworkId;
+
+  usedResources[frameworkId] -= task->resources();
+  if (usedResources[frameworkId].empty()) {
+    usedResources.erase(frameworkId);
   }
 }
 
 
-void Master::exited(const UUID& id)
+void Slave::removeTask(Task* task)
 {
-  if (!subscribers.subscribed.contains(id)) {
-    LOG(WARNING) << "Unknown subscriber" << id << " disconnected";
-    return;
+  const TaskID& taskId = task->task_id();
+  const FrameworkID& frameworkId = task->framework_id();
+
+  CHECK(tasks.at(frameworkId).contains(taskId))
+    << "Unknown task " << taskId << " of framework " << frameworkId;
+
+  if (!protobuf::isTerminalState(task->state())) {
+    usedResources[frameworkId] -= task->resources();
+    if (usedResources[frameworkId].empty()) {
+      usedResources.erase(frameworkId);
+    }
   }
 
-  subscribers.subscribed.erase(id);
+  tasks[frameworkId].erase(taskId);
+  if (tasks[frameworkId].empty()) {
+    tasks.erase(frameworkId);
+  }
+
+  killedTasks.remove(frameworkId, taskId);
 }
 
 
-void Master::subscribe(const HttpConnection& http)
+void Slave::addOffer(Offer* offer)
 {
-  LOG(INFO) << "Added subscriber: " << http.streamId << " to the "
-            << "list of active subscribers";
+  CHECK(!offers.contains(offer)) << "Duplicate offer " << offer->id();
 
-  http.closed()
-    .onAny(defer(self(),
-           [this, http](const Future<Nothing>&) {
-             exited(http.streamId);
-           }));
+  offers.insert(offer);
+  offeredResources += offer->resources();
+}
 
-  subscribers.subscribed.put(
-      http.streamId,
-      Owned<Subscribers::Subscriber>(new Subscribers::Subscriber{http}));
+
+void Slave::removeOffer(Offer* offer)
+{
+  CHECK(offers.contains(offer)) << "Unknown offer " << offer->id();
+
+  offeredResources -= offer->resources();
+  offers.erase(offer);
+}
+
+
+void Slave::addInverseOffer(InverseOffer* inverseOffer)
+{
+  CHECK(!inverseOffers.contains(inverseOffer))
+    << "Duplicate inverse offer " << inverseOffer->id();
+
+  inverseOffers.insert(inverseOffer);
+}
+
+
+void Slave::removeInverseOffer(InverseOffer* inverseOffer)
+{
+  CHECK(inverseOffers.contains(inverseOffer))
+    << "Unknown inverse offer " << inverseOffer->id();
+
+  inverseOffers.erase(inverseOffer);
+}
+
+
+bool Slave::hasExecutor(const FrameworkID& frameworkId,
+                        const ExecutorID& executorId) const
+{
+  return executors.contains(frameworkId) &&
+    executors.get(frameworkId).get().contains(executorId);
+}
+
+
+void Slave::addExecutor(const FrameworkID& frameworkId,
+                        const ExecutorInfo& executorInfo)
+{
+  CHECK(!hasExecutor(frameworkId, executorInfo.executor_id()))
+    << "Duplicate executor '" << executorInfo.executor_id()
+    << "' of framework " << frameworkId;
+
+  executors[frameworkId][executorInfo.executor_id()] = executorInfo;
+  usedResources[frameworkId] += executorInfo.resources();
+}
+
+void Slave::removeExecutor(const FrameworkID& frameworkId,
+                           const ExecutorID& executorId)
+{
+  CHECK(hasExecutor(frameworkId, executorId))
+    << "Unknown executor '" << executorId << "' of framework " << frameworkId;
+
+  usedResources[frameworkId] -=
+    executors[frameworkId][executorId].resources();
+  if (usedResources[frameworkId].empty()) {
+    usedResources.erase(frameworkId);
+  }
+
+  executors[frameworkId].erase(executorId);
+  if (executors[frameworkId].empty()) {
+    executors.erase(frameworkId);
+  }
+}
+
+
+void Slave::apply(const Offer::Operation& operation)
+{
+  Try<Resources> resources = totalResources.apply(operation);
+  CHECK_SOME(resources);
+
+  totalResources = resources.get();
+  checkpointedResources = totalResources.filter(needCheckpointing);
 }
 
 } // namespace master {

http://git-wip-us.apache.org/repos/asf/mesos/blob/bf7fb1ef/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index fe0590f..478c2ab 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -121,54 +121,13 @@ struct Slave
         const std::vector<ExecutorInfo> executorInfos =
           std::vector<ExecutorInfo>(),
         const std::vector<Task> tasks =
-          std::vector<Task>())
-    : master(_master),
-      id(_info.id()),
-      info(_info),
-      machineId(_machineId),
-      pid(_pid),
-      version(_version),
-      registeredTime(_registeredTime),
-      connected(true),
-      active(true),
-      checkpointedResources(_checkpointedResources),
-      observer(nullptr),
-      capabilities(_info.capabilities())
-  {
-    CHECK(_info.has_id());
-
-    Try<Resources> resources = applyCheckpointedResources(
-        info.resources(),
-        _checkpointedResources);
-
-    // NOTE: This should be validated during slave recovery.
-    CHECK_SOME(resources);
-    totalResources = resources.get();
-
-    foreach (const ExecutorInfo& executorInfo, executorInfos) {
-      CHECK(executorInfo.has_framework_id());
-      addExecutor(executorInfo.framework_id(), executorInfo);
-    }
-
-    foreach (const Task& task, tasks) {
-      addTask(new Task(task));
-    }
-  }
+          std::vector<Task>());
 
-  ~Slave()
-  {
-    if (reregistrationTimer.isSome()) {
-      process::Clock::cancel(reregistrationTimer.get());
-    }
-  }
+  ~Slave();
 
-  Task* getTask(const FrameworkID& frameworkId, const TaskID& taskId)
-  {
-    if (tasks.contains(frameworkId) && tasks[frameworkId].contains(taskId)) {
-      return tasks[frameworkId][taskId];
-    }
-    return nullptr;
-  }
+  Task* getTask(
+      const FrameworkID& frameworkId,
+      const TaskID& taskId);
 
   void addTask(Task* task);
 
@@ -176,120 +135,31 @@ struct Slave
   // TODO(bmahler): This is a hack for performance. We need to
   // maintain resource counters because computing task resources
   // functionally for all tasks is expensive, for now.
-  void taskTerminated(Task* task)
-  {
-    const TaskID& taskId = task->task_id();
-    const FrameworkID& frameworkId = task->framework_id();
-
-    CHECK(protobuf::isTerminalState(task->state()));
-    CHECK(tasks.at(frameworkId).contains(taskId))
-      << "Unknown task " << taskId << " of framework " << frameworkId;
-
-    usedResources[frameworkId] -= task->resources();
-    if (usedResources[frameworkId].empty()) {
-      usedResources.erase(frameworkId);
-    }
-  }
-
-  void removeTask(Task* task)
-  {
-    const TaskID& taskId = task->task_id();
-    const FrameworkID& frameworkId = task->framework_id();
-
-    CHECK(tasks.at(frameworkId).contains(taskId))
-      << "Unknown task " << taskId << " of framework " << frameworkId;
-
-    if (!protobuf::isTerminalState(task->state())) {
-      usedResources[frameworkId] -= task->resources();
-      if (usedResources[frameworkId].empty()) {
-        usedResources.erase(frameworkId);
-      }
-    }
-
-    tasks[frameworkId].erase(taskId);
-    if (tasks[frameworkId].empty()) {
-      tasks.erase(frameworkId);
-    }
-
-    killedTasks.remove(frameworkId, taskId);
-  }
-
-  void addOffer(Offer* offer)
-  {
-    CHECK(!offers.contains(offer)) << "Duplicate offer " << offer->id();
+  void taskTerminated(Task* task);
 
-    offers.insert(offer);
-    offeredResources += offer->resources();
-  }
-
-  void removeOffer(Offer* offer)
-  {
-    CHECK(offers.contains(offer)) << "Unknown offer " << offer->id();
-
-    offeredResources -= offer->resources();
-    offers.erase(offer);
-  }
-
-  void addInverseOffer(InverseOffer* inverseOffer)
-  {
-    CHECK(!inverseOffers.contains(inverseOffer))
-      << "Duplicate inverse offer " << inverseOffer->id();
-
-    inverseOffers.insert(inverseOffer);
-  }
-
-  void removeInverseOffer(InverseOffer* inverseOffer)
-  {
-    CHECK(inverseOffers.contains(inverseOffer))
-      << "Unknown inverse offer " << inverseOffer->id();
-
-    inverseOffers.erase(inverseOffer);
-  }
+  void removeTask(Task* task);
 
-  bool hasExecutor(const FrameworkID& frameworkId,
-                   const ExecutorID& executorId) const
-  {
-    return executors.contains(frameworkId) &&
-      executors.get(frameworkId).get().contains(executorId);
-  }
+  void addOffer(Offer* offer);
 
-  void addExecutor(const FrameworkID& frameworkId,
-                   const ExecutorInfo& executorInfo)
-  {
-    CHECK(!hasExecutor(frameworkId, executorInfo.executor_id()))
-      << "Duplicate executor '" << executorInfo.executor_id()
-      << "' of framework " << frameworkId;
+  void removeOffer(Offer* offer);
 
-    executors[frameworkId][executorInfo.executor_id()] = executorInfo;
-    usedResources[frameworkId] += executorInfo.resources();
-  }
+  void addInverseOffer(InverseOffer* inverseOffer);
 
-  void removeExecutor(const FrameworkID& frameworkId,
-                      const ExecutorID& executorId)
-  {
-    CHECK(hasExecutor(frameworkId, executorId))
-      << "Unknown executor '" << executorId << "' of framework " << frameworkId;
+  void removeInverseOffer(InverseOffer* inverseOffer);
 
-    usedResources[frameworkId] -=
-      executors[frameworkId][executorId].resources();
-    if (usedResources[frameworkId].empty()) {
-      usedResources.erase(frameworkId);
-    }
+  bool hasExecutor(
+      const FrameworkID& frameworkId,
+      const ExecutorID& executorId) const;
 
-    executors[frameworkId].erase(executorId);
-    if (executors[frameworkId].empty()) {
-      executors.erase(frameworkId);
-    }
-  }
+  void addExecutor(
+      const FrameworkID& frameworkId,
+      const ExecutorInfo& executorInfo);
 
-  void apply(const Offer::Operation& operation)
-  {
-    Try<Resources> resources = totalResources.apply(operation);
-    CHECK_SOME(resources);
+  void removeExecutor(
+      const FrameworkID& frameworkId,
+      const ExecutorID& executorId);
 
-    totalResources = resources.get();
-    checkpointedResources = totalResources.filter(needCheckpointing);
-  }
+  void apply(const Offer::Operation& operation);
 
   Master* const master;
   const SlaveID id;


[08/25] mesos git commit: Shutdown tasks of completed frameworks on agent re-registration.

Posted by vi...@apache.org.
Shutdown tasks of completed frameworks on agent re-registration.

Previously, if a framework completed (e.g., due to a teardown operation
or framework shutdown), any framework tasks running on partitioned
agents would not be shutdown when the agent re-registered. For tasks
that are not partition-aware, the task would be shutdown on agent
re-registration anyway. But for partition-aware tasks, this could lead
to orphan tasks.

Fix this by changing the master to shutdown such tasks when the agent
reregisters.

Note that if the master fails over between the time the framework
completes and a partitioned agent re-registers, any framework tasks
running on the agent will NOT be shutdown. This is a known bug; fixing
it requires persisting the framework shutdown operation to the registry
(MESOS-1719).

Review: https://reviews.apache.org/r/54232/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/482796bb
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/482796bb
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/482796bb

Branch: refs/heads/master
Commit: 482796bb1fb3f65269b0fd40143eccb8ced2027a
Parents: 6fd373b
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:03:53 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:06:33 2017 -0800

----------------------------------------------------------------------
 src/master/master.cpp         | 109 ++++++++++++++++-----
 src/tests/partition_tests.cpp | 196 ++++++++++++++++++++++++++-----------
 2 files changed, 225 insertions(+), 80 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/482796bb/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 062b210..c28ca25 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -5562,6 +5562,10 @@ void Master::_reregisterSlave(
   // such tasks "completed" when the agent was marked unreachable, so
   // no further cleanup for non-partition-aware tasks is required.
   //
+  // In addition, we also filter any tasks whose frameworks have
+  // completed. As in case (a), such frameworks will be shutdown and
+  // their tasks have already been marked "completed".
+  //
   // (b) If the master has failed over, all tasks are re-added to the
   // master. The master shouldn't have any record of the tasks running
   // on the agent, so no further cleanup is required.
@@ -5570,6 +5574,12 @@ void Master::_reregisterSlave(
     const FrameworkID& frameworkId = task.framework_id();
     Framework* framework = getFramework(frameworkId);
 
+    // Don't re-add tasks whose framework has been shutdown at the
+    // master. Such frameworks will be shutdown on the agent below.
+    if (isCompletedFramework(task.framework_id())) {
+      continue;
+    }
+
     // Always re-add partition-aware tasks.
     if (partitionAwareFrameworks.contains(frameworkId)) {
       tasks_.push_back(task);
@@ -5617,28 +5627,42 @@ void Master::_reregisterSlave(
   LOG(INFO) << "Re-registered agent " << *slave
             << " with " << slave->info.resources();
 
-  // Shutdown any frameworks running on the slave that don't have the
-  // PARTITION_AWARE capability, provided that this instance of the
-  // master previously added the slave to the `slaves.removed`
-  // collection. This matches the Mesos 1.0 "non-strict" semantics for
-  // frameworks that are not partition-aware: when a previously
-  // unreachable slave reregisters, its tasks are only shutdown if the
-  // master has not failed over.
-  if (slaveWasRemoved) {
-    foreach (const FrameworkInfo& framework, frameworks) {
-      if (!partitionAwareFrameworks.contains(framework.id())) {
-        LOG(INFO) << "Shutting down framework " << framework.id()
-                  << " at reregistered agent " << *slave
-                  << " because the framework is not partition-aware";
-
-        ShutdownFrameworkMessage message;
-        message.mutable_framework_id()->MergeFrom(framework.id());
-        send(slave->pid, message);
+  // Determine which frameworks on the slave to shutdown, if any. This
+  // happens in two cases:
+  //
+  // (1) If this master marked the slave unreachable (i.e., master has
+  // not failed over), we shutdown any non-partition-aware frameworks
+  // running on the slave. This matches the Mesos <= 1.0 "non-strict"
+  // registry semantics.
+  //
+  // (2) Any framework that is completed at the master but still
+  // running at the slave is shutdown. This can occur if the framework
+  // was removed when the slave was partitioned. NOTE: This is just a
+  // short-term hack because information about completed frameworks is
+  // lost when the master fails over. Also, we only store a limited
+  // number of completed frameworks. A proper fix likely involves
+  // storing framework information in the registry (MESOS-1719).
+  foreach (const FrameworkInfo& framework, frameworks) {
+    if (slaveWasRemoved && !partitionAwareFrameworks.contains(framework.id())) {
+      LOG(INFO) << "Shutting down framework " << framework.id()
+                << " at re-registered agent " << *slave
+                << " because the framework is not partition-aware";
 
-        // The framework's tasks should not be stored in the master's
-        // in-memory state, because they were not re-added above.
-        CHECK(!slave->tasks.contains(framework.id()));
-      }
+      ShutdownFrameworkMessage message;
+      message.mutable_framework_id()->MergeFrom(framework.id());
+      send(slave->pid, message);
+
+      // The framework's tasks should not be stored in the master's
+      // in-memory state, because they were not re-added filtered above.
+      CHECK(!slave->tasks.contains(framework.id()));
+    } else if (isCompletedFramework(framework.id())) {
+      LOG(INFO) << "Shutting down framework " << framework.id()
+                << " at re-registered agent " << *slave
+                << " because the framework has been shutdown at the master";
+
+      ShutdownFrameworkMessage message;
+      message.mutable_framework_id()->MergeFrom(framework.id());
+      send(slave->pid, message);
     }
   }
 
@@ -7479,7 +7503,8 @@ void Master::removeFramework(Framework* framework)
   // Remove the pending tasks from the framework.
   framework->pendingTasks.clear();
 
-  // Remove pointers to the framework's tasks in slaves.
+  // Remove pointers to the framework's tasks in slaves and mark those
+  // tasks as completed.
   foreachvalue (Task* task, utils::copy(framework->tasks)) {
     Slave* slave = slaves.registered.get(task->slave_id());
 
@@ -7497,7 +7522,15 @@ void Master::removeFramework(Framework* framework)
     // collect the possible finished status. We tolerate this,
     // because we expect that if the framework has been asked to shut
     // down, its user is not interested in results anymore.
+    //
     // TODO(alex): Consider a more descriptive state, e.g. TASK_ABANDONED.
+    //
+    // TODO(neilc): Marking the task KILLED before it has actually
+    // terminated is misleading. Instead, we should consider leaving
+    // the task in its current state at the master; if/when the agent
+    // shuts down the framework, we should arrange for a terminal
+    // status update to be delivered to the master and update the
+    // state of the task at that time (MESOS-6608).
     const StatusUpdate& update = protobuf::createStatusUpdate(
         task->framework_id(),
         task->slave_id(),
@@ -7515,6 +7548,38 @@ void Master::removeFramework(Framework* framework)
     removeTask(task);
   }
 
+  // Mark the framework's unreachable tasks as completed.
+  foreach (const TaskID& taskId, framework->unreachableTasks.keys()) {
+    const Owned<Task>& task = framework->unreachableTasks.at(taskId);
+
+    // TODO(neilc): Per comment above, using TASK_KILLED here is not
+    // ideal. It would be better to use TASK_UNREACHABLE here and only
+    // transition it to a terminal state when the agent re-registers
+    // and the task is shutdown (MESOS-6608).
+    const StatusUpdate& update = protobuf::createStatusUpdate(
+        task->framework_id(),
+        task->slave_id(),
+        task->task_id(),
+        TASK_KILLED,
+        TaskStatus::SOURCE_MASTER,
+        None(),
+        "Framework " + framework->id().value() + " removed",
+        TaskStatus::REASON_FRAMEWORK_REMOVED,
+        (task->has_executor_id()
+         ? Option<ExecutorID>(task->executor_id())
+         : None()));
+
+    updateTask(task.get(), update);
+
+    // We don't need to remove the task from the slave, because the
+    // task was removed when the agent was marked unreachable.
+    CHECK(!slaves.registered.contains(task->slave_id()));
+
+    // Move task from unreachable map to completed map.
+    framework->addCompletedTask(*task.get());
+    framework->unreachableTasks.erase(taskId);
+  }
+
   // Remove the framework's offers (if they weren't removed before).
   foreach (Offer* offer, utils::copy(framework->offers)) {
     allocator->recoverResources(

http://git-wip-us.apache.org/repos/asf/mesos/blob/482796bb/src/tests/partition_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/partition_tests.cpp b/src/tests/partition_tests.cpp
index a6919c5..47f6dc9 100644
--- a/src/tests/partition_tests.cpp
+++ b/src/tests/partition_tests.cpp
@@ -973,9 +973,8 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(
 
 // This test causes a slave to be partitioned while it is running a
 // task for a PARTITION_AWARE framework. The scheduler is shutdown
-// before the partition heals. Right now, the task is left running as
-// an orphan; when MESOS-6602 is fixed, the task will be shutdown when
-// the agent re-registers.
+// before the partition heals; the task should be shutdown after the
+// agent re-registers.
 TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, PartitionedSlaveOrphanedTask)
 {
   Clock::pause();
@@ -1039,10 +1038,10 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, PartitionedSlaveOrphanedTask)
   driver.launchTasks(offer.id(), {task});
 
   AWAIT_READY(runningStatus);
-  EXPECT_EQ(TASK_RUNNING, runningStatus.get().state());
-  EXPECT_EQ(task.task_id(), runningStatus.get().task_id());
+  EXPECT_EQ(TASK_RUNNING, runningStatus->state());
+  EXPECT_EQ(task.task_id(), runningStatus->task_id());
 
-  const SlaveID slaveId = runningStatus.get().slave_id();
+  const SlaveID slaveId = runningStatus->slave_id();
 
   AWAIT_READY(statusUpdateAck);
 
@@ -1070,20 +1069,12 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, PartitionedSlaveOrphanedTask)
   Clock::advance(masterFlags.agent_ping_timeout);
   Clock::settle();
 
-  // Record the time at which we expect the master to have marked the
-  // agent as unhealthy. We then advance the clock -- this shouldn't
-  // do anything, but it ensures that the `unreachable_time` we check
-  // below is computed at the right time.
-  TimeInfo partitionTime = protobuf::getCurrentTime();
-
-  Clock::advance(Milliseconds(100));
-
   AWAIT_READY(unreachableStatus);
-  EXPECT_EQ(TASK_UNREACHABLE, unreachableStatus.get().state());
-  EXPECT_EQ(TaskStatus::REASON_SLAVE_REMOVED, unreachableStatus.get().reason());
-  EXPECT_EQ(task.task_id(), unreachableStatus.get().task_id());
-  EXPECT_EQ(slaveId, unreachableStatus.get().slave_id());
-  EXPECT_EQ(partitionTime, unreachableStatus.get().unreachable_time());
+  EXPECT_EQ(TASK_UNREACHABLE, unreachableStatus->state());
+  EXPECT_EQ(TaskStatus::REASON_SLAVE_REMOVED, unreachableStatus->reason());
+  EXPECT_EQ(task.task_id(), unreachableStatus->task_id());
+  EXPECT_EQ(slaveId, unreachableStatus->slave_id());
+  EXPECT_TRUE(unreachableStatus->has_unreachable_time());
 
   AWAIT_READY(slaveLost);
 
@@ -1092,69 +1083,158 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(PartitionTest, PartitionedSlaveOrphanedTask)
   driver.stop();
   driver.join();
 
-  // Cause the slave to re-register with the master.
+  // Before the agent re-registers, check how `task` is displayed by
+  // the master's "/state" endpoint.
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "state",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+
+    JSON::Array unregisteredFrameworks =
+      parse->values["unregistered_frameworks"].as<JSON::Array>();
+
+    EXPECT_TRUE(unregisteredFrameworks.values.empty());
+
+    EXPECT_TRUE(parse->values["frameworks"].as<JSON::Array>().values.empty());
+    EXPECT_TRUE(parse->values["orphan_tasks"].as<JSON::Array>().values.empty());
+
+    JSON::Array completedFrameworks =
+      parse->values["completed_frameworks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, completedFrameworks.values.size());
+
+    JSON::Object framework =
+      completedFrameworks.values.front().as<JSON::Object>();
+
+    EXPECT_EQ(
+        frameworkId.get(),
+        framework.values["id"].as<JSON::String>().value);
+
+    EXPECT_TRUE(framework.values["tasks"].as<JSON::Array>().values.empty());
+    EXPECT_TRUE(
+        framework.values["unreachable_tasks"].as<JSON::Array>().values.empty());
+
+    JSON::Array completedTasks =
+      framework.values["completed_tasks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, completedTasks.values.size());
+
+    JSON::Object completedTask =
+      completedTasks.values.front().as<JSON::Object>();
+
+    EXPECT_EQ(
+        task.task_id(),
+        completedTask.values["id"].as<JSON::String>().value);
+    EXPECT_EQ(
+        "TASK_KILLED",
+        completedTask.values["state"].as<JSON::String>().value);
+  }
+
+  // Cause the slave to re-register with the master; the master should
+  // send a `ShutdownFrameworkMessage` to the slave.
   Future<SlaveReregisteredMessage> slaveReregistered = FUTURE_PROTOBUF(
       SlaveReregisteredMessage(), master.get()->pid, slave.get()->pid);
 
+  Future<ShutdownFrameworkMessage> shutdownFramework = FUTURE_PROTOBUF(
+      ShutdownFrameworkMessage(), master.get()->pid, slave.get()->pid);
+
   detector.appoint(master.get()->pid);
 
   Clock::advance(agentFlags.registration_backoff_factor);
   AWAIT_READY(slaveReregistered);
+  AWAIT_READY(shutdownFramework);
 
   Clock::resume();
 
-  // Check if `task` is still running by querying master's state endpoint.
-  Future<Response> response = process::http::get(
-      master.get()->pid,
-      "state",
-      None(),
-      createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+  // After the agent re-registers, check how `task` is displayed by
+  // the master's "/state" endpoint.
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "state",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
 
-  AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
-  AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
 
-  Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
-  ASSERT_SOME(parse);
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
 
-  JSON::Array unregisteredFrameworks =
-    parse->values["unregistered_frameworks"].as<JSON::Array>();
+    JSON::Array unregisteredFrameworks =
+      parse->values["unregistered_frameworks"].as<JSON::Array>();
 
-  // TODO(neilc): When MESOS-6602 is fixed, this should be empty.
-  EXPECT_EQ(1u, unregisteredFrameworks.values.size());
+    EXPECT_TRUE(unregisteredFrameworks.values.empty());
 
-  EXPECT_TRUE(parse->values["frameworks"].as<JSON::Array>().values.empty());
-  EXPECT_TRUE(parse->values["orphan_tasks"].as<JSON::Array>().values.empty());
+    EXPECT_TRUE(parse->values["frameworks"].as<JSON::Array>().values.empty());
+    EXPECT_TRUE(parse->values["orphan_tasks"].as<JSON::Array>().values.empty());
 
-  JSON::Array completedFrameworks =
-    parse->values["completed_frameworks"].as<JSON::Array>();
+    JSON::Array completedFrameworks =
+      parse->values["completed_frameworks"].as<JSON::Array>();
 
-  ASSERT_EQ(1u, completedFrameworks.values.size());
+    ASSERT_EQ(1u, completedFrameworks.values.size());
 
-  JSON::Object jsonFramework =
-    completedFrameworks.values.front().as<JSON::Object>();
+    JSON::Object framework =
+      completedFrameworks.values.front().as<JSON::Object>();
 
-  EXPECT_EQ(
-      frameworkId.get(),
-      jsonFramework.values["id"].as<JSON::String>().value);
+    EXPECT_EQ(
+        frameworkId.get(),
+        framework.values["id"].as<JSON::String>().value);
 
-  EXPECT_TRUE(jsonFramework.values["tasks"].as<JSON::Array>().values.empty());
-  EXPECT_TRUE(
-      jsonFramework.values["completed_tasks"].as<JSON::Array>().values.empty());
+    EXPECT_TRUE(framework.values["tasks"].as<JSON::Array>().values.empty());
+    EXPECT_TRUE(
+        framework.values["unreachable_tasks"].as<JSON::Array>().values.empty());
 
-  JSON::Array unreachableTasks =
-    jsonFramework.values["unreachable_tasks"].as<JSON::Array>();
+    JSON::Array completedTasks =
+      framework.values["completed_tasks"].as<JSON::Array>();
 
-  ASSERT_EQ(1u, unreachableTasks.values.size());
+    ASSERT_EQ(1u, completedTasks.values.size());
 
-  JSON::Object unreachableTask =
-    unreachableTasks.values.front().as<JSON::Object>();
+    JSON::Object completedTask =
+      completedTasks.values.front().as<JSON::Object>();
 
-  EXPECT_EQ(
-      task.task_id(),
-      unreachableTask.values["id"].as<JSON::String>().value);
-  EXPECT_EQ(
-      "TASK_UNREACHABLE",
-      unreachableTask.values["state"].as<JSON::String>().value);
+    EXPECT_EQ(
+        task.task_id(),
+        completedTask.values["id"].as<JSON::String>().value);
+    EXPECT_EQ(
+        "TASK_KILLED",
+        completedTask.values["state"].as<JSON::String>().value);
+  }
+
+  // Also check the master's "/tasks" endpoint.
+  {
+    Future<Response> response = process::http::get(
+        master.get()->pid,
+        "tasks",
+        None(),
+        createBasicAuthHeaders(DEFAULT_CREDENTIAL));
+
+    AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+    AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
+
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
+    ASSERT_SOME(parse);
+
+    JSON::Array tasks = parse->values["tasks"].as<JSON::Array>();
+
+    ASSERT_EQ(1u, tasks.values.size());
+
+    JSON::Object jsonTask = tasks.values.front().as<JSON::Object>();
+
+    EXPECT_EQ(
+        task.task_id(), jsonTask.values["id"].as<JSON::String>().value);
+    EXPECT_EQ(
+        "TASK_KILLED",
+        jsonTask.values["state"].as<JSON::String>().value);
+  }
 }
 
 


[10/25] mesos git commit: Prevented task launches that reuse unreachable task IDs.

Posted by vi...@apache.org.
Prevented task launches that reuse unreachable task IDs.

The master keeps an in-memory cache of task IDs that have recently been
marked unreachable. The master now consults this cache to reject task
launch attempts that reuse one of these recently unreachable task IDs
(such tasks are not terminal and may resume running in the future). This
check is not complete (we won't detect all cases in which unreachable
task IDs are reused), but preventing this from happening in the common
case seems worth doing. See MESOS-6785 for details.

Review: https://reviews.apache.org/r/54793/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/b8562b8f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/b8562b8f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/b8562b8f

Branch: refs/heads/master
Commit: b8562b8fc578f8d62f4a934d315516161d9e1720
Parents: e65fa03
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:04:22 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:26 2017 -0800

----------------------------------------------------------------------
 src/master/validation.cpp             |  12 ++-
 src/tests/master_validation_tests.cpp | 143 +++++++++++++++++++++++++++++
 2 files changed, 153 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/b8562b8f/src/master/validation.cpp
----------------------------------------------------------------------
diff --git a/src/master/validation.cpp b/src/master/validation.cpp
index 0920bd0..2a79fbb 100644
--- a/src/master/validation.cpp
+++ b/src/master/validation.cpp
@@ -788,8 +788,8 @@ Option<Error> validateTaskID(const TaskInfo& task)
 }
 
 
-// Validates that the TaskID does not collide with any existing tasks
-// for the framework.
+// Validates that the TaskID does not collide with the ID of a running
+// or unreachable task for this framework.
 Option<Error> validateUniqueTaskID(const TaskInfo& task, Framework* framework)
 {
   const TaskID& taskId = task.task_id();
@@ -798,6 +798,14 @@ Option<Error> validateUniqueTaskID(const TaskInfo& task, Framework* framework)
     return Error("Task has duplicate ID: " + taskId.value());
   }
 
+  // TODO(neilc): `unreachableTasks` is a fixed-size cache and is not
+  // preserved across master failover, so we cannot avoid all possible
+  // task ID collisions (MESOS-6785).
+  if (framework->unreachableTasks.contains(taskId)) {
+    return Error("Task reuses the ID of an unreachable task: " +
+                 taskId.value());
+  }
+
   return None();
 }
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/b8562b8f/src/tests/master_validation_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_validation_tests.cpp b/src/tests/master_validation_tests.cpp
index a631781..ce10ea4 100644
--- a/src/tests/master_validation_tests.cpp
+++ b/src/tests/master_validation_tests.cpp
@@ -40,6 +40,8 @@
 #include "master/master.hpp"
 #include "master/validation.hpp"
 
+#include "master/detector/standalone.hpp"
+
 #include "slave/slave.hpp"
 
 #include "tests/containerizer.hpp"
@@ -54,9 +56,11 @@ using mesos::internal::master::Master;
 using mesos::internal::slave::Slave;
 
 using mesos::master::detector::MasterDetector;
+using mesos::master::detector::StandaloneMasterDetector;
 
 using process::Clock;
 using process::Future;
+using process::Message;
 using process::Owned;
 using process::PID;
 
@@ -65,6 +69,7 @@ using std::vector;
 
 using testing::_;
 using testing::AtMost;
+using testing::Eq;
 using testing::Return;
 
 namespace mesos {
@@ -1429,6 +1434,144 @@ TEST_F(TaskValidationTest, ExecutorInfoDiffersOnDifferentSlaves)
 }
 
 
+// This test checks that if a task is launched with the same task ID
+// as an unreachable task, the second task will be rejected. The
+// master does not store all unreachable task IDs so we cannot prevent
+// all task ID collisions, but we try to prevent the common case.
+TEST_F(TaskValidationTest, TaskReusesUnreachableTaskID)
+{
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  // Allow the master to PING the slave, but drop all PONG messages
+  // from the slave. Note that we don't match on the master / slave
+  // PIDs because it's actually the `SlaveObserver` process that sends
+  // the pings.
+  Future<Message> ping = FUTURE_MESSAGE(
+      Eq(PingSlaveMessage().GetTypeName()), _, _);
+
+  DROP_PROTOBUFS(PongSlaveMessage(), _, _);
+
+  StandaloneMasterDetector detector1(master.get()->pid);
+  Try<Owned<cluster::Slave>> slave1 = StartSlave(&detector1);
+  ASSERT_SOME(slave1);
+
+  // Start a partition-aware scheduler.
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      FrameworkInfo::Capability::PARTITION_AWARE);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(&driver, _, _));
+
+  Future<vector<Offer>> offers1;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers1));
+
+  driver.start();
+
+  AWAIT_READY(offers1);
+  ASSERT_FALSE(offers1->empty());
+
+  Offer offer1 = offers1.get()[0];
+  TaskInfo task1 = createTask(offer1, "sleep 60");
+
+  Future<TaskStatus> runningStatus;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&runningStatus));
+
+  Future<Nothing> statusUpdateAck = FUTURE_DISPATCH(
+      slave1.get()->pid, &Slave::_statusUpdateAcknowledgement);
+
+  driver.launchTasks(offer1.id(), {task1});
+
+  AWAIT_READY(runningStatus);
+  EXPECT_EQ(TASK_RUNNING, runningStatus->state());
+  EXPECT_EQ(task1.task_id(), runningStatus->task_id());
+
+  const SlaveID slaveId1 = runningStatus->slave_id();
+
+  AWAIT_READY(statusUpdateAck);
+
+  // Now, induce a partition of the slave by having the master
+  // timeout the slave.
+  Clock::pause();
+
+  Future<TaskStatus> unreachableStatus;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&unreachableStatus));
+
+  Future<Nothing> slaveLost;
+  EXPECT_CALL(sched, slaveLost(&driver, _))
+    .WillOnce(FutureSatisfy(&slaveLost));
+
+  size_t pings = 0;
+  while (true) {
+    AWAIT_READY(ping);
+    pings++;
+    if (pings == masterFlags.max_agent_ping_timeouts) {
+      break;
+    }
+    ping = FUTURE_MESSAGE(Eq(PingSlaveMessage().GetTypeName()), _, _);
+    Clock::advance(masterFlags.agent_ping_timeout);
+  }
+
+  Clock::advance(masterFlags.agent_ping_timeout);
+
+  AWAIT_READY(unreachableStatus);
+  EXPECT_EQ(TASK_UNREACHABLE, unreachableStatus->state());
+  EXPECT_EQ(TaskStatus::REASON_SLAVE_REMOVED, unreachableStatus->reason());
+  EXPECT_EQ(task1.task_id(), unreachableStatus->task_id());
+  EXPECT_EQ(slaveId1, unreachableStatus->slave_id());
+
+  AWAIT_READY(slaveLost);
+
+  // Start a second agent (the first agent remains partitioned).
+  StandaloneMasterDetector detector2(master.get()->pid);
+  slave::Flags agentFlags2 = CreateSlaveFlags();
+  Try<Owned<cluster::Slave>> slave2 = StartSlave(&detector2, agentFlags2);
+  ASSERT_SOME(slave2);
+
+  Future<vector<Offer>> offers2;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers2));
+
+  Clock::advance(agentFlags2.registration_backoff_factor);
+  AWAIT_READY(offers2);
+  ASSERT_FALSE(offers2->empty());
+
+  // Attempt to launch a new task that reuses the ID of the first
+  // (unreachable) task. This should result in TASK_ERROR.
+
+  Offer offer2 = offers2.get()[0];
+  TaskInfo task2 = createTask(
+      offer2,
+      "sleep 60",
+      None(),
+      "test-task-2",
+      task1.task_id().value());
+
+  Future<TaskStatus> errorStatus;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&errorStatus));
+
+  driver.launchTasks(offer2.id(), {task2});
+
+  AWAIT_READY(errorStatus);
+  EXPECT_EQ(TASK_ERROR, errorStatus->state());
+  EXPECT_EQ(task2.task_id(), errorStatus->task_id());
+
+  driver.stop();
+  driver.join();
+
+  Clock::resume();
+}
+
+
 // This test verifies that a task is not allowed to mix revocable and
 // non-revocable resources.
 TEST_F(TaskValidationTest, TaskUsesRevocableResources)


[12/25] mesos git commit: Cleaned up master.proto slightly.

Posted by vi...@apache.org.
Cleaned up master.proto slightly.

Review: https://reviews.apache.org/r/55798/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/6848084b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/6848084b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/6848084b

Branch: refs/heads/master
Commit: 6848084b8dcfe5989736e769c0357c991b9119b6
Parents: 2692535
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:04:32 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:26 2017 -0800

----------------------------------------------------------------------
 include/mesos/master/master.proto    | 4 ++--
 include/mesos/v1/master/master.proto | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/6848084b/include/mesos/master/master.proto
----------------------------------------------------------------------
diff --git a/include/mesos/master/master.proto b/include/mesos/master/master.proto
index 7eaae22..a2228db 100644
--- a/include/mesos/master/master.proto
+++ b/include/mesos/master/master.proto
@@ -66,7 +66,7 @@ message Call {
 
     GET_MASTER = 17;        // Retrieves the master's information.
 
-    SUBSCRIBE = 18;          // Subscribes the master to receive events.
+    SUBSCRIBE = 18;         // Subscribes the master to receive events.
 
     RESERVE_RESOURCES = 19;
     UNRESERVE_RESOURCES = 20;
@@ -84,7 +84,7 @@ message Call {
 
     GET_QUOTA = 28;
     SET_QUOTA = 29;          // See 'SetQuota' below.
-    REMOVE_QUOTA = 30;
+    REMOVE_QUOTA = 30;       // See 'RemoveQuota' below.
   }
 
   // Provides a snapshot of the current metrics tracked by the master.

http://git-wip-us.apache.org/repos/asf/mesos/blob/6848084b/include/mesos/v1/master/master.proto
----------------------------------------------------------------------
diff --git a/include/mesos/v1/master/master.proto b/include/mesos/v1/master/master.proto
index 5de3a93..cfdca74 100644
--- a/include/mesos/v1/master/master.proto
+++ b/include/mesos/v1/master/master.proto
@@ -66,7 +66,7 @@ message Call {
 
     GET_MASTER = 17;        // Retrieves the master's information.
 
-    SUBSCRIBE = 18;          // Subscribes the master to receive events.
+    SUBSCRIBE = 18;         // Subscribes the master to receive events.
 
     RESERVE_RESOURCES = 19;
     UNRESERVE_RESOURCES = 20;
@@ -84,7 +84,7 @@ message Call {
 
     GET_QUOTA = 28;
     SET_QUOTA = 29;          // See 'SetQuota' below.
-    REMOVE_QUOTA = 30;
+    REMOVE_QUOTA = 30;       // See 'RemoveQuota' below.
   }
 
   // Provides a snapshot of the current metrics tracked by the master.


[14/25] mesos git commit: Cleaned up SUBSCRIBE code slightly.

Posted by vi...@apache.org.
Cleaned up SUBSCRIBE code slightly.

Ensure `SUBSCRIBE` code is ordered consistently with respect to the
other operator call operations.

Review: https://reviews.apache.org/r/55799/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/4c775749
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/4c775749
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/4c775749

Branch: refs/heads/master
Commit: 4c77574929a4a57dcb755f11b76c42fdd1df0787
Parents: 6848084
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:04:37 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:26 2017 -0800

----------------------------------------------------------------------
 src/master/http.cpp       | 6 +++---
 src/master/validation.cpp | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/4c775749/src/master/http.cpp
----------------------------------------------------------------------
diff --git a/src/master/http.cpp b/src/master/http.cpp
index 95cb6c6..10382f4 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -590,6 +590,9 @@ Future<Response> Master::Http::api(
     case mesos::master::Call::GET_MASTER:
       return getMaster(call, principal, acceptType);
 
+    case mesos::master::Call::SUBSCRIBE:
+      return subscribe(call, principal, acceptType);
+
     case mesos::master::Call::RESERVE_RESOURCES:
       return reserveResources(call, principal, acceptType);
 
@@ -625,9 +628,6 @@ Future<Response> Master::Http::api(
 
     case mesos::master::Call::REMOVE_QUOTA:
       return quotaHandler.remove(call, principal);
-
-    case mesos::master::Call::SUBSCRIBE:
-      return subscribe(call, principal, acceptType);
   }
 
   UNREACHABLE();

http://git-wip-us.apache.org/repos/asf/mesos/blob/4c775749/src/master/validation.cpp
----------------------------------------------------------------------
diff --git a/src/master/validation.cpp b/src/master/validation.cpp
index 2a79fbb..e3f71be 100644
--- a/src/master/validation.cpp
+++ b/src/master/validation.cpp
@@ -133,6 +133,9 @@ Option<Error> validate(
     case mesos::master::Call::GET_MASTER:
       return None();
 
+    case mesos::master::Call::SUBSCRIBE:
+      return None();
+
     case mesos::master::Call::RESERVE_RESOURCES: {
       if (!call.has_reserve_resources()) {
         return Error("Expecting 'reserve_resources' to be present");
@@ -213,9 +216,6 @@ Option<Error> validate(
         return Error("Expecting 'remove_quota' to be present");
       }
       return None();
-
-    case mesos::master::Call::SUBSCRIBE:
-      return None();
   }
 
   UNREACHABLE();


[13/25] mesos git commit: Cleaned up teardown tests slightly.

Posted by vi...@apache.org.
Cleaned up teardown tests slightly.

Review: https://reviews.apache.org/r/54794/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/2692535b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/2692535b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/2692535b

Branch: refs/heads/master
Commit: 2692535bcbe4634dca0a58598093ddcf1887ca88
Parents: b8562b8
Author: Neil Conway <ne...@gmail.com>
Authored: Mon Jan 23 17:04:27 2017 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Jan 23 17:07:26 2017 -0800

----------------------------------------------------------------------
 src/tests/teardown_tests.cpp | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/2692535b/src/tests/teardown_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/teardown_tests.cpp b/src/tests/teardown_tests.cpp
index ce51d20..239a87e 100644
--- a/src/tests/teardown_tests.cpp
+++ b/src/tests/teardown_tests.cpp
@@ -91,7 +91,7 @@ TEST_F(TeardownTest, Success)
         master.get()->pid,
         "teardown",
         createBasicAuthHeaders(DEFAULT_CREDENTIAL),
-        "frameworkId=" + frameworkId.get().value());
+        "frameworkId=" + frameworkId->value());
 
     AWAIT_READY(response);
     AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
@@ -109,7 +109,7 @@ TEST_F(TeardownTest, Success)
     AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
     AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
 
-    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response.get().body);
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
     ASSERT_SOME(parse);
 
     JSON::Array frameworks = parse->values["frameworks"].as<JSON::Array>();
@@ -161,7 +161,7 @@ TEST_F(TeardownTest, BadCredentials)
       master.get()->pid,
       "teardown",
       createBasicAuthHeaders(badCredential),
-      "frameworkId=" + frameworkId.get().value());
+      "frameworkId=" + frameworkId->value());
 
   AWAIT_READY(response);
   AWAIT_EXPECT_RESPONSE_STATUS_EQ(Unauthorized({}).status, response);
@@ -204,7 +204,7 @@ TEST_F(TeardownTest, GoodACLs)
       master.get()->pid,
       "teardown",
       createBasicAuthHeaders(DEFAULT_CREDENTIAL),
-      "frameworkId=" + frameworkId.get().value());
+      "frameworkId=" + frameworkId->value());
 
   AWAIT_READY(response);
   AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
@@ -248,7 +248,7 @@ TEST_F(TeardownTest, GoodDeprecatedACLs)
       master.get()->pid,
       "teardown",
       createBasicAuthHeaders(DEFAULT_CREDENTIAL),
-      "frameworkId=" + frameworkId.get().value());
+      "frameworkId=" + frameworkId->value());
 
   AWAIT_READY(response);
   AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
@@ -290,7 +290,7 @@ TEST_F(TeardownTest, BadACLs)
       master.get()->pid,
       "teardown",
       createBasicAuthHeaders(DEFAULT_CREDENTIAL),
-      "frameworkId=" + frameworkId.get().value());
+      "frameworkId=" + frameworkId->value());
 
   AWAIT_READY(response);
   AWAIT_EXPECT_RESPONSE_STATUS_EQ(Forbidden().status, response);
@@ -354,7 +354,7 @@ TEST_F(TeardownTest, NoHeader)
       master.get()->pid,
       "teardown",
       None(),
-      "frameworkId=" + frameworkId.get().value());
+      "frameworkId=" + frameworkId->value());
 
   AWAIT_READY(response);
   AWAIT_EXPECT_RESPONSE_STATUS_EQ(Unauthorized({}).status, response);
@@ -364,7 +364,9 @@ TEST_F(TeardownTest, NoHeader)
 }
 
 
-TEST_F(TeardownTest, DisconnectedFrameworkAfterFailover)
+// This test checks that the teardown operation can be used on a
+// framework that has not re-registered after master failover.
+TEST_F(TeardownTest, RecoveredFrameworkAfterMasterFailover)
 {
   master::Flags masterFlags = CreateMasterFlags();
   Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
@@ -384,8 +386,7 @@ TEST_F(TeardownTest, DisconnectedFrameworkAfterFailover)
 
   Future<vector<Offer>> offers;
   EXPECT_CALL(sched, resourceOffers(&driver, _))
-    .WillOnce(FutureArg<1>(&offers))
-    .WillRepeatedly(Return()); // Ignore subsequent offers.
+    .WillOnce(FutureArg<1>(&offers));
 
   ASSERT_EQ(DRIVER_RUNNING, driver.start());
 
@@ -406,8 +407,8 @@ TEST_F(TeardownTest, DisconnectedFrameworkAfterFailover)
   driver.launchTasks(offers.get()[0].id(), {task});
 
   AWAIT_READY(runningStatus);
-  EXPECT_EQ(TASK_RUNNING, runningStatus.get().state());
-  EXPECT_EQ(task.task_id(), runningStatus.get().task_id());
+  EXPECT_EQ(TASK_RUNNING, runningStatus->state());
+  EXPECT_EQ(task.task_id(), runningStatus->task_id());
 
   AWAIT_READY(statusUpdateAck);
 
@@ -429,13 +430,14 @@ TEST_F(TeardownTest, DisconnectedFrameworkAfterFailover)
 
   AWAIT_READY(slaveReregisteredMessage);
 
-  // Teardown the framework.
+  // Teardown the framework, which has not yet re-registered with the
+  // new master.
   {
     Future<Response> response = process::http::post(
         master.get()->pid,
         "teardown",
         createBasicAuthHeaders(DEFAULT_CREDENTIAL),
-        "frameworkId=" + frameworkId.get().value());
+        "frameworkId=" + frameworkId->value());
 
     AWAIT_READY(response);
     AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
@@ -455,7 +457,7 @@ TEST_F(TeardownTest, DisconnectedFrameworkAfterFailover)
     AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
     AWAIT_EXPECT_RESPONSE_HEADER_EQ(APPLICATION_JSON, "Content-Type", response);
 
-    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response.get().body);
+    Try<JSON::Object> parse = JSON::parse<JSON::Object>(response->body);
     ASSERT_SOME(parse);
 
     JSON::Array frameworks = parse->values["frameworks"].as<JSON::Array>();
@@ -472,12 +474,12 @@ TEST_F(TeardownTest, DisconnectedFrameworkAfterFailover)
 
     ASSERT_EQ(1u, completedFrameworks.values.size());
 
-    JSON::Object firstCompletedFramework =
+    JSON::Object completedFramework =
       completedFrameworks.values.front().as<JSON::Object>();
 
     EXPECT_EQ(
         frameworkId.get(),
-        firstCompletedFramework.values["id"].as<JSON::String>().value);
+        completedFramework.values["id"].as<JSON::String>().value);
   }
 
   driver.stop();