You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by mp...@apache.org on 2017/08/24 23:30:57 UTC

mesos git commit: Fixed agent downgrades for reservation refinement.

Repository: mesos
Updated Branches:
  refs/heads/master 09b1625b5 -> a955458df


Fixed agent downgrades for reservation refinement.

Previously, `checkpoint(path, resources)` was overloaded such that it
would automatically downgrade the resources before being checkpointed
on the agent. However, `checkpoint(path, protobuf_containing_resources)`
did not work correctly since we didn't recursively look within
the messages to downgrade the resources. Ideally, we would use
protobuf reflection to ensure that these are handled automatically.
For now, we attempt to get all of the places where resources are
present within a message.

Review: https://reviews.apache.org/r/61880/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a955458d
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a955458d
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a955458d

Branch: refs/heads/master
Commit: a955458dfec77967df8437805f15f5ed81c16b5c
Parents: 09b1625
Author: Michael Park <mp...@apache.org>
Authored: Thu Aug 24 15:25:49 2017 -0700
Committer: Michael Park <mp...@apache.org>
Committed: Thu Aug 24 15:25:49 2017 -0700

----------------------------------------------------------------------
 src/slave/slave.cpp                | 73 +++++++++++++++++++++++++++++----
 src/slave/state.hpp                | 16 --------
 src/tests/slave_recovery_tests.cpp |  4 +-
 3 files changed, 67 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/a955458d/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index eac896c..3e880b3 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -1208,7 +1208,18 @@ void Slave::registered(
       const string path = paths::getSlaveInfoPath(metaDir, slaveId);
 
       VLOG(1) << "Checkpointing SlaveInfo to '" << path << "'";
-      CHECK_SOME(state::checkpoint(path, info));
+
+      {
+        // The `SlaveInfo.resources` does not include dynamic reservations,
+        // which means it cannot contain reservation refinements, so
+        // `downgradeResources` should always succeed.
+        SlaveInfo info_ = info;
+
+        Try<Nothing> result = downgradeResources(info_.mutable_resources());
+        CHECK_SOME(result);
+
+        CHECK_SOME(state::checkpoint(path, info_));
+      }
 
       // Setup a timer so that the agent attempts to re-register if it
       // doesn't receive a ping from the master for an extended period
@@ -3433,10 +3444,28 @@ void Slave::checkpointResources(vector<Resource> _checkpointedResources)
   // Since we commit the checkpoint after all operations are successful,
   // we avoid a case of inconsistency between the master and the agent if
   // the agent restarts during handling of CheckpointResourcesMessage.
-  CHECK_SOME(state::checkpoint(
-      paths::getResourcesTargetPath(metaDir),
-      newCheckpointedResources))
-    << "Failed to checkpoint resources target " << newCheckpointedResources;
+
+  {
+    // If the checkpointed resources don't have reservation refinements,
+    // checkpoint them on the agent in "pre-reservation-refinement" format
+    // for backward compatibility with old agents. If downgrading is
+    // not possible without losing information, checkpoint the resources in
+    // the "post-reservation-refinement" format. We ignore the return
+    // value of `downgradeResources` because for now, we checkpoint the result
+    // either way.
+    //
+    // TODO(mpark): Do something smarter with the result once something
+    // like agent capability requirements is introduced.
+    RepeatedPtrField<Resource> newCheckpointedResources_ =
+      newCheckpointedResources;
+
+    downgradeResources(&newCheckpointedResources_);
+
+    CHECK_SOME(state::checkpoint(
+        paths::getResourcesTargetPath(metaDir),
+        newCheckpointedResources_))
+      << "Failed to checkpoint resources target " << newCheckpointedResources_;
+  }
 
   Try<Nothing> syncResult = syncCheckpointedResources(
       newCheckpointedResources);
@@ -7748,7 +7777,22 @@ void Executor::checkpointExecutor()
       slave->metaDir, slave->info.id(), frameworkId, id);
 
   VLOG(1) << "Checkpointing ExecutorInfo to '" << path << "'";
-  CHECK_SOME(state::checkpoint(path, info));
+
+  {
+    // If the checkpointed resources don't have reservation refinements,
+    // checkpoint them on the agent in "pre-reservation-refinement" format
+    // for backward compatibility with old agents. If downgrading is
+    // not possible without losing information, checkpoint the resources in
+    // the "post-reservation-refinement" format. We ignore the return
+    // value of `downgradeResources` because for now, we checkpoint the result
+    // either way.
+    //
+    // TODO(mpark): Do something smarter with the result once something
+    // like agent capability requirements is introduced.
+    ExecutorInfo info_ = info;
+    downgradeResources(info_.mutable_resources());
+    CHECK_SOME(state::checkpoint(path, info_));
+  }
 
   // Create the meta executor directory.
   // NOTE: This creates the 'latest' symlink in the meta directory.
@@ -7776,7 +7820,22 @@ void Executor::checkpointTask(const Task& task)
       task.task_id());
 
   VLOG(1) << "Checkpointing TaskInfo to '" << path << "'";
-  CHECK_SOME(state::checkpoint(path, task));
+
+  {
+    // If the checkpointed resources don't have reservation refinements,
+    // checkpoint them on the agent in "pre-reservation-refinement" format
+    // for backward compatibility with old agents. If downgrading is
+    // not possible without losing information, checkpoint the resources in
+    // the "post-reservation-refinement" format. We ignore the return
+    // value of `downgradeResources` because for now, we checkpoint the result
+    // either way.
+    //
+    // TODO(mpark): Do something smarter with the result once something
+    // like agent capability requirements is introduced.
+    Task task_ = task;
+    downgradeResources(task_.mutable_resources());
+    CHECK_SOME(state::checkpoint(path, task_));
+  }
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/a955458d/src/slave/state.hpp
----------------------------------------------------------------------
diff --git a/src/slave/state.hpp b/src/slave/state.hpp
index 18c4319..8d088d0 100644
--- a/src/slave/state.hpp
+++ b/src/slave/state.hpp
@@ -100,22 +100,6 @@ Try<Nothing> checkpoint(
   return ::protobuf::write(path, messages);
 }
 
-
-inline Try<Nothing> checkpoint(
-    const std::string& path,
-    const Resources& resources_)
-{
-  google::protobuf::RepeatedPtrField<Resource> resources = resources_;
-
-  // We ignore the `Try` from `downgradeResources` here because for now,
-  // we checkpoint the result either way.
-  // TODO(mpark): Do something smarter with the result once something like
-  // an agent recovery capability is introduced.
-  downgradeResources(&resources);
-
-  return checkpoint(path, resources);
-}
-
 }  // namespace internal {
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/a955458d/src/tests/slave_recovery_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_recovery_tests.cpp b/src/tests/slave_recovery_tests.cpp
index 9aa0a51..c2e9e86 100644
--- a/src/tests/slave_recovery_tests.cpp
+++ b/src/tests/slave_recovery_tests.cpp
@@ -138,7 +138,7 @@ TEST_F(SlaveStateTest, CheckpointProtobufMessage)
 TEST_F(SlaveStateTest, CheckpointRepeatedProtobufMessages)
 {
   // Checkpoint resources.
-  const Resources expected =
+  const google::protobuf::RepeatedPtrField<Resource> expected =
     Resources::parse("cpus:2;mem:512;cpus(role):4;mem(role):1024").get();
 
   const string file = "resources-file";
@@ -149,8 +149,6 @@ TEST_F(SlaveStateTest, CheckpointRepeatedProtobufMessages)
 
   ASSERT_SOME(actual);
 
-  convertResourceFormat(&actual.get(), POST_RESERVATION_REFINEMENT);
-
   EXPECT_SOME_EQ(expected, actual);
 }