You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ch...@apache.org on 2018/07/02 20:54:45 UTC

[1/4] mesos git commit: Added MESOS-8830 to the 1.6.1 CHANGELOG.

Repository: mesos
Updated Branches:
  refs/heads/1.6.x 131d88788 -> ccc1c128d


Added MESOS-8830 to the 1.6.1 CHANGELOG.


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/ccc1c128
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/ccc1c128
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/ccc1c128

Branch: refs/heads/1.6.x
Commit: ccc1c128d1339c529e7ef0a3888bb351db2f8894
Parents: a3452eb
Author: Chun-Hung Hsiao <ch...@mesosphere.io>
Authored: Fri Jun 29 12:09:26 2018 -0700
Committer: Chun-Hung Hsiao <ch...@mesosphere.io>
Committed: Mon Jul 2 13:54:17 2018 -0700

----------------------------------------------------------------------
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/ccc1c128/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index af74f26..d3ebbda 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -11,6 +11,7 @@ Release Notes - Mesos - Version 1.6.1
   * [MESOS-3790] - ZooKeeper connection should retry on `EAI_NONAME`.
   * [MESOS-8106] - Docker fetcher plugin unsupported scheme failure message is not accurate.
   * [MESOS-8786] - CgroupIsolatorProcess accesses subsystem processes directly.
+  * [MESOS-8830] - Agent gc on old slave sandboxes could empty persistent volume data
   * [MESOS-8871] - Agent may fail to recover if the agent dies before image store cache checkpointed.
   * [MESOS-8904] - Master crash when removing quota.
   * [MESOS-8906] - `UriDiskProfileAdaptor` fails to update profile selectors.


[4/4] mesos git commit: Unmounted any mount points in gc paths.

Posted by ch...@apache.org.
Unmounted any mount points in gc paths.

In various corner cases, agent may not get chance to properly unmount
persistent volumes mounted inside an executor's sandbox. When GC later
gets to these sandbox directories, permanent data loss can happen (see
MESOS-8830).

Currently, the only mounts in the host mount namespace under the sandbox
directories are persistent volumes, so this diff added protection to
unmount any dangling mount points before calling `rmdir` on the
directory.

NOTE: this means agent will not garbage collect any path if it cannot
read its own `mountinfo` table.

Review: https://reviews.apache.org/r/67264


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/fbdb8009
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/fbdb8009
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/fbdb8009

Branch: refs/heads/1.6.x
Commit: fbdb8009fe1fb33c16ac35b9f3d13c350c85fe0d
Parents: 131d887
Author: Zhitao Li <zh...@gmail.com>
Authored: Tue May 22 10:55:50 2018 -0700
Committer: Chun-Hung Hsiao <ch...@mesosphere.io>
Committed: Mon Jul 2 13:54:17 2018 -0700

----------------------------------------------------------------------
 src/local/local.cpp       |  2 +-
 src/slave/gc.cpp          | 74 ++++++++++++++++++++++++++++++++++++++----
 src/slave/gc.hpp          |  2 +-
 src/slave/gc_process.hpp  |  7 ++--
 src/slave/main.cpp        |  2 +-
 src/tests/cluster.cpp     |  2 +-
 src/tests/gc_tests.cpp    |  6 ++--
 src/tests/mesos.cpp       |  3 +-
 src/tests/mesos.hpp       |  2 +-
 src/tests/slave_tests.cpp |  6 ++--
 10 files changed, 85 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/fbdb8009/src/local/local.cpp
----------------------------------------------------------------------
diff --git a/src/local/local.cpp b/src/local/local.cpp
index afff546..5b7bb59 100644
--- a/src/local/local.cpp
+++ b/src/local/local.cpp
@@ -427,7 +427,7 @@ PID<Master> launch(const Flags& flags, Allocator* _allocator)
         << slaveFlags.runtime_dir << "': " << mkdir.error();
     }
 
-    garbageCollectors->push_back(new GarbageCollector());
+    garbageCollectors->push_back(new GarbageCollector(slaveFlags.work_dir));
     taskStatusUpdateManagers->push_back(
         new TaskStatusUpdateManager(slaveFlags));
     fetchers->push_back(new Fetcher(slaveFlags));

http://git-wip-us.apache.org/repos/asf/mesos/blob/fbdb8009/src/slave/gc.cpp
----------------------------------------------------------------------
diff --git a/src/slave/gc.cpp b/src/slave/gc.cpp
index 390b35e..8770d88 100644
--- a/src/slave/gc.cpp
+++ b/src/slave/gc.cpp
@@ -25,6 +25,7 @@
 
 #include <process/metrics/metrics.hpp>
 
+#include <stout/adaptor.hpp>
 #include <stout/foreach.hpp>
 #include <stout/lambda.hpp>
 
@@ -32,6 +33,10 @@
 
 #include "logging/logging.hpp"
 
+#ifdef __linux__
+#include "linux/fs.hpp"
+#endif
+
 #include "slave/gc_process.hpp"
 
 using namespace process;
@@ -193,12 +198,69 @@ void GarbageCollectorProcess::remove(const Timeout& removalTime)
 
     Counter _succeeded = metrics.path_removals_succeeded;
     Counter _failed = metrics.path_removals_failed;
+    const string _workDir = workDir;
 
-    auto rmdirs = [_succeeded, _failed, infos]() {
+    auto rmdirs =
+      [_succeeded, _failed, _workDir, infos]() mutable -> Future<Nothing> {
       // Make mutable copies of the counters to work around MESOS-7907.
       Counter succeeded = _succeeded;
       Counter failed = _failed;
 
+#ifdef __linux__
+      // Clear any possible persistent volume mount points in `infos`. See
+      // MESOS-8830.
+      Try<fs::MountInfoTable> mountTable = fs::MountInfoTable::read();
+      if (mountTable.isError()) {
+        LOG(ERROR) << "Skipping any path deletion because of failure on read "
+                      "MountInfoTable for agent process: "
+                   << mountTable.error();
+
+        foreach (const Owned<PathInfo>& info, infos) {
+          info->promise.fail(mountTable.error());
+          ++failed;
+        }
+
+        return Failure(mountTable.error());
+      }
+
+      foreach (const fs::MountInfoTable::Entry& entry,
+               adaptor::reverse(mountTable->entries)) {
+        // Ignore mounts whose targets are not under `workDir`.
+        if (!strings::startsWith(
+                path::join(entry.target, ""),
+                path::join(_workDir, ""))) {
+                continue;
+        }
+
+        for (auto it = infos.begin(); it != infos.end(); ) {
+          const Owned<PathInfo>& info = *it;
+          // TODO(zhitao): Validate that both `info->path` and `workDir` are
+          // real paths.
+          if (strings::startsWith(
+                path::join(entry.target, ""), path::join(info->path, ""))) {
+            LOG(WARNING)
+                << "Unmounting dangling mount point '" << entry.target
+                << "' of persistent volume '" << entry.root
+                << "' inside garbage collected path '" << info->path << "'";
+
+            Try<Nothing> unmount = fs::unmount(entry.target);
+            if (unmount.isError()) {
+              LOG(WARNING) << "Skipping deletion of '"
+                           << info->path << "' because unmount failed on '"
+                           << entry.target << "': " << unmount.error();
+
+              info->promise.fail(unmount.error());
+              ++failed;
+              it = infos.erase(it);
+              continue;
+            }
+          }
+
+          it++;
+        }
+      }
+#endif // __linux__
+
       foreach (const Owned<PathInfo>& info, infos) {
         // Run the removal operation with 'continueOnError = true'.
         // It's possible for tasks and isolators to lay down files
@@ -241,11 +303,9 @@ void GarbageCollectorProcess::remove(const Timeout& removalTime)
 }
 
 
-void  GarbageCollectorProcess::_remove(const Future<Nothing>& result,
-                                       const list<Owned<PathInfo>> infos)
+void GarbageCollectorProcess::_remove(const Future<Nothing>& result,
+                                      const list<Owned<PathInfo>> infos)
 {
-  CHECK_READY(result);
-
   // Remove path records from `paths` and `timeouts` data structures.
   foreach (const Owned<PathInfo>& info, infos) {
     CHECK(paths.remove(timeouts[info->path], info));
@@ -268,9 +328,9 @@ void GarbageCollectorProcess::prune(const Duration& d)
 }
 
 
-GarbageCollector::GarbageCollector()
+GarbageCollector::GarbageCollector(const string& workDir)
 {
-  process = new GarbageCollectorProcess();
+  process = new GarbageCollectorProcess(workDir);
   spawn(process);
 }
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/fbdb8009/src/slave/gc.hpp
----------------------------------------------------------------------
diff --git a/src/slave/gc.hpp b/src/slave/gc.hpp
index df40165..0275c73 100644
--- a/src/slave/gc.hpp
+++ b/src/slave/gc.hpp
@@ -42,7 +42,7 @@ class GarbageCollectorProcess;
 class GarbageCollector
 {
 public:
-  GarbageCollector();
+  explicit GarbageCollector(const std::string& workDir);
   virtual ~GarbageCollector();
 
   // Schedules the specified path for removal after the specified

http://git-wip-us.apache.org/repos/asf/mesos/blob/fbdb8009/src/slave/gc_process.hpp
----------------------------------------------------------------------
diff --git a/src/slave/gc_process.hpp b/src/slave/gc_process.hpp
index 20374ad..84c83d3 100644
--- a/src/slave/gc_process.hpp
+++ b/src/slave/gc_process.hpp
@@ -45,9 +45,10 @@ class GarbageCollectorProcess :
     public process::Process<GarbageCollectorProcess>
 {
 public:
-  GarbageCollectorProcess()
+  explicit GarbageCollectorProcess(const std::string& _workDir)
     : ProcessBase(process::ID::generate("agent-garbage-collector")),
-      metrics(this) {}
+      metrics(this),
+      workDir(_workDir) {}
 
   virtual ~GarbageCollectorProcess();
 
@@ -97,6 +98,8 @@ private:
     process::metrics::PullGauge path_removals_pending;
   } metrics;
 
+  const std::string workDir;
+
   // Store all the timeouts and corresponding paths to delete.
   // NOTE: We are using Multimap here instead of Multihashmap, because
   // we need the keys of the map (deletion time) to be sorted.

http://git-wip-us.apache.org/repos/asf/mesos/blob/fbdb8009/src/slave/main.cpp
----------------------------------------------------------------------
diff --git a/src/slave/main.cpp b/src/slave/main.cpp
index ca35668..7179389 100644
--- a/src/slave/main.cpp
+++ b/src/slave/main.cpp
@@ -551,7 +551,7 @@ int main(int argc, char** argv)
   }
 
   Files* files = new Files(READONLY_HTTP_AUTHENTICATION_REALM, authorizer_);
-  GarbageCollector* gc = new GarbageCollector();
+  GarbageCollector* gc = new GarbageCollector(flags.work_dir);
   TaskStatusUpdateManager* taskStatusUpdateManager =
     new TaskStatusUpdateManager(flags);
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/fbdb8009/src/tests/cluster.cpp
----------------------------------------------------------------------
diff --git a/src/tests/cluster.cpp b/src/tests/cluster.cpp
index c071da6..3f2d25a 100644
--- a/src/tests/cluster.cpp
+++ b/src/tests/cluster.cpp
@@ -486,7 +486,7 @@ Try<process::Owned<Slave>> Slave::create(
 
   // If the garbage collector is not provided, create a default one.
   if (gc.isNone()) {
-    slave->gc.reset(new slave::GarbageCollector());
+    slave->gc.reset(new slave::GarbageCollector(flags.work_dir));
   }
 
   // If the resource estimator is not provided, create a default one.

http://git-wip-us.apache.org/repos/asf/mesos/blob/fbdb8009/src/tests/gc_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/gc_tests.cpp b/src/tests/gc_tests.cpp
index 619ed22..ed0fef3 100644
--- a/src/tests/gc_tests.cpp
+++ b/src/tests/gc_tests.cpp
@@ -96,7 +96,7 @@ class GarbageCollectorTest : public TemporaryDirectoryTest {};
 
 TEST_F(GarbageCollectorTest, Schedule)
 {
-  GarbageCollector gc;
+  GarbageCollector gc("work_dir");
 
   // Make some temporary files to gc.
   const string& file1 = "file1";
@@ -180,7 +180,7 @@ TEST_F(GarbageCollectorTest, Schedule)
 
 TEST_F(GarbageCollectorTest, Unschedule)
 {
-  GarbageCollector gc;
+  GarbageCollector gc("work_dir");
 
   // Attempt to unschedule a file that is not scheduled.
   AWAIT_ASSERT_FALSE(gc.unschedule("bogus"));
@@ -229,7 +229,7 @@ TEST_F(GarbageCollectorTest, Unschedule)
 
 TEST_F(GarbageCollectorTest, Prune)
 {
-  GarbageCollector gc;
+  GarbageCollector gc("work_dir");
 
   // Make some temporary files to prune.
   const string& file1 = "file1";

http://git-wip-us.apache.org/repos/asf/mesos/blob/fbdb8009/src/tests/mesos.cpp
----------------------------------------------------------------------
diff --git a/src/tests/mesos.cpp b/src/tests/mesos.cpp
index d3c87c2..c0ab2f7 100644
--- a/src/tests/mesos.cpp
+++ b/src/tests/mesos.cpp
@@ -691,7 +691,8 @@ MockAuthorizer::MockAuthorizer()
 MockAuthorizer::~MockAuthorizer() {}
 
 
-MockGarbageCollector::MockGarbageCollector()
+MockGarbageCollector::MockGarbageCollector(const string& workDir)
+    : slave::GarbageCollector(workDir)
 {
   EXPECT_CALL(*this, unschedule(_)).WillRepeatedly(Return(true));
 }

http://git-wip-us.apache.org/repos/asf/mesos/blob/fbdb8009/src/tests/mesos.hpp
----------------------------------------------------------------------
diff --git a/src/tests/mesos.hpp b/src/tests/mesos.hpp
index b945edf..8545190 100644
--- a/src/tests/mesos.hpp
+++ b/src/tests/mesos.hpp
@@ -3331,7 +3331,7 @@ public:
 class MockGarbageCollector : public slave::GarbageCollector
 {
 public:
-  MockGarbageCollector();
+  explicit MockGarbageCollector(const std::string& workDir);
   virtual ~MockGarbageCollector();
 
   // The default action is to always return `true`.

http://git-wip-us.apache.org/repos/asf/mesos/blob/fbdb8009/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 9e39884..8e5c31c 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -4878,10 +4878,10 @@ TEST_F(SlaveTest, RemoveExecutorUponFailedTaskGroupLaunch)
 
   Owned<MasterDetector> detector = master.get()->createDetector();
 
-  MockGarbageCollector mockGarbageCollector;
-
   slave::Flags slaveFlags = CreateSlaveFlags();
 
+  MockGarbageCollector mockGarbageCollector(slaveFlags.work_dir);
+
   // Start a mock slave.
   Try<Owned<cluster::Slave>> slave =
     StartSlave(detector.get(), &mockGarbageCollector, slaveFlags, true);
@@ -5036,8 +5036,8 @@ TEST_F(SlaveTest, LaunchTasksReorderUnscheduleGC)
   ASSERT_SOME(master);
 
   Owned<MasterDetector> detector = master.get()->createDetector();
-  MockGarbageCollector mockGarbageCollector;
   slave::Flags slaveFlags = CreateSlaveFlags();
+  MockGarbageCollector mockGarbageCollector(slaveFlags.work_dir);
 
   // Start a mock slave.
   Try<Owned<cluster::Slave>> slave =


[3/4] mesos git commit: Rewrote the `ROOT_BusyMountPoint` test to reflect updated behavior.

Posted by ch...@apache.org.
Rewrote the `ROOT_BusyMountPoint` test to reflect updated behavior.

The current `ROOT_BusyMountPoint` test would fail because we added
support for unmounting dangling mount points in directory to gc. This
patch rewrote this test to reflect that after unmounting, the gc
succeeded, directory was gone and metrics were correctly reported.

Review: https://reviews.apache.org/r/67421


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a3452eb0
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a3452eb0
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a3452eb0

Branch: refs/heads/1.6.x
Commit: a3452eb001ed6870f6e8e1e7c69bce9623f005cf
Parents: 2e309ab
Author: Zhitao Li <zh...@gmail.com>
Authored: Fri Jun 1 16:58:34 2018 -0700
Committer: Chun-Hung Hsiao <ch...@mesosphere.io>
Committed: Mon Jul 2 13:54:17 2018 -0700

----------------------------------------------------------------------
 src/tests/gc_tests.cpp | 90 ++++++++++++++++++++++++++++-----------------
 1 file changed, 57 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/a3452eb0/src/tests/gc_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/gc_tests.cpp b/src/tests/gc_tests.cpp
index ed0fef3..4f288cf 100644
--- a/src/tests/gc_tests.cpp
+++ b/src/tests/gc_tests.cpp
@@ -899,25 +899,29 @@ TEST_F(GarbageCollectorIntegrationTest, Unschedule)
 
 
 #ifdef __linux__
-// In Mesos it's possible for tasks and isolators to lay down files
-// that are not deletable by GC. This test runs a task that creates a busy
-// mount point which is not directly deletable by GC. We verify that
-// GC deletes all files that it's able to delete in the face of such errors.
-TEST_F(GarbageCollectorIntegrationTest, ROOT_BusyMountPoint)
+// This test creates a persistent volume and runs a task which mounts the volume
+// inside the sandbox, to simulate a dangling mount which agent failed to
+// clean up (see MESOS-8830). We verify that GC process will unmount the
+// dangling mount point successfully and report success in metrics.
+TEST_F(GarbageCollectorIntegrationTest, ROOT_DanglingMount)
 {
   Try<Owned<cluster::Master>> master = StartMaster();
   ASSERT_SOME(master);
 
   slave::Flags flags = CreateSlaveFlags();
+  flags.resources = strings::format("disk(%s):1024", DEFAULT_TEST_ROLE).get();
+
   Owned<MasterDetector> detector = master.get()->createDetector();
   Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags);
 
   ASSERT_SOME(slave);
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.set_roles(0, DEFAULT_TEST_ROLE);
 
   MockScheduler sched;
   MesosSchedulerDriver driver(
       &sched,
-      DEFAULT_FRAMEWORK_INFO,
+      frameworkInfo,
       master.get()->pid,
       DEFAULT_CREDENTIAL);
 
@@ -936,21 +940,33 @@ TEST_F(GarbageCollectorIntegrationTest, ROOT_BusyMountPoint)
   AWAIT_READY(offers);
   ASSERT_FALSE(offers->empty());
 
-  const Offer& offer = offers.get()[0];
-  const SlaveID& slaveId = offer.slave_id();
+  const Offer& offer = offers->at(0);
+
+  string persistenceId = "persistence-id";
+  string containerPath = "path";
+
+  Resource volume = createPersistentVolume(
+      Megabytes(1024),
+      DEFAULT_TEST_ROLE,
+      persistenceId,
+      containerPath,
+      None(),
+      None(),
+      frameworkInfo.principal());
+
+  string mountPoint = "dangling";
+
+  string hostPath = slave::paths::getPersistentVolumePath(
+      flags.work_dir, DEFAULT_TEST_ROLE, persistenceId);
 
-  // The busy mount point goes before the regular file in GC's
-  // directory traversal due to their names. This makes sure that
-  // an error occurs before all deletable files are GCed.
-  string mountPoint = "test1";
-  string regularFile = "test2.txt";
+  string fileInVolume = "foo.txt";
 
   TaskInfo task = createTask(
-      slaveId,
-      Resources::parse("cpus:1;mem:128;disk:1").get(),
-      "touch "+ regularFile + "; "
+      offer.slave_id(),
+      Resources(volume),
+      "touch "+ path::join(containerPath, fileInVolume) + "; "
       "mkdir " + mountPoint + "; "
-      "mount --bind " + mountPoint + " " + mountPoint,
+      "mount --bind " + hostPath + " " + mountPoint,
       None(),
       "test-task123",
       "test-task123");
@@ -966,7 +982,15 @@ TEST_F(GarbageCollectorIntegrationTest, ROOT_BusyMountPoint)
   Future<Nothing> schedule = FUTURE_DISPATCH(
       _, &GarbageCollectorProcess::schedule);
 
-  driver.launchTasks(offer.id(), {task});
+  Future<Nothing> ack1 =
+    FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement);
+
+  Future<Nothing> ack2 =
+    FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement);
+
+  driver.acceptOffers(
+      {offer.id()},
+      {CREATE(volume), LAUNCH({task})});
 
   AWAIT_READY(status0);
   EXPECT_EQ(task.task_id(), status0->task_id());
@@ -980,23 +1004,21 @@ TEST_F(GarbageCollectorIntegrationTest, ROOT_BusyMountPoint)
   executorId.set_value("test-task123");
   Result<string> _sandbox = os::realpath(slave::paths::getExecutorLatestRunPath(
       flags.work_dir,
-      slaveId,
+      offer.slave_id(),
       frameworkId.get(),
       executorId));
   ASSERT_SOME(_sandbox);
   string sandbox = _sandbox.get();
   EXPECT_TRUE(os::exists(sandbox));
 
-  // Wait for the task to create these paths.
+  // Wait for the task to create the dangling mount point.
   Timeout timeout = Timeout::in(process::TEST_AWAIT_TIMEOUT);
   while (!os::exists(path::join(sandbox, mountPoint)) ||
-         !os::exists(path::join(sandbox, regularFile)) ||
          !timeout.expired()) {
     os::sleep(Milliseconds(10));
   }
 
   ASSERT_TRUE(os::exists(path::join(sandbox, mountPoint)));
-  ASSERT_TRUE(os::exists(path::join(sandbox, regularFile)));
 
   AWAIT_READY(status2);
   ASSERT_EQ(task.task_id(), status2->task_id());
@@ -1004,15 +1026,16 @@ TEST_F(GarbageCollectorIntegrationTest, ROOT_BusyMountPoint)
 
   AWAIT_READY(schedule);
 
+  ASSERT_TRUE(os::exists(path::join(sandbox, mountPoint)));
+  ASSERT_TRUE(os::exists(path::join(hostPath, fileInVolume)));
+
+  AWAIT_READY(schedule);
+
   Clock::pause();
   Clock::advance(flags.gc_delay);
   Clock::settle();
 
-  EXPECT_TRUE(os::exists(sandbox));
-  EXPECT_TRUE(os::exists(path::join(sandbox, mountPoint)));
-  EXPECT_FALSE(os::exists(path::join(sandbox, regularFile)));
-
-  // Verify that GC metrics show that a path removal failed.
+  // Verify that GC metrics showes no failure.
   JSON::Object metrics = Metrics();
 
   ASSERT_EQ(1u, metrics.values.count("gc/path_removals_pending"));
@@ -1022,18 +1045,19 @@ TEST_F(GarbageCollectorIntegrationTest, ROOT_BusyMountPoint)
   EXPECT_SOME_EQ(
       0u,
       metrics.at<JSON::Number>("gc/path_removals_pending"));
+
   EXPECT_SOME_EQ(
       0u,
-      metrics.at<JSON::Number>("gc/path_removals_succeeded"));
+      metrics.at<JSON::Number>("gc/path_removals_failed"));
 
-  // The sandbox path removal failure will cascade to cause failures to
-  // remove the executor and framework directories. For testing purposes
-  // it is sufficient to verify that some failure was detected.
-  ASSERT_SOME(metrics.at<JSON::Number>("gc/path_removals_failed"));
+  ASSERT_SOME(metrics.at<JSON::Number>("gc/path_removals_succeeded"));
   EXPECT_GT(
-      metrics.at<JSON::Number>("gc/path_removals_failed")->as<unsigned>(),
+      metrics.at<JSON::Number>("gc/path_removals_succeeded")->as<unsigned>(),
       0u);
 
+  ASSERT_FALSE(os::exists(path::join(sandbox, mountPoint)));
+  ASSERT_TRUE(os::exists(path::join(hostPath, fileInVolume)));
+
   Clock::resume();
   driver.stop();
   driver.join();


[2/4] mesos git commit: Skipped metric for non existing paths in gc.

Posted by ch...@apache.org.
Skipped metric for non existing paths in gc.

Previously, agent gc would increment the "failed" counter if the path
does not exist, but this should not be an issue. This patch skipped such
paths in both "failed" and "succeeded" counters.

Review: https://reviews.apache.org/r/67423


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/2e309ab9
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/2e309ab9
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/2e309ab9

Branch: refs/heads/1.6.x
Commit: 2e309ab9ea258f7814e41bf7747bf012f83db9f7
Parents: fbdb800
Author: Zhitao Li <zh...@gmail.com>
Authored: Fri Jun 1 22:09:11 2018 -0700
Committer: Chun-Hung Hsiao <ch...@mesosphere.io>
Committed: Mon Jul 2 13:54:17 2018 -0700

----------------------------------------------------------------------
 src/slave/gc.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/2e309ab9/src/slave/gc.cpp
----------------------------------------------------------------------
diff --git a/src/slave/gc.cpp b/src/slave/gc.cpp
index 8770d88..407f6b2 100644
--- a/src/slave/gc.cpp
+++ b/src/slave/gc.cpp
@@ -271,11 +271,17 @@ void GarbageCollectorProcess::remove(const Timeout& removalTime)
         Try<Nothing> rmdir = os::rmdir(info->path, true, true, true);
 
         if (rmdir.isError()) {
-          LOG(WARNING) << "Failed to delete '" << info->path << "': "
-                       << rmdir.error();
-          info->promise.fail(rmdir.error());
-
-          ++failed;
+          // TODO(zhitao): Change return value type of `rmdir` to
+          // `Try<Nothing, ErrnoError>` and check error type instead.
+          if (rmdir.error() == ErrnoError(ENOENT).message) {
+            LOG(INFO) << "Skipped '" << info->path << "' which does not exist";
+          } else {
+            LOG(WARNING) << "Failed to delete '" << info->path << "': "
+                         << rmdir.error();
+            info->promise.fail(rmdir.error());
+
+            ++failed;
+          }
         } else {
           LOG(INFO) << "Deleted '" << info->path << "'";
           info->promise.set(rmdir.get());