You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ya...@apache.org on 2016/04/09 01:52:05 UTC

[1/6] mesos git commit: Make tests::cluster::Slave more tolerant of start failures.

Repository: mesos
Updated Branches:
  refs/heads/master 6a04e4603 -> b900abff1


Make tests::cluster::Slave more tolerant of start failures.

If cluster::Slave::start() fails, make sure we don't crash in the
destructor.

Review: https://reviews.apache.org/r/45689/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/548da8ff
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/548da8ff
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/548da8ff

Branch: refs/heads/master
Commit: 548da8ff3597935c618b43a82bd432482e5e5fed
Parents: 6a04e46
Author: James Peach <jp...@apache.org>
Authored: Fri Apr 8 14:00:10 2016 -0700
Committer: Jiang Yan Xu <ya...@jxu.me>
Committed: Fri Apr 8 16:27:03 2016 -0700

----------------------------------------------------------------------
 src/tests/cluster.cpp | 5 +++++
 src/tests/cluster.hpp | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/548da8ff/src/tests/cluster.cpp
----------------------------------------------------------------------
diff --git a/src/tests/cluster.cpp b/src/tests/cluster.cpp
index 7e488d2..b4d6910 100644
--- a/src/tests/cluster.cpp
+++ b/src/tests/cluster.cpp
@@ -442,6 +442,11 @@ Slave::~Slave()
     return;
   }
 
+  // Startup didn't complete so don't try to do the full shutdown.
+  if (!containerizer) {
+    return;
+  }
+
   // This extra closure is necessary in order to use `AWAIT` and `ASSERT_*`,
   // as these macros require a void return type.
   [this]() {

http://git-wip-us.apache.org/repos/asf/mesos/blob/548da8ff/src/tests/cluster.hpp
----------------------------------------------------------------------
diff --git a/src/tests/cluster.hpp b/src/tests/cluster.hpp
index 39ca15e..887342a 100644
--- a/src/tests/cluster.hpp
+++ b/src/tests/cluster.hpp
@@ -185,13 +185,13 @@ private:
   bool cleanUpContainersInDestructor = true;
 
   // Master detector that is not managed by this object.
-  mesos::master::detector::MasterDetector* detector;
+  mesos::master::detector::MasterDetector* detector = nullptr;
 
   // Containerizer that is either owned outside of this `Slave` object
   // or by `ownedContainerizer`.  We keep a copy of this pointer
   // because the cleanup logic acts upon the containerizer (regardless
   // of who created it).
-  slave::Containerizer* containerizer;
+  slave::Containerizer* containerizer = nullptr;
 
   // Dependencies that are created by the factory method.
   process::Owned<slave::Containerizer> ownedContainerizer;


[3/6] mesos git commit: Add tests for XFS project quota utilities.

Posted by ya...@apache.org.
Add tests for XFS project quota utilities.

Review: https://reviews.apache.org/r/44947/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/04be1d03
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/04be1d03
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/04be1d03

Branch: refs/heads/master
Commit: 04be1d03ca71513cc966a17f87cd10611d959ac9
Parents: a0e96bd
Author: James Peach <jp...@apache.org>
Authored: Fri Apr 8 14:07:03 2016 -0700
Committer: Jiang Yan Xu <ya...@jxu.me>
Committed: Fri Apr 8 16:46:08 2016 -0700

----------------------------------------------------------------------
 src/Makefile.am                             |   5 +
 src/tests/containerizer/xfs_quota_tests.cpp | 337 +++++++++++++++++++++++
 src/tests/environment.cpp                   |  61 ++--
 3 files changed, 387 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/04be1d03/src/Makefile.am
----------------------------------------------------------------------
diff --git a/src/Makefile.am b/src/Makefile.am
index f235a6a..a16c2da 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1967,6 +1967,11 @@ mesos_tests_SOURCES =						\
   tests/containerizer/provisioner_backend_tests.cpp		\
   tests/containerizer/provisioner_docker_tests.cpp
 
+if ENABLE_XFS_DISK_ISOLATOR
+mesos_tests_SOURCES +=						\
+  tests/containerizer/xfs_quota_tests.cpp
+endif
+
 mesos_tests_CPPFLAGS = $(MESOS_CPPFLAGS)
 mesos_tests_CPPFLAGS += -DSOURCE_DIR=\"$(abs_top_srcdir)\"
 mesos_tests_CPPFLAGS += -DBUILD_DIR=\"$(abs_top_builddir)\"

http://git-wip-us.apache.org/repos/asf/mesos/blob/04be1d03/src/tests/containerizer/xfs_quota_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/xfs_quota_tests.cpp b/src/tests/containerizer/xfs_quota_tests.cpp
new file mode 100644
index 0000000..8b0322b
--- /dev/null
+++ b/src/tests/containerizer/xfs_quota_tests.cpp
@@ -0,0 +1,337 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/loop.h>
+
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+
+#include <mesos/mesos.hpp>
+#include <mesos/resources.hpp>
+
+#include <process/gtest.hpp>
+#include <process/pid.hpp>
+
+#include <stout/fs.hpp>
+#include <stout/gtest.hpp>
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+
+#include "linux/fs.hpp"
+
+#include "master/master.hpp"
+
+#include "slave/constants.hpp"
+#include "slave/flags.hpp"
+#include "slave/slave.hpp"
+
+#include "slave/containerizer/fetcher.hpp"
+#include "slave/containerizer/mesos/containerizer.hpp"
+#include "slave/containerizer/mesos/isolators/xfs/utils.hpp"
+
+#include "tests/environment.hpp"
+#include "tests/mesos.hpp"
+#include "tests/utils.hpp"
+
+using namespace mesos::internal::xfs;
+
+using namespace process;
+
+using std::string;
+using std::vector;
+
+using testing::_;
+using testing::Return;
+
+using mesos::internal::master::Master;
+
+using mesos::internal::slave::Fetcher;
+using mesos::internal::slave::MesosContainerizer;
+using mesos::internal::slave::Slave;
+
+namespace mesos {
+namespace internal {
+namespace tests {
+
+static QuotaInfo makeQuotaInfo(
+    Bytes limit,
+    Bytes used)
+{
+  return {limit, used};
+}
+
+
+class ROOT_XFS_QuotaTest : public MesosTest
+{
+public:
+  virtual void SetUp()
+  {
+    MesosTest::SetUp();
+
+    Try<string> base = environment->mkdtemp();
+    ASSERT_SOME(base) << "Failed to mkdtemp";
+
+    string devPath = path::join(base.get(), "device");
+    string mntPath = path::join(base.get(), "mnt");
+
+    ASSERT_SOME(os::mkdir(mntPath));
+    ASSERT_SOME(mkfile(devPath, Megabytes(40)));
+
+    // Get an unused loop device.
+    Try<string> loop = mkloop();
+    ASSERT_SOME(loop);
+
+    // Attach the loop to a backing file.
+    Try<Subprocess> losetup = subprocess(
+        "losetup " + loop.get() + " " + devPath,
+        Subprocess::PATH("/dev/null"));
+
+    ASSERT_SOME(losetup);
+    AWAIT_READY(losetup->status());
+    ASSERT_SOME_EQ(0, losetup->status().get());
+
+    loopDevice = loop.get();
+    ASSERT_SOME(loopDevice);
+
+    // Make an XFS filesystem (using the force flag). The defaults
+    // should be good enough for tests.
+    Try<Subprocess> mkfs = subprocess(
+        "mkfs.xfs -f " + loopDevice.get(),
+        Subprocess::PATH("/dev/null"));
+
+    ASSERT_SOME(mkfs);
+    AWAIT_READY(mkfs->status());
+    ASSERT_SOME_EQ(0, mkfs->status().get());
+
+    ASSERT_SOME(fs::mount(
+        loopDevice.get(),
+        mntPath,
+        "xfs",
+        0, // Flags.
+        "prjquota"));
+    mountPoint = mntPath;
+
+    ASSERT_SOME(os::chdir(mountPoint.get()))
+      << "Failed to chdir into '" << mountPoint.get() << "'";
+  }
+
+  virtual void TearDown()
+  {
+    if (mountPoint.isSome()) {
+      fs::unmount(mountPoint.get(), MNT_FORCE | MNT_DETACH);
+    }
+
+    // Make a best effort to tear everything down. We don't make any assertions
+    // here because even if something goes wrong we still want to clean up as
+    // much as we can.
+    if (loopDevice.isSome()) {
+      Try<Subprocess> cmdProcess = subprocess(
+          "losetup -d " + loopDevice.get(),
+          Subprocess::PATH("/dev/null"));
+
+      if (cmdProcess.isSome()) {
+        cmdProcess->status().await(Seconds(15));
+      }
+    }
+
+    MesosTest::TearDown();
+  }
+
+  slave::Flags CreateSlaveFlags()
+  {
+    slave::Flags flags = MesosTest::CreateSlaveFlags();
+
+    // We only need an XFS-specific directory for the work directory. We
+    // don't mind that other flags refer to a different temp directory.
+    flags.work_dir = mountPoint.get();
+    return flags;
+  }
+
+  static Try<Nothing> mkfile(string path, Bytes size)
+  {
+    Try<int> fd = os::open(path, O_CREAT | O_RDWR | O_EXCL);
+
+    if (fd.isError()) {
+      return Error(fd.error());
+    }
+
+    // XFS supports posix_fallocate(3), and we depend on it actually
+    // allocating storage in the quota tests.
+    if (int error = ::posix_fallocate(fd.get(), 0, size.bytes())) {
+      os::close(fd.get());
+      return Error("posix_fallocate failed: " + os::strerror(error));
+    }
+
+    os::close(fd.get());
+    return Nothing();
+  }
+
+  static Try<string> mkloop()
+  {
+    Try<int> fd = os::open("/dev/loop-control", O_RDWR);
+
+    if (fd.isError()) {
+      return Error(fd.error());
+    }
+
+    // All failure cases here are reported in errno with a -1 return value.
+    int devno = ::ioctl(fd.get(), LOOP_CTL_GET_FREE);
+    if (devno == -1) {
+      ErrnoError error("ioctl(LOOP_CTL_GET_FREE failed");
+      os::close(fd.get());
+      return error;
+    }
+
+    os::close(fd.get());
+
+    return string("/dev/loop") + stringify(devno);
+  }
+
+  Option<string> loopDevice; // The loop device we attached.
+  Option<string> mountPoint; // XFS filesystem mountpoint.
+};
+
+
+TEST_F(ROOT_XFS_QuotaTest, QuotaGetSet)
+{
+  prid_t projectId = 44;
+  string root = "project";
+  Bytes limit = Megabytes(44);
+
+  ASSERT_SOME(os::mkdir(root));
+
+  EXPECT_SOME(setProjectQuota(root, projectId, limit));
+
+  Result<QuotaInfo> info = getProjectQuota(root, projectId);
+  ASSERT_SOME(info);
+
+  EXPECT_EQ(limit, info.get().limit);
+  EXPECT_EQ(Bytes(0), info.get().used);
+
+  EXPECT_SOME(clearProjectQuota(root, projectId));
+}
+
+
+TEST_F(ROOT_XFS_QuotaTest, QuotaLimit)
+{
+  prid_t projectId = 55;
+  string root = "project";
+  Bytes limit = Megabytes(11);
+  Bytes used = Megabytes(10);
+
+  ASSERT_SOME(os::mkdir(root));
+
+  // Assign a project quota.
+  EXPECT_SOME(setProjectQuota(root, projectId, limit));
+
+  // Move the directory into the project.
+  EXPECT_SOME(setProjectId(root, projectId));
+
+  // Allocate some storage to this project.
+  EXPECT_SOME(mkfile(path::join(root, "file"), used));
+
+  // And verify the quota reflects what we used.
+  EXPECT_SOME_EQ(
+      makeQuotaInfo(limit, used),
+      getProjectQuota(root, projectId));
+
+  // We have 1MB of our quota left. Verify that we get a write
+  // error if we overflow that.
+  EXPECT_ERROR(mkfile(path::join(root, "file2"), Megabytes(2)));
+
+  EXPECT_SOME(clearProjectQuota(root, projectId));
+}
+
+
+TEST_F(ROOT_XFS_QuotaTest, ProjectIdErrors)
+{
+  // Setting project IDs should not work for non-directories.
+  EXPECT_SOME(::fs::symlink("symlink", "nowhere"));
+  EXPECT_ERROR(setProjectId("symlink", 99));
+  EXPECT_ERROR(clearProjectId("symlink"));
+
+  EXPECT_SOME(mkfile("file", Bytes(1)));
+  EXPECT_ERROR(setProjectId("file", 99));
+  EXPECT_ERROR(clearProjectId("file"));
+
+  // Setting on a missing file should error.
+  EXPECT_ERROR(setProjectId("none", 99));
+  EXPECT_ERROR(clearProjectId("none"));
+}
+
+
+TEST_F(ROOT_XFS_QuotaTest, DirectoryTree)
+{
+  Bytes limit = Megabytes(100);
+  prid_t projectA = 200;
+  prid_t projectB = 400;
+  string rootA = "projectA";
+  string rootB = "projectB";
+
+  // Create rootA with 2MB of data.
+  ASSERT_SOME(os::mkdir(path::join(rootA, "depth1/depth2/depth3"), true));
+  EXPECT_SOME(mkfile(path::join(rootA, "depth1/file1"), Megabytes(1)));
+  EXPECT_SOME(mkfile(path::join(rootA, "depth1/depth2/file2"), Megabytes(1)));
+
+  // Create rootB with 1MB of data.
+  ASSERT_SOME(os::mkdir(rootB));
+  EXPECT_SOME(mkfile(path::join(rootB, "file1"), Megabytes(1)));
+
+  // Symlink from rootA into rootB. This should have no effect on the
+  // measured quota.
+  EXPECT_SOME(::fs::symlink(
+      path::join(rootB, "file1"), path::join(rootA, "depth1/file1.A")));
+  EXPECT_SOME(::fs::symlink(
+      path::join(rootB, "file1"), path::join(rootA, "depth1/depth2/file2.A")));
+  EXPECT_SOME(::fs::symlink(rootB,
+      path::join(rootA, "depth1/depth2/depth3.A")));
+
+  // Now we want to verify that assigning and removing project IDs is recursive
+  // and does not follow symlinks. For each directory, assign the project ID and
+  // verify the expected quota usage. Then verify the inverse.
+
+  EXPECT_SOME(setProjectId(rootA, projectA));
+  EXPECT_SOME(setProjectQuota(rootA, projectA, limit));
+
+  EXPECT_SOME_EQ(
+      makeQuotaInfo(limit, Megabytes(2)),
+      getProjectQuota(rootA, projectA));
+
+  EXPECT_SOME(setProjectId(rootB, projectB));
+  EXPECT_SOME(setProjectQuota(rootB, projectB, limit));
+
+  EXPECT_SOME_EQ(
+      makeQuotaInfo(limit, Megabytes(1)),
+      getProjectQuota(rootB, projectB));
+
+  EXPECT_SOME(clearProjectId(rootA));
+
+  EXPECT_SOME_EQ(
+      makeQuotaInfo(limit, Megabytes(0)),
+      getProjectQuota(rootA, projectA));
+
+  EXPECT_SOME(clearProjectId(rootB));
+
+  EXPECT_SOME_EQ(
+      makeQuotaInfo(limit, Megabytes(0)),
+      getProjectQuota(rootB, projectB));
+}
+
+} // namespace tests {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/04be1d03/src/tests/environment.cpp
----------------------------------------------------------------------
diff --git a/src/tests/environment.cpp b/src/tests/environment.cpp
index acadb5b..45ed8f2 100644
--- a/src/tests/environment.cpp
+++ b/src/tests/environment.cpp
@@ -484,38 +484,60 @@ private:
 };
 
 
-class OverlayFSTestFilter : public TestFilter
+class SupportedFilesystemTestFilter : public TestFilter
 {
 public:
-  OverlayFSTestFilter()
+  explicit SupportedFilesystemTestFilter(const string fsname)
   {
 #ifdef __linux__
-    Try<bool> check = fs::overlay::supported();
+    Try<bool> check = (fsname == "overlayfs")
+      ? fs::overlay::supported()
+      : fs::supported(fsname);
+
     if (check.isError()) {
-      overlayfsError = check.error();
+      fsSupportError = check.error();
     } else if (!check.get()) {
-      overlayfsError = Error("Overlayfs is not supported on your systems");
+      fsSupportError = Error(fsname + " is not supported on your systems");
     }
 #else
-    overlayfsError =
-      Error("Overlayfs tests not supported on non-Linux systems");
-#endif // __linux__
-    if (overlayfsError.isSome()) {
+    fsSupportError =
+      Error(fsname + " tests not supported on non-Linux systems");
+#endif
+
+    if (fsSupportError.isSome()) {
       std::cerr
         << "-------------------------------------------------------------\n"
-        << "We cannot run any overlayfs tests because:\n"
-        << overlayfsError.get().message << "\n"
+        << "We cannot run any " << fsname << " tests because:\n"
+        << fsSupportError.get().message << "\n"
         << "-------------------------------------------------------------\n";
     }
   }
 
+  Option<Error> fsSupportError;
+};
+
+
+class OverlayFSFilter : public SupportedFilesystemTestFilter
+{
+public:
+  OverlayFSFilter() : SupportedFilesystemTestFilter("overlayfs") {}
+
   bool disable(const ::testing::TestInfo* test) const
   {
-    return overlayfsError.isSome() && matches(test, "OVERLAYFS_");
+    return fsSupportError.isSome() && matches(test, "OVERLAYFS_");
   }
+};
 
-private:
-  Option<Error> overlayfsError;
+
+class XfsFilter : public SupportedFilesystemTestFilter
+{
+public:
+  XfsFilter() : SupportedFilesystemTestFilter("xfs") {}
+
+  bool disable(const ::testing::TestInfo* test) const
+  {
+    return fsSupportError.isSome() && matches(test, "XFS_");
+  }
 };
 
 
@@ -727,11 +749,12 @@ Environment::Environment(const Flags& _flags) : flags(_flags)
   filters.push_back(Owned<TestFilter>(new NetClsCgroupsFilter()));
   filters.push_back(Owned<TestFilter>(new NetworkIsolatorTestFilter()));
   filters.push_back(Owned<TestFilter>(new NvidiaGpuFilter()));
-  filters.push_back(Owned<TestFilter>(new OverlayFSTestFilter()));
+  filters.push_back(Owned<TestFilter>(new OverlayFSFilter()));
   filters.push_back(Owned<TestFilter>(new PerfCPUCyclesFilter()));
   filters.push_back(Owned<TestFilter>(new PerfFilter()));
   filters.push_back(Owned<TestFilter>(new RootFilter()));
   filters.push_back(Owned<TestFilter>(new UnzipFilter()));
+  filters.push_back(Owned<TestFilter>(new XfsFilter()));
 
   // Construct the filter string to handle system or platform specific tests.
   ::testing::UnitTest* unitTest = ::testing::UnitTest::GetInstance();
@@ -862,8 +885,14 @@ Try<string> Environment::TemporaryDirectoryEventListener::mkdtemp()
     testName = strings::remove(testName, "DISABLED_", strings::PREFIX);
   }
 
+  Option<string> tmpdir = os::getenv("TMPDIR");
+
+  if (tmpdir.isNone()) {
+    tmpdir = "/tmp";
+  }
+
   const string& path =
-    path::join("/tmp", strings::join("_", testCase, testName, "XXXXXX"));
+    path::join(tmpdir.get(), strings::join("_", testCase, testName, "XXXXXX"));
 
   Try<string> mkdtemp = os::mkdtemp(path);
   if (mkdtemp.isSome()) {


[5/6] mesos git commit: Add XFS disk isolator documentation.

Posted by ya...@apache.org.
Add XFS disk isolator documentation.

Review: https://reviews.apache.org/r/44950/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/b900abff
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/b900abff
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/b900abff

Branch: refs/heads/master
Commit: b900abff1648ae397d9819322de95ad99737ce4d
Parents: 255710b
Author: James Peach <jp...@apache.org>
Authored: Fri Apr 8 14:56:12 2016 -0700
Committer: Jiang Yan Xu <ya...@jxu.me>
Committed: Fri Apr 8 16:46:08 2016 -0700

----------------------------------------------------------------------
 CHANGELOG                   |  4 ++++
 docs/configuration.md       | 25 +++++++++++++++++++++++
 docs/mesos-containerizer.md | 43 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/b900abff/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 4337490..1f0527e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -27,6 +27,10 @@ This release contains the following new features:
     `mesos.scheduler`. `mesos.native` still exists, combining both modules for
     backwards compatibility with existing code.
 
+  * [MESOS-4828] - **Experimental** support for a new `xfs/disk' isolator
+    has been added to isolate disk resources more efficiently. Please refer to
+    docs/mesos-containerizer.md for more details.
+
 Deprecations:
   * [MESOS-2281] - Deprecated the plain text format for credentials in favor of
     the JSON format.

http://git-wip-us.apache.org/repos/asf/mesos/blob/b900abff/docs/configuration.md
----------------------------------------------------------------------
diff --git a/docs/configuration.md b/docs/configuration.md
index 309a5a0..ba00ec5 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1725,6 +1725,31 @@ isolator. (default: false)
 </tr>
 </table>
 
+*XFS disk isolator flags available when configured with
+`--enable-xfs-disk-isolator`*
+
+<table class="table table-striped">
+  <thead>
+    <tr>
+      <th width="30%">
+        Flag
+      </th>
+      <th>
+        Explanation
+      </th>
+    </tr>
+  </thead>
+<tr>
+  <td>
+    --xfs_project_range=VALUE
+  </td>
+<td>
+The ranges of XFS project IDs that the isolator can use to track disk
+quotas for container sandbox directories. Valid project IDs range from
+1 to max(uint32). (default `[5000-10000]`)
+</td>
+</tr>
+</table>
 
 ## Libprocess Options
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/b900abff/docs/mesos-containerizer.md
----------------------------------------------------------------------
diff --git a/docs/mesos-containerizer.md b/docs/mesos-containerizer.md
index 2fde743..6f40c57 100644
--- a/docs/mesos-containerizer.md
+++ b/docs/mesos-containerizer.md
@@ -85,6 +85,49 @@ The interval between two `du`s can be controlled by the slave flag
 minute. The default interval is 15 seconds.
 
 
+### XFS Disk Isolator
+
+The XFS Disk isolator uses XFS project quotas to track the disk
+space used by each container sandbox and to enforce the corresponding
+disk space allocation. Write operations performed by tasks exceeding
+their disk allocation will fail with an `EDQUOT` error. The task
+will not be terminated by the containerizer.
+
+The XFS disk isolator is functionally similar to Posix Disk isolator
+but avoids the cost of repeatedly running the `du`.  Though they will
+not interfere with each other, it is not recommended to use them together.
+
+To enable the XFS Disk isolator, append `xfs/disk` to the
+`--isolation` flag when starting the slave.
+
+The XFS Disk isolator requires the sandbox directory to be located
+on an XFS filesystem that is mounted with the `pquota` option. There
+is no need to configure
+[projects](http://man7.org/linux/man-pages/man5/projects.5.html)
+or [projid](http://man7.org/linux/man-pages/man5/projid.5.html)
+files. The range of project IDs given to the `--xfs_project_range`
+must not overlap any project IDs allocated for other uses.
+
+The XFS disk isolator does not natively support an accounting-only mode
+like that of the Posix Disk isolator. Quota enforcement can be disabled
+by mounting the filesystem with the `pqnoenforce` mount option.
+
+The [xfs_quota](http://man7.org/linux/man-pages/man8/xfs_quota.8.html)
+command can be used to show the current allocation of project IDs
+and quota. For example:
+
+    $ xfs_quota -x -c "report -a -n -L 5000 -U 1000"
+
+To show which project a file belongs to, use the
+[xfs_io](http://man7.org/linux/man-pages/man8/xfs_io.8.html) command
+to display the `fsxattr.projid` field. For example:
+
+    $ xfs_io -r -c stat /mnt/mesos/
+
+Note that the Posix Disk isolator flags `--enforce_container_disk_quota`,
+`--container_disk_watch_interval` and `--enforce_container_disk_quota` do
+not apply to the XFS Disk isolator.
+
 ### Docker Runtime Isolator
 
 The Docker Runtime isolator is used for supporting runtime


[6/6] mesos git commit: Add utility functions to manipulate XFS project quotas.

Posted by ya...@apache.org.
Add utility functions to manipulate XFS project quotas.

Review: https://reviews.apache.org/r/44946/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a0e96bd2
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a0e96bd2
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a0e96bd2

Branch: refs/heads/master
Commit: a0e96bd22da7a39086600c3186fbad61c554e262
Parents: 0313707
Author: James Peach <jp...@apache.org>
Authored: Fri Apr 8 13:49:16 2016 -0700
Committer: Jiang Yan Xu <ya...@jxu.me>
Committed: Fri Apr 8 16:46:08 2016 -0700

----------------------------------------------------------------------
 src/Makefile.am                                 |   6 +
 .../containerizer/mesos/isolators/xfs/utils.cpp | 384 +++++++++++++++++++
 .../containerizer/mesos/isolators/xfs/utils.hpp |  81 ++++
 3 files changed, 471 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/a0e96bd2/src/Makefile.am
----------------------------------------------------------------------
diff --git a/src/Makefile.am b/src/Makefile.am
index 4375b03..f235a6a 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -896,6 +896,12 @@ MESOS_LINUX_FILES +=							\
   slave/containerizer/mesos/provisioner/backends/bind.hpp		\
   slave/containerizer/mesos/provisioner/backends/overlay.hpp
 
+if ENABLE_XFS_DISK_ISOLATOR
+MESOS_LINUX_FILES +=                                                    \
+  slave/containerizer/mesos/isolators/xfs/utils.cpp                     \
+  slave/containerizer/mesos/isolators/xfs/utils.hpp
+endif
+
 MESOS_NETWORK_ISOLATOR_FILES =						\
   linux/routing/handle.cpp						\
   linux/routing/route.cpp						\

http://git-wip-us.apache.org/repos/asf/mesos/blob/a0e96bd2/src/slave/containerizer/mesos/isolators/xfs/utils.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/xfs/utils.cpp b/src/slave/containerizer/mesos/isolators/xfs/utils.cpp
new file mode 100644
index 0000000..9285183
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/xfs/utils.cpp
@@ -0,0 +1,384 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The XFS API headers come from the xfsprogs package. xfsprogs versions
+// earlier than 4.5 contain various internal macros that conflict with
+// libstdc++.
+
+// If ENABLE_GETTEXT is not defined, then the XFS headers will define
+// textdomain() to a while(0) loop. When C++ standard headers try to
+// use textdomain(), compilation errors ensue.
+#define ENABLE_GETTEXT
+#include <xfs/xfs.h>
+#include <xfs/xqm.h>
+#undef ENABLE_GETTEXT
+
+// xfs/platform_defs-x86_64.h defines min() and max() macros which conflict
+// with various min() and max() function definitions.
+#undef min
+#undef max
+
+#include <fts.h>
+
+#include <blkid/blkid.h>
+#include <linux/quota.h>
+#include <sys/quota.h>
+
+#include <stout/check.hpp>
+#include <stout/error.hpp>
+#include <stout/numify.hpp>
+#include <stout/path.hpp>
+
+#include <stout/fs.hpp>
+#include <stout/os.hpp>
+
+#include "slave/containerizer/mesos/isolators/xfs/utils.hpp"
+
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace xfs {
+
+// The quota API defines space limits in terms of in basic
+// blocks (512 bytes).
+static constexpr Bytes BASIC_BLOCK_SIZE = Bytes(512u);
+
+
+// Although XFS itself doesn't define any invalid project IDs,
+// we need a way to know whether or not a project ID was assigned
+// so we use 0 as our sentinel value.
+static constexpr prid_t NON_PROJECT_ID = 0u;
+
+
+static Error nonProjectError()
+{
+  return Error("Invalid project ID '0'");
+}
+
+
+static Try<int> openPath(
+    const string& path,
+    const struct stat& stat)
+{
+  int flags = O_NOFOLLOW | O_RDONLY | O_CLOEXEC;
+
+  // Directories require O_DIRECTORY.
+  flags |= S_ISDIR(stat.st_mode) ? O_DIRECTORY : 0;
+  return os::open(path, flags);
+}
+
+
+static Try<Nothing> setAttributes(
+    int fd,
+    struct fsxattr& attr)
+{
+  if (::xfsctl(nullptr, fd, XFS_IOC_FSSETXATTR, &attr) == -1) {
+    return ErrnoError();
+  }
+
+  return Nothing();
+}
+
+
+static Try<struct fsxattr> getAttributes(int fd)
+{
+  struct fsxattr attr;
+
+  if (::xfsctl(nullptr, fd, XFS_IOC_FSGETXATTR, &attr) == -1) {
+    return ErrnoError();
+  }
+
+  return attr;
+}
+
+
+// Return the path of the device backing the filesystem containing
+// the given path.
+static Try<string> getDeviceForPath(const string& path)
+{
+  struct stat statbuf;
+
+  if (::lstat(path.c_str(), &statbuf) == -1) {
+    return ErrnoError("Unable to access '" + path + "'");
+  }
+
+  char* name = blkid_devno_to_devname(statbuf.st_dev);
+  if (name == nullptr) {
+    return ErrnoError("Unable to get device for '" + path + "'");
+  }
+
+  string devname(name);
+  free(name);
+
+  return devname;
+}
+
+
+namespace internal {
+
+static Try<Nothing> setProjectQuota(
+    const string& path,
+    prid_t projectId,
+    Bytes limit)
+{
+  Try<string> devname = getDeviceForPath(path);
+  if (devname.isError()) {
+    return Error(devname.error());
+  }
+
+  fs_disk_quota_t quota = {0};
+
+  quota.d_version = FS_DQUOT_VERSION;
+
+  // Specify that we are setting a project quota for this ID.
+  quota.d_id = projectId;
+  quota.d_flags = XFS_PROJ_QUOTA;
+
+  // Set both the hard and the soft limit to the same quota, just
+  // for consistency. Functionally all we need is the hard quota.
+  quota.d_fieldmask = FS_DQ_BSOFT | FS_DQ_BHARD;
+
+  quota.d_blk_hardlimit = limit.bytes() / BASIC_BLOCK_SIZE.bytes();
+  quota.d_blk_softlimit = limit.bytes() / BASIC_BLOCK_SIZE.bytes();
+
+  if (::quotactl(QCMD(Q_XSETQLIM, PRJQUOTA),
+                 devname.get().c_str(),
+                 projectId,
+                 reinterpret_cast<caddr_t>(&quota)) == -1) {
+    return ErrnoError("Failed to set quota for project ID " +
+                      stringify(projectId));
+  }
+
+  return Nothing();
+}
+
+
+static Try<Nothing> setProjectId(
+    const string& path,
+    const struct stat& stat,
+    prid_t projectId)
+{
+  Try<int> fd = openPath(path, stat);
+  if (fd.isError()) {
+    return Error("Failed to open '" + path + "': " + fd.error());
+  }
+
+  Try<struct fsxattr> attr = getAttributes(fd.get());
+  if (attr.isError()) {
+    os::close(fd.get());
+    return Error("Failed to get XFS attributes for '" + path + "': " +
+                 attr.error());
+  }
+
+  attr->fsx_projid = projectId;
+
+  if (projectId == NON_PROJECT_ID) {
+    attr->fsx_xflags &= ~XFS_XFLAG_PROJINHERIT;
+  } else {
+    attr->fsx_xflags |= XFS_XFLAG_PROJINHERIT;
+  }
+
+  Try<Nothing> status = setAttributes(fd.get(), attr.get());
+  os::close(fd.get());
+
+  if (status.isError()) {
+    return Error("Failed to set XFS attributes for '" + path + "': " +
+                 status.error());
+  }
+
+  return Nothing();
+}
+
+} // namespace internal {
+
+
+Result<QuotaInfo> getProjectQuota(
+    const string& path,
+    prid_t projectId)
+{
+  if (projectId == NON_PROJECT_ID) {
+    return nonProjectError();
+  }
+
+  Try<string> devname = getDeviceForPath(path);
+  if (devname.isError()) {
+    return Error(devname.error());
+  }
+
+  fs_disk_quota_t quota = {0};
+
+  quota.d_version = FS_DQUOT_VERSION;
+  quota.d_id = projectId;
+  quota.d_flags = XFS_PROJ_QUOTA;
+
+  // In principle, we should issue a Q_XQUOTASYNC to get an accurate accounting.
+  // However, we don't want to affect performance by continually syncing the
+  // disks, so we accept that the quota information will be slightly out of
+  // date.
+
+  if (::quotactl(QCMD(Q_XGETQUOTA, PRJQUOTA),
+                 devname.get().c_str(),
+                 projectId,
+                 reinterpret_cast<caddr_t>(&quota)) == -1) {
+    return ErrnoError("Failed to get quota for project ID " +
+                      stringify(projectId));
+  }
+
+  // Zero quota means that no quota is assigned.
+  if (quota.d_blk_hardlimit == 0 && quota.d_bcount == 0) {
+    return None();
+  }
+
+  QuotaInfo info;
+  info.limit = BASIC_BLOCK_SIZE * quota.d_blk_hardlimit;
+  info.used =  BASIC_BLOCK_SIZE * quota.d_bcount;
+
+  return info;
+}
+
+
+Try<Nothing> setProjectQuota(
+    const string& path,
+    prid_t projectId,
+    Bytes limit)
+{
+  if (projectId == NON_PROJECT_ID) {
+    return nonProjectError();
+  }
+
+  // A 0 limit deletes the quota record. Since the limit is in basic
+  // blocks that effectively means > 512 bytes.
+  if (limit < BASIC_BLOCK_SIZE) {
+    return Error("Quota limit must be >= " + stringify(BASIC_BLOCK_SIZE));
+  }
+
+  return internal::setProjectQuota(path, projectId, limit);
+}
+
+
+Try<Nothing> clearProjectQuota(
+    const string& path,
+    prid_t projectId)
+{
+  if (projectId == NON_PROJECT_ID) {
+    return nonProjectError();
+  }
+
+  return internal::setProjectQuota(path, projectId, Bytes(0));
+}
+
+
+Result<prid_t> getProjectId(
+    const string& directory)
+{
+  struct stat stat;
+
+  if (::lstat(directory.c_str(), &stat) == -1) {
+    return ErrnoError("Failed to access '" + directory);
+  }
+
+  Try<int> fd = openPath(directory, stat);
+  if (fd.isError()) {
+    return Error("Failed to open '" + directory + "': " + fd.error());
+  }
+
+  Try<struct fsxattr> attr = getAttributes(fd.get());
+  os::close(fd.get());
+
+  if (attr.isError()) {
+    return Error("Failed to get XFS attributes for '" + directory + "': " +
+                 attr.error());
+  }
+
+  if (attr->fsx_projid == NON_PROJECT_ID) {
+    return None();
+  }
+
+  return attr->fsx_projid;
+}
+
+
+static Try<Nothing> setProjectIdRecursively(
+    const string& directory,
+    prid_t projectId)
+{
+  if (os::stat::islink(directory) || !os::stat::isdir(directory)) {
+    return Error(directory + " is not a directory");
+  }
+
+  char* directory_[] = {const_cast<char*>(directory.c_str()), nullptr};
+
+  FTS* tree = ::fts_open(
+      directory_, FTS_NOCHDIR | FTS_PHYSICAL | FTS_XDEV, nullptr);
+  if (tree == nullptr) {
+    return ErrnoError("Failed to open '" + directory + "'");
+  }
+
+  for (FTSENT *node = ::fts_read(tree);
+       node != nullptr; node = ::fts_read(tree)) {
+    if (node->fts_info == FTS_D || node->fts_info == FTS_F) {
+      Try<Nothing> status = internal::setProjectId(
+          node->fts_path, *node->fts_statp, projectId);
+      if (status.isError()) {
+        ::fts_close(tree);
+        return Error(status.error());
+      }
+    }
+  }
+
+  if (errno != 0) {
+    Error error = ErrnoError();
+    ::fts_close(tree);
+    return error;
+  }
+
+  return Nothing();
+}
+
+
+Try<Nothing> setProjectId(
+    const string& directory,
+    prid_t projectId)
+{
+  if (projectId == NON_PROJECT_ID) {
+    return nonProjectError();
+  }
+
+  return setProjectIdRecursively(directory, projectId);
+}
+
+
+Try<Nothing> clearProjectId(
+    const string& directory)
+{
+  return setProjectIdRecursively(directory, NON_PROJECT_ID);
+}
+
+
+Option<Error> validateProjectIds(const IntervalSet<prid_t>& projectRange)
+{
+  if (projectRange.contains(NON_PROJECT_ID)) {
+    return Error("XFS project ID range contains illegal " +
+                 stringify(NON_PROJECT_ID) + " value");
+  }
+
+  return None();
+}
+
+} // namespace xfs {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/a0e96bd2/src/slave/containerizer/mesos/isolators/xfs/utils.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/xfs/utils.hpp b/src/slave/containerizer/mesos/isolators/xfs/utils.hpp
new file mode 100644
index 0000000..654dc73
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/xfs/utils.hpp
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __XFS_UTILS_HPP__
+#define __XFS_UTILS_HPP__
+
+#include <string>
+
+#include <stout/bytes.hpp>
+#include <stout/interval.hpp>
+#include <stout/nothing.hpp>
+#include <stout/try.hpp>
+
+#include <xfs/xfs_types.h>
+
+namespace mesos {
+namespace internal {
+namespace xfs {
+
+struct QuotaInfo
+{
+  Bytes limit;
+  Bytes used;
+};
+
+
+inline bool operator==(const QuotaInfo& left, const QuotaInfo& right)
+{
+  return left.limit == right.limit && left.used == right.used;
+}
+
+
+Option<Error> validateProjectIds(const IntervalSet<prid_t>& projectRange);
+
+
+Result<QuotaInfo> getProjectQuota(
+    const std::string& path,
+    prid_t projectId);
+
+
+Try<Nothing> setProjectQuota(
+    const std::string& path,
+    prid_t projectId,
+    Bytes limit);
+
+
+Try<Nothing> clearProjectQuota(
+    const std::string& path,
+    prid_t projectId);
+
+
+Result<prid_t> getProjectId(
+    const std::string& directory);
+
+
+Try<Nothing> setProjectId(
+    const std::string& directory,
+    prid_t projectId);
+
+
+Try<Nothing> clearProjectId(
+    const std::string& directory);
+
+} // namespace xfs {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __XFS_UTILS_HPP__


[2/6] mesos git commit: Add autoconf tests for XFS project quotas.

Posted by ya...@apache.org.
Add autoconf tests for XFS project quotas.

Review: https://reviews.apache.org/r/44945/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/03137072
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/03137072
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/03137072

Branch: refs/heads/master
Commit: 031370725d05866f98016dfdba8ebf5448067a22
Parents: 548da8f
Author: James Peach <jp...@apache.org>
Authored: Fri Apr 8 13:48:36 2016 -0700
Committer: Jiang Yan Xu <ya...@jxu.me>
Committed: Fri Apr 8 16:46:01 2016 -0700

----------------------------------------------------------------------
 configure.ac | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/03137072/configure.ac
----------------------------------------------------------------------
diff --git a/configure.ac b/configure.ac
index c693b82..4392909 100644
--- a/configure.ac
+++ b/configure.ac
@@ -258,12 +258,19 @@ AC_ARG_ENABLE([tests-install],
                              [build and install tests and their helper tools
                               default: no]),
               [], [enable_tests_install=no])
-
+# TODO(MESOS-4991): Since network-isolator is an optional feature, it should
+# be enabled with --enable-network-isolator.
 AC_ARG_WITH([network-isolator],
             AS_HELP_STRING([--with-network-isolator],
                            [builds the network isolator]),
             [], [with_network_isolator=no])
 
+AC_ARG_ENABLE([xfs-disk-isolator],
+              AS_HELP_STRING([--enable-xfs-disk-isolator],
+                             [builds the XFS disk isolator
+                             default: no]),
+              [], [enable_xfs_disk_isolator=no])
+
 AC_ARG_ENABLE([libevent],
               AS_HELP_STRING([--enable-libevent],
                              [use libevent instead of libev default: no]),
@@ -938,6 +945,52 @@ AM_CONDITIONAL([WITH_NETWORK_ISOLATOR],
                [test "x$with_network_isolator" = "xyes"])
 
 
+AC_MSG_CHECKING([whether to enable the XFS disk isolator])
+AS_IF([test "x$enable_xfs_disk_isolator" = "xyes"],
+      [AC_MSG_RESULT([yes])],
+      [AC_MSG_RESULT([no])])
+
+AS_IF([test "x$enable_xfs_disk_isolator" = "xyes"], [
+  # We only support XFS on Linux.
+  AS_IF([test "$OS_NAME" = "linux"],
+        [],
+        [AC_MSG_ERROR([no XFS support on $OS_NAME
+-------------------------------------------------------------------
+The XFS disk isolator is only supported on Linux.
+-------------------------------------------------------------------
+  ])])
+
+  # Check for build dependencies for the XFS disk isolator. We only
+  # enable this if all the needed headers and libraries are present.
+  AC_CHECK_HEADERS([xfs/xfs.h xfs/xqm.h linux/quota.h sys/quota.h],
+                   [], [AC_MSG_ERROR([missing XFS quota headers
+-------------------------------------------------------------------
+Please install the Linux kernel headers and xfsprogs development
+packages for XFS disk isolator support.
+-------------------------------------------------------------------
+  ])])
+
+  AC_CHECK_HEADERS([blkid/blkid.h], [], [AC_MSG_ERROR([missing libblkid headers
+-------------------------------------------------------------------
+Please install the libblkid development package for XFS disk
+isolator support.
+-------------------------------------------------------------------
+  ])])
+
+  # Note that AC_SEARCH_LIBS causes libblkid to be added to each binary. In
+  # this case, that is what we want, since the dependency will be in libmesos.
+  AC_SEARCH_LIBS(blkid_devno_to_devname, blkid, [], [AC_MSG_ERROR([missing libblkid
+-------------------------------------------------------------------
+Please install the libblkid package for XFS disk isolator support.
+-------------------------------------------------------------------
+  ])])
+
+  AC_DEFINE([ENABLE_XFS_DISK_ISOLATOR])
+])
+
+AM_CONDITIONAL([ENABLE_XFS_DISK_ISOLATOR], [test "x$enable_xfs_disk_isolator" = "xyes"])
+
+
 # Check if Nvidia GPU support is enabled, and if so, verify we can
 # access the NVML header files and libs.
 if test x"$enable_nvidia_gpu_support" = "xyes"; then


[4/6] mesos git commit: Add XFS disk isolator tests.

Posted by ya...@apache.org.
Add XFS disk isolator tests.

Review: https://reviews.apache.org/r/44949/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/255710b7
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/255710b7
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/255710b7

Branch: refs/heads/master
Commit: 255710b7c95e578c873e1317e3705a55e81b1f61
Parents: 04be1d0
Author: James Peach <jp...@apache.org>
Authored: Fri Apr 8 14:53:56 2016 -0700
Committer: Jiang Yan Xu <ya...@jxu.me>
Committed: Fri Apr 8 16:46:08 2016 -0700

----------------------------------------------------------------------
 src/Makefile.am                                 |   4 +-
 src/slave/containerizer/mesos/containerizer.cpp |   7 +
 .../containerizer/mesos/isolators/xfs/disk.cpp  | 437 +++++++++++++++++++
 .../containerizer/mesos/isolators/xfs/disk.hpp  | 107 +++++
 .../containerizer/mesos/isolators/xfs/utils.cpp |   6 +
 .../containerizer/mesos/isolators/xfs/utils.hpp |   3 +
 src/slave/flags.cpp                             |   7 +
 src/slave/flags.hpp                             |   3 +
 src/tests/containerizer/xfs_quota_tests.cpp     | 425 +++++++++++++++++-
 9 files changed, 997 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/255710b7/src/Makefile.am
----------------------------------------------------------------------
diff --git a/src/Makefile.am b/src/Makefile.am
index a16c2da..dc8f8e3 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -899,7 +899,9 @@ MESOS_LINUX_FILES +=							\
 if ENABLE_XFS_DISK_ISOLATOR
 MESOS_LINUX_FILES +=                                                    \
   slave/containerizer/mesos/isolators/xfs/utils.cpp                     \
-  slave/containerizer/mesos/isolators/xfs/utils.hpp
+  slave/containerizer/mesos/isolators/xfs/utils.hpp                     \
+  slave/containerizer/mesos/isolators/xfs/disk.cpp                      \
+  slave/containerizer/mesos/isolators/xfs/disk.hpp
 endif
 
 MESOS_NETWORK_ISOLATOR_FILES =						\

http://git-wip-us.apache.org/repos/asf/mesos/blob/255710b7/src/slave/containerizer/mesos/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp
index a5dd223..c25fa92 100644
--- a/src/slave/containerizer/mesos/containerizer.cpp
+++ b/src/slave/containerizer/mesos/containerizer.cpp
@@ -55,6 +55,10 @@
 
 #include "slave/containerizer/mesos/isolators/posix/disk.hpp"
 
+#if ENABLE_XFS_DISK_ISOLATOR
+#include "slave/containerizer/mesos/isolators/xfs/disk.hpp"
+#endif
+
 #ifdef __linux__
 #include "slave/containerizer/mesos/isolators/cgroups/cpushare.hpp"
 #include "slave/containerizer/mesos/isolators/cgroups/mem.hpp"
@@ -215,6 +219,9 @@ Try<MesosContainerizer*> MesosContainerizer::create(
     {"posix/cpu", &PosixCpuIsolatorProcess::create},
     {"posix/mem", &PosixMemIsolatorProcess::create},
     {"posix/disk", &PosixDiskIsolatorProcess::create},
+#if ENABLE_XFS_DISK_ISOLATOR
+    {"xfs/disk", &XfsDiskIsolatorProcess::create},
+#endif
 #ifdef __linux__
     {"cgroups/cpu", &CgroupsCpushareIsolatorProcess::create},
     {"cgroups/mem", &CgroupsMemIsolatorProcess::create},

http://git-wip-us.apache.org/repos/asf/mesos/blob/255710b7/src/slave/containerizer/mesos/isolators/xfs/disk.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/xfs/disk.cpp b/src/slave/containerizer/mesos/isolators/xfs/disk.cpp
new file mode 100644
index 0000000..2f65f0a
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/xfs/disk.cpp
@@ -0,0 +1,437 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "slave/containerizer/mesos/isolators/xfs/disk.hpp"
+
+#include <glog/logging.h>
+
+#include <stout/check.hpp>
+#include <stout/foreach.hpp>
+#include <stout/os.hpp>
+
+#include <stout/os/stat.hpp>
+
+#include "slave/paths.hpp"
+
+using std::list;
+using std::string;
+
+using process::Failure;
+using process::Future;
+using process::Owned;
+using process::PID;
+using process::Process;
+using process::Promise;
+
+using mesos::slave::ContainerConfig;
+using mesos::slave::ContainerLaunchInfo;
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+static Try<IntervalSet<prid_t>> getIntervalSet(
+    const Value::Ranges& ranges)
+{
+  IntervalSet<prid_t> set;
+
+  for (int i = 0; i < ranges.range_size(); i++) {
+    if (ranges.range(i).end() > std::numeric_limits<prid_t>::max()) {
+      return Error("Project ID " + stringify(ranges.range(i).end()) +
+                   "  is out of range");
+    }
+
+    set += (Bound<prid_t>::closed(ranges.range(i).begin()),
+            Bound<prid_t>::closed(ranges.range(i).end()));
+  }
+
+  return set;
+}
+
+
+static Option<Bytes> getDiskResource(
+    const Resources& resources)
+{
+  Option<Bytes> bytes = None();
+
+  foreach (const Resource& resource, resources) {
+    if (resource.name() != "disk") {
+      continue;
+    }
+
+    // TODO(jpeach): Ignore persistent volume resources. The problem here is
+    // that we need to guarantee that we can track the removal of every
+    // directory for which we assign a project ID. Since destruction of
+    // persistent is not visible to the isolator, we don't want to risk
+    // leaking the project ID, or spuriously reusing it.
+    if (Resources::isPersistentVolume(resource)) {
+      continue;
+    }
+
+    if (resource.has_disk() && resource.disk().has_volume()) {
+      continue;
+    }
+
+    if (bytes.isSome()) {
+      bytes.get() += Megabytes(resource.scalar().value());
+    } else {
+      bytes = Megabytes(resource.scalar().value());
+    }
+  }
+
+  return bytes;
+}
+
+
+Try<Isolator*> XfsDiskIsolatorProcess::create(const Flags& flags)
+{
+  if (!xfs::pathIsXfs(flags.work_dir)) {
+    return Error("'" + flags.work_dir + "' is not an XFS filesystem");
+  }
+
+  Result<uid_t> uid = os::getuid();
+  CHECK_SOME(uid) << "getuid(2) doesn't fail";
+
+  if (uid.get() != 0) {
+    return Error("The XFS disk isolator requires running as root.");
+  }
+
+  Try<Resource> projects =
+    Resources::parse("projects", flags.xfs_project_range, "*");
+
+  if (projects.isError()) {
+    return Error(
+        "Failed to parse XFS project range '" +
+        flags.xfs_project_range +
+        "'");
+  }
+
+  if (projects.get().type() != Value::RANGES) {
+    return Error(
+        "Invalid XFS project resource type " +
+        mesos::Value_Type_Name(projects.get().type()) +
+        ", expecting " +
+        mesos::Value_Type_Name(Value::RANGES));
+  }
+
+  Try<IntervalSet<prid_t>> totalProjectIds =
+    getIntervalSet(projects.get().ranges());
+
+  if (totalProjectIds.isError()) {
+    return Error(totalProjectIds.error());
+  }
+
+  Option<Error> status = xfs::validateProjectIds(totalProjectIds.get());
+  if (status.isSome()) {
+    return Error(status->message);
+  }
+
+  return new MesosIsolator(Owned<MesosIsolatorProcess>(
+      new XfsDiskIsolatorProcess(flags, totalProjectIds.get())));
+}
+
+
+XfsDiskIsolatorProcess::XfsDiskIsolatorProcess(
+    const Flags& _flags,
+    const IntervalSet<prid_t>& projectIds)
+  : flags(_flags),
+    totalProjectIds(projectIds),
+    freeProjectIds(projectIds)
+{
+  // At the beginning, the free project range is the same as the
+  // configured project range.
+
+  LOG(INFO) << "Allocating XFS project IDs from the range " << totalProjectIds;
+}
+
+
+XfsDiskIsolatorProcess::~XfsDiskIsolatorProcess() {}
+
+
+Future<Nothing> XfsDiskIsolatorProcess::recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  // We don't need to explicitly deal with orphans since we are primarily
+  // concerned with the on-disk state. We scan all the sandbox directories
+  // for project IDs that we have not recovered and make a best effort to
+  // remove all the corresponding on-disk state.
+  Try<std::list<std::string>> sandboxes = os::glob(path::join(
+      paths::getSandboxRootDir(flags.work_dir),
+      "*",
+      "frameworks",
+      "*",
+      "executors",
+      "*",
+      "runs",
+      "*"));
+
+  if (sandboxes.isError()) {
+    return Failure("Failed to scan sandbox directories: " + sandboxes.error());
+  }
+
+  hashset<ContainerID> alive;
+
+  foreach (const ContainerState& state, states) {
+    alive.insert(state.container_id());
+  }
+
+  foreach (const string& sandbox, sandboxes.get()) {
+    // Skip the "latest" symlink.
+    if (os::stat::islink(sandbox)) {
+      continue;
+    }
+
+    ContainerID containerId;
+    containerId.set_value(Path(sandbox).basename());
+
+    CHECK(!infos.contains(containerId)) << "ContainerIDs should never collide";
+
+    // We fail the isolator recovery upon failure in any container because
+    // failing to get the project ID usually suggests some fatal issue on the
+    // host.
+    Result<prid_t> projectId = xfs::getProjectId(sandbox);
+    if (projectId.isError()) {
+      return Failure(projectId.error());
+    }
+
+    // If there is no project ID, don't worry about it. This can happen the
+    // first time an operator enables the XFS disk isolator and we recover a
+    // set of containers that we did not isolate.
+    if (projectId.isNone()) {
+      continue;
+    }
+
+    infos.put(containerId, Owned<Info>(new Info(sandbox, projectId.get())));
+    freeProjectIds -= projectId.get();
+
+    // If this is a known orphan, the containerizer will send a cleanup call
+    // later. If this is a live container, we will manage it. Otherwise, we have
+    // to dispatch a cleanup ourselves.  Note that we don't wait for the result
+    // of the cleanups as we don't want to block agent recovery for unknown
+    // orphans.
+    if (!orphans.contains(containerId) && !alive.contains(containerId)) {
+      dispatch(self(), &XfsDiskIsolatorProcess::cleanup, containerId);
+    }
+  }
+
+  return Nothing();
+}
+
+
+// We want to assign the project ID as early as possible. XFS will automatically
+// inherit the project ID to new inodes, so if we do this early we save the work
+// of manually assigning the ID to a lot of files.
+Future<Option<ContainerLaunchInfo>> XfsDiskIsolatorProcess::prepare(
+    const ContainerID& containerId,
+    const ContainerConfig& containerConfig)
+{
+  if (infos.contains(containerId)) {
+    return Failure("Container has already been prepared");
+  }
+
+  Option<prid_t> projectId = nextProjectId();
+  if (projectId.isNone()) {
+    return Failure("Failed to assign project ID, range exhausted");
+  }
+
+  // Keep a record of this container so that cleanup() can remove it if
+  // we fail to assign the project ID.
+  infos.put(
+      containerId,
+      Owned<Info>(new Info(containerConfig.directory(), projectId.get())));
+
+  Try<Nothing> status = xfs::setProjectId(
+      containerConfig.directory(), projectId.get());
+
+  if (status.isError()) {
+    return Failure(
+        "Failed to assign project " + stringify(projectId.get()) + ": " +
+        status.error());
+  }
+
+  LOG(INFO) << "Assigned project " << stringify(projectId.get()) << " to '"
+            << containerConfig.directory() << "'";
+
+  return update(containerId, containerConfig.executor_info().resources())
+    .then([]() -> Future<Option<ContainerLaunchInfo>> {
+      return None();
+    });
+}
+
+
+Future<Nothing> XfsDiskIsolatorProcess::isolate(
+    const ContainerID& containerId,
+    pid_t pid)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  return Nothing();
+}
+
+
+Future<ContainerLimitation> XfsDiskIsolatorProcess::watch(
+    const ContainerID& containerId)
+{
+  // We have nothing to do here, since the XFS quota is enforcing
+  // the limitation.
+  return Future<ContainerLimitation>();
+}
+
+
+Future<Nothing> XfsDiskIsolatorProcess::update(
+    const ContainerID& containerId,
+    const Resources& resources)
+{
+  CHECK(infos.contains(containerId));
+
+  const Owned<Info>& info = infos[containerId];
+
+  Option<Bytes> needed = getDiskResource(resources);
+  if (needed.isNone()) {
+    // TODO(jpeach) If there's no disk resource attached, we should set the
+    // minimum quota (1 block), since a zero quota would be unconstrained.
+    LOG(WARNING) << "Ignoring quota update with no disk resources";
+    return Nothing();
+  }
+
+  // Only update the disk quota if it has changed.
+  if (needed.get() != info->quota) {
+    Try<Nothing> status =
+      xfs::setProjectQuota(info->directory, info->projectId, needed.get());
+
+    if (status.isError()) {
+      return Failure("Failed to update quota for project " +
+                     stringify(info->projectId) + ": " + status.error());
+    }
+
+    info->quota = needed.get();
+
+    LOG(INFO) << "Set quota on container " << containerId
+              << " for project " << info->projectId
+              << " to " << info->quota;
+  }
+
+  return Nothing();
+}
+
+
+Future<ResourceStatistics> XfsDiskIsolatorProcess::usage(
+    const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  ResourceStatistics statistics;
+  const Owned<Info>& info = infos[containerId];
+
+  Result<xfs::QuotaInfo> quota = xfs::getProjectQuota(
+      info->directory, info->projectId);
+
+  if (quota.isError()) {
+    return Failure(quota.error());
+  }
+
+  if (quota.isSome()) {
+    statistics.set_disk_limit_bytes(quota.get().limit.bytes());
+    statistics.set_disk_used_bytes(quota.get().used.bytes());
+  }
+
+  return statistics;
+}
+
+
+// Remove all the quota state that was created for this container. We
+// make a best effort to remove all the state we can, so we keep going
+// even if one operation fails so that we can remove subsequent state.
+Future<Nothing> XfsDiskIsolatorProcess::cleanup(const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    LOG(INFO) << "Ignoring cleanup for unknown container " << containerId;
+    return Nothing();
+  }
+
+  // Take a copy of the Info we are removing so that we can use it
+  // to construct the Failure message if necessary.
+  const Info info = *infos[containerId];
+
+  infos.erase(containerId);
+
+  LOG(INFO) << "Removing project ID " << info.projectId
+            << " from '" << info.directory << "'";
+
+  Try<Nothing> quotaStatus = xfs::clearProjectQuota(
+      info.directory, info.projectId);
+
+  if (quotaStatus.isError()) {
+    LOG(ERROR) << "Failed to clear quota for '"
+               << info.directory << "': " << quotaStatus.error();
+  }
+
+  Try<Nothing> projectStatus = xfs::clearProjectId(info.directory);
+  if (projectStatus.isError()) {
+    LOG(ERROR) << "Failed to remove project ID "
+               << info.projectId
+               << " from '" << info.directory << "': "
+               << projectStatus.error();
+  }
+
+  // If we failed to remove the on-disk project ID we can't reclaim it
+  // because the quota would then be applied across two containers. This
+  // would be a project ID leak, but we could recover it at GC time if
+  // that was visible to isolators.
+  if (quotaStatus.isError() || projectStatus.isError()) {
+    freeProjectIds -= info.projectId;
+    return Failure("Failed to cleanup '" + info.directory + "'");
+  } else {
+    returnProjectId(info.projectId);
+    return Nothing();
+  }
+}
+
+
+Option<prid_t> XfsDiskIsolatorProcess::nextProjectId()
+{
+  if (freeProjectIds.empty()) {
+    return None();
+  }
+
+  prid_t projectId = freeProjectIds.begin()->lower();
+
+  freeProjectIds -= projectId;
+  return projectId;
+}
+
+void XfsDiskIsolatorProcess::returnProjectId(
+    prid_t projectId)
+{
+  // Only return this project ID to the free range if it is in the total
+  // range. This could happen if the total range is changed by the operator
+  // and we recover a previous container from the old range.
+  if (totalProjectIds.contains(projectId)) {
+    freeProjectIds += projectId;
+  }
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/255710b7/src/slave/containerizer/mesos/isolators/xfs/disk.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/xfs/disk.hpp b/src/slave/containerizer/mesos/isolators/xfs/disk.hpp
new file mode 100644
index 0000000..822de65
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/xfs/disk.hpp
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __XFS_DISK_ISOLATOR_HPP__
+#define __XFS_DISK_ISOLATOR_HPP__
+
+#include <string>
+
+#include <process/owned.hpp>
+
+#include <stout/bytes.hpp>
+#include <stout/duration.hpp>
+#include <stout/hashmap.hpp>
+
+#include "slave/flags.hpp"
+#include "slave/state.hpp"
+
+#include "slave/containerizer/mesos/isolator.hpp"
+
+#include "slave/containerizer/mesos/isolators/xfs/utils.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class XfsDiskIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+  static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+  virtual ~XfsDiskIsolatorProcess();
+
+  process::PID<XfsDiskIsolatorProcess> self() const
+  {
+    return process::PID<XfsDiskIsolatorProcess>(this);
+  }
+
+  virtual process::Future<Nothing> recover(
+      const std::list<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  virtual process::Future<Option<mesos::slave::ContainerLaunchInfo>> prepare(
+      const ContainerID& containerId,
+      const mesos::slave::ContainerConfig& containerConfig);
+
+  virtual process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      pid_t pid);
+
+  virtual process::Future<mesos::slave::ContainerLimitation> watch(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> update(
+      const ContainerID& containerId,
+      const Resources& resources);
+
+  virtual process::Future<ResourceStatistics> usage(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> cleanup(
+      const ContainerID& containerId);
+
+private:
+  XfsDiskIsolatorProcess(
+      const Flags& flags,
+      const IntervalSet<prid_t>& projectIds);
+
+  // Take the next project ID from the unallocated pool.
+  Option<prid_t> nextProjectId();
+
+  // Return this project ID to the unallocated pool.
+  void returnProjectId(prid_t projectId);
+
+  struct Info
+  {
+    explicit Info(const std::string& _directory, prid_t _projectId)
+      : directory(_directory), quota(0),  projectId(_projectId) {}
+
+    const std::string directory;
+    Bytes quota;
+    const prid_t projectId;
+  };
+
+  const Flags flags;
+  const IntervalSet<prid_t> totalProjectIds;
+  IntervalSet<prid_t> freeProjectIds;
+  hashmap<ContainerID, process::Owned<Info>> infos;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __XFS_DISK_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/255710b7/src/slave/containerizer/mesos/isolators/xfs/utils.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/xfs/utils.cpp b/src/slave/containerizer/mesos/isolators/xfs/utils.cpp
index 9285183..92914af 100644
--- a/src/slave/containerizer/mesos/isolators/xfs/utils.cpp
+++ b/src/slave/containerizer/mesos/isolators/xfs/utils.cpp
@@ -379,6 +379,12 @@ Option<Error> validateProjectIds(const IntervalSet<prid_t>& projectRange)
   return None();
 }
 
+
+bool pathIsXfs(const std::string& path)
+{
+  return ::platform_test_xfs_path(path.c_str()) == 1;
+}
+
 } // namespace xfs {
 } // namespace internal {
 } // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/255710b7/src/slave/containerizer/mesos/isolators/xfs/utils.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/xfs/utils.hpp b/src/slave/containerizer/mesos/isolators/xfs/utils.hpp
index 654dc73..7602fe3 100644
--- a/src/slave/containerizer/mesos/isolators/xfs/utils.hpp
+++ b/src/slave/containerizer/mesos/isolators/xfs/utils.hpp
@@ -46,6 +46,9 @@ inline bool operator==(const QuotaInfo& left, const QuotaInfo& right)
 Option<Error> validateProjectIds(const IntervalSet<prid_t>& projectRange);
 
 
+bool pathIsXfs(const std::string& path);
+
+
 Result<QuotaInfo> getProjectQuota(
     const std::string& path,
     prid_t projectId);

http://git-wip-us.apache.org/repos/asf/mesos/blob/255710b7/src/slave/flags.cpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.cpp b/src/slave/flags.cpp
index 7164afe..dd7bc9a 100644
--- a/src/slave/flags.cpp
+++ b/src/slave/flags.cpp
@@ -776,4 +776,11 @@ mesos::internal::slave::Flags::Flags()
       "The symbol name of the master detector to use. This symbol\n"
       "should exist in a module specified through the --modules flag.\n"
       "Cannot be used in conjunction with --master.");
+
+#if ENABLE_XFS_DISK_ISOLATOR
+  add(&Flags::xfs_project_range,
+      "xfs_project_range",
+      "The ranges of XFS project IDs to use for tracking directory quotas",
+      "[5000-10000]");
+#endif
 }

http://git-wip-us.apache.org/repos/asf/mesos/blob/255710b7/src/slave/flags.hpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.hpp b/src/slave/flags.hpp
index 4236b7f..300db49 100644
--- a/src/slave/flags.hpp
+++ b/src/slave/flags.hpp
@@ -144,6 +144,9 @@ public:
   Duration qos_correction_interval_min;
   Duration oversubscribed_resources_interval;
   Option<std::string> master_detector;
+#if ENABLE_XFS_DISK_ISOLATOR
+  std::string xfs_project_range;
+#endif
 };
 
 } // namespace slave {

http://git-wip-us.apache.org/repos/asf/mesos/blob/255710b7/src/tests/containerizer/xfs_quota_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/xfs_quota_tests.cpp b/src/tests/containerizer/xfs_quota_tests.cpp
index 8b0322b..61ea2e5 100644
--- a/src/tests/containerizer/xfs_quota_tests.cpp
+++ b/src/tests/containerizer/xfs_quota_tests.cpp
@@ -36,8 +36,8 @@
 
 #include "master/master.hpp"
 
-#include "slave/constants.hpp"
 #include "slave/flags.hpp"
+#include "slave/paths.hpp"
 #include "slave/slave.hpp"
 
 #include "slave/containerizer/fetcher.hpp"
@@ -62,8 +62,11 @@ using mesos::internal::master::Master;
 
 using mesos::internal::slave::Fetcher;
 using mesos::internal::slave::MesosContainerizer;
+using mesos::internal::slave::MesosContainerizerProcess;
 using mesos::internal::slave::Slave;
 
+using mesos::master::detector::MasterDetector;
+
 namespace mesos {
 namespace internal {
 namespace tests {
@@ -159,6 +162,7 @@ public:
     // We only need an XFS-specific directory for the work directory. We
     // don't mind that other flags refer to a different temp directory.
     flags.work_dir = mountPoint.get();
+    flags.isolation = "xfs/disk";
     return flags;
   }
 
@@ -275,6 +279,10 @@ TEST_F(ROOT_XFS_QuotaTest, ProjectIdErrors)
 }
 
 
+// Verify that directories are isolated with respect to XFS quotas. We
+// create two trees which have symlinks into each other. If we followed
+// the symlinks when applying the project IDs to the directories, then the
+// quotas would end up being incorrect.
 TEST_F(ROOT_XFS_QuotaTest, DirectoryTree)
 {
   Bytes limit = Megabytes(100);
@@ -332,6 +340,421 @@ TEST_F(ROOT_XFS_QuotaTest, DirectoryTree)
       getProjectQuota(rootB, projectB));
 }
 
+
+// Verify that a task that tries to consume more space than it has requested
+// is only allowed to consume exactly the assigned resources. We tell dd
+// to write 2MB but only give it 1MB of resources and (roughly) verify that
+// it exits with a failure (that should be a write error).
+TEST_F(ROOT_XFS_QuotaTest, DiskUsageExceedsQuota)
+{
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave =
+    StartSlave(detector.get(), CreateSlaveFlags());
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(&driver, _, _));
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  EXPECT_FALSE(offers.get().empty());
+
+  const Offer& offer = offers.get()[0];
+
+  // Create a task which requests 1MB disk, but actually uses more
+  // than 2MB disk.
+  TaskInfo task = createTask(
+      offer.slave_id(),
+      Resources::parse("cpus:1;mem:128;disk:1").get(),
+      "dd if=/dev/zero of=file bs=1048576 count=2");
+
+  Future<TaskStatus> status1;
+  Future<TaskStatus> status2;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&status1))
+    .WillOnce(FutureArg<1>(&status2));
+
+  driver.launchTasks(offer.id(), {task});
+
+  AWAIT_READY(status1);
+  EXPECT_EQ(task.task_id(), status1.get().task_id());
+  EXPECT_EQ(TASK_RUNNING, status1.get().state());
+
+  AWAIT_READY(status2);
+  EXPECT_EQ(task.task_id(), status2.get().task_id());
+  EXPECT_EQ(TASK_FAILED, status2.get().state());
+
+  // Unlike the posix/disk isolator, the reason for task failure
+  // should be that dd got an IO error.
+  EXPECT_EQ(TaskStatus::SOURCE_EXECUTOR, status2.get().source());
+  EXPECT_EQ("Command exited with status 1", status2.get().message());
+
+  driver.stop();
+  driver.join();
+}
+
+
+// Verify that we can get accurate resource statistics from the XFS
+// disk isolator.
+TEST_F(ROOT_XFS_QuotaTest, ResourceStatistics)
+{
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Fetcher fetcher;
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  slave::Flags flags = CreateSlaveFlags();
+
+  Try<MesosContainerizer*> _containerizer =
+    MesosContainerizer::create(flags, true, &fetcher);
+
+  ASSERT_SOME(_containerizer);
+  Owned<MesosContainerizer> containerizer(_containerizer.get());
+
+  Try<Owned<cluster::Slave>> slave =
+    StartSlave(detector.get(), containerizer.get(), flags);
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+
+  MesosSchedulerDriver driver(
+      &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(_, _, _));
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(_, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return());      // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  EXPECT_FALSE(offers.get().empty());
+
+  Offer offer = offers.get()[0];
+
+  // Create a task that uses 4 of 3MB disk but doesn't fail. We will verify
+  // that the allocated disk is filled.
+  TaskInfo task = createTask(
+      offer.slave_id(),
+      Resources::parse("cpus:1;mem:128;disk:3").get(),
+      "dd if=/dev/zero of=file bs=1048576 count=4 || sleep 1000");
+
+  Future<TaskStatus> status;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&status))
+    .WillRepeatedly(Return()); // Ignore subsequent updates.
+
+  driver.launchTasks(offers.get()[0].id(), {task});
+
+  AWAIT_READY(status);
+  EXPECT_EQ(task.task_id(), status.get().task_id());
+  EXPECT_EQ(TASK_RUNNING, status.get().state());
+
+  Future<hashset<ContainerID>> containers = containerizer.get()->containers();
+  AWAIT_READY(containers);
+  ASSERT_EQ(1u, containers.get().size());
+
+  ContainerID containerId = *(containers.get().begin());
+  Timeout timeout = Timeout::in(Seconds(5));
+
+  while (true) {
+    Future<ResourceStatistics> usage = containerizer.get()->usage(containerId);
+    AWAIT_READY(usage);
+
+    ASSERT_TRUE(usage.get().has_disk_limit_bytes());
+    EXPECT_EQ(Megabytes(3), Bytes(usage.get().disk_limit_bytes()));
+
+    if (usage.get().has_disk_used_bytes()) {
+      // Usage must always be <= the limit.
+      EXPECT_LE(usage.get().disk_used_bytes(), usage.get().disk_limit_bytes());
+
+      // Usage might not be equal to the limit, but it must hit
+      // and not exceed the limit.
+      if (usage.get().disk_used_bytes() >= usage.get().disk_limit_bytes()) {
+        EXPECT_EQ(
+            usage.get().disk_used_bytes(), usage.get().disk_limit_bytes());
+        EXPECT_EQ(Megabytes(3), Bytes(usage.get().disk_used_bytes()));
+        break;
+      }
+    }
+
+    ASSERT_FALSE(timeout.expired());
+    os::sleep(Milliseconds(1));
+  }
+
+  driver.stop();
+  driver.join();
+}
+
+
+// In this test, the framework is not checkpointed. This ensures that when we
+// stop the slave, the executor is killed and we will need to recover the
+// working directories without getting any checkpointed recovery state.
+TEST_F(ROOT_XFS_QuotaTest, NoCheckpointRecovery)
+{
+  slave::Flags flags = CreateSlaveFlags();
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags);
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+
+  MesosSchedulerDriver driver(
+      &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(_, _, _));
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(_, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  EXPECT_FALSE(offers.get().empty());
+
+  Offer offer = offers.get()[0];
+
+  TaskInfo task = createTask(
+      offer.slave_id(),
+      Resources::parse("cpus:1;mem:128;disk:1").get(),
+      "dd if=/dev/zero of=file bs=1048576 count=1; sleep 1000");
+
+  Future<TaskStatus> status;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&status))
+    .WillOnce(Return());
+
+  driver.launchTasks(offer.id(), {task});
+
+  AWAIT_READY(status);
+  EXPECT_EQ(task.task_id(), status.get().task_id());
+  EXPECT_EQ(TASK_RUNNING, status.get().state());
+
+  Future<ResourceUsage> usage1 =
+    process::dispatch(slave.get()->pid, &Slave::usage);
+  AWAIT_READY(usage1);
+
+  // We should have 1 executor using resources.
+  ASSERT_EQ(1, usage1.get().executors().size());
+  EXPECT_EQ(Megabytes(1), usage1->executors(0).statistics().disk_limit_bytes());
+  EXPECT_EQ(Megabytes(1), usage1->executors(0).statistics().disk_used_bytes());
+
+  // Restart the slave.
+  slave.get()->terminate();
+
+  Future<SlaveReregisteredMessage> slaveReregisteredMessage =
+    FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _);
+
+  slave = StartSlave(detector.get(), flags);
+  ASSERT_SOME(slave);
+
+  // Following the example of the filesystem isolator tests, wait
+  // until the containerizer cleans up the orphans. Only after that
+  // should we expect to find the project IDs removed.
+  Future<Nothing> _recover =
+    FUTURE_DISPATCH(_, &MesosContainerizerProcess::___recover);
+  AWAIT_READY(_recover);
+
+  AWAIT_READY(slaveReregisteredMessage);
+
+  Future<ResourceUsage> usage2 =
+    process::dispatch(slave.get()->pid, &Slave::usage);
+  AWAIT_READY(usage2);
+
+  // We should have no executors left because we didn't checkpoint.
+  ASSERT_EQ(0, usage2.get().executors().size());
+
+  Try<std::list<std::string>> sandboxes = os::glob(path::join(
+      slave::paths::getSandboxRootDir(mountPoint.get()),
+      "*",
+      "frameworks",
+      "*",
+      "executors",
+      "*",
+      "runs",
+      "*"));
+
+  ASSERT_SOME(sandboxes);
+
+  // One sandbox and one symlink.
+  ASSERT_EQ(2u, sandboxes->size());
+
+  // Scan the remaining sandboxes and make sure that no projects are assigned.
+  foreach (const string& sandbox, sandboxes.get()) {
+    // Skip the "latest" symlink.
+    if (os::stat::islink(sandbox)) {
+      continue;
+    }
+
+    EXPECT_NONE(xfs::getProjectId(sandbox));
+  }
+
+  driver.stop();
+  driver.join();
+}
+
+
+// In this test, the framework is checkpointed so we expect the executor to
+// persist across the slave restart and to have the same resource usage before
+// and after.
+TEST_F(ROOT_XFS_QuotaTest, CheckpointRecovery)
+{
+  slave::Flags flags = CreateSlaveFlags();
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave =
+    StartSlave(detector.get(), CreateSlaveFlags());
+  ASSERT_SOME(slave);
+
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.set_checkpoint(true);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(_, _, _));
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(_, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  EXPECT_FALSE(offers.get().empty());
+
+  Offer offer = offers.get()[0];
+
+  TaskInfo task = createTask(
+      offer.slave_id(),
+      Resources::parse("cpus:1;mem:128;disk:1").get(),
+      "dd if=/dev/zero of=file bs=1048576 count=1; sleep 1000");
+
+  Future<TaskStatus> status;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&status));
+
+  driver.launchTasks(offer.id(), {task});
+
+  AWAIT_READY(status);
+  EXPECT_EQ(task.task_id(), status.get().task_id());
+  EXPECT_EQ(TASK_RUNNING, status.get().state());
+
+  Future<ResourceUsage> usage1 =
+    process::dispatch(slave.get()->pid, &Slave::usage);
+  AWAIT_READY(usage1);
+
+  // We should have 1 executor using resources.
+  ASSERT_EQ(1, usage1.get().executors().size());
+
+  // Restart the slave.
+  slave.get()->terminate();
+
+  Future<SlaveReregisteredMessage> slaveReregisteredMessage =
+    FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _);
+
+  slave = StartSlave(detector.get(), flags);
+  ASSERT_SOME(slave);
+
+  // Wait for the slave to re-register.
+  AWAIT_READY(slaveReregisteredMessage);
+
+  Future<ResourceUsage> usage2 =
+    process::dispatch(slave.get()->pid, &Slave::usage);
+  AWAIT_READY(usage2);
+
+  // We should have still have 1 executor using resources.
+  ASSERT_EQ(1, usage1.get().executors().size());
+
+  Try<std::list<std::string>> sandboxes = os::glob(path::join(
+      slave::paths::getSandboxRootDir(mountPoint.get()),
+      "*",
+      "frameworks",
+      "*",
+      "executors",
+      "*",
+      "runs",
+      "*"));
+
+  ASSERT_SOME(sandboxes);
+
+  // One sandbox and one symlink.
+  ASSERT_EQ(2u, sandboxes->size());
+
+  // Scan the remaining sandboxes. We ought to still have project IDs
+  // assigned to them all.
+  foreach (const string& sandbox, sandboxes.get()) {
+    // Skip the "latest" symlink.
+    if (os::stat::islink(sandbox)) {
+      continue;
+    }
+
+    EXPECT_SOME(xfs::getProjectId(sandbox));
+  }
+
+  driver.stop();
+  driver.join();
+}
+
+
+TEST_F(ROOT_XFS_QuotaTest, IsolatorFlags)
+{
+  slave::Flags flags;
+
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+
+  // work_dir must be an XFS filesystem.
+  flags = CreateSlaveFlags();
+  flags.work_dir = "/proc";
+  ASSERT_ERROR(StartSlave(detector.get(), flags));
+
+  // 0 is an invalid project ID.
+  flags = CreateSlaveFlags();
+  flags.xfs_project_range = "[0-10]";
+  ASSERT_ERROR(StartSlave(detector.get(), flags));
+
+  // Project IDs are 32 bit.
+  flags = CreateSlaveFlags();
+  flags.xfs_project_range = "[100-1099511627776]";
+  ASSERT_ERROR(StartSlave(detector.get(), flags));
+
+  // Project IDs must be a range.
+  flags = CreateSlaveFlags();
+  flags.xfs_project_range = "foo";
+  ASSERT_ERROR(StartSlave(detector.get(), flags));
+
+  // Project IDs must be a range.
+  flags = CreateSlaveFlags();
+  flags.xfs_project_range = "100";
+  ASSERT_ERROR(StartSlave(detector.get(), flags));
+}
+
 } // namespace tests {
 } // namespace internal {
 } // namespace mesos {