You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2016/03/30 02:34:19 UTC

[1/2] mesos git commit: Added flag to specify which Nvidia GPU devices to make available.

Repository: mesos
Updated Branches:
  refs/heads/master e79e690cc -> 77fae968f


Added flag to specify which Nvidia GPU devices to make available.

Review: https://reviews.apache.org/r/44365/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a04b8b6a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a04b8b6a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a04b8b6a

Branch: refs/heads/master
Commit: a04b8b6a5efa2661d5e8f45944b0e121d508f0fa
Parents: e79e690
Author: Kevin Klues <kl...@gmail.com>
Authored: Tue Mar 29 17:19:11 2016 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Tue Mar 29 17:19:11 2016 -0700

----------------------------------------------------------------------
 docs/configuration.md | 15 +++++++++++++++
 src/slave/flags.cpp   | 12 ++++++++++++
 src/slave/flags.hpp   |  3 +++
 3 files changed, 30 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/a04b8b6a/docs/configuration.md
----------------------------------------------------------------------
diff --git a/docs/configuration.md b/docs/configuration.md
index 6fc1875..75c9a0a 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1322,6 +1322,21 @@ directory. (default: /usr/local/libexec/mesos)
 </tr>
 <tr>
   <td>
+    --nvidia_gpu_devices=VALUE
+  </td>
+  <td>
+A comma-separated list of Nvidia GPU devices. When `gpus` is specified
+in the `--resources` flag, this flag determines which GPU devices will
+be made available. The devices should be listed as numbers that
+correspond to Nvidia's NVML device enumeration (as seen by running the
+command `nvidia-smi` on an Nvidia GPU equipped system). The GPUs
+listed will only be isolated if the `--isolation` flag contains the
+string `cgroups/devices/gpus/nvidia`. This flag will only work if
+mesos has been configured with `--enable-nvidia-gpu-support`.
+  </td>
+</tr>
+<tr>
+  <td>
     --network_cni_plugins_dir=VALUE
   </td>
   <td>

http://git-wip-us.apache.org/repos/asf/mesos/blob/a04b8b6a/src/slave/flags.cpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.cpp b/src/slave/flags.cpp
index 0c13ab6..8868e1e 100644
--- a/src/slave/flags.cpp
+++ b/src/slave/flags.cpp
@@ -530,6 +530,18 @@ mesos::internal::slave::Flags::Flags()
       "before it kills that instance.",
       Seconds(0));
 
+#ifdef ENABLE_NVIDIA_GPU_SUPPORT
+  add(&Flags::nvidia_gpu_devices,
+      "nvidia_gpu_devices",
+      "A comma-separated list of Nvidia GPU devices. When `gpus` is\n"
+      "specified in the `--resources` flag, this flag determines which GPU\n"
+      "devices will be made available. The devices should be listed as\n"
+      "numbers that correspond to Nvidia's NVML device enumeration (as\n"
+      "seen by running the command `nvidia-smi` on an Nvidia GPU\n"
+      "equipped system).  The GPUs listed will only be isolated if the\n"
+      "`--isolation` flag contains the string `cgroups/devices/gpus/nvidia`.");
+#endif // ENABLE_NVIDIA_GPU_SUPPORT
+
 #ifdef WITH_NETWORK_ISOLATOR
   add(&Flags::ephemeral_ports_per_container,
       "ephemeral_ports_per_container",

http://git-wip-us.apache.org/repos/asf/mesos/blob/a04b8b6a/src/slave/flags.hpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.hpp b/src/slave/flags.hpp
index 9ee7f34..345a225 100644
--- a/src/slave/flags.hpp
+++ b/src/slave/flags.hpp
@@ -110,6 +110,9 @@ public:
   Duration docker_stop_timeout;
   bool docker_kill_orphans;
   std::string docker_socket;
+#ifdef ENABLE_NVIDIA_GPU_SUPPORT
+  Option<std::vector<unsigned int>> nvidia_gpu_devices;
+#endif
 #ifdef WITH_NETWORK_ISOLATOR
   uint16_t ephemeral_ports_per_container;
   Option<std::string> eth0_name;


[2/2] mesos git commit: Added GPUs as an explicit resource in the agent.

Posted by bm...@apache.org.
Added GPUs as an explicit resource in the agent.

Currently, we enforce that the number of GPUs specified in the 'gpus'
resource parameter equal the number of GPUs passed in via the
--nvidia_gpu_devices flag. In the future, we will generalize this via
autodiscovery of GPUs and support for GPU types other than Nvidia.

Review: https://reviews.apache.org/r/44366/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/77fae968
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/77fae968
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/77fae968

Branch: refs/heads/master
Commit: 77fae968ff80e8516d42ead384956952d5132d46
Parents: a04b8b6
Author: Kevin Klues <kl...@gmail.com>
Authored: Tue Mar 29 17:19:49 2016 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Tue Mar 29 17:33:44 2016 -0700

----------------------------------------------------------------------
 include/mesos/resources.hpp               |  1 +
 include/mesos/v1/resources.hpp            |  1 +
 src/common/resources.cpp                  | 14 ++++++++++-
 src/slave/containerizer/containerizer.cpp | 32 ++++++++++++++++++++++++++
 src/v1/resources.cpp                      | 14 ++++++++++-
 5 files changed, 60 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/77fae968/include/mesos/resources.hpp
----------------------------------------------------------------------
diff --git a/include/mesos/resources.hpp b/include/mesos/resources.hpp
index bb343ad..a557e97 100644
--- a/include/mesos/resources.hpp
+++ b/include/mesos/resources.hpp
@@ -324,6 +324,7 @@ public:
   // TODO(vinod): Fix this when we make these types as first class
   // protobufs.
   Option<double> cpus() const;
+  Option<double> gpus() const;
   Option<Bytes> mem() const;
   Option<Bytes> disk() const;
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/77fae968/include/mesos/v1/resources.hpp
----------------------------------------------------------------------
diff --git a/include/mesos/v1/resources.hpp b/include/mesos/v1/resources.hpp
index 719110f..a5ba8fe 100644
--- a/include/mesos/v1/resources.hpp
+++ b/include/mesos/v1/resources.hpp
@@ -324,6 +324,7 @@ public:
   // TODO(vinod): Fix this when we make these types as first class
   // protobufs.
   Option<double> cpus() const;
+  Option<double> gpus() const;
   Option<Bytes> mem() const;
   Option<Bytes> disk() const;
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/77fae968/src/common/resources.cpp
----------------------------------------------------------------------
diff --git a/src/common/resources.cpp b/src/common/resources.cpp
index 818eb8b..f6ff92b 100644
--- a/src/common/resources.cpp
+++ b/src/common/resources.cpp
@@ -1101,9 +1101,10 @@ Try<Resources> Resources::apply(const Offer::Operation& operation) const
   // The following are sanity checks to ensure the amount of each type of
   // resource does not change.
   // TODO(jieyu): Currently, we only check known resource types like
-  // cpus, mem, disk, ports, etc. We should generalize this.
+  // cpus, gpus, mem, disk, ports, etc. We should generalize this.
 
   CHECK(result.cpus() == cpus());
+  CHECK(result.gpus() == gpus());
   CHECK(result.mem() == mem());
   CHECK(result.disk() == disk());
   CHECK(result.ports() == ports());
@@ -1227,6 +1228,17 @@ Option<double> Resources::cpus() const
 }
 
 
+Option<double> Resources::gpus() const
+{
+  Option<Value::Scalar> value = get<Value::Scalar>("gpus");
+  if (value.isSome()) {
+    return value->value();
+  } else {
+    return None();
+  }
+}
+
+
 Option<Bytes> Resources::mem() const
 {
   Option<Value::Scalar> value = get<Value::Scalar>("mem");

http://git-wip-us.apache.org/repos/asf/mesos/blob/77fae968/src/slave/containerizer/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/containerizer.cpp b/src/slave/containerizer/containerizer.cpp
index 3556040..d0cae79 100644
--- a/src/slave/containerizer/containerizer.cpp
+++ b/src/slave/containerizer/containerizer.cpp
@@ -22,6 +22,7 @@
 
 #include <stout/fs.hpp>
 #include <stout/hashmap.hpp>
+#include <stout/numify.hpp>
 #include <stout/os.hpp>
 #include <stout/stringify.hpp>
 #include <stout/strings.hpp>
@@ -92,6 +93,37 @@ Try<Resources> Containerizer::resources(const Flags& flags)
         flags.default_role).get();
   }
 
+  // GPU resource.
+  // We currently do not support GPU discovery, so we require that
+  // GPUs are explicitly specified in `--resources`. When Nvidia GPU
+  // support is enabled, we also require the GPU devices to be
+  // specified in `--nvidia_gpu_devices`.
+  if (strings::contains(flags.resources.getOrElse(""), "gpus")) {
+    // Make sure that the value of `gpus` is actually an integer and
+    // not a fractional amount. We take advantage of the fact that we
+    // know the value of `gpus` is only precise up to 3 decimals.
+    long long millis = static_cast<long long>(resources.gpus().get() * 1000);
+    if ((millis % 1000) != 0) {
+      return Error("The `gpus` resource must specified as an unsigned integer");
+    }
+
+#ifdef ENABLE_NVIDIA_GPU_SUPPORT
+    // Verify that the number of GPUs in `--nvidia_gpu_devices`
+    // matches the number of GPUs specified as a resource. In the
+    // future we will do discovery of GPUs, which will make the
+    // `--nvidia_gpu_devices` flag optional.
+    if (!flags.nvidia_gpu_devices.isSome()) {
+      return Error("When specifying the `gpus` resource, you must also specify"
+                   " a list of GPUs via the `--nvidia_gpu_devices` flag");
+    }
+
+    if (flags.nvidia_gpu_devices->size() != resources.gpus().get())
+      return Error("The number of GPUs passed in the '--nvidia_gpu_devices'"
+                   " flag must match the number of GPUs specified in the 'gpus'"
+                   " resource");
+#endif // ENABLE_NVIDIA_GPU_SUPPORT
+  }
+
   // Memory resource.
   if (!strings::contains(flags.resources.getOrElse(""), "mem")) {
     // No memory specified so probe OS or resort to DEFAULT_MEM.

http://git-wip-us.apache.org/repos/asf/mesos/blob/77fae968/src/v1/resources.cpp
----------------------------------------------------------------------
diff --git a/src/v1/resources.cpp b/src/v1/resources.cpp
index 4907040..8c3f2d1 100644
--- a/src/v1/resources.cpp
+++ b/src/v1/resources.cpp
@@ -1104,9 +1104,10 @@ Try<Resources> Resources::apply(const Offer::Operation& operation) const
   // The following are sanity checks to ensure the amount of each type of
   // resource does not change.
   // TODO(jieyu): Currently, we only check known resource types like
-  // cpus, mem, disk, ports, etc. We should generalize this.
+  // cpus, gpus, mem, disk, ports, etc. We should generalize this.
 
   CHECK(result.cpus() == cpus());
+  CHECK(result.gpus() == gpus());
   CHECK(result.mem() == mem());
   CHECK(result.disk() == disk());
   CHECK(result.ports() == ports());
@@ -1230,6 +1231,17 @@ Option<double> Resources::cpus() const
 }
 
 
+Option<double> Resources::gpus() const
+{
+  Option<Value::Scalar> value = get<Value::Scalar>("gpus");
+  if (value.isSome()) {
+    return value->value();
+  } else {
+    return None();
+  }
+}
+
+
 Option<Bytes> Resources::mem() const
 {
   Option<Value::Scalar> value = get<Value::Scalar>("mem");