You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2016/03/30 01:24:27 UTC

[1/2] mesos git commit: Added stubs for the Nvidia GPU device isolator.

Repository: mesos
Updated Branches:
  refs/heads/master 250af60cf -> 9174faf37


Added stubs for the Nvidia GPU device isolator.

This is the first isolator to fall under the category of cgroup
devices. However, we do not yet have a generic cgroup device isolator
(nor will we in the very near future). As such, I have preemptively
created the nvidia gpu isolator in the directory hierarchy under the
path:

src/slave/containerizer/mesos/isolators/cgroups/devices/gpus/

in order to easily allow them to fall under the category of gpu
devices later on.

In this stub implementation, initialization of the agent will
fail if the nvidia gpu isolator is enabled via the agent --isolation
flag. That is --isolation="cgroups/devices/gpu/nvidia". In a
subsequent commit we will fill in the guts to actually enable the
proper isolation.

The flags documentation has been udpated accordingly.

Review: https://reviews.apache.org/r/44363/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/9174faf3
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/9174faf3
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/9174faf3

Branch: refs/heads/master
Commit: 9174faf37038cadd2d6261754215c3228cb07840
Parents: 2bb19aa
Author: Kevin Klues <kl...@gmail.com>
Authored: Tue Mar 29 16:11:10 2016 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Tue Mar 29 16:23:50 2016 -0700

----------------------------------------------------------------------
 docs/configuration.md                           |   2 +
 src/Makefile.am                                 |  17 +++
 src/slave/containerizer/mesos/containerizer.cpp |   9 ++
 .../isolators/cgroups/devices/gpus/nvidia.cpp   | 114 +++++++++++++++++++
 .../isolators/cgroups/devices/gpus/nvidia.hpp   |  77 +++++++++++++
 src/slave/flags.cpp                             |   2 +
 6 files changed, 221 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/9174faf3/docs/configuration.md
----------------------------------------------------------------------
diff --git a/docs/configuration.md b/docs/configuration.md
index 9ad0c2a..6fc1875 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1291,6 +1291,8 @@ e.g., <code>bind</code>, <code>copy</code>. (default: copy)
 Isolation mechanisms to use, e.g., <code>posix/cpu,posix/mem</code>, or
 <code>cgroups/cpu,cgroups/mem</code>, or network/port_mapping
 (configure with flag: <code>--with-network-isolator</code> to enable),
+or `cgroups/devices/gpus/nvidia` for nvidia specific gpu isolation
+(configure with flag: `--enable-nvidia-gpu-support` to enable),
 or <code>external</code>, or load an alternate isolator module using
 the <code>--modules</code> flag. Note that this flag is only relevant
 for the Mesos Containerizer. (default: posix/cpu,posix/mem)

http://git-wip-us.apache.org/repos/asf/mesos/blob/9174faf3/src/Makefile.am
----------------------------------------------------------------------
diff --git a/src/Makefile.am b/src/Makefile.am
index 21e2965..f22ae5b 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -924,6 +924,12 @@ MESOS_NETWORK_ISOLATOR_FILES +=						\
   linux/routing/queueing/statistics.hpp					\
   slave/containerizer/mesos/isolators/network/port_mapping.hpp
 
+MESOS_NVIDIA_GPU_ISOLATOR_FILES =					\
+  slave/containerizer/mesos/isolators/cgroups/devices/gpus/nvidia.cpp
+
+MESOS_NVIDIA_GPU_ISOLATOR_FILES +=					\
+  slave/containerizer/mesos/isolators/cgroups/devices/gpus/nvidia.hpp
+
 if OS_LINUX
 libmesos_no_3rdparty_la_SOURCES += $(MESOS_LINUX_FILES)
 else
@@ -936,6 +942,12 @@ else
 EXTRA_DIST += $(MESOS_NETWORK_ISOLATOR_FILES)
 endif
 
+if ENABLE_NVIDIA_GPU_SUPPORT
+libmesos_no_3rdparty_la_SOURCES += $(MESOS_NVIDIA_GPU_ISOLATOR_FILES)
+else
+EXTRA_DIST += $(MESOS_NVIDIA_GPU_ISOLATOR_FILES)
+endif
+
 libmesos_no_3rdparty_la_CPPFLAGS = $(MESOS_CPPFLAGS)
 
 libmesos_no_3rdparty_la_LIBADD = # Initialized to enable using +=.
@@ -1119,6 +1131,11 @@ libmesos_la_LIBADD += -lprotobuf
 LDADD += -lprotobuf
 endif
 
+if ENABLE_NVIDIA_GPU_SUPPORT
+libmesos_la_LIBADD += -lnvidia-ml
+LDADD += -lnvidia-ml
+endif
+
 
 # Binaries.
 sbin_PROGRAMS += mesos-master

http://git-wip-us.apache.org/repos/asf/mesos/blob/9174faf3/src/slave/containerizer/mesos/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp
index 605c52e..ba713f0 100644
--- a/src/slave/containerizer/mesos/containerizer.cpp
+++ b/src/slave/containerizer/mesos/containerizer.cpp
@@ -62,6 +62,12 @@
 #include "slave/containerizer/mesos/isolators/cgroups/perf_event.hpp"
 #endif
 
+#ifdef ENABLE_NVIDIA_GPU_SUPPORT
+#ifdef __linux__
+#include "slave/containerizer/mesos/isolators/cgroups/devices/gpus/nvidia.hpp"
+#endif
+#endif
+
 #ifdef __linux__
 #include "slave/containerizer/mesos/isolators/docker/runtime.hpp"
 #endif
@@ -214,6 +220,9 @@ Try<MesosContainerizer*> MesosContainerizer::create(
     {"cgroups/mem", &CgroupsMemIsolatorProcess::create},
     {"cgroups/net_cls", &CgroupsNetClsIsolatorProcess::create},
     {"cgroups/perf_event", &CgroupsPerfEventIsolatorProcess::create},
+#ifdef ENABLE_NVIDIA_GPU_SUPPORT
+    {"cgroups/devices/gpus/nvidia", &CgroupsNvidiaGpuIsolatorProcess::create},
+#endif
     {"docker/runtime", &DockerRuntimeIsolatorProcess::create},
     {"namespaces/pid", &NamespacesPidIsolatorProcess::create},
 #endif

http://git-wip-us.apache.org/repos/asf/mesos/blob/9174faf3/src/slave/containerizer/mesos/isolators/cgroups/devices/gpus/nvidia.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/devices/gpus/nvidia.cpp b/src/slave/containerizer/mesos/isolators/cgroups/devices/gpus/nvidia.cpp
new file mode 100644
index 0000000..c2cdc8f
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/devices/gpus/nvidia.cpp
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+
+#include <list>
+#include <string>
+
+#include <process/future.hpp>
+
+#include <stout/error.hpp>
+#include <stout/hashmap.hpp>
+#include <stout/option.hpp>
+#include <stout/try.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/mesos/isolator.hpp"
+
+#include "slave/containerizer/mesos/isolators/cgroups/devices/gpus/nvidia.hpp"
+
+using namespace process;
+
+using std::list;
+using std::string;
+
+using mesos::slave::ContainerConfig;
+using mesos::slave::ContainerLaunchInfo;
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+CgroupsNvidiaGpuIsolatorProcess::~CgroupsNvidiaGpuIsolatorProcess() {}
+
+
+Try<Isolator*> CgroupsNvidiaGpuIsolatorProcess::create(const Flags& flags)
+{
+  return Error("Cgroups Nvidia GPU isolation currently not supported");
+}
+
+
+Future<Nothing> CgroupsNvidiaGpuIsolatorProcess::recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  return Failure("Cgroups Nvidia GPU isolation currently not supported");
+}
+
+
+Future<Option<ContainerLaunchInfo>> CgroupsNvidiaGpuIsolatorProcess::prepare(
+    const ContainerID& containerId,
+    const mesos::slave::ContainerConfig& containerConfig)
+{
+  return Failure("Cgroups Nvidia GPU isolation currently not supported");
+}
+
+
+Future<Nothing> CgroupsNvidiaGpuIsolatorProcess::isolate(
+    const ContainerID& containerId,
+    pid_t pid)
+{
+  return Failure("Cgroups Nvidia GPU isolation currently not supported");
+}
+
+
+Future<ContainerLimitation> CgroupsNvidiaGpuIsolatorProcess::watch(
+    const ContainerID& containerId)
+{
+  return Failure("Cgroups Nvidia GPU isolation currently not supported");
+}
+
+
+Future<Nothing> CgroupsNvidiaGpuIsolatorProcess::update(
+    const ContainerID& containerId,
+    const Resources& resources)
+{
+  return Failure("Cgroups Nvidia GPU isolation currently not supported");
+}
+
+
+Future<ResourceStatistics> CgroupsNvidiaGpuIsolatorProcess::usage(
+    const ContainerID& containerId)
+{
+  return Failure("Cgroups Nvidia GPU isolation currently not supported");
+}
+
+
+Future<Nothing> CgroupsNvidiaGpuIsolatorProcess::cleanup(
+    const ContainerID& containerId)
+{
+  return Failure("Cgroups Nvidia GPU isolation currently not supported");
+}
+
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9174faf3/src/slave/containerizer/mesos/isolators/cgroups/devices/gpus/nvidia.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/devices/gpus/nvidia.hpp b/src/slave/containerizer/mesos/isolators/cgroups/devices/gpus/nvidia.hpp
new file mode 100644
index 0000000..1e17df1
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/devices/gpus/nvidia.hpp
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __NVIDIA_GPU_ISOLATOR_HPP__
+#define __NVIDIA_GPU_ISOLATOR_HPP__
+
+#include <sys/types.h>
+
+#include <list>
+
+#include <nvidia/gdk/nvml.h>
+
+#include <process/future.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/option.hpp>
+#include <stout/try.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/mesos/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class CgroupsNvidiaGpuIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+  static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+  virtual ~CgroupsNvidiaGpuIsolatorProcess();
+
+  virtual process::Future<Nothing> recover(
+      const std::list<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  virtual process::Future<Option<mesos::slave::ContainerLaunchInfo>> prepare(
+      const ContainerID& containerId,
+      const mesos::slave::ContainerConfig& containerConfig);
+
+  virtual process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      pid_t pid);
+
+  virtual process::Future<mesos::slave::ContainerLimitation> watch(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> update(
+      const ContainerID& containerId,
+      const Resources& resources);
+
+  virtual process::Future<ResourceStatistics> usage(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> cleanup(
+      const ContainerID& containerId);
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __NVIDIA_GPU_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9174faf3/src/slave/flags.cpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.cpp b/src/slave/flags.cpp
index fd9fbba..0c13ab6 100644
--- a/src/slave/flags.cpp
+++ b/src/slave/flags.cpp
@@ -90,6 +90,8 @@ mesos::internal::slave::Flags::Flags()
       "Isolation mechanisms to use, e.g., `posix/cpu,posix/mem`, or\n"
       "`cgroups/cpu,cgroups/mem`, or network/port_mapping\n"
       "(configure with flag: `--with-network-isolator` to enable),\n"
+      "or `cgroups/devices/gpus/nvidia` for nvidia specific gpu isolation\n"
+      "(configure with flag: `--enable-nvidia-gpu-support` to enable),\n"
       "or `external`, or load an alternate isolator module using\n"
       "the `--modules` flag. Note that this flag is only relevant\n"
       "for the Mesos Containerizer.",


[2/2] mesos git commit: Added configure flags to build with Nvidia GPU support.

Posted by bm...@apache.org.
Added configure flags to build with Nvidia GPU support.

This is the initial commit to begin adding native support for GPUs in
Mesos. This initial version will only include support for Nvidia GPUs
that can be managed by the Nvidia Management Library (NVML).

The configure flags added in this commit can be used to enable Nvidia
GPU support, as well as specify the installation directories of the
NVML header and library files if not already installed in standard
include/library paths on the system.

In a subsequent commit, we will use these configure flags to
conditionally build support for Nvidia GPUs into Mesos.

Review: https://reviews.apache.org/r/44361/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/2bb19aa3
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/2bb19aa3
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/2bb19aa3

Branch: refs/heads/master
Commit: 2bb19aa3eb52b68ebfe9dd4f5efbee2ac2e377d3
Parents: 250af60
Author: Kevin Klues <kl...@gmail.com>
Authored: Tue Mar 29 16:10:37 2016 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Tue Mar 29 16:23:50 2016 -0700

----------------------------------------------------------------------
 configure.ac | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/2bb19aa3/configure.ac
----------------------------------------------------------------------
diff --git a/configure.ac b/configure.ac
index 9ec4bc1..812c92a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -210,6 +210,18 @@ AC_ARG_WITH([nl],
                             (required for network-isolator). [default: /usr]]),
             [], [])
 
+AC_ARG_WITH([nvml-include],
+            AS_HELP_STRING([--with-nvml-include@<:@=DIR@:>@],
+                           [specify where to locate the Nvidia NVML headers
+                            (required for Nvidia GPU support)]),
+            [], [])
+
+AC_ARG_WITH([nvml-lib],
+            AS_HELP_STRING([--with-nvml-lib@<:@=DIR@:>@],
+                           [specify where to locate the Nvidia NVML libraries
+                            (required for Nvidia GPU support)]),
+            [], [])
+
 AC_ARG_ENABLE([bundled-distribute],
               AS_HELP_STRING([--disable-bundled-distribute],
                              [excludes building and using the bundled distribute
@@ -257,6 +269,11 @@ AC_ARG_ENABLE([libevent],
                              [use libevent instead of libev default: no]),
               [enable_libevent=yes], [])
 
+AC_ARG_ENABLE([nvidia-gpu-support],
+              AS_HELP_STRING([--enable-nvidia-gpu-support],
+                             [build with Nvidia GPU support default: no]),
+              [enable_nvidia_gpu_support=yes], [])
+
 AC_ARG_ENABLE([ssl],
               AS_HELP_STRING([--enable-ssl],
                              [use ssl for libprocess communication
@@ -921,6 +938,51 @@ AM_CONDITIONAL([WITH_NETWORK_ISOLATOR],
                [test "x$with_network_isolator" = "xyes"])
 
 
+# Check if Nvidia GPU support is enabled, and if so, verify we can
+# access the NVML header files and libs.
+if test x"$enable_nvidia_gpu_support" = "xyes"; then
+  # If the paths to the NVML headers and/or NVML libraries have been
+  # specified, make sure that those paths are absolute. If everything
+  # is in order, add these paths to the CPPFLAGS and LDFLAGS
+  # respectively.
+  if test -n "`echo $with_nvml_include`"; then
+    if test "$with_nvml_include" = "${with_nvml_include#/}"; then
+      AC_MSG_ERROR([The path passed to --with-nvml-include must be absolute.])
+    fi
+    CPPFLAGS="-I${with_nvml_include} $CPPFLAGS"
+  fi
+  if test -n "`echo $with_nvml_lib`"; then
+    if test "$with_nvml_lib" = "${with_nvml_lib#/}"; then
+      AC_MSG_ERROR([The path passed to --with-nvml-lib must be absolute.])
+    fi
+    LDFLAGS="-L${with_nvml_lib} $LDFLAGS"
+  fi
+
+  AC_CHECK_HEADERS([nvidia/gdk/nvml.h], [],
+                   [AC_MSG_ERROR([Cannot find the Nvidia NVML headers
+-------------------------------------------------------------------
+The Nvidia NVML headers are required to build Mesos with Nvidia
+GPU support. Make sure these headers are either installed on the
+system or the path passed via --with-nvml-include is correct.
+-------------------------------------------------------------------
+  ])])
+
+  AC_CHECK_LIB([nvidia-ml], [nvmlInit], [],
+               [AC_MSG_ERROR([Cannot find the Nvidia NVML libraries
+-------------------------------------------------------------------
+The Nvidia NVML libraries are required to build Mesos with Nvidia
+GPU support. Make sure these libraries are either installed on the
+system or the path passed via --with-nvml-lib is correct.
+-------------------------------------------------------------------
+  ])])
+
+  AC_DEFINE([ENABLE_NVIDIA_GPU_SUPPORT])
+fi
+
+AM_CONDITIONAL([ENABLE_NVIDIA_GPU_SUPPORT],
+  [test x"$enable_nvidia_gpu_support" = "xyes"])
+
+
 # TODO(benh): Consider using AS_IF instead of just shell 'if'
 # statements for better autoconf style (the AS_IF macros also make
 # sure variable dependencies are handled appropriately).