You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2016/07/06 04:11:25 UTC
mesos git commit: Added test for using an Nvidia Docker image.

Repository: mesos
Updated Branches:
  refs/heads/master 6fd7f8125 -> 516d8ef07


Added test for using an Nvidia Docker image.

This test ensures that when using one of Nvidia's Docker images
(these contain a special label), we mount a volume that contains
the libraries and binaries.

Review: https://reviews.apache.org/r/49678/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/516d8ef0
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/516d8ef0
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/516d8ef0

Branch: refs/heads/master
Commit: 516d8ef0759937eb327893a01e2bdef8c7438b1c
Parents: 6fd7f81
Author: Kevin Klues <kl...@gmail.com>
Authored: Tue Jul 5 20:55:36 2016 -0700
Committer: Benjamin Mahler <bm...@apache.org>
Committed: Tue Jul 5 21:09:59 2016 -0700

----------------------------------------------------------------------
 .../containerizer/nvidia_gpu_isolator_tests.cpp | 116 +++++++++++++++++++
 1 file changed, 116 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/516d8ef0/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
index b551213..7944c0d 100644
--- a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
+++ b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
@@ -174,6 +174,122 @@ TEST_F(NvidiaGpuTest, ROOT_CGROUPS_NVIDIA_GPU_VerifyDeviceAccess)
 }
 
 
+// This test verifies that we can enable the Nvidia GPU isolator
+// and launch tasks with restricted access to GPUs while running
+// inside one of Nvidia's images. These images have a special
+// label that indicates that we need to mount a volume containing
+// the Nvidia libraries and binaries. We first launch a task with
+// 1 GPU and verify that a call to `nvidia-smi` both succeeds and
+// reports exactly 1 GPU available. We then launch a task with
+// access to 0 GPUs and verify that a call to `nvidia-smi` fails.
+TEST_F(NvidiaGpuTest, ROOT_INTERNET_CURL_CGROUPS_NVIDIA_GPU_NvidiaDockerImage)
+{
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  slave::Flags flags = CreateSlaveFlags();
+  flags.isolation = "docker/runtime,filesystem/linux,"
+                    "cgroups/devices,gpu/nvidia";
+  flags.image_providers = "docker";
+  flags.nvidia_gpu_devices = vector<unsigned int>({0u});
+  flags.resources = "cpus:1;mem:128;gpus:1";
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags);
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      FrameworkInfo::Capability::GPU_RESOURCES);
+
+  MesosSchedulerDriver driver(
+      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  Future<Nothing> schedRegistered;
+  EXPECT_CALL(sched, registered(_, _, _))
+    .WillOnce(FutureSatisfy(&schedRegistered));
+
+  Future<vector<Offer>> offers1, offers2;
+  EXPECT_CALL(sched, resourceOffers(_, _))
+    .WillOnce(FutureArg<1>(&offers1))
+    .WillOnce(FutureArg<1>(&offers2))
+    .WillRepeatedly(Return());      // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(schedRegistered);
+
+  Image image;
+  image.set_type(Image::DOCKER);
+  image.mutable_docker()->set_name("nvidia/cuda");
+
+  // Launch a task requesting 1 GPU and verify
+  // that `nvidia-smi` lists exactly one GPU.
+  AWAIT_READY(offers1);
+  ASSERT_EQ(1u, offers1->size());
+
+  TaskInfo task1 = createTask(
+      offers1->at(0).slave_id(),
+      Resources::parse("cpus:1;mem:128;gpus:1").get(),
+      "NUM_GPUS=`nvidia-smi --list-gpus | wc -l`;\n"
+      "if [ \"$NUM_GPUS\" != \"1\" ]; then\n"
+      "  exit 1;\n"
+      "fi");
+
+  ContainerInfo* container = task1.mutable_container();
+  container->set_type(ContainerInfo::MESOS);
+  container->mutable_mesos()->mutable_image()->CopyFrom(image);
+
+  Future<TaskStatus> statusRunning1, statusFinished1;
+  EXPECT_CALL(sched, statusUpdate(_, _))
+    .WillOnce(FutureArg<1>(&statusRunning1))
+    .WillOnce(FutureArg<1>(&statusFinished1));
+
+  driver.launchTasks(offers1->at(0).id(), {task1});
+
+  // We wait wait up to 120 seconds
+  // to download the docker image.
+  AWAIT_READY_FOR(statusRunning1, Seconds(120));
+  ASSERT_EQ(TASK_RUNNING, statusRunning1->state());
+
+  AWAIT_READY(statusFinished1);
+  ASSERT_EQ(TASK_FINISHED, statusFinished1->state());
+
+  // Launch a task requesting no GPUs and
+  // verify that running `nvidia-smi` fails.
+  AWAIT_READY(offers2);
+  EXPECT_EQ(1u, offers2->size());
+
+  TaskInfo task2 = createTask(
+      offers2->at(0).slave_id(),
+      Resources::parse("cpus:1;mem:128").get(),
+      "nvidia-smi");
+
+  container = task2.mutable_container();
+  container->set_type(ContainerInfo::MESOS);
+  container->mutable_mesos()->mutable_image()->CopyFrom(image);
+
+  Future<TaskStatus> statusRunning2, statusFailed2;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&statusRunning2))
+    .WillOnce(FutureArg<1>(&statusFailed2));
+
+  driver.launchTasks(offers2->at(0).id(), {task2});
+
+  AWAIT_READY_FOR(statusRunning2, Seconds(120));
+  ASSERT_EQ(TASK_RUNNING, statusRunning2->state());
+
+  AWAIT_READY(statusFailed2);
+  ASSERT_EQ(TASK_FAILED, statusFailed2->state());
+
+  driver.stop();
+  driver.join();
+}
+
+
 // This test verifies correct failure semantics when
 // a task requests a fractional number of GPUs.
 TEST_F(NvidiaGpuTest, ROOT_CGROUPS_NVIDIA_GPU_FractionalResources)