You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2017/08/25 18:40:41 UTC

[1/3] mesos git commit: Added a test verifying that DefaultExecutor tasks can use nvidia GPUs.

Repository: mesos
Updated Branches:
  refs/heads/master e17be1925 -> 53a79b229


Added a test verifying that DefaultExecutor tasks can use nvidia GPUs.

Review: https://reviews.apache.org/r/61282/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/dd06684d
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/dd06684d
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/dd06684d

Branch: refs/heads/master
Commit: dd06684dbcefe98a6ba2b46b8bdf4eda718cee44
Parents: e17be19
Author: Gastón Kleiman <ga...@mesosphere.io>
Authored: Fri Aug 25 11:40:21 2017 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 25 11:40:21 2017 -0700

----------------------------------------------------------------------
 .../containerizer/nvidia_gpu_isolator_tests.cpp | 92 ++++++++++++++++++++
 1 file changed, 92 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/dd06684d/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
index 9a78ae6..f9b26bc 100644
--- a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
+++ b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
@@ -649,6 +649,98 @@ TEST_F(NvidiaGpuTest, ROOT_NVIDIA_GPU_VolumeShouldInject)
   ASSERT_FALSE(volume->shouldInject(manifest.get()));
 }
 
+
+// This test verifies that the DefaultExecutor is able to launch tasks
+// with restricted access to GPUs.
+// It launches a task with 1 GPU and verifies that a call to
+// `nvidia-smi` both succeeds and reports exactly 1 GPU available.
+TEST_F(NvidiaGpuTest, ROOT_CGROUPS_NVIDIA_GPU_DefaultExecutorVerifyDeviceAccess)
+{
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  // Turn on Nvidia GPU isolation.
+  // Assume at least one GPU is available for isolation.
+  slave::Flags flags = CreateSlaveFlags();
+  flags.isolation = "filesystem/linux,cgroups/devices,gpu/nvidia";
+  flags.resources = "cpus:1"; // To override the default with gpus:0.
+
+#ifndef USE_SSL_SOCKET
+  // Disable operator API authentication for the default executor. Executor
+  // authentication currently has SSL as a dependency, so we cannot require
+  // executors to authenticate with the agent operator API if Mesos was not
+  // built with SSL support.
+  flags.authenticate_http_readwrite = false;
+#endif // USE_SSL_SOCKET
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags);
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      FrameworkInfo::Capability::GPU_RESOURCES);
+
+  MesosSchedulerDriver driver(
+      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  Future<FrameworkID> frameworkId;
+  EXPECT_CALL(sched, registered(&driver, _, _))
+    .WillOnce(FutureArg<1>(&frameworkId));
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(_, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return());      // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(frameworkId);
+
+  Resources resources = Resources::parse("cpus:0.1;mem:32;disk:32").get();
+
+  ExecutorInfo executorInfo;
+  executorInfo.set_type(ExecutorInfo::DEFAULT);
+  executorInfo.mutable_executor_id()->CopyFrom(DEFAULT_EXECUTOR_ID);
+  executorInfo.mutable_framework_id()->CopyFrom(frameworkId.get());
+  executorInfo.mutable_resources()->CopyFrom(resources);
+
+  AWAIT_READY(offers);
+  EXPECT_NE(0u, offers->size());
+
+  const Offer& offer = offers->front();
+  const SlaveID& slaveId = offer.slave_id();
+
+  TaskInfo taskInfo = createTask(
+      slaveId,
+      Resources::parse("cpus:0.1;mem:128;gpus:1").get(),
+      "NUM_GPUS=`nvidia-smi --list-gpus | wc -l`;\n"
+      "if [ \"$NUM_GPUS\" != \"1\" ]; then\n"
+      "  exit 1;\n"
+      "fi");
+
+  TaskGroupInfo taskGroup = createTaskGroupInfo({taskInfo});
+
+  Future<TaskStatus> statusRunning, statusFinished;
+
+  EXPECT_CALL(sched, statusUpdate(_, _))
+    .WillOnce(FutureArg<1>(&statusRunning))
+    .WillOnce(FutureArg<1>(&statusFinished));
+
+  driver.acceptOffers({offer.id()}, {LAUNCH_GROUP(executorInfo, taskGroup)});
+
+  AWAIT_READY(statusRunning);
+  ASSERT_EQ(TASK_RUNNING, statusRunning->state());
+
+  AWAIT_READY(statusFinished);
+  ASSERT_EQ(TASK_FINISHED, statusFinished->state());
+
+  driver.stop();
+  driver.join();
+}
+
 } // namespace tests {
 } // namespace internal {
 } // namespace mesos {


[3/3] mesos git commit: Added a test using CMD health checks + DefaultExecutor w/ Docker image.

Posted by vi...@apache.org.
Added a test using CMD health checks + DefaultExecutor w/ Docker image.

Review: https://reviews.apache.org/r/61483/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/53a79b22
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/53a79b22
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/53a79b22

Branch: refs/heads/master
Commit: 53a79b229c7b0d493831855dec5afb21c1c78407
Parents: 1e304a0
Author: Gastón Kleiman <ga...@mesosphere.io>
Authored: Fri Aug 25 11:40:31 2017 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 25 11:40:31 2017 -0700

----------------------------------------------------------------------
 src/tests/health_check_tests.cpp | 136 ++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/53a79b22/src/tests/health_check_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/health_check_tests.cpp b/src/tests/health_check_tests.cpp
index 2d7aefd..2c43241 100644
--- a/src/tests/health_check_tests.cpp
+++ b/src/tests/health_check_tests.cpp
@@ -2383,6 +2383,142 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(
   }
 }
 
+
+// Tests a healthy docker task via CMD health checks using the
+// DefaultExecutor.
+TEST_F_TEMP_DISABLED_ON_WINDOWS(
+    HealthCheckTest, DefaultExecutorWithDockerImageCommandHealthCheck)
+{
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  slave::Flags flags = CreateSlaveFlags();
+#ifndef USE_SSL_SOCKET
+  // Disable operator API authentication for the default executor. Executor
+  // authentication currently has SSL as a dependency, so we cannot require
+  // executors to authenticate with the agent operator API if Mesos was not
+  // built with SSL support.
+  flags.authenticate_http_readwrite = false;
+
+  // Set permissive ACLs in the agent so that the local authorizer will be
+  // loaded and implicit executor authorization will be tested.
+  ACLs acls;
+  acls.set_permissive(true);
+
+  flags.acls = acls;
+#endif // USE_SSL_SOCKET
+
+  Fetcher fetcher(flags);
+
+  // We have to explicitly create a `Containerizer` in non-local mode,
+  // because `LaunchNestedContainerSession` (used by command health
+  // checks) tries to start a IO switchboard, which doesn't work in
+  // local mode yet.
+  Try<MesosContainerizer*> _containerizer =
+    MesosContainerizer::create(flags, false, &fetcher);
+
+  ASSERT_SOME(_containerizer);
+
+  Owned<slave::Containerizer> containerizer(_containerizer.get());
+  Owned<MasterDetector> detector = master.get()->createDetector();
+
+  Try<Owned<cluster::Slave>> agent =
+    StartSlave(detector.get(), containerizer.get(), flags);
+  ASSERT_SOME(agent);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+    &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  Future<FrameworkID> frameworkId;
+  EXPECT_CALL(sched, registered(&driver, _, _))
+    .WillOnce(FutureArg<1>(&frameworkId));
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(frameworkId);
+
+  AWAIT_READY(offers);
+  EXPECT_NE(0u, offers.get().size());
+
+  Future<TaskStatus> statusRunning;
+  Future<TaskStatus> statusHealthy;
+
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&statusRunning))
+    .WillOnce(FutureArg<1>(&statusHealthy));
+
+  TaskInfo task = createTask(offers->front(), "sleep 120");
+
+  // TODO(tnachen): Use local image to test if possible.
+  ContainerInfo containerInfo;
+  containerInfo.set_type(ContainerInfo::MESOS);
+  containerInfo.mutable_docker()->set_image("alpine");
+
+  task.mutable_container()->CopyFrom(containerInfo);
+
+  HealthCheck healthCheck;
+
+  healthCheck.set_type(HealthCheck::COMMAND);
+  healthCheck.mutable_command()->set_value("exit $STATUS");
+  healthCheck.set_delay_seconds(0);
+  healthCheck.set_interval_seconds(0);
+  healthCheck.set_grace_period_seconds(0);
+
+  Environment::Variable* variable = healthCheck.mutable_command()->
+    mutable_environment()->mutable_variables()->Add();
+  variable->set_name("STATUS");
+  variable->set_value("0");
+
+  task.mutable_health_check()->CopyFrom(healthCheck);
+
+  Resources executorResources =
+    allocatedResources(Resources::parse("cpus:0.1;mem:32;disk:32").get(), "*");
+
+  task.mutable_resources()->CopyFrom(task.resources() - executorResources);
+
+  TaskGroupInfo taskGroup;
+  taskGroup.add_tasks()->CopyFrom(task);
+
+  ExecutorInfo executor;
+  executor.mutable_executor_id()->set_value("default");
+  executor.set_type(ExecutorInfo::DEFAULT);
+  executor.mutable_framework_id()->CopyFrom(frameworkId.get());
+  executor.mutable_resources()->CopyFrom(executorResources);
+  executor.mutable_shutdown_grace_period()->set_nanoseconds(Seconds(10).ns());
+
+  driver.acceptOffers(
+      {offers->front().id()}, {LAUNCH_GROUP(executor, taskGroup)});
+
+  AWAIT_READY(statusRunning);
+  EXPECT_EQ(TASK_RUNNING, statusRunning.get().state());
+
+  AWAIT_READY(statusHealthy);
+  EXPECT_EQ(TASK_RUNNING, statusHealthy.get().state());
+  EXPECT_EQ(
+      TaskStatus::REASON_TASK_HEALTH_CHECK_STATUS_UPDATED,
+      statusHealthy->reason());
+  EXPECT_TRUE(statusHealthy.get().has_healthy());
+  EXPECT_TRUE(statusHealthy.get().healthy());
+
+  Future<hashset<ContainerID>> containerIds = containerizer->containers();
+
+  AWAIT_READY(containerIds);
+
+  driver.stop();
+  driver.join();
+
+  // Cleanup all mesos launched containers.
+  foreach (const ContainerID& containerId, containerIds.get()) {
+    AWAIT_READY(containerizer->wait(containerId));
+  }
+}
+
 } // namespace tests {
 } // namespace internal {
 } // namespace mesos {


[2/3] mesos git commit: Improved `NvidiaGpuTest.ROOT_CGROUPS_NVIDIA_GPU_VerifyDeviceAccess`.

Posted by vi...@apache.org.
Improved `NvidiaGpuTest.ROOT_CGROUPS_NVIDIA_GPU_VerifyDeviceAccess`.

Change the test so that the agent offesr as many GPUs as available on
the box instead of restricting it to 1. This way the test will fail if
there's a bug that makes the isolator give a task access to more GPUs
than what it was allocated.

Review: https://reviews.apache.org/r/61438/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/1e304a0b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/1e304a0b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/1e304a0b

Branch: refs/heads/master
Commit: 1e304a0b73f2ab43e9b59a2648446d1e8d7defa0
Parents: dd06684
Author: Gastón Kleiman <ga...@mesosphere.io>
Authored: Fri Aug 25 11:40:27 2017 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 25 11:40:27 2017 -0700

----------------------------------------------------------------------
 src/tests/containerizer/nvidia_gpu_isolator_tests.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/1e304a0b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
index f9b26bc..f7a7158 100644
--- a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
+++ b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
@@ -91,8 +91,7 @@ TEST_F(NvidiaGpuTest, ROOT_CGROUPS_NVIDIA_GPU_VerifyDeviceAccess)
   // Assume at least one GPU is available for isolation.
   slave::Flags flags = CreateSlaveFlags();
   flags.isolation = "filesystem/linux,cgroups/devices,gpu/nvidia";
-  flags.nvidia_gpu_devices = vector<unsigned int>({0u});
-  flags.resources = "gpus:1";
+  flags.resources = "cpus:1"; // To override the default with gpus:0.
 
   Owned<MasterDetector> detector = master.get()->createDetector();