You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2017/08/25 18:40:41 UTC
[1/3] mesos git commit: Added a test verifying that DefaultExecutor
tasks can use nvidia GPUs.
Repository: mesos
Updated Branches:
refs/heads/master e17be1925 -> 53a79b229
Added a test verifying that DefaultExecutor tasks can use nvidia GPUs.
Review: https://reviews.apache.org/r/61282/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/dd06684d
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/dd06684d
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/dd06684d
Branch: refs/heads/master
Commit: dd06684dbcefe98a6ba2b46b8bdf4eda718cee44
Parents: e17be19
Author: Gastón Kleiman <ga...@mesosphere.io>
Authored: Fri Aug 25 11:40:21 2017 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 25 11:40:21 2017 -0700
----------------------------------------------------------------------
.../containerizer/nvidia_gpu_isolator_tests.cpp | 92 ++++++++++++++++++++
1 file changed, 92 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/dd06684d/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
index 9a78ae6..f9b26bc 100644
--- a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
+++ b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
@@ -649,6 +649,98 @@ TEST_F(NvidiaGpuTest, ROOT_NVIDIA_GPU_VolumeShouldInject)
ASSERT_FALSE(volume->shouldInject(manifest.get()));
}
+
+// This test verifies that the DefaultExecutor is able to launch tasks
+// with restricted access to GPUs.
+// It launches a task with 1 GPU and verifies that a call to
+// `nvidia-smi` both succeeds and reports exactly 1 GPU available.
+TEST_F(NvidiaGpuTest, ROOT_CGROUPS_NVIDIA_GPU_DefaultExecutorVerifyDeviceAccess)
+{
+ Try<Owned<cluster::Master>> master = StartMaster();
+ ASSERT_SOME(master);
+
+ // Turn on Nvidia GPU isolation.
+ // Assume at least one GPU is available for isolation.
+ slave::Flags flags = CreateSlaveFlags();
+ flags.isolation = "filesystem/linux,cgroups/devices,gpu/nvidia";
+ flags.resources = "cpus:1"; // To override the default with gpus:0.
+
+#ifndef USE_SSL_SOCKET
+ // Disable operator API authentication for the default executor. Executor
+ // authentication currently has SSL as a dependency, so we cannot require
+ // executors to authenticate with the agent operator API if Mesos was not
+ // built with SSL support.
+ flags.authenticate_http_readwrite = false;
+#endif // USE_SSL_SOCKET
+
+ Owned<MasterDetector> detector = master.get()->createDetector();
+ Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags);
+ ASSERT_SOME(slave);
+
+ MockScheduler sched;
+
+ FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+ frameworkInfo.add_capabilities()->set_type(
+ FrameworkInfo::Capability::GPU_RESOURCES);
+
+ MesosSchedulerDriver driver(
+ &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+ Future<FrameworkID> frameworkId;
+ EXPECT_CALL(sched, registered(&driver, _, _))
+ .WillOnce(FutureArg<1>(&frameworkId));
+
+ Future<vector<Offer>> offers;
+ EXPECT_CALL(sched, resourceOffers(_, _))
+ .WillOnce(FutureArg<1>(&offers))
+ .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+ driver.start();
+
+ AWAIT_READY(frameworkId);
+
+ Resources resources = Resources::parse("cpus:0.1;mem:32;disk:32").get();
+
+ ExecutorInfo executorInfo;
+ executorInfo.set_type(ExecutorInfo::DEFAULT);
+ executorInfo.mutable_executor_id()->CopyFrom(DEFAULT_EXECUTOR_ID);
+ executorInfo.mutable_framework_id()->CopyFrom(frameworkId.get());
+ executorInfo.mutable_resources()->CopyFrom(resources);
+
+ AWAIT_READY(offers);
+ EXPECT_NE(0u, offers->size());
+
+ const Offer& offer = offers->front();
+ const SlaveID& slaveId = offer.slave_id();
+
+ TaskInfo taskInfo = createTask(
+ slaveId,
+ Resources::parse("cpus:0.1;mem:128;gpus:1").get(),
+ "NUM_GPUS=`nvidia-smi --list-gpus | wc -l`;\n"
+ "if [ \"$NUM_GPUS\" != \"1\" ]; then\n"
+ " exit 1;\n"
+ "fi");
+
+ TaskGroupInfo taskGroup = createTaskGroupInfo({taskInfo});
+
+ Future<TaskStatus> statusRunning, statusFinished;
+
+ EXPECT_CALL(sched, statusUpdate(_, _))
+ .WillOnce(FutureArg<1>(&statusRunning))
+ .WillOnce(FutureArg<1>(&statusFinished));
+
+ driver.acceptOffers({offer.id()}, {LAUNCH_GROUP(executorInfo, taskGroup)});
+
+ AWAIT_READY(statusRunning);
+ ASSERT_EQ(TASK_RUNNING, statusRunning->state());
+
+ AWAIT_READY(statusFinished);
+ ASSERT_EQ(TASK_FINISHED, statusFinished->state());
+
+ driver.stop();
+ driver.join();
+}
+
} // namespace tests {
} // namespace internal {
} // namespace mesos {
[3/3] mesos git commit: Added a test using CMD health checks +
DefaultExecutor w/ Docker image.
Posted by vi...@apache.org.
Added a test using CMD health checks + DefaultExecutor w/ Docker image.
Review: https://reviews.apache.org/r/61483/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/53a79b22
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/53a79b22
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/53a79b22
Branch: refs/heads/master
Commit: 53a79b229c7b0d493831855dec5afb21c1c78407
Parents: 1e304a0
Author: Gastón Kleiman <ga...@mesosphere.io>
Authored: Fri Aug 25 11:40:31 2017 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 25 11:40:31 2017 -0700
----------------------------------------------------------------------
src/tests/health_check_tests.cpp | 136 ++++++++++++++++++++++++++++++++++
1 file changed, 136 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/53a79b22/src/tests/health_check_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/health_check_tests.cpp b/src/tests/health_check_tests.cpp
index 2d7aefd..2c43241 100644
--- a/src/tests/health_check_tests.cpp
+++ b/src/tests/health_check_tests.cpp
@@ -2383,6 +2383,142 @@ TEST_F_TEMP_DISABLED_ON_WINDOWS(
}
}
+
+// Tests a healthy docker task via CMD health checks using the
+// DefaultExecutor.
+TEST_F_TEMP_DISABLED_ON_WINDOWS(
+ HealthCheckTest, DefaultExecutorWithDockerImageCommandHealthCheck)
+{
+ Try<Owned<cluster::Master>> master = StartMaster();
+ ASSERT_SOME(master);
+
+ slave::Flags flags = CreateSlaveFlags();
+#ifndef USE_SSL_SOCKET
+ // Disable operator API authentication for the default executor. Executor
+ // authentication currently has SSL as a dependency, so we cannot require
+ // executors to authenticate with the agent operator API if Mesos was not
+ // built with SSL support.
+ flags.authenticate_http_readwrite = false;
+
+ // Set permissive ACLs in the agent so that the local authorizer will be
+ // loaded and implicit executor authorization will be tested.
+ ACLs acls;
+ acls.set_permissive(true);
+
+ flags.acls = acls;
+#endif // USE_SSL_SOCKET
+
+ Fetcher fetcher(flags);
+
+ // We have to explicitly create a `Containerizer` in non-local mode,
+ // because `LaunchNestedContainerSession` (used by command health
+ // checks) tries to start a IO switchboard, which doesn't work in
+ // local mode yet.
+ Try<MesosContainerizer*> _containerizer =
+ MesosContainerizer::create(flags, false, &fetcher);
+
+ ASSERT_SOME(_containerizer);
+
+ Owned<slave::Containerizer> containerizer(_containerizer.get());
+ Owned<MasterDetector> detector = master.get()->createDetector();
+
+ Try<Owned<cluster::Slave>> agent =
+ StartSlave(detector.get(), containerizer.get(), flags);
+ ASSERT_SOME(agent);
+
+ MockScheduler sched;
+ MesosSchedulerDriver driver(
+ &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+
+ Future<FrameworkID> frameworkId;
+ EXPECT_CALL(sched, registered(&driver, _, _))
+ .WillOnce(FutureArg<1>(&frameworkId));
+
+ Future<vector<Offer>> offers;
+ EXPECT_CALL(sched, resourceOffers(&driver, _))
+ .WillOnce(FutureArg<1>(&offers))
+ .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+ driver.start();
+
+ AWAIT_READY(frameworkId);
+
+ AWAIT_READY(offers);
+ EXPECT_NE(0u, offers.get().size());
+
+ Future<TaskStatus> statusRunning;
+ Future<TaskStatus> statusHealthy;
+
+ EXPECT_CALL(sched, statusUpdate(&driver, _))
+ .WillOnce(FutureArg<1>(&statusRunning))
+ .WillOnce(FutureArg<1>(&statusHealthy));
+
+ TaskInfo task = createTask(offers->front(), "sleep 120");
+
+ // TODO(tnachen): Use local image to test if possible.
+ ContainerInfo containerInfo;
+ containerInfo.set_type(ContainerInfo::MESOS);
+ containerInfo.mutable_docker()->set_image("alpine");
+
+ task.mutable_container()->CopyFrom(containerInfo);
+
+ HealthCheck healthCheck;
+
+ healthCheck.set_type(HealthCheck::COMMAND);
+ healthCheck.mutable_command()->set_value("exit $STATUS");
+ healthCheck.set_delay_seconds(0);
+ healthCheck.set_interval_seconds(0);
+ healthCheck.set_grace_period_seconds(0);
+
+ Environment::Variable* variable = healthCheck.mutable_command()->
+ mutable_environment()->mutable_variables()->Add();
+ variable->set_name("STATUS");
+ variable->set_value("0");
+
+ task.mutable_health_check()->CopyFrom(healthCheck);
+
+ Resources executorResources =
+ allocatedResources(Resources::parse("cpus:0.1;mem:32;disk:32").get(), "*");
+
+ task.mutable_resources()->CopyFrom(task.resources() - executorResources);
+
+ TaskGroupInfo taskGroup;
+ taskGroup.add_tasks()->CopyFrom(task);
+
+ ExecutorInfo executor;
+ executor.mutable_executor_id()->set_value("default");
+ executor.set_type(ExecutorInfo::DEFAULT);
+ executor.mutable_framework_id()->CopyFrom(frameworkId.get());
+ executor.mutable_resources()->CopyFrom(executorResources);
+ executor.mutable_shutdown_grace_period()->set_nanoseconds(Seconds(10).ns());
+
+ driver.acceptOffers(
+ {offers->front().id()}, {LAUNCH_GROUP(executor, taskGroup)});
+
+ AWAIT_READY(statusRunning);
+ EXPECT_EQ(TASK_RUNNING, statusRunning.get().state());
+
+ AWAIT_READY(statusHealthy);
+ EXPECT_EQ(TASK_RUNNING, statusHealthy.get().state());
+ EXPECT_EQ(
+ TaskStatus::REASON_TASK_HEALTH_CHECK_STATUS_UPDATED,
+ statusHealthy->reason());
+ EXPECT_TRUE(statusHealthy.get().has_healthy());
+ EXPECT_TRUE(statusHealthy.get().healthy());
+
+ Future<hashset<ContainerID>> containerIds = containerizer->containers();
+
+ AWAIT_READY(containerIds);
+
+ driver.stop();
+ driver.join();
+
+ // Cleanup all mesos launched containers.
+ foreach (const ContainerID& containerId, containerIds.get()) {
+ AWAIT_READY(containerizer->wait(containerId));
+ }
+}
+
} // namespace tests {
} // namespace internal {
} // namespace mesos {
[2/3] mesos git commit: Improved
`NvidiaGpuTest.ROOT_CGROUPS_NVIDIA_GPU_VerifyDeviceAccess`.
Posted by vi...@apache.org.
Improved `NvidiaGpuTest.ROOT_CGROUPS_NVIDIA_GPU_VerifyDeviceAccess`.
Change the test so that the agent offesr as many GPUs as available on
the box instead of restricting it to 1. This way the test will fail if
there's a bug that makes the isolator give a task access to more GPUs
than what it was allocated.
Review: https://reviews.apache.org/r/61438/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/1e304a0b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/1e304a0b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/1e304a0b
Branch: refs/heads/master
Commit: 1e304a0b73f2ab43e9b59a2648446d1e8d7defa0
Parents: dd06684
Author: Gastón Kleiman <ga...@mesosphere.io>
Authored: Fri Aug 25 11:40:27 2017 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 25 11:40:27 2017 -0700
----------------------------------------------------------------------
src/tests/containerizer/nvidia_gpu_isolator_tests.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/1e304a0b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
index f9b26bc..f7a7158 100644
--- a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
+++ b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
@@ -91,8 +91,7 @@ TEST_F(NvidiaGpuTest, ROOT_CGROUPS_NVIDIA_GPU_VerifyDeviceAccess)
// Assume at least one GPU is available for isolation.
slave::Flags flags = CreateSlaveFlags();
flags.isolation = "filesystem/linux,cgroups/devices,gpu/nvidia";
- flags.nvidia_gpu_devices = vector<unsigned int>({0u});
- flags.resources = "gpus:1";
+ flags.resources = "cpus:1"; // To override the default with gpus:0.
Owned<MasterDetector> detector = master.get()->createDetector();