You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@mesos.apache.org by Benjamin Mahler <be...@gmail.com> on 2014/07/30 03:36:30 UTC
Re: git commit: Task health status change notifications
Not sure if it's related to this commit, but seems the GracePeriod test is
flaky now:
https://issues.apache.org/jira/browse/MESOS-1653
Could you help triage this ticket?
On Tue, Jul 29, 2014 at 3:47 PM, <nn...@apache.org> wrote:
> Repository: mesos
> Updated Branches:
> refs/heads/master 98557a7cf -> f66289831
>
>
> Task health status change notifications
>
> The reusable health check program added in #22579 emits TaskStatus
> messages when the task under supervision first becomes viable (when the
> task passes its first health check). It also emits a message when a
> task changes state from healthy to unhealthy.
>
> However, the scheduler should be notified for _every_ observed change in
> health status. It's easy to imagine cases where the scheduler wants to
> wait a while before killing an unhealthy task, but still be notified of
> status changes so that load balancers may be updated, etc. This patch
> therefore causes the scheduler to also be notified when an unhealthy
> task becomes healthy again.
>
> Review: https://reviews.apache.org/r/23966
>
>
> Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
> Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f6628983
> Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f6628983
> Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f6628983
>
> Branch: refs/heads/master
> Commit: f6628983165e4b0a2f44bb288ff87041f9e5e1bb
> Parents: 98557a7
> Author: Connor Doyle <co...@mesosphere.io>
> Authored: Tue Jul 29 15:46:45 2014 -0700
> Committer: Niklas Q. Nielsen <ni...@mesosphere.io>
> Committed: Tue Jul 29 15:46:45 2014 -0700
>
> ----------------------------------------------------------------------
> src/health-check/main.cpp | 5 +-
> src/tests/health_check_tests.cpp | 87 +++++++++++++++++++++++++++++++++++
> 2 files changed, 91 insertions(+), 1 deletion(-)
> ----------------------------------------------------------------------
>
>
>
> http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/health-check/main.cpp
> ----------------------------------------------------------------------
> diff --git a/src/health-check/main.cpp b/src/health-check/main.cpp
> index 95d881e..10d57a0 100644
> --- a/src/health-check/main.cpp
> +++ b/src/health-check/main.cpp
> @@ -121,7 +121,10 @@ private:
> void success()
> {
> VLOG(1) << "Check passed";
> - if (initializing) {
> +
> + // Send a healthy status update on the first success,
> + // and on the first success following failure(s).
> + if (initializing || consecutiveFailures > 0) {
> TaskHealthStatus taskHealthStatus;
> taskHealthStatus.set_healthy(true);
> taskHealthStatus.mutable_task_id()->CopyFrom(taskID);
>
>
> http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/tests/health_check_tests.cpp
> ----------------------------------------------------------------------
> diff --git a/src/tests/health_check_tests.cpp
> b/src/tests/health_check_tests.cpp
> index aa5b78b..6c54ea8 100644
> --- a/src/tests/health_check_tests.cpp
> +++ b/src/tests/health_check_tests.cpp
> @@ -174,6 +174,93 @@ TEST_F(HealthCheckTest, HealthyTask)
> Shutdown();
> }
>
> +// Testing health status change reporting to scheduler.
> +TEST_F(HealthCheckTest, HealthStatusChange)
> +{
> + Try<PID<Master> > master = StartMaster();
> + ASSERT_SOME(master);
> +
> + slave::Flags flags = CreateSlaveFlags();
> + flags.isolation = "posix/cpu,posix/mem";
> +
> + Try<MesosContainerizer*> containerizer =
> + MesosContainerizer::create(flags, false);
> + CHECK_SOME(containerizer);
> +
> + Try<PID<Slave> > slave = StartSlave(containerizer.get());
> + ASSERT_SOME(slave);
> +
> + MockScheduler sched;
> + MesosSchedulerDriver driver(
> + &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
> +
> + EXPECT_CALL(sched, registered(&driver, _, _));
> +
> + Future<vector<Offer> > offers;
> + EXPECT_CALL(sched, resourceOffers(&driver, _))
> + .WillOnce(FutureArg<1>(&offers))
> + .WillRepeatedly(Return()); // Ignore subsequent offers.
> +
> + driver.start();
> +
> + AWAIT_READY(offers);
> + EXPECT_NE(0u, offers.get().size());
> +
> + // Create a temporary file.
> + Try<string> temporaryPath = os::mktemp();
> + ASSERT_SOME(temporaryPath);
> + string tmpPath = temporaryPath.get();
> +
> + // This command fails every other invocation.
> + // For all runs i in Nat0, the following case i % 2 applies:
> + //
> + // Case 0:
> + // - Remove the temporary file.
> + //
> + // Case 1:
> + // - Attempt to remove the nonexistent temporary file.
> + // - Create the temporary file.
> + // - Exit with a non-zero status.
> + string alt = "rm " + tmpPath + " || (touch " + tmpPath + " && exit 1)";
> +
> + vector<TaskInfo> tasks = populateTasks(
> + "sleep 20", alt, offers.get()[0], 0, 3);
> +
> + Future<TaskStatus> statusRunning;
> + Future<TaskStatus> statusHealth1;
> + Future<TaskStatus> statusHealth2;
> + Future<TaskStatus> statusHealth3;
> +
> + EXPECT_CALL(sched, statusUpdate(&driver, _))
> + .WillOnce(FutureArg<1>(&statusRunning))
> + .WillOnce(FutureArg<1>(&statusHealth1))
> + .WillOnce(FutureArg<1>(&statusHealth2))
> + .WillOnce(FutureArg<1>(&statusHealth3));
> +
> + driver.launchTasks(offers.get()[0].id(), tasks);
> +
> + AWAIT_READY(statusRunning);
> + EXPECT_EQ(TASK_RUNNING, statusRunning.get().state());
> +
> + AWAIT_READY(statusHealth1);
> + EXPECT_EQ(TASK_RUNNING, statusHealth1.get().state());
> + EXPECT_TRUE(statusHealth1.get().healthy());
> +
> + AWAIT_READY(statusHealth2);
> + EXPECT_EQ(TASK_RUNNING, statusHealth2.get().state());
> + EXPECT_FALSE(statusHealth2.get().healthy());
> +
> + AWAIT_READY(statusHealth3);
> + EXPECT_EQ(TASK_RUNNING, statusHealth3.get().state());
> + EXPECT_TRUE(statusHealth3.get().healthy());
> +
> + os::rm(tmpPath); // Clean up the temporary file.
> +
> + driver.stop();
> + driver.join();
> +
> + Shutdown();
> +}
>
> // Testing killing task after number of consecutive failures.
> // Temporarily disabled due to MESOS-1613.
>
>
Re: git commit: Task health status change notifications
Posted by Niklas Nielsen <ni...@mesosphere.io>.
Hi Ben,
Sorry about that; I am on it with Connor and followed up with a couple of
questions in the ticket.
Niklas
On 29 July 2014 18:36, Benjamin Mahler <be...@gmail.com> wrote:
> Not sure if it's related to this commit, but seems the GracePeriod test is
> flaky now:
>
> https://issues.apache.org/jira/browse/MESOS-1653
>
> Could you help triage this ticket?
>
>
> On Tue, Jul 29, 2014 at 3:47 PM, <nn...@apache.org> wrote:
>
> > Repository: mesos
> > Updated Branches:
> > refs/heads/master 98557a7cf -> f66289831
> >
> >
> > Task health status change notifications
> >
> > The reusable health check program added in #22579 emits TaskStatus
> > messages when the task under supervision first becomes viable (when the
> > task passes its first health check). It also emits a message when a
> > task changes state from healthy to unhealthy.
> >
> > However, the scheduler should be notified for _every_ observed change in
> > health status. It's easy to imagine cases where the scheduler wants to
> > wait a while before killing an unhealthy task, but still be notified of
> > status changes so that load balancers may be updated, etc. This patch
> > therefore causes the scheduler to also be notified when an unhealthy
> > task becomes healthy again.
> >
> > Review: https://reviews.apache.org/r/23966
> >
> >
> > Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
> > Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f6628983
> > Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f6628983
> > Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f6628983
> >
> > Branch: refs/heads/master
> > Commit: f6628983165e4b0a2f44bb288ff87041f9e5e1bb
> > Parents: 98557a7
> > Author: Connor Doyle <co...@mesosphere.io>
> > Authored: Tue Jul 29 15:46:45 2014 -0700
> > Committer: Niklas Q. Nielsen <ni...@mesosphere.io>
> > Committed: Tue Jul 29 15:46:45 2014 -0700
> >
> > ----------------------------------------------------------------------
> > src/health-check/main.cpp | 5 +-
> > src/tests/health_check_tests.cpp | 87
> +++++++++++++++++++++++++++++++++++
> > 2 files changed, 91 insertions(+), 1 deletion(-)
> > ----------------------------------------------------------------------
> >
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/health-check/main.cpp
> > ----------------------------------------------------------------------
> > diff --git a/src/health-check/main.cpp b/src/health-check/main.cpp
> > index 95d881e..10d57a0 100644
> > --- a/src/health-check/main.cpp
> > +++ b/src/health-check/main.cpp
> > @@ -121,7 +121,10 @@ private:
> > void success()
> > {
> > VLOG(1) << "Check passed";
> > - if (initializing) {
> > +
> > + // Send a healthy status update on the first success,
> > + // and on the first success following failure(s).
> > + if (initializing || consecutiveFailures > 0) {
> > TaskHealthStatus taskHealthStatus;
> > taskHealthStatus.set_healthy(true);
> > taskHealthStatus.mutable_task_id()->CopyFrom(taskID);
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/tests/health_check_tests.cpp
> > ----------------------------------------------------------------------
> > diff --git a/src/tests/health_check_tests.cpp
> > b/src/tests/health_check_tests.cpp
> > index aa5b78b..6c54ea8 100644
> > --- a/src/tests/health_check_tests.cpp
> > +++ b/src/tests/health_check_tests.cpp
> > @@ -174,6 +174,93 @@ TEST_F(HealthCheckTest, HealthyTask)
> > Shutdown();
> > }
> >
> > +// Testing health status change reporting to scheduler.
> > +TEST_F(HealthCheckTest, HealthStatusChange)
> > +{
> > + Try<PID<Master> > master = StartMaster();
> > + ASSERT_SOME(master);
> > +
> > + slave::Flags flags = CreateSlaveFlags();
> > + flags.isolation = "posix/cpu,posix/mem";
> > +
> > + Try<MesosContainerizer*> containerizer =
> > + MesosContainerizer::create(flags, false);
> > + CHECK_SOME(containerizer);
> > +
> > + Try<PID<Slave> > slave = StartSlave(containerizer.get());
> > + ASSERT_SOME(slave);
> > +
> > + MockScheduler sched;
> > + MesosSchedulerDriver driver(
> > + &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
> > +
> > + EXPECT_CALL(sched, registered(&driver, _, _));
> > +
> > + Future<vector<Offer> > offers;
> > + EXPECT_CALL(sched, resourceOffers(&driver, _))
> > + .WillOnce(FutureArg<1>(&offers))
> > + .WillRepeatedly(Return()); // Ignore subsequent offers.
> > +
> > + driver.start();
> > +
> > + AWAIT_READY(offers);
> > + EXPECT_NE(0u, offers.get().size());
> > +
> > + // Create a temporary file.
> > + Try<string> temporaryPath = os::mktemp();
> > + ASSERT_SOME(temporaryPath);
> > + string tmpPath = temporaryPath.get();
> > +
> > + // This command fails every other invocation.
> > + // For all runs i in Nat0, the following case i % 2 applies:
> > + //
> > + // Case 0:
> > + // - Remove the temporary file.
> > + //
> > + // Case 1:
> > + // - Attempt to remove the nonexistent temporary file.
> > + // - Create the temporary file.
> > + // - Exit with a non-zero status.
> > + string alt = "rm " + tmpPath + " || (touch " + tmpPath + " && exit
> 1)";
> > +
> > + vector<TaskInfo> tasks = populateTasks(
> > + "sleep 20", alt, offers.get()[0], 0, 3);
> > +
> > + Future<TaskStatus> statusRunning;
> > + Future<TaskStatus> statusHealth1;
> > + Future<TaskStatus> statusHealth2;
> > + Future<TaskStatus> statusHealth3;
> > +
> > + EXPECT_CALL(sched, statusUpdate(&driver, _))
> > + .WillOnce(FutureArg<1>(&statusRunning))
> > + .WillOnce(FutureArg<1>(&statusHealth1))
> > + .WillOnce(FutureArg<1>(&statusHealth2))
> > + .WillOnce(FutureArg<1>(&statusHealth3));
> > +
> > + driver.launchTasks(offers.get()[0].id(), tasks);
> > +
> > + AWAIT_READY(statusRunning);
> > + EXPECT_EQ(TASK_RUNNING, statusRunning.get().state());
> > +
> > + AWAIT_READY(statusHealth1);
> > + EXPECT_EQ(TASK_RUNNING, statusHealth1.get().state());
> > + EXPECT_TRUE(statusHealth1.get().healthy());
> > +
> > + AWAIT_READY(statusHealth2);
> > + EXPECT_EQ(TASK_RUNNING, statusHealth2.get().state());
> > + EXPECT_FALSE(statusHealth2.get().healthy());
> > +
> > + AWAIT_READY(statusHealth3);
> > + EXPECT_EQ(TASK_RUNNING, statusHealth3.get().state());
> > + EXPECT_TRUE(statusHealth3.get().healthy());
> > +
> > + os::rm(tmpPath); // Clean up the temporary file.
> > +
> > + driver.stop();
> > + driver.join();
> > +
> > + Shutdown();
> > +}
> >
> > // Testing killing task after number of consecutive failures.
> > // Temporarily disabled due to MESOS-1613.
> >
> >
>