You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@mesos.apache.org by Benjamin Mahler <be...@gmail.com> on 2014/07/30 03:36:30 UTC

Re: git commit: Task health status change notifications

Not sure if it's related to this commit, but seems the GracePeriod test is
flaky now:

https://issues.apache.org/jira/browse/MESOS-1653

Could you help triage this ticket?


On Tue, Jul 29, 2014 at 3:47 PM, <nn...@apache.org> wrote:

> Repository: mesos
> Updated Branches:
>   refs/heads/master 98557a7cf -> f66289831
>
>
> Task health status change notifications
>
> The reusable health check program added in #22579 emits TaskStatus
> messages when the task under supervision first becomes viable (when the
> task passes its first health check).  It also emits a message when a
> task changes state from healthy to unhealthy.
>
> However, the scheduler should be notified for _every_ observed change in
> health status.  It's easy to imagine cases where the scheduler wants to
> wait a while before killing an unhealthy task, but still be notified of
> status changes so that load balancers may be updated, etc.  This patch
> therefore causes the scheduler to also be notified when an unhealthy
> task becomes healthy again.
>
> Review: https://reviews.apache.org/r/23966
>
>
> Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
> Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f6628983
> Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f6628983
> Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f6628983
>
> Branch: refs/heads/master
> Commit: f6628983165e4b0a2f44bb288ff87041f9e5e1bb
> Parents: 98557a7
> Author: Connor Doyle <co...@mesosphere.io>
> Authored: Tue Jul 29 15:46:45 2014 -0700
> Committer: Niklas Q. Nielsen <ni...@mesosphere.io>
> Committed: Tue Jul 29 15:46:45 2014 -0700
>
> ----------------------------------------------------------------------
>  src/health-check/main.cpp        |  5 +-
>  src/tests/health_check_tests.cpp | 87 +++++++++++++++++++++++++++++++++++
>  2 files changed, 91 insertions(+), 1 deletion(-)
> ----------------------------------------------------------------------
>
>
>
> http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/health-check/main.cpp
> ----------------------------------------------------------------------
> diff --git a/src/health-check/main.cpp b/src/health-check/main.cpp
> index 95d881e..10d57a0 100644
> --- a/src/health-check/main.cpp
> +++ b/src/health-check/main.cpp
> @@ -121,7 +121,10 @@ private:
>    void success()
>    {
>      VLOG(1) << "Check passed";
> -    if (initializing) {
> +
> +    // Send a healthy status update on the first success,
> +    // and on the first success following failure(s).
> +    if (initializing || consecutiveFailures > 0) {
>        TaskHealthStatus taskHealthStatus;
>        taskHealthStatus.set_healthy(true);
>        taskHealthStatus.mutable_task_id()->CopyFrom(taskID);
>
>
> http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/tests/health_check_tests.cpp
> ----------------------------------------------------------------------
> diff --git a/src/tests/health_check_tests.cpp
> b/src/tests/health_check_tests.cpp
> index aa5b78b..6c54ea8 100644
> --- a/src/tests/health_check_tests.cpp
> +++ b/src/tests/health_check_tests.cpp
> @@ -174,6 +174,93 @@ TEST_F(HealthCheckTest, HealthyTask)
>    Shutdown();
>  }
>
> +// Testing health status change reporting to scheduler.
> +TEST_F(HealthCheckTest, HealthStatusChange)
> +{
> +  Try<PID<Master> > master = StartMaster();
> +  ASSERT_SOME(master);
> +
> +  slave::Flags flags = CreateSlaveFlags();
> +  flags.isolation = "posix/cpu,posix/mem";
> +
> +  Try<MesosContainerizer*> containerizer =
> +    MesosContainerizer::create(flags, false);
> +  CHECK_SOME(containerizer);
> +
> +  Try<PID<Slave> > slave = StartSlave(containerizer.get());
> +  ASSERT_SOME(slave);
> +
> +  MockScheduler sched;
> +  MesosSchedulerDriver driver(
> +      &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
> +
> +  EXPECT_CALL(sched, registered(&driver, _, _));
> +
> +  Future<vector<Offer> > offers;
> +  EXPECT_CALL(sched, resourceOffers(&driver, _))
> +    .WillOnce(FutureArg<1>(&offers))
> +    .WillRepeatedly(Return()); // Ignore subsequent offers.
> +
> +  driver.start();
> +
> +  AWAIT_READY(offers);
> +  EXPECT_NE(0u, offers.get().size());
> +
> +  // Create a temporary file.
> +  Try<string> temporaryPath = os::mktemp();
> +  ASSERT_SOME(temporaryPath);
> +  string tmpPath = temporaryPath.get();
> +
> +  // This command fails every other invocation.
> +  // For all runs i in Nat0, the following case i % 2 applies:
> +  //
> +  // Case 0:
> +  //   - Remove the temporary file.
> +  //
> +  // Case 1:
> +  //   - Attempt to remove the nonexistent temporary file.
> +  //   - Create the temporary file.
> +  //   - Exit with a non-zero status.
> +  string alt = "rm " + tmpPath + " || (touch " + tmpPath + " && exit 1)";
> +
> +  vector<TaskInfo> tasks = populateTasks(
> +      "sleep 20", alt, offers.get()[0], 0, 3);
> +
> +  Future<TaskStatus> statusRunning;
> +  Future<TaskStatus> statusHealth1;
> +  Future<TaskStatus> statusHealth2;
> +  Future<TaskStatus> statusHealth3;
> +
> +  EXPECT_CALL(sched, statusUpdate(&driver, _))
> +    .WillOnce(FutureArg<1>(&statusRunning))
> +    .WillOnce(FutureArg<1>(&statusHealth1))
> +    .WillOnce(FutureArg<1>(&statusHealth2))
> +    .WillOnce(FutureArg<1>(&statusHealth3));
> +
> +  driver.launchTasks(offers.get()[0].id(), tasks);
> +
> +  AWAIT_READY(statusRunning);
> +  EXPECT_EQ(TASK_RUNNING, statusRunning.get().state());
> +
> +  AWAIT_READY(statusHealth1);
> +  EXPECT_EQ(TASK_RUNNING, statusHealth1.get().state());
> +  EXPECT_TRUE(statusHealth1.get().healthy());
> +
> +  AWAIT_READY(statusHealth2);
> +  EXPECT_EQ(TASK_RUNNING, statusHealth2.get().state());
> +  EXPECT_FALSE(statusHealth2.get().healthy());
> +
> +  AWAIT_READY(statusHealth3);
> +  EXPECT_EQ(TASK_RUNNING, statusHealth3.get().state());
> +  EXPECT_TRUE(statusHealth3.get().healthy());
> +
> +  os::rm(tmpPath); // Clean up the temporary file.
> +
> +  driver.stop();
> +  driver.join();
> +
> +  Shutdown();
> +}
>
>  // Testing killing task after number of consecutive failures.
>  // Temporarily disabled due to MESOS-1613.
>
>

Re: git commit: Task health status change notifications

Posted by Niklas Nielsen <ni...@mesosphere.io>.
Hi Ben,

Sorry about that; I am on it with Connor and followed up with a couple of
questions in the ticket.

Niklas


On 29 July 2014 18:36, Benjamin Mahler <be...@gmail.com> wrote:

> Not sure if it's related to this commit, but seems the GracePeriod test is
> flaky now:
>
> https://issues.apache.org/jira/browse/MESOS-1653
>
> Could you help triage this ticket?
>
>
> On Tue, Jul 29, 2014 at 3:47 PM, <nn...@apache.org> wrote:
>
> > Repository: mesos
> > Updated Branches:
> >   refs/heads/master 98557a7cf -> f66289831
> >
> >
> > Task health status change notifications
> >
> > The reusable health check program added in #22579 emits TaskStatus
> > messages when the task under supervision first becomes viable (when the
> > task passes its first health check).  It also emits a message when a
> > task changes state from healthy to unhealthy.
> >
> > However, the scheduler should be notified for _every_ observed change in
> > health status.  It's easy to imagine cases where the scheduler wants to
> > wait a while before killing an unhealthy task, but still be notified of
> > status changes so that load balancers may be updated, etc.  This patch
> > therefore causes the scheduler to also be notified when an unhealthy
> > task becomes healthy again.
> >
> > Review: https://reviews.apache.org/r/23966
> >
> >
> > Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
> > Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f6628983
> > Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f6628983
> > Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f6628983
> >
> > Branch: refs/heads/master
> > Commit: f6628983165e4b0a2f44bb288ff87041f9e5e1bb
> > Parents: 98557a7
> > Author: Connor Doyle <co...@mesosphere.io>
> > Authored: Tue Jul 29 15:46:45 2014 -0700
> > Committer: Niklas Q. Nielsen <ni...@mesosphere.io>
> > Committed: Tue Jul 29 15:46:45 2014 -0700
> >
> > ----------------------------------------------------------------------
> >  src/health-check/main.cpp        |  5 +-
> >  src/tests/health_check_tests.cpp | 87
> +++++++++++++++++++++++++++++++++++
> >  2 files changed, 91 insertions(+), 1 deletion(-)
> > ----------------------------------------------------------------------
> >
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/health-check/main.cpp
> > ----------------------------------------------------------------------
> > diff --git a/src/health-check/main.cpp b/src/health-check/main.cpp
> > index 95d881e..10d57a0 100644
> > --- a/src/health-check/main.cpp
> > +++ b/src/health-check/main.cpp
> > @@ -121,7 +121,10 @@ private:
> >    void success()
> >    {
> >      VLOG(1) << "Check passed";
> > -    if (initializing) {
> > +
> > +    // Send a healthy status update on the first success,
> > +    // and on the first success following failure(s).
> > +    if (initializing || consecutiveFailures > 0) {
> >        TaskHealthStatus taskHealthStatus;
> >        taskHealthStatus.set_healthy(true);
> >        taskHealthStatus.mutable_task_id()->CopyFrom(taskID);
> >
> >
> >
> http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/tests/health_check_tests.cpp
> > ----------------------------------------------------------------------
> > diff --git a/src/tests/health_check_tests.cpp
> > b/src/tests/health_check_tests.cpp
> > index aa5b78b..6c54ea8 100644
> > --- a/src/tests/health_check_tests.cpp
> > +++ b/src/tests/health_check_tests.cpp
> > @@ -174,6 +174,93 @@ TEST_F(HealthCheckTest, HealthyTask)
> >    Shutdown();
> >  }
> >
> > +// Testing health status change reporting to scheduler.
> > +TEST_F(HealthCheckTest, HealthStatusChange)
> > +{
> > +  Try<PID<Master> > master = StartMaster();
> > +  ASSERT_SOME(master);
> > +
> > +  slave::Flags flags = CreateSlaveFlags();
> > +  flags.isolation = "posix/cpu,posix/mem";
> > +
> > +  Try<MesosContainerizer*> containerizer =
> > +    MesosContainerizer::create(flags, false);
> > +  CHECK_SOME(containerizer);
> > +
> > +  Try<PID<Slave> > slave = StartSlave(containerizer.get());
> > +  ASSERT_SOME(slave);
> > +
> > +  MockScheduler sched;
> > +  MesosSchedulerDriver driver(
> > +      &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
> > +
> > +  EXPECT_CALL(sched, registered(&driver, _, _));
> > +
> > +  Future<vector<Offer> > offers;
> > +  EXPECT_CALL(sched, resourceOffers(&driver, _))
> > +    .WillOnce(FutureArg<1>(&offers))
> > +    .WillRepeatedly(Return()); // Ignore subsequent offers.
> > +
> > +  driver.start();
> > +
> > +  AWAIT_READY(offers);
> > +  EXPECT_NE(0u, offers.get().size());
> > +
> > +  // Create a temporary file.
> > +  Try<string> temporaryPath = os::mktemp();
> > +  ASSERT_SOME(temporaryPath);
> > +  string tmpPath = temporaryPath.get();
> > +
> > +  // This command fails every other invocation.
> > +  // For all runs i in Nat0, the following case i % 2 applies:
> > +  //
> > +  // Case 0:
> > +  //   - Remove the temporary file.
> > +  //
> > +  // Case 1:
> > +  //   - Attempt to remove the nonexistent temporary file.
> > +  //   - Create the temporary file.
> > +  //   - Exit with a non-zero status.
> > +  string alt = "rm " + tmpPath + " || (touch " + tmpPath + " && exit
> 1)";
> > +
> > +  vector<TaskInfo> tasks = populateTasks(
> > +      "sleep 20", alt, offers.get()[0], 0, 3);
> > +
> > +  Future<TaskStatus> statusRunning;
> > +  Future<TaskStatus> statusHealth1;
> > +  Future<TaskStatus> statusHealth2;
> > +  Future<TaskStatus> statusHealth3;
> > +
> > +  EXPECT_CALL(sched, statusUpdate(&driver, _))
> > +    .WillOnce(FutureArg<1>(&statusRunning))
> > +    .WillOnce(FutureArg<1>(&statusHealth1))
> > +    .WillOnce(FutureArg<1>(&statusHealth2))
> > +    .WillOnce(FutureArg<1>(&statusHealth3));
> > +
> > +  driver.launchTasks(offers.get()[0].id(), tasks);
> > +
> > +  AWAIT_READY(statusRunning);
> > +  EXPECT_EQ(TASK_RUNNING, statusRunning.get().state());
> > +
> > +  AWAIT_READY(statusHealth1);
> > +  EXPECT_EQ(TASK_RUNNING, statusHealth1.get().state());
> > +  EXPECT_TRUE(statusHealth1.get().healthy());
> > +
> > +  AWAIT_READY(statusHealth2);
> > +  EXPECT_EQ(TASK_RUNNING, statusHealth2.get().state());
> > +  EXPECT_FALSE(statusHealth2.get().healthy());
> > +
> > +  AWAIT_READY(statusHealth3);
> > +  EXPECT_EQ(TASK_RUNNING, statusHealth3.get().state());
> > +  EXPECT_TRUE(statusHealth3.get().healthy());
> > +
> > +  os::rm(tmpPath); // Clean up the temporary file.
> > +
> > +  driver.stop();
> > +  driver.join();
> > +
> > +  Shutdown();
> > +}
> >
> >  // Testing killing task after number of consecutive failures.
> >  // Temporarily disabled due to MESOS-1613.
> >
> >
>