You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@mesos.apache.org by "Jian Qiu (JIRA)" <ji...@apache.org> on 2016/11/17 08:37:58 UTC
[jira] [Created] (MESOS-6599) The disordered status update message from executor may cause agent exit

Jian Qiu created MESOS-6599:
-------------------------------

             Summary: The disordered status update message from executor may cause agent exit
                 Key: MESOS-6599
                 URL: https://issues.apache.org/jira/browse/MESOS-6599
             Project: Mesos
          Issue Type: Bug
          Components: slave
         Environment: CentOS 7.2/Ubuntu 16.04
            Reporter: Jian Qiu


The framework enables checkpoint, and the executor sends TaskKiiled to the agent. After the agent acknowledges the status update, the executor sends a TaskLost, and it will cause the agent exits. It is due to the CHECK_READY(future) in Slave::___statusUpdate. Not sure why we need a CHECK here.

The test code as below:

{code}
Try<Owned<cluster::Master>> master = StartMaster();
  ASSERT_SOME(master);

  MockExecutor exec(DEFAULT_EXECUTOR_ID);
  TestContainerizer containerizer(&exec);

  Owned<MasterDetector> detector = master.get()->createDetector();
  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), &containerizer);
  ASSERT_SOME(slave);

  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
  frameworkInfo.set_checkpoint(true); // Enable checkpointing.

  MockScheduler sched;
  MesosSchedulerDriver driver(
      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);

  FrameworkID frameworkId;
  EXPECT_CALL(sched, registered(_, _, _))
    .WillOnce(SaveArg<1>(&frameworkId));

  Future<vector<Offer>> offers;
  EXPECT_CALL(sched, resourceOffers(_, _))
    .WillOnce(FutureArg<1>(&offers))
    .WillRepeatedly(Return()); // Ignore subsequent offers.

  Future<TaskStatus> status;
  EXPECT_CALL(sched, statusUpdate(_, _))
    .WillOnce(FutureArg<1>(&status));

  driver.start();

  AWAIT_READY(offers);
  EXPECT_NE(0u, offers.get().size());

  ExecutorDriver* execDriver;
  EXPECT_CALL(exec, registered(_, _, _, _))
    .WillOnce(SaveArg<0>(&execDriver));

  EXPECT_CALL(exec, launchTask(_, _))
    .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING));

  Future<StatusUpdateMessage> statusUpdateMessage =
  FUTURE_PROTOBUF(StatusUpdateMessage(), master.get()->pid, _);

  Future<Nothing> _statusUpdateAcknowledgement =
    FUTURE_DISPATCH(slave.get()->pid, &Slave::_statusUpdateAcknowledgement);

  vector<TaskInfo> tasks = createTasks(offers.get()[0]);
  driver.launchTasks(offers.get()[0].id(), tasks);

  AWAIT_READY(statusUpdateMessage);
  StatusUpdate update = statusUpdateMessage.get().update();

  AWAIT_READY(status);

  EXPECT_EQ(TASK_RUNNING, status.get().state());
  AWAIT_READY(_statusUpdateAcknowledgement);

  // driver.killTask(tasks[0].task_id());

  Future<Nothing> _statusUpdateAcknowledgement2 =
    FUTURE_DISPATCH(slave.get()->pid, &Slave::_statusUpdateAcknowledgement);

  TaskStatus status3 = status.get();
  status3.set_state(TASK_KILLED);

  execDriver->sendStatusUpdate(status3);
  AWAIT_READY(_statusUpdateAcknowledgement2);

  Future<Nothing> _statusUpdate =
    FUTURE_DISPATCH(slave.get()->pid, &Slave::___statusUpdate);

  TaskStatus status2 = status.get();
  status2.set_state(TASK_LOST);
{code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)