You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@aurora.apache.org by dm...@apache.org on 2018/01/30 22:05:45 UTC

aurora git commit: Fix infinite loop in Task State Machine due to TASK_UNKNOWN handling

Repository: aurora
Updated Branches:
  refs/heads/master 007212d80 -> 8cf3e3b84


Fix infinite loop in Task State Machine due to TASK_UNKNOWN handling

This patch cleans up the logic. The two main changes:

1) Do not allow ASSIGNED -> PARTITIONED. This is not really related to this bug, but I found this logic error during debugging. ASSIGNED is a transient state and is subject to the transient task timeout in the Scheduler, so we should not attempt to move to PARTITIONED during that window.
2) Do not try to kill tasks we think are terminal when Mesos tells us they are unknown. Originally we did this because "manageTerminalTasks" is also used for restarting tasks - but in both cases it never makes sense to respond  to "I don't know about that task" with a request to kill it.

Bugs closed: AURORA-1966

Reviewed at https://reviews.apache.org/r/65339/


Project: http://git-wip-us.apache.org/repos/asf/aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/8cf3e3b8
Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/8cf3e3b8
Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/8cf3e3b8

Branch: refs/heads/master
Commit: 8cf3e3b840463257c198d98697b04c5fd99b3d21
Parents: 007212d
Author: David McLaughlin <da...@dmclaughlin.com>
Authored: Tue Jan 30 14:02:09 2018 -0800
Committer: David McLaughlin <da...@dmclaughlin.com>
Committed: Tue Jan 30 14:02:09 2018 -0800

----------------------------------------------------------------------
 .../scheduler/state/TaskStateMachine.java       | 10 +++-------
 .../scheduler/state/TaskStateMachineTest.java   | 21 --------------------
 2 files changed, 3 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/aurora/blob/8cf3e3b8/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java b/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java
index b8ba5da..f325bf4 100644
--- a/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java
+++ b/src/main/java/org/apache/aurora/scheduler/state/TaskStateMachine.java
@@ -178,12 +178,8 @@ class TaskStateMachine {
               // order to mark the task as terminal and unblock any operations that depend on
               // kills (e.g. job updates). Just sending a KILL signal alone is insufficient,
               // as any partitioned task will be on an agent that is unreachable.
-              if (transition.getTo() == PARTITIONED) {
-                if (transition.getFrom() == KILLING) {
-                  addFollowup(TRANSITION_TO_LOST);
-                } else {
-                  addFollowup(KILL);
-                }
+              if (transition.getFrom() == KILLING && transition.getTo() == PARTITIONED) {
+                addFollowup(TRANSITION_TO_LOST);
               }
             })
             // Kill a task that we believe to be terminated when an attempt is made to revive.
@@ -312,7 +308,7 @@ class TaskStateMachine {
         .addState(
             Rule.from(ASSIGNED)
                 .to(STARTING, RUNNING, FINISHED, FAILED, RESTARTING, DRAINING,
-                    KILLED, KILLING, LOST, PREEMPTING, PARTITIONED)
+                    KILLED, KILLING, LOST, PREEMPTING)
                 .withCallback(
                     transition -> {
                       switch (transition.getTo()) {

http://git-wip-us.apache.org/repos/asf/aurora/blob/8cf3e3b8/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java b/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java
index 3d98fe6..050a46a 100644
--- a/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java
+++ b/src/test/java/org/apache/aurora/scheduler/state/TaskStateMachineTest.java
@@ -404,18 +404,15 @@ public class TaskStateMachineTest {
           .put(new TestCase(false, INIT, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, INIT, STARTING), ILLEGAL_KILL)
           .put(new TestCase(false, INIT, RUNNING), ILLEGAL_KILL)
-          .put(new TestCase(false, INIT, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, THROTTLED, PENDING), SAVE)
           .put(new TestCase(true, THROTTLED, KILLING), DELETE_TASK)
           .put(new TestCase(false, THROTTLED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, THROTTLED, STARTING), ILLEGAL_KILL)
           .put(new TestCase(false, THROTTLED, RUNNING), ILLEGAL_KILL)
-          .put(new TestCase(false, THROTTLED, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, PENDING, ASSIGNED), SAVE)
           .put(new TestCase(false, PENDING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, PENDING, STARTING), ILLEGAL_KILL)
           .put(new TestCase(false, PENDING, RUNNING), ILLEGAL_KILL)
-          .put(new TestCase(false, PENDING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, PENDING, KILLING), DELETE_TASK)
           .put(new TestCase(false, ASSIGNED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(true, ASSIGNED, STARTING), SAVE)
@@ -430,8 +427,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, ASSIGNED, KILLED), SAVE_AND_RESCHEDULE)
           .put(new TestCase(true, ASSIGNED, KILLING), SAVE_AND_KILL)
           .put(new TestCase(true, ASSIGNED, LOST), SAVE_KILL_AND_RESCHEDULE)
-          .put(new TestCase(false, ASSIGNED, PARTITIONED), ILLEGAL_KILL)
-          .put(new TestCase(true, ASSIGNED, PARTITIONED), SAVE)
           .put(new TestCase(false, STARTING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, STARTING, STARTING), ILLEGAL_KILL)
           .put(new TestCase(true, STARTING, RUNNING), SAVE)
@@ -444,7 +439,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, STARTING, KILLED), SAVE_AND_RESCHEDULE)
           .put(new TestCase(true, STARTING, KILLING), SAVE_AND_KILL)
           .put(new TestCase(true, STARTING, LOST), SAVE_AND_RESCHEDULE)
-          .put(new TestCase(false, STARTING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, STARTING, PARTITIONED), SAVE)
           .put(new TestCase(false, RUNNING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, RUNNING, STARTING), ILLEGAL_KILL)
@@ -457,7 +451,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, RUNNING, KILLED), SAVE_AND_RESCHEDULE)
           .put(new TestCase(true, RUNNING, KILLING), SAVE_AND_KILL)
           .put(new TestCase(true, RUNNING, LOST), SAVE_AND_RESCHEDULE)
-          .put(new TestCase(false, RUNNING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, RUNNING, PARTITIONED), SAVE)
           .put(new TestCase(true, FINISHED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, FINISHED, ASSIGNED), ILLEGAL_KILL)
@@ -466,8 +459,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, FINISHED, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(false, FINISHED, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(true, FINISHED, DELETED), DELETE_TASK)
-          .put(new TestCase(false, FINISHED, PARTITIONED), ILLEGAL_KILL)
-          .put(new TestCase(true, FINISHED, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, PREEMPTING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, PREEMPTING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(true, PREEMPTING, STARTING), ILLEGAL_KILL)
@@ -479,7 +470,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, PREEMPTING, KILLED), SAVE_AND_RESCHEDULE)
           .put(new TestCase(true, PREEMPTING, KILLING), SAVE)
           .put(new TestCase(true, PREEMPTING, LOST), SAVE_KILL_AND_RESCHEDULE)
-          .put(new TestCase(false, PREEMPTING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, PREEMPTING, PARTITIONED), TRANSITION_TO_LOST)
           .put(new TestCase(true, RESTARTING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, RESTARTING, ASSIGNED), ILLEGAL_KILL)
@@ -492,7 +482,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, RESTARTING, KILLED), SAVE_AND_RESCHEDULE)
           .put(new TestCase(true, RESTARTING, KILLING), SAVE)
           .put(new TestCase(true, RESTARTING, LOST), SAVE_KILL_AND_RESCHEDULE)
-          .put(new TestCase(false, RESTARTING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, RESTARTING, PARTITIONED), TRANSITION_TO_LOST)
           .put(new TestCase(true, DRAINING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, DRAINING, ASSIGNED), ILLEGAL_KILL)
@@ -505,7 +494,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, DRAINING, KILLED), SAVE_AND_RESCHEDULE)
           .put(new TestCase(true, DRAINING, KILLING), SAVE)
           .put(new TestCase(true, DRAINING, LOST), SAVE_KILL_AND_RESCHEDULE)
-          .put(new TestCase(false, DRAINING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, DRAINING, PARTITIONED), TRANSITION_TO_LOST)
           .put(new TestCase(true, FAILED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, FAILED, ASSIGNED), ILLEGAL_KILL)
@@ -514,8 +502,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, FAILED, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(false, FAILED, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(true, FAILED, DELETED), DELETE_TASK)
-          .put(new TestCase(false, FAILED, PARTITIONED), ILLEGAL_KILL)
-          .put(new TestCase(true, FAILED, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, KILLED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, KILLED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(true, KILLED, STARTING), ILLEGAL_KILL)
@@ -523,8 +509,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, KILLED, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(false, KILLED, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(true, KILLED, DELETED), DELETE_TASK)
-          .put(new TestCase(false, KILLED, PARTITIONED), ILLEGAL_KILL)
-          .put(new TestCase(true, KILLED, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, KILLING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, KILLING, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(true, KILLING, STARTING), ILLEGAL_KILL)
@@ -536,7 +520,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, KILLING, KILLED), SAVE)
           .put(new TestCase(true, KILLING, LOST), SAVE)
           .put(new TestCase(true, KILLING, DELETED), DELETE_TASK)
-          .put(new TestCase(false, KILLING, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, KILLING, PARTITIONED), TRANSITION_TO_LOST)
           .put(new TestCase(false, PARTITIONED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(true, PARTITIONED, ASSIGNED), SAVE)
@@ -551,7 +534,6 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, PARTITIONED, FAILED), RECORD_FAILURE)
           .put(new TestCase(true, PARTITIONED, FINISHED), SAVE)
           .put(new TestCase(true, PARTITIONED, LOST), SAVE_KILL_AND_RESCHEDULE)
-          .put(new TestCase(false, PARTITIONED, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(true, LOST, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, LOST, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(true, LOST, STARTING), ILLEGAL_KILL)
@@ -559,12 +541,9 @@ public class TaskStateMachineTest {
           .put(new TestCase(true, LOST, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(false, LOST, RUNNING), ILLEGAL_KILL)
           .put(new TestCase(true, LOST, DELETED), DELETE_TASK)
-          .put(new TestCase(false, LOST, PARTITIONED), ILLEGAL_KILL)
-          .put(new TestCase(true, LOST, PARTITIONED), ILLEGAL_KILL)
           .put(new TestCase(false, DELETED, ASSIGNED), ILLEGAL_KILL)
           .put(new TestCase(false, DELETED, STARTING), ILLEGAL_KILL)
           .put(new TestCase(false, DELETED, RUNNING), ILLEGAL_KILL)
-          .put(new TestCase(false, DELETED, PARTITIONED), ILLEGAL_KILL)
           .build();
 
   @Test