You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@helix.apache.org by jx...@apache.org on 2018/10/29 17:50:40 UTC

[2/5] helix git commit: [HELIX-763] Task:Ignore tasks whose workflow and job are inactive

[HELIX-763] Task:Ignore tasks whose workflow and job are inactive

It was discovered that by manual testing, there were task states in INIT and RUNNING, and they were occupying a thread count even though their parent job or workflow was in an inactive state (terminal or stopped). This was happening when the capacities were being rebuilt from scratch, which could have caused a thread leak.
Changelist:
1. Add a check in buildAssignableInstances() so that it ignores workflows and jobs whose states are inactive states (that is, their tasks cannot be occupying a thread on Participants)


Project: http://git-wip-us.apache.org/repos/asf/helix/repo
Commit: http://git-wip-us.apache.org/repos/asf/helix/commit/e492d9f6
Tree: http://git-wip-us.apache.org/repos/asf/helix/tree/e492d9f6
Diff: http://git-wip-us.apache.org/repos/asf/helix/diff/e492d9f6

Branch: refs/heads/master
Commit: e492d9f663d8edad0f344208cc8affc6828708a3
Parents: e7b960c
Author: Hunter Lee <hu...@linkedin.com>
Authored: Fri Oct 26 18:49:52 2018 -0700
Committer: Hunter Lee <hu...@linkedin.com>
Committed: Fri Oct 26 18:49:52 2018 -0700

----------------------------------------------------------------------
 .../helix/task/AssignableInstanceManager.java   | 45 +++++++++++++++++++-
 1 file changed, 43 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/helix/blob/e492d9f6/helix-core/src/main/java/org/apache/helix/task/AssignableInstanceManager.java
----------------------------------------------------------------------
diff --git a/helix-core/src/main/java/org/apache/helix/task/AssignableInstanceManager.java b/helix-core/src/main/java/org/apache/helix/task/AssignableInstanceManager.java
index 2693005..1c1ed69 100644
--- a/helix-core/src/main/java/org/apache/helix/task/AssignableInstanceManager.java
+++ b/helix-core/src/main/java/org/apache/helix/task/AssignableInstanceManager.java
@@ -21,7 +21,6 @@ package org.apache.helix.task;
 
 import java.util.Collection;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
@@ -94,6 +93,24 @@ public class AssignableInstanceManager {
             jobName, jobConfig, jobContext);
         continue; // Ignore this job if either the config or context is null
       }
+
+      // First, check that the workflow and job are in valid states. This is important because
+      // sometimes aborted jobs do not get a proper update of their task states, meaning there could
+      // be INIT and RUNNING tasks we want to ignore
+      String workflowName = jobConfig.getWorkflow();
+      WorkflowConfig workflowConfig = taskDataCache.getWorkflowConfig(workflowName);
+      WorkflowContext workflowContext = taskDataCache.getWorkflowContext(workflowName);
+      if (workflowConfig == null || workflowContext == null) {
+        // There is no workflow config or context - meaning no tasks are currently scheduled and
+        // invalid, so skip this job
+        continue;
+      }
+      TaskState workflowState = workflowContext.getWorkflowState();
+      TaskState jobState = workflowContext.getJobState(jobName);
+      if (isResourceTerminalOrStopped(workflowState) || isResourceTerminalOrStopped(jobState)) {
+        continue;
+      }
+
       String quotaType = jobConfig.getJobType();
       if (quotaType == null) {
         quotaType = AssignableInstance.DEFAULT_QUOTA_TYPE;
@@ -236,4 +253,28 @@ public class AssignableInstanceManager {
   public Map<String, TaskAssignResult> getTaskAssignResultMap() {
     return _taskAssignResultMap;
   }
-}
\ No newline at end of file
+
+  /**
+   * Determines whether it's possible for a given workflow or a job to have any running tasks. In
+   * other words, rule out all resources that are in terminal states or have been stopped.
+   * @param state
+   * @return
+   */
+  private boolean isResourceTerminalOrStopped(TaskState state) {
+    if (state == null) {
+      // If the state is null, it cannot have currently-running tasks either, so consider it
+      // inactive
+      return true;
+    }
+    switch (state) {
+      case ABORTED:
+      case FAILED:
+      case STOPPED:
+      case COMPLETED:
+      case TIMED_OUT:
+      case NOT_STARTED:
+        return true;
+    }
+    return false;
+  }
+}