You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@slider.apache.org by st...@apache.org on 2014/10/06 04:57:16 UTC

[19/24] git commit: SLIDER-467 cleanly shut down AMs should say "succeeded" in app reports. Easier said than done

SLIDER-467 cleanly shut down AMs should say "succeeded" in app reports. Easier said than done


Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/155262bf
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/155262bf
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/155262bf

Branch: refs/heads/feature/SLIDER-149_Support_a_YARN_service_registry
Commit: 155262bf7555139cc54d553e6ef34e612d075a6f
Parents: 4a53f55
Author: Steve Loughran <st...@apache.org>
Authored: Sat Oct 4 14:02:48 2014 -0700
Committer: Steve Loughran <st...@apache.org>
Committed: Sat Oct 4 14:02:48 2014 -0700

----------------------------------------------------------------------
 .../providers/AbstractProviderService.java      | 21 ++++++++++-
 .../providers/agent/AgentProviderService.java   | 22 ------------
 .../slideram/SliderAMProviderService.java       |  8 -----
 .../apache/slider/server/appmaster/AMUtils.java |  2 +-
 .../server/appmaster/SliderAppMaster.java       | 38 +++++++++-----------
 .../appmaster/actions/ActionStopSlider.java     | 20 +++++++++++
 .../workflow/WorkflowSequenceService.java       |  8 ++++-
 .../standalone/TestStandaloneAMDestroy.groovy   |  3 +-
 .../standalone/TestStandaloneAMRestart.groovy   | 36 +++++++++++++------
 .../slider/test/YarnMiniClusterTestBase.groovy  | 13 ++++---
 .../providers/hbase/HBaseProviderService.java   | 23 ------------
 11 files changed, 99 insertions(+), 95 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/providers/AbstractProviderService.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/providers/AbstractProviderService.java b/slider-core/src/main/java/org/apache/slider/providers/AbstractProviderService.java
index baddb56..b3cf527 100644
--- a/slider-core/src/main/java/org/apache/slider/providers/AbstractProviderService.java
+++ b/slider-core/src/main/java/org/apache/slider/providers/AbstractProviderService.java
@@ -77,6 +77,7 @@ public abstract class AbstractProviderService
 
   public AbstractProviderService(String name) {
     super(name);
+    setStopIfNoChildServicesAtStartup(false);
   }
 
   @Override
@@ -184,7 +185,25 @@ public abstract class AbstractProviderService
     }
     return false;
   }
-  
+
+  /**
+   * override point to allow a process to start executing in this container
+   * @param instanceDefinition cluster description
+   * @param confDir configuration directory
+   * @param env environment
+   * @param execInProgress the callback for the exec events
+   * @return false
+   * @throws IOException
+   * @throws SliderException
+   */
+  @Override
+  public boolean exec(AggregateConf instanceDefinition,
+      File confDir,
+      Map<String, String> env,
+      ProviderCompleted execInProgress) throws IOException, SliderException {
+    return false;
+  }
+
   @SuppressWarnings("ThrowableResultOfMethodCallIgnored")
   @Override // ExitCodeProvider
   public int getExitCode() {

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/providers/agent/AgentProviderService.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/providers/agent/AgentProviderService.java b/slider-core/src/main/java/org/apache/slider/providers/agent/AgentProviderService.java
index d7943b2..67a268e 100644
--- a/slider-core/src/main/java/org/apache/slider/providers/agent/AgentProviderService.java
+++ b/slider-core/src/main/java/org/apache/slider/providers/agent/AgentProviderService.java
@@ -395,28 +395,6 @@ public class AgentProviderService extends AbstractProviderService implements
     }
   }
 
-  /**
-   * Run this service
-   *
-   * @param instanceDefinition component description
-   * @param confDir            local dir with the config
-   * @param env                environment variables above those generated by
-   * @param execInProgress     callback for the event notification
-   *
-   * @throws IOException     IO problems
-   * @throws SliderException anything internal
-   */
-  @Override
-  public boolean exec(AggregateConf instanceDefinition,
-                      File confDir,
-                      Map<String, String> env,
-                      ProviderCompleted execInProgress) throws
-      IOException,
-      SliderException {
-
-    return false;
-  }
-
   @Override
   public boolean isSupportedRole(String role) {
     return true;

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMProviderService.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMProviderService.java b/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMProviderService.java
index 863ea7e..2b2d1c7 100644
--- a/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMProviderService.java
+++ b/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMProviderService.java
@@ -91,14 +91,6 @@ public class SliderAMProviderService extends AbstractProviderService implements
   }
 
   @Override
-  public boolean exec(AggregateConf instanceDefinition,
-      File confDir,
-      Map<String, String> env,
-      ProviderCompleted execInProgress) throws IOException, SliderException {
-    return false;
-  }
-
-  @Override
   public List<ProviderRole> getRoles() {
     return new ArrayList<ProviderRole>(0);
   }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/server/appmaster/AMUtils.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/AMUtils.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/AMUtils.java
index 39f511a..32684c6 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/AMUtils.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/AMUtils.java
@@ -33,7 +33,7 @@ public class AMUtils {
         return LauncherExitCodes.EXIT_SUCCESS;
       //remap from a planned shutdown to a failure
       case LauncherExitCodes.EXIT_CLIENT_INITIATED_SHUTDOWN:
-        return SliderExitCodes.EXIT_PROCESS_FAILED;
+        return SliderExitCodes.EXIT_SUCCESS;
       default:
         return exitCode;
     }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
index aac8106..7fbea86 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
@@ -20,6 +20,7 @@ package org.apache.slider.server.appmaster;
 
 import com.codahale.metrics.MetricRegistry;
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
 import com.google.protobuf.BlockingService;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
@@ -846,15 +847,14 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
       // launch the real provider; this is expected to trigger a callback that
       // starts the node review process
       launchProviderService(instanceDefinition, confDir);
-      
+
       //now block waiting to be told to exit the process
       waitForAMCompletionSignal();
-      //shutdown time
-    } finally {
-      finish();
+    } catch(Exception e) {
+      stopAction = new ActionStopSlider(e);
     }
-
-    return amExitCode;
+    //shutdown time
+    return finish();
   }
 
   private void startAgentWebApp(MapOperations appInformation,
@@ -1064,14 +1064,6 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
     } finally {
       AMExecutionStateLock.unlock();
     }
-    //add a sleep here for about a second. Why? it
-    //stops RPC calls breaking so dramatically when the cluster
-    //is torn down mid-RPC
-    try {
-      Thread.sleep(TERMINATION_SIGNAL_PROPAGATION_DELAY);
-    } catch (InterruptedException ignored) {
-      //ignored
-    }
   }
 
   /**
@@ -1083,6 +1075,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
     // this is a queued action: schedule it through the queues
     schedule(stopActionRequest);
   }
+
   /**
    * Signal that the AM is complete
    *
@@ -1105,8 +1098,10 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
   
   /**
    * trigger the YARN cluster termination process
+   * @return the exit code
    */
-  private synchronized void finish() {
+  private synchronized int finish() {
+    Preconditions.checkNotNull(stopAction, "null stop action");
     FinalApplicationStatus appStatus;
     log.info("Triggering shutdown of the AM: {}", stopAction);
 
@@ -1145,6 +1140,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
     } catch (YarnException e) {
       log.info("Failed to unregister application: " + e, e);
     }
+    return exitCode;
   }
 
     /**
@@ -1377,7 +1373,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
   public void onShutdownRequest() {
     LOG_YARN.info("Shutdown Request received");
     signalAMComplete(new ActionStopSlider("stop",
-        EXIT_CLIENT_INITIATED_SHUTDOWN,
+        EXIT_SUCCESS,
         FinalApplicationStatus.SUCCEEDED,
         "Shutdown requested from RM"));
   }
@@ -1627,7 +1623,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
     return Messages.AMSuicideResponseProto.getDefaultInstance();
   }
 
-  /* =================================================================== */
+/* =================================================================== */
 /* END */
 /* =================================================================== */
 
@@ -1664,7 +1660,6 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
     }
   }
 
-
   /* =================================================================== */
   /* EventCallback  from the child or ourselves directly */
   /* =================================================================== */
@@ -1680,9 +1675,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
       //this may happen in a separate thread, so the ability to act is limited
       log.error("Failed to flex cluster nodes: {}", e, e);
       //declare a failure
-      queue(new ActionStopSlider("stop",
-          EXIT_DEPLOYMENT_FAILED, FinalApplicationStatus.FAILED,
-          "Failed to create application:" + e.toString()));
+      queue(new ActionStopSlider(e));
     }
   }
 
@@ -1728,7 +1721,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
 
       if (shouldTriggerFailure) {
         String reason =
-            "Spawned master exited with raw " + exitCode + " mapped to " +
+            "Spawned process failed with raw " + exitCode + " mapped to " +
             mappedProcessExitCode;
         ActionStopSlider stop = new ActionStopSlider("stop",
             mappedProcessExitCode,
@@ -1916,6 +1909,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
             InternalKeys.DEFAULT_CHAOS_MONKEY_ENABLED);
     if (!enabled) {
       log.info("Chaos monkey disabled");
+      return false;
     }
     
     long monkeyInterval = internals.getTimeRange(

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java
index 39ff761..d2f23a2 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/ActionStopSlider.java
@@ -20,6 +20,8 @@ package org.apache.slider.server.appmaster.actions;
 
 import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
 import org.apache.slider.core.exceptions.TriggerClusterTeardownException;
+import org.apache.slider.core.main.ExitCodeProvider;
+import org.apache.slider.core.main.LauncherExitCodes;
 import org.apache.slider.server.appmaster.SliderAppMaster;
 import org.apache.slider.server.appmaster.state.AppState;
 
@@ -91,6 +93,24 @@ public class ActionStopSlider extends AsyncAction {
         ex.getMessage());
   }
   
+  /**
+   * Build from an exception.
+   * <p>
+   * If the exception implements
+   * {@link ExitCodeProvider} then the exit code is extracted from that
+   * @param ex exception.
+   */
+  public ActionStopSlider(Exception ex) {
+    super("stop");
+    if (ex instanceof ExitCodeProvider) {
+      setExitCode(((ExitCodeProvider)ex).getExitCode());
+    } else {
+      setExitCode(LauncherExitCodes.EXIT_EXCEPTION_THROWN);
+    }
+    setFinalApplicationStatus(FinalApplicationStatus.FAILED);
+    setMessage(ex.getMessage());
+  }
+  
   @Override
   public void execute(SliderAppMaster appMaster,
       QueueAccess queueService,

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/main/java/org/apache/slider/server/services/workflow/WorkflowSequenceService.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/services/workflow/WorkflowSequenceService.java b/slider-core/src/main/java/org/apache/slider/server/services/workflow/WorkflowSequenceService.java
index ca07f99..e584e63 100644
--- a/slider-core/src/main/java/org/apache/slider/server/services/workflow/WorkflowSequenceService.java
+++ b/slider-core/src/main/java/org/apache/slider/server/services/workflow/WorkflowSequenceService.java
@@ -79,6 +79,8 @@ public class WorkflowSequenceService extends AbstractService implements
   null if one did not finish yet
    */
   private volatile Service previousService;
+  
+  private boolean stopIfNoChildServicesAtStartup = true;
 
   /**
    * Construct an instance
@@ -133,13 +135,17 @@ public class WorkflowSequenceService extends AbstractService implements
     return previousService;
   }
 
+  protected void setStopIfNoChildServicesAtStartup(boolean stopIfNoChildServicesAtStartup) {
+    this.stopIfNoChildServicesAtStartup = stopIfNoChildServicesAtStartup;
+  }
+
   /**
    * When started
    * @throws Exception
    */
   @Override
   protected void serviceStart() throws Exception {
-    if (!startNextService()) {
+    if (!startNextService() && stopIfNoChildServicesAtStartup) {
         //nothing to start -so stop
         stop();
     }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMDestroy.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMDestroy.groovy b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMDestroy.groovy
index fa48b70..463c4c0 100644
--- a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMDestroy.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMDestroy.groovy
@@ -94,7 +94,6 @@ class TestStandaloneAMDestroy extends AgentMiniClusterTestBase {
 
     describe "END EXPECTED WARNINGS"
 
-
     describe "destroying $clustername"
     //now: destroy it
     
@@ -153,6 +152,8 @@ class TestStandaloneAMDestroy extends AgentMiniClusterTestBase {
     
     //and try to destroy a completely different cluster just for the fun of it
     assert 0 == sliderClient.actionDestroy("no-cluster-of-this-name")
+
+    maybeStopCluster(cluster2, "", "Teardown at end of test case", false);
   }
 
 

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy
index 1073309..947529c 100644
--- a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy
@@ -47,11 +47,13 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase {
     // patch the configuration for AM restart
     YarnConfiguration conf = getRestartableConfiguration(5)
 
+    int restartLimit = 3;
     String clustername = createMiniCluster("", conf, 1, true)
     ServiceLauncher<SliderClient> launcher =
         createStandaloneAMWithArgs(clustername,
             [
-                Arguments.ARG_OPTION, SliderXmlConfKeys.KEY_AM_RESTART_LIMIT, "4"
+                Arguments.ARG_OPTION, SliderXmlConfKeys.KEY_AM_RESTART_LIMIT, 
+                "$restartLimit".toString()
             ],
             true,
             false)
@@ -68,21 +70,17 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase {
     diagnosticArgs.yarn = true
     sliderClient.actionDiagnostic(diagnosticArgs)
 
-    ActionAMSuicideArgs args = new ActionAMSuicideArgs()
-    args.message = "test AM iteration"
-    args.waittime = 100
-    args.exitcode = 1
-    sliderClient.actionAmSuicide(clustername, args)
-    waitWhileClusterLive(sliderClient);
-    //give yarn some time to notice
-    sleep(20000)
-    waitUntilClusterLive(sliderClient, 20000)
+    int iteration = 1;
+    killAM(iteration, sliderClient, clustername)
 
 
+    killAM(iteration++, sliderClient, clustername)
     // app should be running here
     assert 0 == sliderClient.actionExists(clustername, true)
+
     // kill again & expect it to be considered a failure
-    sliderClient.actionAmSuicide(clustername, args)
+    killAM(iteration++, sliderClient, clustername)
+
     report = sliderClient.applicationReport
     assert report.finalApplicationStatus == FinalApplicationStatus.FAILED
 
@@ -95,6 +93,22 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase {
     assert 0 == clusterActionFreeze(sliderClient, clustername, "force", true)
   }
 
+  public ActionAMSuicideArgs killAM(
+      int iteration,
+      SliderClient sliderClient,
+      String clustername) {
+    ActionAMSuicideArgs args = new ActionAMSuicideArgs()
+    args.waittime = 100
+    args.exitcode = 1
+    args.message = "kill AM iteration #$iteration"
+    sliderClient.actionAmSuicide(clustername, args)
+    waitWhileClusterLive(sliderClient);
+    //give yarn some time to notice
+    sleep(20000)
+    waitUntilClusterLive(sliderClient, 20000)
+    return args
+  }
+
   /**
    * Get a restartable configuration
    * @param restarts

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-core/src/test/groovy/org/apache/slider/test/YarnMiniClusterTestBase.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/test/YarnMiniClusterTestBase.groovy b/slider-core/src/test/groovy/org/apache/slider/test/YarnMiniClusterTestBase.groovy
index aa82bdb..9595a32 100644
--- a/slider-core/src/test/groovy/org/apache/slider/test/YarnMiniClusterTestBase.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/test/YarnMiniClusterTestBase.groovy
@@ -193,8 +193,10 @@ public abstract class YarnMiniClusterTestBase extends ServiceLauncherBaseTest {
     clustersToTeardown << client;
   }
   protected void addToTeardown(ServiceLauncher<SliderClient> launcher) {
-    SliderClient sliderClient = launcher.service
-    if (sliderClient) addToTeardown(sliderClient)
+    SliderClient sliderClient = launcher?.service
+    if (sliderClient) {
+      addToTeardown(sliderClient)
+    }
   }
 
 
@@ -208,7 +210,7 @@ public abstract class YarnMiniClusterTestBase extends ServiceLauncherBaseTest {
   public void stopRunningClusters() {
     clustersToTeardown.each { SliderClient client ->
       try {
-        maybeStopCluster(client, "", "Teardown at end of test case");
+        maybeStopCluster(client, "", "Teardown at end of test case", true);
       } catch (Exception e) {
         log.warn("While stopping cluster " + e, e);
       }
@@ -748,14 +750,15 @@ public abstract class YarnMiniClusterTestBase extends ServiceLauncherBaseTest {
   public int maybeStopCluster(
       SliderClient sliderClient,
       String clustername,
-      String message) {
+      String message,
+      boolean force = false) {
     if (sliderClient != null) {
       if (!clustername) {
         clustername = sliderClient.deployedClusterName;
       }
       //only stop a cluster that exists
       if (clustername) {
-        return clusterActionFreeze(sliderClient, clustername, message);
+        return clusterActionFreeze(sliderClient, clustername, message, force);
       }
     }
     return 0;

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/155262bf/slider-providers/hbase/slider-hbase-provider/src/main/java/org/apache/slider/providers/hbase/HBaseProviderService.java
----------------------------------------------------------------------
diff --git a/slider-providers/hbase/slider-hbase-provider/src/main/java/org/apache/slider/providers/hbase/HBaseProviderService.java b/slider-providers/hbase/slider-hbase-provider/src/main/java/org/apache/slider/providers/hbase/HBaseProviderService.java
index dc11050..f75a6c7 100644
--- a/slider-providers/hbase/slider-hbase-provider/src/main/java/org/apache/slider/providers/hbase/HBaseProviderService.java
+++ b/slider-providers/hbase/slider-hbase-provider/src/main/java/org/apache/slider/providers/hbase/HBaseProviderService.java
@@ -281,29 +281,6 @@ public class HBaseProviderService extends AbstractProviderService
   }
 
   /**
-   * Run this service
-   *
-   *
-   * @param instanceDefinition component description
-   * @param confDir local dir with the config
-   * @param env environment variables above those generated by
-   * @param execInProgress callback for the event notification
-   * @throws IOException IO problems
-   * @throws SliderException anything internal
-   */
-  @Override
-  public boolean exec(AggregateConf instanceDefinition,
-                      File confDir,
-                      Map<String, String> env,
-                      ProviderCompleted execInProgress) throws
-                                                 IOException,
-      SliderException {
-
-    return false;
-  }
-
-
-  /**
    * This is a validation of the application configuration on the AM.
    * Here is where things like the existence of keytabs and other
    * not-seen-client-side properties can be tested, before