You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@slider.apache.org by st...@apache.org on 2014/10/06 04:57:18 UTC

[21/24] git commit: SLIDER-467 final status to be "succeeded"...reworking of how exit codes are propagated with tests fixed where the changes showed they were incomplete/broken

SLIDER-467 final status to be "succeeded"...reworking of how exit codes are propagated with tests fixed where the changes showed they were incomplete/broken


Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/a3944b1c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/a3944b1c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/a3944b1c

Branch: refs/heads/feature/SLIDER-149_Support_a_YARN_service_registry
Commit: a3944b1cd44024acd4578a7e49f8abf0e8497f92
Parents: 155262b
Author: Steve Loughran <st...@apache.org>
Authored: Sat Oct 4 19:48:56 2014 -0700
Committer: Steve Loughran <st...@apache.org>
Committed: Sat Oct 4 19:48:56 2014 -0700

----------------------------------------------------------------------
 .../slider/core/exceptions/SliderException.java |  6 +--
 .../apache/slider/providers/ProviderUtils.java  |  6 ++-
 .../server/appmaster/SliderAppMaster.java       | 40 ++++++++++++++------
 .../agent/actions/TestActionStatus.groovy       |  9 ++---
 .../standalone/TestStandaloneAMRestart.groovy   | 38 ++++++++++++++-----
 5 files changed, 67 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java b/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java
index 18e3157..7f3134a 100644
--- a/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java
+++ b/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java
@@ -41,9 +41,9 @@ public class SliderException extends ServiceLaunchException implements
 
   /**
    * Format the exception as you create it
-   * @param code
-   * @param message
-   * @param args
+   * @param code exit code
+   * @param message exception message -sprintf formatted
+   * @param args arguments for the formatting
    */
   public SliderException(int code, String message, Object... args) {
     super(code, String.format(message, args));

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java b/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java
index 464aba5..8e77a9c 100644
--- a/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java
+++ b/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java
@@ -204,11 +204,13 @@ public class ProviderUtils implements RoleKeys {
   
   /**
    * Validate the node count and heap size values of a node class 
-   *
+   * <p>
+   * If max &lt;= 0:  min &lt;= count
+   * If max &gt; 0:  min &lt;= count &lt;= max
    * @param name node class name
    * @param count requested node count
    * @param min requested heap size
-   * @param max
+   * @param max maximum value. 
    * @throws BadCommandArgumentsException if the values are out of range
    */
   public void validateNodeCount(String name,

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
index 7fbea86..5676f3f 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
@@ -51,6 +51,7 @@ import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
 import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
 import org.apache.hadoop.yarn.client.api.async.impl.NMClientAsyncImpl;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.InvalidApplicationMasterRequestException;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.ipc.YarnRPC;
 import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
@@ -86,6 +87,7 @@ import org.apache.slider.core.exceptions.SliderException;
 import org.apache.slider.core.exceptions.SliderInternalStateException;
 import org.apache.slider.core.exceptions.TriggerClusterTeardownException;
 import org.apache.slider.core.main.ExitCodeProvider;
+import org.apache.slider.core.main.LauncherExitCodes;
 import org.apache.slider.core.main.RunService;
 import org.apache.slider.core.main.ServiceLauncher;
 import org.apache.slider.core.persist.ConfTreeSerDeser;
@@ -335,7 +337,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
    * The stop request received...the exit details are extracted
    * from this
    */
-  private ActionStopSlider stopAction;
+  private volatile ActionStopSlider stopAction;
 
   @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized")
   private RoleLaunchService launchService;
@@ -357,6 +359,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
   private String agentOpsUrl;
   private String agentStatusUrl;
   private FsDelegationTokenManager fsDelegationTokenManager;
+  private RegisterApplicationMasterResponse amRegistrationData;
 
   /**
    * Service Constructor
@@ -690,12 +693,12 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
       // address = SliderUtils.getRmSchedulerAddress(asyncRMClient.getConfig());
       log.info("Connecting to RM at {},address tracking URL={}",
                appMasterRpcPort, appMasterTrackingUrl);
-      RegisterApplicationMasterResponse response = asyncRMClient
+      amRegistrationData = asyncRMClient
         .registerApplicationMaster(appMasterHostname,
                                    appMasterRpcPort,
                                    appMasterTrackingUrl);
       Resource maxResources =
-        response.getMaximumResourceCapability();
+        amRegistrationData.getMaximumResourceCapability();
       containerMaxMemory = maxResources.getMemory();
       containerMaxCores = maxResources.getVirtualCores();
       appState.setContainerLimits(maxResources.getMemory(),
@@ -707,8 +710,8 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
       boolean securityEnabled = UserGroupInformation.isSecurityEnabled();
       if (securityEnabled) {
         secretManager.setMasterKey(
-          response.getClientToAMTokenMasterKey().array());
-        applicationACLs = response.getApplicationACLs();
+            amRegistrationData.getClientToAMTokenMasterKey().array());
+        applicationACLs = amRegistrationData.getApplicationACLs();
 
         //tell the server what the ACLs are 
         rpcService.getServer().refreshServiceAcl(serviceConf,
@@ -717,7 +720,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
 
       // extract container list
 
-      liveContainers = response.getContainersFromPreviousAttempts();
+      liveContainers = amRegistrationData.getContainersFromPreviousAttempts();
 
       //now validate the installation
       Configuration providerConf =
@@ -851,7 +854,8 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
       //now block waiting to be told to exit the process
       waitForAMCompletionSignal();
     } catch(Exception e) {
-      stopAction = new ActionStopSlider(e);
+      log.error("Exception : {}", e, e);
+      onAMStop(new ActionStopSlider(e));
     }
     //shutdown time
     return finish();
@@ -1117,6 +1121,13 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
       log.debug("Stopped forked process: exit code={}", exitCode);
     }
 
+    // make sure the AM is actually registered. If not, there's no point
+    // trying to unregister it
+    if (amRegistrationData == null) {
+      log.info("Application attempt not yet registered; skipping unregistration");
+      return exitCode;
+    }
+    
     //stop any launches in progress
     launchService.stop();
 
@@ -1137,6 +1148,9 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
 */
     } catch (IOException e) {
       log.info("Failed to unregister application: " + e, e);
+    } catch (InvalidApplicationMasterRequestException e) {
+      log.info("Application not found in YARN application list;" +
+               " it may have been terminated/YARN shutdown in progress: " + e, e);
     } catch (YarnException e) {
       log.info("Failed to unregister application: " + e, e);
     }
@@ -1451,10 +1465,13 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
                                                                                                  YarnException {
     onRpcCall("stopCluster()");
     String message = request.getMessage();
+    if (message == null) {
+      message = "application frozen by client";
+    }
     ActionStopSlider stopSlider =
         new ActionStopSlider(message,
             1000, TimeUnit.MILLISECONDS,
-            0,
+            LauncherExitCodes.EXIT_SUCCESS,
             FinalApplicationStatus.SUCCEEDED,
             message);
     log.info("SliderAppMasterApi.stopCluster: {}", stopSlider);
@@ -1471,7 +1488,8 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
     ConfTreeSerDeser confTreeSerDeser = new ConfTreeSerDeser();
     ConfTree updatedResources = confTreeSerDeser.fromJson(payload);
     flexCluster(updatedResources);
-    return Messages.FlexClusterResponseProto.newBuilder().setResponse(true).build();
+    return Messages.FlexClusterResponseProto.newBuilder().setResponse(
+        true).build();
   }
 
   @Override //SliderClusterProtocol
@@ -1672,9 +1690,9 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
     try {
       flexCluster(getInstanceDefinition().getResources());
     } catch (Exception e) {
-      //this may happen in a separate thread, so the ability to act is limited
+      // cluster flex failure: log
       log.error("Failed to flex cluster nodes: {}", e, e);
-      //declare a failure
+      // then what? exit
       queue(new ActionStopSlider(e));
     }
   }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy b/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy
index 9fcdb17..fea07af 100644
--- a/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy
@@ -44,11 +44,6 @@ import org.junit.Test
 @Slf4j
 class TestActionStatus extends AgentMiniClusterTestBase {
 
-  @Before
-  public void setup() {
-    super.setup()
-    createMiniCluster("", configuration, 1, false)
-  }
 
   /**
    * This is a test suite to run the tests against a single cluster instance
@@ -58,6 +53,8 @@ class TestActionStatus extends AgentMiniClusterTestBase {
 
   @Test
   public void testSuite() throws Throwable {
+    super.setup()
+    createMiniCluster("testactionstatus", configuration, 1, true)
     testStatusLiveCluster()
     testStatusMissingCluster()
   }
@@ -116,7 +113,7 @@ class TestActionStatus extends AgentMiniClusterTestBase {
     assert statusLauncher.serviceExitCode == 0
 
     //status to a file
-    File tfile = new File("target/" + clustername + "/status.json")
+    File tfile = new File("target/$clustername-status.json")
     statusArgs.output = tfile.absolutePath
     sliderClient.actionStatus(clustername, statusArgs)
     def text = tfile.text

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy
index 947529c..d8f7141 100644
--- a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy
@@ -52,8 +52,8 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase {
     ServiceLauncher<SliderClient> launcher =
         createStandaloneAMWithArgs(clustername,
             [
-                Arguments.ARG_OPTION, SliderXmlConfKeys.KEY_AM_RESTART_LIMIT, 
-                "$restartLimit".toString()
+                Arguments.ARG_DEFINE,
+                SliderXmlConfKeys.KEY_AM_RESTART_LIMIT + "=" + restartLimit
             ],
             true,
             false)
@@ -71,15 +71,16 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase {
     sliderClient.actionDiagnostic(diagnosticArgs)
 
     int iteration = 1;
-    killAM(iteration, sliderClient, clustername)
+    killAMAndWaitForRestart(sliderClient, iteration, clustername)
 
 
-    killAM(iteration++, sliderClient, clustername)
+    killAMAndWaitForRestart(sliderClient, iteration++, clustername)
     // app should be running here
     assert 0 == sliderClient.actionExists(clustername, true)
 
     // kill again & expect it to be considered a failure
-    killAM(iteration++, sliderClient, clustername)
+    killAmAndWaitForDeath(sliderClient, iteration++, clustername)
+    sleep(20000)
 
     report = sliderClient.applicationReport
     assert report.finalApplicationStatus == FinalApplicationStatus.FAILED
@@ -93,9 +94,29 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase {
     assert 0 == clusterActionFreeze(sliderClient, clustername, "force", true)
   }
 
-  public ActionAMSuicideArgs killAM(
-      int iteration,
+  /**
+   * Kill an AM. take an iteration count for the message sent to the 
+   * AM (hence its logs)
+   * @param iteration
+   * @param sliderClient
+   * @param clustername
+   * @return
+   */
+  public ActionAMSuicideArgs killAMAndWaitForRestart(
+      SliderClient sliderClient, int iteration, String clustername) {
+    ActionAMSuicideArgs args = killAmAndWaitForDeath(
+        sliderClient,
+        iteration,
+        clustername)
+    //give yarn some time to notice
+    sleep(20000)
+    waitUntilClusterLive(sliderClient, 20000)
+    return args
+  }
+
+  public ActionAMSuicideArgs killAmAndWaitForDeath(
       SliderClient sliderClient,
+      int iteration,
       String clustername) {
     ActionAMSuicideArgs args = new ActionAMSuicideArgs()
     args.waittime = 100
@@ -103,9 +124,6 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase {
     args.message = "kill AM iteration #$iteration"
     sliderClient.actionAmSuicide(clustername, args)
     waitWhileClusterLive(sliderClient);
-    //give yarn some time to notice
-    sleep(20000)
-    waitUntilClusterLive(sliderClient, 20000)
     return args
   }