You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@slider.apache.org by st...@apache.org on 2014/10/06 04:57:18 UTC
[21/24] git commit: SLIDER-467 final status to be
"succeeded"...reworking of how exit codes are propagated with tests fixed
where the changes showed they were incomplete/broken
SLIDER-467 final status to be "succeeded"...reworking of how exit codes are propagated with tests fixed where the changes showed they were incomplete/broken
Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/a3944b1c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/a3944b1c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/a3944b1c
Branch: refs/heads/feature/SLIDER-149_Support_a_YARN_service_registry
Commit: a3944b1cd44024acd4578a7e49f8abf0e8497f92
Parents: 155262b
Author: Steve Loughran <st...@apache.org>
Authored: Sat Oct 4 19:48:56 2014 -0700
Committer: Steve Loughran <st...@apache.org>
Committed: Sat Oct 4 19:48:56 2014 -0700
----------------------------------------------------------------------
.../slider/core/exceptions/SliderException.java | 6 +--
.../apache/slider/providers/ProviderUtils.java | 6 ++-
.../server/appmaster/SliderAppMaster.java | 40 ++++++++++++++------
.../agent/actions/TestActionStatus.groovy | 9 ++---
.../standalone/TestStandaloneAMRestart.groovy | 38 ++++++++++++++-----
5 files changed, 67 insertions(+), 32 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java b/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java
index 18e3157..7f3134a 100644
--- a/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java
+++ b/slider-core/src/main/java/org/apache/slider/core/exceptions/SliderException.java
@@ -41,9 +41,9 @@ public class SliderException extends ServiceLaunchException implements
/**
* Format the exception as you create it
- * @param code
- * @param message
- * @param args
+ * @param code exit code
+ * @param message exception message -sprintf formatted
+ * @param args arguments for the formatting
*/
public SliderException(int code, String message, Object... args) {
super(code, String.format(message, args));
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java b/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java
index 464aba5..8e77a9c 100644
--- a/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java
+++ b/slider-core/src/main/java/org/apache/slider/providers/ProviderUtils.java
@@ -204,11 +204,13 @@ public class ProviderUtils implements RoleKeys {
/**
* Validate the node count and heap size values of a node class
- *
+ * <p>
+ * If max <= 0: min <= count
+ * If max > 0: min <= count <= max
* @param name node class name
* @param count requested node count
* @param min requested heap size
- * @param max
+ * @param max maximum value.
* @throws BadCommandArgumentsException if the values are out of range
*/
public void validateNodeCount(String name,
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
index 7fbea86..5676f3f 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
@@ -51,6 +51,7 @@ import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
import org.apache.hadoop.yarn.client.api.async.impl.NMClientAsyncImpl;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.InvalidApplicationMasterRequestException;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.ipc.YarnRPC;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
@@ -86,6 +87,7 @@ import org.apache.slider.core.exceptions.SliderException;
import org.apache.slider.core.exceptions.SliderInternalStateException;
import org.apache.slider.core.exceptions.TriggerClusterTeardownException;
import org.apache.slider.core.main.ExitCodeProvider;
+import org.apache.slider.core.main.LauncherExitCodes;
import org.apache.slider.core.main.RunService;
import org.apache.slider.core.main.ServiceLauncher;
import org.apache.slider.core.persist.ConfTreeSerDeser;
@@ -335,7 +337,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
* The stop request received...the exit details are extracted
* from this
*/
- private ActionStopSlider stopAction;
+ private volatile ActionStopSlider stopAction;
@SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized")
private RoleLaunchService launchService;
@@ -357,6 +359,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
private String agentOpsUrl;
private String agentStatusUrl;
private FsDelegationTokenManager fsDelegationTokenManager;
+ private RegisterApplicationMasterResponse amRegistrationData;
/**
* Service Constructor
@@ -690,12 +693,12 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
// address = SliderUtils.getRmSchedulerAddress(asyncRMClient.getConfig());
log.info("Connecting to RM at {},address tracking URL={}",
appMasterRpcPort, appMasterTrackingUrl);
- RegisterApplicationMasterResponse response = asyncRMClient
+ amRegistrationData = asyncRMClient
.registerApplicationMaster(appMasterHostname,
appMasterRpcPort,
appMasterTrackingUrl);
Resource maxResources =
- response.getMaximumResourceCapability();
+ amRegistrationData.getMaximumResourceCapability();
containerMaxMemory = maxResources.getMemory();
containerMaxCores = maxResources.getVirtualCores();
appState.setContainerLimits(maxResources.getMemory(),
@@ -707,8 +710,8 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
boolean securityEnabled = UserGroupInformation.isSecurityEnabled();
if (securityEnabled) {
secretManager.setMasterKey(
- response.getClientToAMTokenMasterKey().array());
- applicationACLs = response.getApplicationACLs();
+ amRegistrationData.getClientToAMTokenMasterKey().array());
+ applicationACLs = amRegistrationData.getApplicationACLs();
//tell the server what the ACLs are
rpcService.getServer().refreshServiceAcl(serviceConf,
@@ -717,7 +720,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
// extract container list
- liveContainers = response.getContainersFromPreviousAttempts();
+ liveContainers = amRegistrationData.getContainersFromPreviousAttempts();
//now validate the installation
Configuration providerConf =
@@ -851,7 +854,8 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
//now block waiting to be told to exit the process
waitForAMCompletionSignal();
} catch(Exception e) {
- stopAction = new ActionStopSlider(e);
+ log.error("Exception : {}", e, e);
+ onAMStop(new ActionStopSlider(e));
}
//shutdown time
return finish();
@@ -1117,6 +1121,13 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
log.debug("Stopped forked process: exit code={}", exitCode);
}
+ // make sure the AM is actually registered. If not, there's no point
+ // trying to unregister it
+ if (amRegistrationData == null) {
+ log.info("Application attempt not yet registered; skipping unregistration");
+ return exitCode;
+ }
+
//stop any launches in progress
launchService.stop();
@@ -1137,6 +1148,9 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
*/
} catch (IOException e) {
log.info("Failed to unregister application: " + e, e);
+ } catch (InvalidApplicationMasterRequestException e) {
+ log.info("Application not found in YARN application list;" +
+ " it may have been terminated/YARN shutdown in progress: " + e, e);
} catch (YarnException e) {
log.info("Failed to unregister application: " + e, e);
}
@@ -1451,10 +1465,13 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
YarnException {
onRpcCall("stopCluster()");
String message = request.getMessage();
+ if (message == null) {
+ message = "application frozen by client";
+ }
ActionStopSlider stopSlider =
new ActionStopSlider(message,
1000, TimeUnit.MILLISECONDS,
- 0,
+ LauncherExitCodes.EXIT_SUCCESS,
FinalApplicationStatus.SUCCEEDED,
message);
log.info("SliderAppMasterApi.stopCluster: {}", stopSlider);
@@ -1471,7 +1488,8 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
ConfTreeSerDeser confTreeSerDeser = new ConfTreeSerDeser();
ConfTree updatedResources = confTreeSerDeser.fromJson(payload);
flexCluster(updatedResources);
- return Messages.FlexClusterResponseProto.newBuilder().setResponse(true).build();
+ return Messages.FlexClusterResponseProto.newBuilder().setResponse(
+ true).build();
}
@Override //SliderClusterProtocol
@@ -1672,9 +1690,9 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
try {
flexCluster(getInstanceDefinition().getResources());
} catch (Exception e) {
- //this may happen in a separate thread, so the ability to act is limited
+ // cluster flex failure: log
log.error("Failed to flex cluster nodes: {}", e, e);
- //declare a failure
+ // then what? exit
queue(new ActionStopSlider(e));
}
}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy b/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy
index 9fcdb17..fea07af 100644
--- a/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/agent/actions/TestActionStatus.groovy
@@ -44,11 +44,6 @@ import org.junit.Test
@Slf4j
class TestActionStatus extends AgentMiniClusterTestBase {
- @Before
- public void setup() {
- super.setup()
- createMiniCluster("", configuration, 1, false)
- }
/**
* This is a test suite to run the tests against a single cluster instance
@@ -58,6 +53,8 @@ class TestActionStatus extends AgentMiniClusterTestBase {
@Test
public void testSuite() throws Throwable {
+ super.setup()
+ createMiniCluster("testactionstatus", configuration, 1, true)
testStatusLiveCluster()
testStatusMissingCluster()
}
@@ -116,7 +113,7 @@ class TestActionStatus extends AgentMiniClusterTestBase {
assert statusLauncher.serviceExitCode == 0
//status to a file
- File tfile = new File("target/" + clustername + "/status.json")
+ File tfile = new File("target/$clustername-status.json")
statusArgs.output = tfile.absolutePath
sliderClient.actionStatus(clustername, statusArgs)
def text = tfile.text
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/a3944b1c/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy
index 947529c..d8f7141 100644
--- a/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/agent/standalone/TestStandaloneAMRestart.groovy
@@ -52,8 +52,8 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase {
ServiceLauncher<SliderClient> launcher =
createStandaloneAMWithArgs(clustername,
[
- Arguments.ARG_OPTION, SliderXmlConfKeys.KEY_AM_RESTART_LIMIT,
- "$restartLimit".toString()
+ Arguments.ARG_DEFINE,
+ SliderXmlConfKeys.KEY_AM_RESTART_LIMIT + "=" + restartLimit
],
true,
false)
@@ -71,15 +71,16 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase {
sliderClient.actionDiagnostic(diagnosticArgs)
int iteration = 1;
- killAM(iteration, sliderClient, clustername)
+ killAMAndWaitForRestart(sliderClient, iteration, clustername)
- killAM(iteration++, sliderClient, clustername)
+ killAMAndWaitForRestart(sliderClient, iteration++, clustername)
// app should be running here
assert 0 == sliderClient.actionExists(clustername, true)
// kill again & expect it to be considered a failure
- killAM(iteration++, sliderClient, clustername)
+ killAmAndWaitForDeath(sliderClient, iteration++, clustername)
+ sleep(20000)
report = sliderClient.applicationReport
assert report.finalApplicationStatus == FinalApplicationStatus.FAILED
@@ -93,9 +94,29 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase {
assert 0 == clusterActionFreeze(sliderClient, clustername, "force", true)
}
- public ActionAMSuicideArgs killAM(
- int iteration,
+ /**
+ * Kill an AM. take an iteration count for the message sent to the
+ * AM (hence its logs)
+ * @param iteration
+ * @param sliderClient
+ * @param clustername
+ * @return
+ */
+ public ActionAMSuicideArgs killAMAndWaitForRestart(
+ SliderClient sliderClient, int iteration, String clustername) {
+ ActionAMSuicideArgs args = killAmAndWaitForDeath(
+ sliderClient,
+ iteration,
+ clustername)
+ //give yarn some time to notice
+ sleep(20000)
+ waitUntilClusterLive(sliderClient, 20000)
+ return args
+ }
+
+ public ActionAMSuicideArgs killAmAndWaitForDeath(
SliderClient sliderClient,
+ int iteration,
String clustername) {
ActionAMSuicideArgs args = new ActionAMSuicideArgs()
args.waittime = 100
@@ -103,9 +124,6 @@ class TestStandaloneAMRestart extends AgentMiniClusterTestBase {
args.message = "kill AM iteration #$iteration"
sliderClient.actionAmSuicide(clustername, args)
waitWhileClusterLive(sliderClient);
- //give yarn some time to notice
- sleep(20000)
- waitUntilClusterLive(sliderClient, 20000)
return args
}