You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@slider.apache.org by go...@apache.org on 2017/10/04 07:54:15 UTC
incubator-slider git commit: SLIDER-1250 Tests for Health Threshold
Monitoring Feature (SLIDER-1246)
Repository: incubator-slider
Updated Branches:
refs/heads/develop 0f436c865 -> cc7a644ea
SLIDER-1250 Tests for Health Threshold Monitoring Feature (SLIDER-1246)
Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/cc7a644e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/cc7a644e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/cc7a644e
Branch: refs/heads/develop
Commit: cc7a644ea967e86be267c7cf7519d1d7402c60b2
Parents: 0f436c8
Author: Gour Saha <go...@apache.org>
Authored: Wed Oct 4 00:52:39 2017 -0700
Committer: Gour Saha <go...@apache.org>
Committed: Wed Oct 4 00:53:34 2017 -0700
----------------------------------------------------------------------
.../resources_health_monitor_60.json | 22 +
.../resources_health_monitor_80.json | 22 +
...urces_health_monitor_lots_of_containers.json | 18 +
.../resources_health_monitor_uniq_names_60.json | 23 ++
.../resources_health_monitor_uniq_names_80.json | 23 ++
.../funtest/framework/CommandTestBase.groovy | 1 +
.../apache/slider/funtest/ResourcePaths.groovy | 10 +
.../lifecycle/AppsHealthMonitorIT.groovy | 408 +++++++++++++++++++
8 files changed, 527 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_60.json
----------------------------------------------------------------------
diff --git a/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_60.json b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_60.json
new file mode 100644
index 0000000..53c1feb
--- /dev/null
+++ b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_60.json
@@ -0,0 +1,22 @@
+{
+ "schema": "http://example.org/specification/v2.0.0",
+ "metadata": {
+ },
+ "global": {
+ },
+ "components": {
+ "COMMAND_LOGGER": {
+ "yarn.memory": "128",
+ "yarn.role.priority": "1",
+ "yarn.component.instances": "3",
+ "yarn.container.health.threshold.percent" : "60",
+ "yarn.container.health.threshold.window.secs" : "5",
+ "yarn.container.health.threshold.init.delay.secs" : "1",
+ "yarn.container.health.threshold.poll.frequency.secs" : "2",
+ "yarn.node.failure.threshold" : "1000"
+ },
+ "slider-appmaster": {
+ "yarn.memory": "512"
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_80.json
----------------------------------------------------------------------
diff --git a/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_80.json b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_80.json
new file mode 100644
index 0000000..b65bd23
--- /dev/null
+++ b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_80.json
@@ -0,0 +1,22 @@
+{
+ "schema": "http://example.org/specification/v2.0.0",
+ "metadata": {
+ },
+ "global": {
+ },
+ "components": {
+ "COMMAND_LOGGER": {
+ "yarn.memory": "128",
+ "yarn.role.priority": "1",
+ "yarn.component.instances": "3",
+ "yarn.container.health.threshold.percent" : "80",
+ "yarn.container.health.threshold.window.secs" : "5",
+ "yarn.container.health.threshold.init.delay.secs" : "1",
+ "yarn.container.health.threshold.poll.frequency.secs" : "2",
+ "yarn.node.failure.threshold" : "1000"
+ },
+ "slider-appmaster": {
+ "yarn.memory": "512"
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_lots_of_containers.json
----------------------------------------------------------------------
diff --git a/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_lots_of_containers.json b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_lots_of_containers.json
new file mode 100644
index 0000000..0e22b25
--- /dev/null
+++ b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_lots_of_containers.json
@@ -0,0 +1,18 @@
+{
+ "schema": "http://example.org/specification/v2.0.0",
+ "metadata": {
+ },
+ "global": {
+ },
+ "components": {
+ "COMMAND_LOGGER": {
+ "yarn.memory": "128",
+ "yarn.role.priority": "1",
+ "yarn.component.instances": "10000",
+ "yarn.node.failure.threshold" : "1000"
+ },
+ "slider-appmaster": {
+ "yarn.memory": "512"
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_60.json
----------------------------------------------------------------------
diff --git a/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_60.json b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_60.json
new file mode 100644
index 0000000..c42fbec
--- /dev/null
+++ b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_60.json
@@ -0,0 +1,23 @@
+{
+ "schema": "http://example.org/specification/v2.0.0",
+ "metadata": {
+ },
+ "global": {
+ },
+ "components": {
+ "COMMAND_LOGGER": {
+ "yarn.memory": "128",
+ "yarn.role.priority": "1",
+ "component.unique.names" : "true",
+ "yarn.component.instances": "3",
+ "yarn.container.health.threshold.percent" : "60",
+ "yarn.container.health.threshold.window.secs" : "5",
+ "yarn.container.health.threshold.init.delay.secs" : "1",
+ "yarn.container.health.threshold.poll.frequency.secs" : "2",
+ "yarn.node.failure.threshold" : "1000"
+ },
+ "slider-appmaster": {
+ "yarn.memory": "512"
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_80.json
----------------------------------------------------------------------
diff --git a/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_80.json b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_80.json
new file mode 100644
index 0000000..f7b37b1
--- /dev/null
+++ b/slider-core/src/test/app_packages/test_command_log/resources_health_monitor_uniq_names_80.json
@@ -0,0 +1,23 @@
+{
+ "schema": "http://example.org/specification/v2.0.0",
+ "metadata": {
+ },
+ "global": {
+ },
+ "components": {
+ "COMMAND_LOGGER": {
+ "yarn.memory": "128",
+ "yarn.role.priority": "1",
+ "component.unique.names" : "true",
+ "yarn.component.instances": "3",
+ "yarn.container.health.threshold.percent" : "80",
+ "yarn.container.health.threshold.window.secs" : "5",
+ "yarn.container.health.threshold.init.delay.secs" : "1",
+ "yarn.container.health.threshold.poll.frequency.secs" : "2",
+ "yarn.node.failure.threshold" : "1000"
+ },
+ "slider-appmaster": {
+ "yarn.memory": "512"
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-funtest/src/main/groovy/org/apache/slider/funtest/framework/CommandTestBase.groovy
----------------------------------------------------------------------
diff --git a/slider-funtest/src/main/groovy/org/apache/slider/funtest/framework/CommandTestBase.groovy b/slider-funtest/src/main/groovy/org/apache/slider/funtest/framework/CommandTestBase.groovy
index 0eb7541..81dba29 100644
--- a/slider-funtest/src/main/groovy/org/apache/slider/funtest/framework/CommandTestBase.groovy
+++ b/slider-funtest/src/main/groovy/org/apache/slider/funtest/framework/CommandTestBase.groovy
@@ -375,6 +375,7 @@ abstract class CommandTestBase extends SliderTestUtils {
[
ACTION_KILL_CONTAINER,
name,
+ ARG_ID,
containerID
])
}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-funtest/src/test/groovy/org/apache/slider/funtest/ResourcePaths.groovy
----------------------------------------------------------------------
diff --git a/slider-funtest/src/test/groovy/org/apache/slider/funtest/ResourcePaths.groovy b/slider-funtest/src/test/groovy/org/apache/slider/funtest/ResourcePaths.groovy
index 37503d9..e128cd6 100644
--- a/slider-funtest/src/test/groovy/org/apache/slider/funtest/ResourcePaths.groovy
+++ b/slider-funtest/src/test/groovy/org/apache/slider/funtest/ResourcePaths.groovy
@@ -27,6 +27,16 @@ interface ResourcePaths {
String COMMAND_LOG_RESOURCES = "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources.json"
String COMMAND_LOG_RESOURCES_QUEUE_LABELS = "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_queue_labels.json"
String COMMAND_LOG_RESOURCES_NO_ROLE = "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_no_role.json"
+ String COMMAND_LOG_RESOURCES_HEALTH_MONITOR_60 =
+ "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_health_monitor_60.json"
+ String COMMAND_LOG_RESOURCES_HEALTH_MONITOR_80 =
+ "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_health_monitor_80.json"
+ String COMMAND_LOG_RESOURCES_HEALTH_MONITOR_UNIQUE_NAMES_60 =
+ "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_health_monitor_uniq_names_60.json"
+ String COMMAND_LOG_RESOURCES_HEALTH_MONITOR_UNIQUE_NAMES_80 =
+ "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_health_monitor_uniq_names_80.json"
+ String COMMAND_LOG_RESOURCES_HEALTH_MONITOR_LOTS_OF_CONTAINERS =
+ "$SLIDER_CORE_APP_PACKAGES/test_command_log/resources_health_monitor_lots_of_containers.json"
String COMMAND_LOG_APPCONFIG_NO_HB = "$SLIDER_CORE_APP_PACKAGES/test_command_log/appConfig_no_hb.json"
String COMMAND_LOG_APPCONFIG_FAST_NO_REG = "$SLIDER_CORE_APP_PACKAGES/test_command_log/appConfig_fast_no_reg.json"
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/cc7a644e/slider-funtest/src/test/groovy/org/apache/slider/funtest/lifecycle/AppsHealthMonitorIT.groovy
----------------------------------------------------------------------
diff --git a/slider-funtest/src/test/groovy/org/apache/slider/funtest/lifecycle/AppsHealthMonitorIT.groovy b/slider-funtest/src/test/groovy/org/apache/slider/funtest/lifecycle/AppsHealthMonitorIT.groovy
new file mode 100644
index 0000000..1c072b2
--- /dev/null
+++ b/slider-funtest/src/test/groovy/org/apache/slider/funtest/lifecycle/AppsHealthMonitorIT.groovy
@@ -0,0 +1,408 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.slider.funtest.lifecycle
+
+import groovy.transform.CompileStatic
+import groovy.util.logging.Slf4j
+
+import org.apache.curator.utils.EnsurePath
+import org.apache.hadoop.yarn.api.records.YarnApplicationState
+import org.apache.slider.api.ClusterDescription
+import org.apache.slider.api.StatusKeys
+import org.apache.slider.common.SliderExitCodes
+import org.apache.slider.common.params.Arguments
+import org.apache.slider.common.params.SliderActions
+import org.apache.slider.funtest.ResourcePaths
+import org.apache.slider.funtest.framework.AgentCommandTestBase
+import org.apache.slider.funtest.framework.FuntestProperties
+import org.apache.slider.funtest.framework.SliderShell
+import org.junit.After
+import org.junit.Test
+import org.junit.runner.RunWith
+import org.junit.runners.Parameterized
+import org.junit.runners.Parameterized.Parameter
+import org.junit.runners.Parameterized.Parameters
+
+import java.util.Arrays
+import java.util.Collection
+
+/**
+ * These are the steps required for the Health Monitor tests -
+ * - Install an app package
+ * - Create an app A with 3 containers, 60% health threshold, 5 sec health
+ * window, 2 secs poll frequency, and 1 secs init delay. Node failure
+ * threshold is kept high at 1000 to prevent it to interfere with these tests.
+ * - Create another app B with lots of containers (10K say), which will
+ * potentially eat up all the remaining resource in the default queue. Note,
+ * the idea is, that YARN will not be able to fulfil all the 10K container
+ * requests and hence a bunch of requests will be in Outstanding state.
+ * - Then test the following scenarios:
+ * > Kill one container of the app A. YARN will immediately allocate a
+ * container to app B since it had Outstanding container requests ahead of
+ * app A. So YARN will not be able to satisfy the one container request for
+ * app A. Health of app A will come down to 66.67%, but it should continue
+ * to run beyond the health window (5 secs) expiry, since it is above the
+ * threshold of 60%.
+ *
+ * - Create an app A with 3 containers, 80% health threshold, 5 sec health
+ * window, 2 secs poll frequency, and 1 secs init delay. Node failure
+ * threshold is kept high at 1000 to prevent it to interfere with these tests.
+ * - Create app B with same specs as the previous test
+ * - Then test the following scenarios:
+ * > Kill one container of the app A. YARN will immediately allocate a
+ * container to app B since it had Outstanding container requests ahead of
+ * app A. So YARN will not be able to satisfy the one container request for
+ * app A. Health of app A will come down to 66.67%, so after the health
+ * window (5 secs) expiry it should be killed, since it is below threshold
+ * of 80%.
+ *
+ * - Create an app A with 3 containers, 80% health threshold, 5 sec health
+ * window, 2 secs poll frequency, and 1 secs init delay. Node failure
+ * threshold is kept high at 1000 to prevent it to interfere with these tests.
+ * - Create app B with same specs as the previous test
+ * - Then test the following scenarios:
+ * > Kill one container of the app A. YARN will immediately allocate a
+ * container to app B since it had Outstanding container requests ahead of
+ * app A. So YARN will not be able to satisfy the one container request for
+ * app A. Health of app A will come down to 66.67%, so if the health
+ * window (5 secs) expires it will be killed as proven by the previous test.
+ * However in this test before the window expires we do a flex down of the
+ * role which brings the total containers required to 2 and hence current
+ * health goes back to 100%. As a result app A does not killed and continues
+ * to run even beyond the health window expiry.
+ *
+ * - Repeat all the above 3 test scenarios but this time with app A having
+ * unique component names enabled. So a total of 6 unique tests are run in
+ * this suite.
+ *
+ * - Note: This is a lengthy test-suite. Each test takes approx 2-3 mins, so all
+ * 6 tests in this suite takes approx 12-15 mins to run. Health monitor
+ * needs multiple success and failure simulations with appropriate
+ * window for each step to expire and subsequently validated for
+ * expected results.
+ */
+@RunWith(Parameterized.class)
+@CompileStatic
+@Slf4j
+public class AppsHealthMonitorIT extends AgentCommandTestBase
+ implements FuntestProperties, Arguments, SliderExitCodes, SliderActions {
+ private static String COMMAND_LOGGER = "COMMAND_LOGGER"
+ private static String APPLICATION_NAME_60 = "app-health-monitor-60"
+ private static String APPLICATION_NAME_80 = "app-health-monitor-80"
+ private static String APPLICATION_NAME_LOTS_OF_CONTAINERS =
+ "app-health-monitor-lots-of-containers"
+ private static String APP_RESOURCE_60 =
+ ResourcePaths.COMMAND_LOG_RESOURCES_HEALTH_MONITOR_60
+ private static String APP_RESOURCE_80 =
+ ResourcePaths.COMMAND_LOG_RESOURCES_HEALTH_MONITOR_80
+ private static String APP_RESOURCE_UNIQUE_NAMES_60 =
+ ResourcePaths.COMMAND_LOG_RESOURCES_HEALTH_MONITOR_UNIQUE_NAMES_60
+ private static String APP_RESOURCE_UNIQUE_NAMES_80 =
+ ResourcePaths.COMMAND_LOG_RESOURCES_HEALTH_MONITOR_UNIQUE_NAMES_80
+ private static String APP_RESOURCE_LOTS_OF_CONTAINERS =
+ ResourcePaths.COMMAND_LOG_RESOURCES_HEALTH_MONITOR_LOTS_OF_CONTAINERS
+
+ @Parameter
+ public Boolean isUniqueComp
+ @Parameter(1)
+ public String appResourceFor60
+ @Parameter(2)
+ public String appResourceFor80
+
+ @Parameters
+ public static Collection<Object[]> data() {
+ Object[] testRun1 = [Boolean.FALSE, APP_RESOURCE_60, APP_RESOURCE_80]
+ Object[] testRun2 = [Boolean.TRUE, APP_RESOURCE_UNIQUE_NAMES_60,
+ APP_RESOURCE_UNIQUE_NAMES_80]
+ Object[][] data = [testRun1, testRun2]
+ return Arrays.asList(data);
+ }
+
+ @After
+ public void destroyCluster() {
+ def appName60 = APPLICATION_NAME_60
+ def appName80 = APPLICATION_NAME_80
+ if (isUniqueComp) {
+ appName60 += "-uniq-comp"
+ appName80 += "-uniq-comp"
+ }
+ cleanup(appName60)
+ cleanup(appName80)
+ cleanup(APPLICATION_NAME_LOTS_OF_CONTAINERS)
+ }
+
+ @Test
+ public void testHealthMonitorAppRunning() throws Throwable {
+ describe("Running testHealthMonitorAppRunning for apps with resources "
+ + appResourceFor60 + " and " + appResourceFor80 + " with unique comp = "
+ + isUniqueComp)
+ assumeAgentTestsEnabled()
+ def appName = APPLICATION_NAME_60
+ if (isUniqueComp) {
+ appName += "-uniq-comp"
+ }
+ cleanup(appName)
+ cleanup(APPLICATION_NAME_LOTS_OF_CONTAINERS)
+
+ File launchReportFile = createTempJsonFile();
+ SliderShell shell = createTemplatedSliderApplication(
+ appName,
+ APP_TEMPLATE,
+ appResourceFor60,
+ [],
+ launchReportFile)
+ logShell(shell)
+
+ def appId = ensureYarnApplicationIsUp(launchReportFile)
+ if (isUniqueComp) {
+ expectContainerRequestedCountReached(appName, COMMAND_LOGGER + "1", 1,
+ CONTAINER_LAUNCH_TIMEOUT)
+ assertContainersLive(appName, COMMAND_LOGGER + "1", 1)
+ assertContainersLive(appName, COMMAND_LOGGER + "2", 1)
+ assertContainersLive(appName, COMMAND_LOGGER + "3", 1)
+ } else {
+ expectContainerRequestedCountReached(appName, COMMAND_LOGGER, 3,
+ CONTAINER_LAUNCH_TIMEOUT)
+ assertContainersLive(appName, COMMAND_LOGGER, 3)
+ }
+
+ // Wait for 2 secs to get past the init delay and let the health monitor
+ // polling to start
+ describe("Wait for 2 secs to let the health monitor polling to start")
+ sleep(1000 * 2)
+
+ // Now bring up an app which will eat up all the remaining resources of the
+ // default queue of the cluster and ensure it is up and running. Currently
+ // it has 10,000 containers which is about 1.28TB of memory. Note, if this
+ // test is executed in a queue with more than 1.28TB of memory (very slim
+ // chance), then it will very likely fail.
+ File launchReportFileLotsOfContainers = createTempJsonFile();
+ shell = createTemplatedSliderApplication(
+ APPLICATION_NAME_LOTS_OF_CONTAINERS,
+ APP_TEMPLATE,
+ APP_RESOURCE_LOTS_OF_CONTAINERS,
+ [],
+ launchReportFileLotsOfContainers)
+ logShell(shell)
+
+ def appIdLotsOfContainers =
+ ensureYarnApplicationIsUp(launchReportFileLotsOfContainers)
+ // Wait for 10 secs to let the containers come up (until no more resource is
+ // left in the default queue)
+ describe("Wait 10 secs to let containers come up and eat up all the memory")
+ sleep(1000 * 10)
+
+ // kill one container which will bring health down to about 66.67% but app
+ // should continue to run, since threshold is 60%
+ ClusterDescription cd = execStatus(appName)
+ String containerId;
+ if (isUniqueComp) {
+ containerId = cd.instances.get(COMMAND_LOGGER + "3").get(0)
+ } else {
+ containerId = cd.instances.get(COMMAND_LOGGER).get(0)
+ }
+ describe("Killing container " + containerId)
+ killContainer(appName, containerId)
+
+ describe("Wait for 10 secs to ensure no container was allocated even after "
+ + "expiry of health window, but then the app should continue to run")
+ sleep(1000 * 10)
+ ensureYarnApplicationIsUp(appId)
+ // Also assert that only 2 containers are running
+ if (isUniqueComp) {
+ assertContainersLive(appName, COMMAND_LOGGER + "1", 1)
+ assertContainersLive(appName, COMMAND_LOGGER + "2", 1)
+ assertContainersLive(appName, COMMAND_LOGGER + "3", 0)
+ } else {
+ assertContainersLive(appName, COMMAND_LOGGER, 2)
+ }
+ }
+
+ @Test
+ public void testHealthMonitorAppStopped() throws Throwable {
+ describe("Running testHealthMonitorAppStopped for apps with resources "
+ + appResourceFor60 + " and " + appResourceFor80 + " with unique comp = "
+ + isUniqueComp)
+ assumeAgentTestsEnabled()
+ def appName = APPLICATION_NAME_80
+ if (isUniqueComp) {
+ appName += "-uniq-comp"
+ }
+ cleanup(appName)
+ cleanup(APPLICATION_NAME_LOTS_OF_CONTAINERS)
+
+ File launchReportFile = createTempJsonFile();
+ SliderShell shell = createTemplatedSliderApplication(
+ appName,
+ APP_TEMPLATE,
+ appResourceFor80,
+ [],
+ launchReportFile)
+ logShell(shell)
+
+ def appId = ensureYarnApplicationIsUp(launchReportFile)
+ if (isUniqueComp) {
+ expectContainerRequestedCountReached(appName, COMMAND_LOGGER + "1", 1,
+ CONTAINER_LAUNCH_TIMEOUT)
+ assertContainersLive(appName, COMMAND_LOGGER + "1", 1)
+ assertContainersLive(appName, COMMAND_LOGGER + "2", 1)
+ assertContainersLive(appName, COMMAND_LOGGER + "3", 1)
+ } else {
+ expectContainerRequestedCountReached(appName, COMMAND_LOGGER, 3,
+ CONTAINER_LAUNCH_TIMEOUT)
+ assertContainersLive(appName, COMMAND_LOGGER, 3)
+ }
+
+ // Wait for 2 secs to get past the init delay and let the health monitor
+ // polling to start
+ describe("Wait for 2 secs to let the health monitor polling to start")
+ sleep(1000 * 2)
+
+ // Now bring up app B
+ File launchReportFileLotsOfContainers = createTempJsonFile();
+ shell = createTemplatedSliderApplication(
+ APPLICATION_NAME_LOTS_OF_CONTAINERS,
+ APP_TEMPLATE,
+ APP_RESOURCE_LOTS_OF_CONTAINERS,
+ [],
+ launchReportFileLotsOfContainers)
+ logShell(shell)
+
+ def appIdLotsOfContainers =
+ ensureYarnApplicationIsUp(launchReportFileLotsOfContainers)
+ // Wait for 10 secs to let the containers come up (until no more resource is
+ // left in the default queue)
+ describe("Wait 10 secs to let containers come up and eat up all the memory")
+ sleep(1000 * 10)
+
+ // kill one container which will bring health down to about 66.67% and app
+ // should be shutdown after health window expires, since threshold is 80%
+ ClusterDescription cd = execStatus(appName)
+ String containerId;
+ if (isUniqueComp) {
+ containerId = cd.instances.get(COMMAND_LOGGER + "3").get(0)
+ } else {
+ containerId = cd.instances.get(COMMAND_LOGGER).get(0)
+ }
+ describe("Killing container " + containerId)
+ killContainer(appName, containerId)
+
+ describe("Wait 10 secs to give sufficient time for the app to be stopped")
+ sleep(1000 * 10)
+ if (isApplicationUp(appName)) {
+ fail("Application should have been shutdown, but is still running")
+ }
+ }
+
+ @Test
+ public void testHealthMonitorAppSavedByFlex() throws Throwable {
+ describe("Running testHealthMonitorAppSavedByFlex for apps with resources "
+ + appResourceFor60 + " and " + appResourceFor80 + " with unique comp = "
+ + isUniqueComp)
+ assumeAgentTestsEnabled()
+ def appName = APPLICATION_NAME_80
+ if (isUniqueComp) {
+ appName += "-uniq-comp"
+ }
+ cleanup(appName)
+ cleanup(APPLICATION_NAME_LOTS_OF_CONTAINERS)
+
+ File launchReportFile = createTempJsonFile();
+ SliderShell shell = createTemplatedSliderApplication(
+ appName,
+ APP_TEMPLATE,
+ appResourceFor80,
+ [],
+ launchReportFile)
+ logShell(shell)
+
+ def appId = ensureYarnApplicationIsUp(launchReportFile)
+ if (isUniqueComp) {
+ expectContainerRequestedCountReached(appName, COMMAND_LOGGER + "1", 1,
+ CONTAINER_LAUNCH_TIMEOUT)
+ assertContainersLive(appName, COMMAND_LOGGER + "1", 1)
+ assertContainersLive(appName, COMMAND_LOGGER + "2", 1)
+ assertContainersLive(appName, COMMAND_LOGGER + "3", 1)
+ } else {
+ expectContainerRequestedCountReached(appName, COMMAND_LOGGER, 3,
+ CONTAINER_LAUNCH_TIMEOUT)
+ assertContainersLive(appName, COMMAND_LOGGER, 3)
+ }
+
+ // Wait for 2 secs to get past the init delay and let the health monitor
+ // polling to start
+ describe("Wait for 2 secs to let the health monitor polling to start")
+ sleep(1000 * 2)
+
+ // Now bring up app B
+ File launchReportFileLotsOfContainers = createTempJsonFile();
+ shell = createTemplatedSliderApplication(
+ APPLICATION_NAME_LOTS_OF_CONTAINERS,
+ APP_TEMPLATE,
+ APP_RESOURCE_LOTS_OF_CONTAINERS,
+ [],
+ launchReportFileLotsOfContainers)
+ logShell(shell)
+
+ def appIdLotsOfContainers =
+ ensureYarnApplicationIsUp(launchReportFileLotsOfContainers)
+ // Wait for 10 secs to let the containers come up (until no more resource is
+ // left in the default queue)
+ describe("Wait 10 secs to let containers come up and eat up all the memory")
+ sleep(1000 * 10)
+
+ // kill one container which will bring health down to about 66.67% and app
+ // could be shutdown if health window expires, since threshold is 80%
+ ClusterDescription cd = execStatus(appName)
+ String containerId;
+ if (isUniqueComp) {
+ containerId = cd.instances.get(COMMAND_LOGGER + "3").get(0)
+ } else {
+ containerId = cd.instances.get(COMMAND_LOGGER).get(0)
+ }
+ describe("Killing container " + containerId)
+ killContainer(appName, containerId)
+
+ // Before the health window expires, let's do a flex down to bring the
+ // health above threshold and prevent the app from being killed. Let's not
+ // do any additional checks after the kill container.
+ describe("Flexing down by 1 container")
+ slider(EXIT_SUCCESS,
+ [
+ ACTION_FLEX,
+ appName,
+ ARG_COMPONENT,
+ COMMAND_LOGGER,
+ "-1"
+ ])
+
+ describe("Wait for 10 secs to give sufficient time for the health window "
+ + "to expire, and the app should continue to run")
+ sleep(1000 * 10)
+ ensureYarnApplicationIsUp(appId)
+ // Now assert that only 2 containers are running
+ if (isUniqueComp) {
+ // note, after flex down the role 3 does not even exist
+ assertContainersLive(appName, COMMAND_LOGGER + "1", 1)
+ assertContainersLive(appName, COMMAND_LOGGER + "2", 1)
+ } else {
+ assertContainersLive(appName, COMMAND_LOGGER, 2)
+ }
+ }
+}