You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@slider.apache.org by go...@apache.org on 2017/10/02 05:17:02 UTC
incubator-slider git commit: SLIDER-1246 Application health should
not be affected by faulty nodes (health monitor based on percent threshold)
Repository: incubator-slider
Updated Branches:
refs/heads/develop 5696c7de3 -> 0f436c865
SLIDER-1246 Application health should not be affected by faulty nodes (health monitor based on percent threshold)
Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/0f436c86
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/0f436c86
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/0f436c86
Branch: refs/heads/develop
Commit: 0f436c865a90aba5b427d1c0571183c6fcbded1e
Parents: 5696c7d
Author: Gour Saha <go...@apache.org>
Authored: Sun Oct 1 22:15:07 2017 -0700
Committer: Gour Saha <go...@apache.org>
Committed: Sun Oct 1 22:15:07 2017 -0700
----------------------------------------------------------------------
.../org/apache/slider/api/ResourceKeys.java | 46 ++++++
.../slider/core/conf/ConfTreeOperations.java | 15 ++
.../apache/slider/providers/ProviderRole.java | 4 +-
.../server/appmaster/SliderAppMaster.java | 77 +++++++++-
.../actions/MonitorHealthThreshold.java | 146 +++++++++++++++++++
.../slider/server/appmaster/state/AppState.java | 103 ++++++++++++-
.../server/appmaster/state/RoleStatus.java | 12 ++
7 files changed, 398 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
index 295f7cd..29ef8ea 100644
--- a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
+++ b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
@@ -159,6 +159,52 @@ public interface ResourceKeys {
int DEFAULT_CONTAINER_FAILURE_THRESHOLD = 5;
/**
+ * The container health threshold when explicitly set for a specific role or
+ * glabally for all roles, will schedule a health check monitor to
+ * periodically check for the percentage of healthy containers. It runs the
+ * check at a specified/default poll frequency. It allows a role to be below
+ * the health threshold for a specified/default window after which it
+ * considers the application to be unhealthy and triggers an app stop.
+ */
+ String CONTAINER_HEALTH_THRESHOLD_PERCENT =
+ "yarn.container.health.threshold.percent";
+ /**
+ * Health check monitor poll frequency. It is an advanced setting and does not
+ * need to be set unless the app owner understands the implication and does
+ * not want the default.
+ */
+ String CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC =
+ "yarn.container.health.threshold.poll.frequency.secs";
+ /**
+ * The amount of time the health check monitor allows a specific role to be
+ * below the health threshold after which it considers the app to be
+ * unhealthy.
+ */
+ String CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC =
+ "yarn.container.health.threshold.window.secs";
+ /**
+ * The amount of initial time the health check monitor waits before the first
+ * check kicks in. It gives a lead time for the app containers to come up for
+ * the first time.
+ */
+ String CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC =
+ "yarn.container.health.threshold.init.delay.secs";
+ /**
+ * By default the health threshold percent does not come into play until it is
+ * explicitly set in resource config for a specific role or globally for all
+ * roles. -1 signifies disabled.
+ */
+ int CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED = -1;
+
+ int DEFAULT_CONTAINER_HEALTH_THRESHOLD_PERCENT =
+ CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED;
+ long DEFAULT_CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC = 10;
+ long DEFAULT_CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC = 600;
+ // the default for init delay is same as default health window
+ long DEFAULT_CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC =
+ DEFAULT_CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC;
+
+ /**
* Default node failure threshold for a component instance: {@value}
* Should to be lower than default component failure threshold to allow
* the component to start elsewhere
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java b/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java
index 526e17d..c8a7720 100644
--- a/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java
+++ b/slider-core/src/main/java/org/apache/slider/core/conf/ConfTreeOperations.java
@@ -459,6 +459,21 @@ public class ConfTreeOperations {
}
/**
+ * Get a component opt; use {@link Long#decode(String)} so as to take hex
+ * oct and bin values too.
+ *
+ * @param name component name
+ * @param option option name
+ * @param defVal default value
+ * @return parsed value
+ * @throws NumberFormatException if the role could not be parsed.
+ */
+ public long getComponentOptLong(String name, String option, long defVal) {
+ String val = getComponentOpt(name, option, Long.toString(defVal));
+ return Long.decode(val);
+ }
+
+ /**
* Get a component opt as a boolean using {@link Boolean#valueOf(String)}.
*
* @param name component name
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
index 4f6be52..4105b67 100644
--- a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
+++ b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
@@ -76,8 +76,8 @@ public final class ProviderRole {
* @param group role/component group
* @param id ID. This becomes the YARN priority
* @param policy placement policy
- * @param nodeFailureThreshold threshold for node failures (within a reset interval)
- * after which a node failure is considered an app failure
+ * @param nodeFailureThreshold threshold for node failures (within a reset
+ * interval) after which a node failure is considered an app failure
* @param placementTimeoutSeconds for lax placement, timeout in seconds before
* @param labelExpression label expression for requests; may be null
*/
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
index 3f47b98..c12fae8 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
@@ -138,6 +138,7 @@ import org.apache.slider.server.appmaster.actions.QueueService;
import org.apache.slider.server.appmaster.actions.ActionStopSlider;
import org.apache.slider.server.appmaster.actions.ActionUpgradeContainers;
import org.apache.slider.server.appmaster.actions.AsyncAction;
+import org.apache.slider.server.appmaster.actions.MonitorHealthThreshold;
import org.apache.slider.server.appmaster.actions.RenewingAction;
import org.apache.slider.server.appmaster.actions.ResetFailureWindow;
import org.apache.slider.server.appmaster.actions.ReviewAndFlexApplicationSize;
@@ -991,6 +992,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
scheduleFailureWindowResets(instanceDefinition.getResources());
scheduleEscalation(instanceDefinition.getInternal());
+ scheduleHealthThresholdMonitor(instanceDefinition.getResources());
try {
// schedule YARN Registry registration
@@ -1902,6 +1904,79 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
}
/**
+ * Schedule the health threshold monitor for all roles (except AM)
+ *
+ * @param resources
+ * the resource tree
+ */
+ private void scheduleHealthThresholdMonitor(ConfTree resources) {
+ ConfTreeOperations ops = new ConfTreeOperations(resources);
+ for (String roleGroup : ops.getComponentNames()) {
+ if (roleGroup.equals(SliderKeys.COMPONENT_AM)) {
+ continue;
+ }
+ // determine health threshold percent
+ int healthThresholdPercent = appState
+ .getHealthThresholdPercentForRole(roleGroup);
+ // validations
+ if (healthThresholdPercent ==
+ ResourceKeys.CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED) {
+ log.info("No health threshold monitor enabled for role {}", roleGroup);
+ continue;
+ }
+ // if threshold set to outside acceptable range then don't enable monitor
+ if (healthThresholdPercent <= 0 || healthThresholdPercent > 100) {
+ log.error(
+ "Invalid health threshold percent {}% for role {}. Monitor not "
+ + "enabled.",
+ healthThresholdPercent, roleGroup);
+ continue;
+ }
+ // determine the threshold properties
+ long window = ops.getComponentOptLong(roleGroup,
+ ResourceKeys.CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC,
+ ResourceKeys.DEFAULT_CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC);
+ long initDelay = ops.getComponentOptLong(roleGroup,
+ ResourceKeys.CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC,
+ ResourceKeys.DEFAULT_CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC);
+ long pollFrequency = ops.getComponentOptLong(roleGroup,
+ ResourceKeys.CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC,
+ ResourceKeys.DEFAULT_CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC);
+ // validations
+ if (window <= 0) {
+ log.error(
+ "Invalid health monitor window {} secs for role {}. Monitor not "
+ + "enabled.",
+ window, roleGroup);
+ continue;
+ }
+ if (initDelay < 0) {
+ log.error("Invalid health monitor init delay {} secs for role {}. "
+ + "Monitor not enabled.", initDelay, roleGroup);
+ continue;
+ }
+ if (pollFrequency <= 0) {
+ log.error("Invalid health monitor poll frequency {} secs for role {}. "
+ + "Monitor not enabled.", pollFrequency, roleGroup);
+ continue;
+ }
+ log.info(
+ "Scheduling the health threshold monitor for role {} with percent = "
+ + "{}%, window = {} secs, poll freq = {} secs, init-delay = {} "
+ + "secs",
+ roleGroup, healthThresholdPercent, window, pollFrequency, initDelay);
+ MonitorHealthThreshold monitor = new MonitorHealthThreshold(roleGroup,
+ healthThresholdPercent, window);
+ RenewingAction<MonitorHealthThreshold> renew = new RenewingAction<>(
+ monitor, initDelay, pollFrequency, TimeUnit.SECONDS, 0);
+ actionQueues.renewing("healthThresholdMonitor", renew);
+ // Mark that health threshold monitor is enabled for this role. Can be
+ // used to disable the failure threshold check.
+ appState.setHealthThresholdMonitorEnabled(roleGroup, true);
+ }
+ }
+
+ /**
* Schedule the escalation action
* @param internal
* @throws BadConfigException
@@ -2078,7 +2153,7 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
public void onError(Throwable e) {
if (e instanceof InvalidResourceRequestException) {
// stop the cluster
- LOG_YARN.error("AMRMClientAsync.onError() received {}", e, e);
+ LOG_YARN.error("AMRMClientAsync.onError() received {}", e);
ActionStopSlider stopSlider = new ActionStopSlider("stop",
EXIT_EXCEPTION_THROWN, FinalApplicationStatus.FAILED,
SliderUtils.extractFirstLine(e.getLocalizedMessage()));
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/MonitorHealthThreshold.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/MonitorHealthThreshold.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/MonitorHealthThreshold.java
new file mode 100644
index 0000000..cc6f13c
--- /dev/null
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/actions/MonitorHealthThreshold.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.slider.server.appmaster.actions;
+
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
+import org.apache.slider.api.SliderExitReason;
+import org.apache.slider.core.main.LauncherExitCodes;
+import org.apache.slider.server.appmaster.SliderAppMaster;
+import org.apache.slider.server.appmaster.state.AppState;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Monitors at a regular interval if the container health for a specific role
+ * has dropped below a desired threshold.
+ */
+public class MonitorHealthThreshold extends AsyncAction {
+ protected static final Logger log = LoggerFactory
+ .getLogger(MonitorHealthThreshold.class);
+
+ private final String roleGroup;
+ private final int healthThresholdPercent;
+ private final long healthThresholdWindowSecs;
+ private final long healthThresholdWindowNanos;
+ private long firstOccurrenceTimestamp = 0;
+ // Sufficient logging happens when role health is below threshold. However,
+ // there has to be some logging when it is above threshold, otherwise app
+ // owners have no idea how the health is fluctuating. So let's log whenever
+ // there is a change in role health, thereby preventing excessive logging on
+ // every poll.
+ private float prevRunningContainerFraction = 0;
+
+ public MonitorHealthThreshold(String roleGroup, int healthThresholdPercent,
+ long healthThresholdWindowSecs) {
+ super("MonitorHealthThreshold");
+ this.roleGroup = roleGroup;
+ this.healthThresholdPercent = healthThresholdPercent;
+ this.healthThresholdWindowSecs = healthThresholdWindowSecs;
+ this.healthThresholdWindowNanos = TimeUnit.NANOSECONDS
+ .convert(healthThresholdWindowSecs, TimeUnit.SECONDS);
+ }
+
+ @Override
+ public void execute(SliderAppMaster appMaster, QueueAccess queueService,
+ AppState appState) throws Exception {
+ log.debug("MonitorHealthThreshold execute method");
+ // Perform container health checks against desired threshold
+ synchronized (appMaster) {
+ long desiredContainerCount = appState.getDesiredContainerCount(roleGroup);
+ // if desired container count for this role is 0 then nothing to do
+ if (desiredContainerCount == 0) {
+ return;
+ }
+ long runningContainerCount = appState.getLiveContainerCount(roleGroup);
+ float thresholdFraction = (float) healthThresholdPercent / 100;
+ // no possibility of div by 0 since desiredContainerCount won't be 0 here
+ float runningContainerFraction = (float) runningContainerCount
+ / desiredContainerCount;
+ boolean healthChanged = false;
+ if (runningContainerFraction != prevRunningContainerFraction) {
+ prevRunningContainerFraction = runningContainerFraction;
+ healthChanged = true;
+ }
+ String runningContainerPercentStr = String.format("%.2f",
+ runningContainerFraction * 100);
+ // Check if the current running container percent is less than the
+ // threshold percent
+ if (runningContainerFraction < thresholdFraction) {
+ // Check if it is the first occurrence and if yes set the timestamp
+ long currentTimestamp = now();
+ if (firstOccurrenceTimestamp == 0) {
+ firstOccurrenceTimestamp = currentTimestamp;
+ log.info("Role {} is going below health threshold for the first time "
+ + "at ts = {}", roleGroup, firstOccurrenceTimestamp);
+ }
+ long elapsedTime = currentTimestamp - firstOccurrenceTimestamp;
+ long elapsedTimeSecs = TimeUnit.SECONDS.convert(elapsedTime,
+ TimeUnit.NANOSECONDS);
+ log.warn(
+ "Role = {}, Current health = {}%, is below Health threshold of {}% "
+ + "for {} secs (window = {} secs)",
+ roleGroup, runningContainerPercentStr, healthThresholdPercent,
+ elapsedTimeSecs, healthThresholdWindowSecs);
+ if (elapsedTime > healthThresholdWindowNanos) {
+ log.error(
+ "Role = {}, Current health = {}%, has been below health "
+ + "threshold of {}% for {} secs (threshold window = {} secs)",
+ roleGroup, runningContainerPercentStr, healthThresholdPercent,
+ elapsedTimeSecs, healthThresholdWindowSecs);
+ // Trigger an app stop
+ ActionStopSlider stopSlider = new ActionStopSlider("stop",
+ LauncherExitCodes.EXIT_EXCEPTION_THROWN,
+ FinalApplicationStatus.FAILED,
+ String.format(
+ "Application was killed because container health for role %s "
+ + "was %s%% (threshold = %d%%) for %d secs (threshold "
+ + "window = %d secs)",
+ roleGroup, runningContainerPercentStr, healthThresholdPercent,
+ elapsedTimeSecs, healthThresholdWindowSecs));
+ stopSlider.setExitReason(SliderExitReason.APP_ERROR);
+ appMaster.queue(stopSlider);
+ }
+ } else {
+ String logMsg = "Role = {}, Health threshold = {}%, Current health = "
+ + "{}% (Current Running count = {}, Desired count = {})";
+ if (healthChanged) {
+ log.info(logMsg, roleGroup, healthThresholdPercent,
+ runningContainerPercentStr, runningContainerCount,
+ desiredContainerCount);
+ } else {
+ log.debug(logMsg, roleGroup, healthThresholdPercent,
+ runningContainerPercentStr, runningContainerCount,
+ desiredContainerCount);
+ }
+ // The container health might have recovered above threshold after being
+ // below for less than the threshold window amount of time. So we need
+ // to reset firstOccurrenceTimestamp to 0.
+ if (firstOccurrenceTimestamp != 0) {
+ log.info(
+ "Role = {}, resetting first occurence to 0, since it recovered "
+ + "above health threshold of {}%",
+ roleGroup, healthThresholdPercent);
+ firstOccurrenceTimestamp = 0;
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
index 9e56870..d33e92e 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
@@ -72,6 +72,7 @@ import org.apache.slider.core.persist.AggregateConfSerDeser;
import org.apache.slider.core.persist.ConfTreeSerDeser;
import org.apache.slider.providers.PlacementPolicy;
import org.apache.slider.providers.ProviderRole;
+import org.apache.slider.server.appmaster.actions.MonitorHealthThreshold;
import org.apache.slider.server.appmaster.management.LongGauge;
import org.apache.slider.server.appmaster.management.MetricsAndMonitoring;
import org.apache.slider.server.appmaster.management.MetricsConstants;
@@ -274,6 +275,12 @@ public class AppState {
private final AtomicInteger completionOfUnknownContainerEvent =
new AtomicInteger();
+ /**
+ * This simply keeps track of the current set of live container ids for all
+ * roles and is primarily used by the {@link MonitorHealthThreshold} class.
+ */
+ private final Map<String, Set<ContainerId>> currentLiveContainers =
+ new ConcurrentHashMap<>();
/**
* limits of container core numbers in this queue
@@ -293,6 +300,9 @@ public class AppState {
private int failureThreshold = 10;
private int nodeFailureThreshold = 3;
+ // health threshold is disabled by default
+ private int healthThresholdPercent =
+ CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED;
private static String logServerURL = "";
@@ -337,6 +347,69 @@ public class AppState {
}
/**
+ * Get the no of containers running for a specific role at the time of this
+ * API call. It includes owned containers, meaning containers which have been
+ * allocated even if the app is not completely deployed and/or started in the
+ * container.
+ */
+ public int getLiveContainerCount(String roleGroup) {
+ if (roleGroup == null) {
+ return 0;
+ }
+ Set<ContainerId> containers = currentLiveContainers.get(roleGroup);
+ log.debug("Current live containers = {} for role {}", containers, roleGroup);
+ return containers == null ? 0 : containers.size();
+ }
+
+ public long getDesiredContainerCount(String roleGroup)
+ throws BadConfigException {
+ return getDesiredInstanceCount(getResourcesSnapshot(), roleGroup);
+ }
+
+ public void setHealthThresholdMonitorEnabled(String roleGroup,
+ boolean enabled) {
+ for (RoleStatus rs : getRoleStatusMap().values()) {
+ if (rs.getGroup().equals(roleGroup)) {
+ rs.setHealthThresholdMonitorEnabled(enabled);
+ }
+ }
+ }
+
+ /**
+ * Add a new (or existing in which case it has no effect) container to the
+ * live container set.
+ */
+ public void addLiveContainer(String roleGroup, ContainerId cId) {
+ log.info("Adding live container {} to role {}", cId, roleGroup);
+ if (roleGroup == null) {
+ return;
+ }
+ if (!currentLiveContainers.containsKey(roleGroup)) {
+ // new role entry
+ currentLiveContainers.put(roleGroup, Collections
+ .newSetFromMap(new ConcurrentHashMap<ContainerId, Boolean>()));
+ }
+ currentLiveContainers.get(roleGroup).add(cId);
+ }
+
+ /**
+ * Remove an existing container from the live container set.
+ */
+ public void removeLiveContainer(String roleGroup, ContainerId cId) {
+ if (roleGroup == null) {
+ return;
+ }
+ if (currentLiveContainers.containsKey(roleGroup)) {
+ log.info("Removing live container {} from role {}", cId, roleGroup);
+ currentLiveContainers.get(roleGroup).remove(cId);
+ } else {
+ log.warn(
+ "Nothing to remove as role {} does not exist in currentLiveContainers",
+ roleGroup);
+ }
+ }
+
+ /**
* Increment the count
*/
public void incFailedCountainerCount() {
@@ -586,6 +659,9 @@ public class AppState {
nodeFailureThreshold = globalResOpts.getOptionInt(
NODE_FAILURE_THRESHOLD,
DEFAULT_NODE_FAILURE_THRESHOLD);
+ healthThresholdPercent = globalResOpts.getOptionInt(
+ CONTAINER_HEALTH_THRESHOLD_PERCENT,
+ DEFAULT_CONTAINER_HEALTH_THRESHOLD_PERCENT);
initClusterStatus();
@@ -1091,7 +1167,14 @@ public class AppState {
* @return the instance removed
*/
private RoleInstance removeOwnedContainer(ContainerId id) {
- return ownedContainers.remove(id);
+ RoleInstance ri = ownedContainers.remove(id);
+ if (ri == null) {
+ log.warn("RoleInstance is null for container {}", id);
+ } else {
+ log.debug("RoleInstance = {}", ri);
+ removeLiveContainer(ri.group, id);
+ }
+ return ri;
}
/**
@@ -1102,6 +1185,7 @@ public class AppState {
*/
private RoleInstance putOwnedContainer(ContainerId id,
RoleInstance instance) {
+ addLiveContainer(instance.group, id);
return ownedContainers.put(id, instance);
}
@@ -2076,6 +2160,18 @@ public class AppState {
}
/**
+ * Get the health threshold percent for a specific role, falling back to
+ * the global one if not.
+ * @param roleGroup role group
+ * @return the threshold percent for health
+ */
+ public int getHealthThresholdPercentForRole(String roleGroup) {
+ ConfTreeOperations resources = instanceDefinition.getResourceOperations();
+ return resources.getComponentOptInt(roleGroup,
+ CONTAINER_HEALTH_THRESHOLD_PERCENT, healthThresholdPercent);
+ }
+
+ /**
* Get the node failure threshold for a specific role, falling back to
* the global one if not
* @param roleGroup role group
@@ -2154,7 +2250,10 @@ public class AppState {
log.info("Reviewing {} : ", role);
log.debug("Expected {}, Delta: {}", expected, delta);
- checkFailureThreshold(role);
+ // If health threshold monitor is disabled then check for failure threshold
+ if (!role.isHealthThresholdMonitorEnabled()) {
+ checkFailureThreshold(role);
+ }
if (expected < 0 ) {
// negative value: fail
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0f436c86/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
index 694f5cf..f46ed74 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
@@ -82,6 +82,8 @@ public final class RoleStatus implements Cloneable, MetricSet {
private String failureMessage = "";
private final Set<ContainerId> failedContainers = new HashSet<>();
+ private boolean healthThresholdMonitorEnabled = false;
+
public RoleStatus(ProviderRole providerRole) {
this.providerRole = providerRole;
this.name = providerRole.name;
@@ -458,6 +460,8 @@ public final class RoleStatus implements Cloneable, MetricSet {
sb.append(", failureMessage='").append(failureMessage).append('\'');
sb.append(", providerRole=").append(providerRole);
sb.append(", failedContainers=").append(failedContainers);
+ sb.append(", healthThresholdMonitorEnabled=")
+ .append(healthThresholdMonitorEnabled);
sb.append('}');
return sb.toString();
}
@@ -577,4 +581,12 @@ public final class RoleStatus implements Cloneable, MetricSet {
return stats;
}
+ public boolean isHealthThresholdMonitorEnabled() {
+ return healthThresholdMonitorEnabled;
+ }
+
+ public void setHealthThresholdMonitorEnabled(
+ boolean healthThresholdMonitorEnabled) {
+ this.healthThresholdMonitorEnabled = healthThresholdMonitorEnabled;
+ }
}