You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@storm.apache.org by ag...@apache.org on 2019/11/20 18:57:42 UTC
[storm] branch master updated: STORM-3539 Add metric for worker
start time out
This is an automated email from the ASF dual-hosted git repository.
agresch pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/storm.git
View the commit online:
https://github.com/apache/storm/commit/bd42fe98a6e56ac17467f3903d41f93539d234f5
The following commit(s) were added to refs/heads/master by this push:
new bd42fe9 STORM-3539 Add metric for worker start time out
new 0c6648a Merge pull request #3168 from dandsager1/STORM-3539
bd42fe9 is described below
commit bd42fe98a6e56ac17467f3903d41f93539d234f5
Author: david <da...@verizonmedia.com>
AuthorDate: Fri Nov 15 11:25:35 2019 -0600
STORM-3539 Add metric for worker start time out
---
docs/ClusterMetrics.md | 1 +
storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java | 1 +
.../src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java | 2 ++
3 files changed, 4 insertions(+)
diff --git a/docs/ClusterMetrics.md b/docs/ClusterMetrics.md
index 4e4d0f1..f7f7b4f 100644
--- a/docs/ClusterMetrics.md
+++ b/docs/ClusterMetrics.md
@@ -185,6 +185,7 @@ Metrics associated with the supervisor, which launches the workers for a topolog
| supervisor:num-launched | meter | number of times the supervisor is launched. |
| supervisor:num-shell-exceptions | meter | number of exceptions calling shell commands. |
| supervisor:num-slots-used-gauge | gauge | number of slots used on the supervisor. |
+| supervisor:num-worker-start-timed-out | meter | number of times worker start timed out. |
| supervisor:num-worker-transitions-into-empty | meter | number of transitions into empty state. |
| supervisor:num-worker-transitions-into-kill | meter | number of transitions into kill state. |
| supervisor:num-worker-transitions-into-kill-and-relaunch | meter | number of transitions into kill-and-relaunch state |
diff --git a/storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java b/storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java
index 7575a91..df419b9 100644
--- a/storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java
+++ b/storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java
@@ -684,6 +684,7 @@ public class Slot extends Thread implements AutoCloseable, BlobChangingCallback
long timeDiffms = (Time.currentTimeMillis() - dynamicState.startTime);
long hbFirstTimeoutMs = getFirstHbTimeoutMs(staticState, dynamicState);
if (timeDiffms > hbFirstTimeoutMs) {
+ staticState.slotMetrics.numWorkerStartTimedOut.mark();
LOG.warn("SLOT {}: Container {} failed to launch in {} ms.", staticState.port, dynamicState.container,
hbFirstTimeoutMs);
return killContainerFor(KillReason.HB_TIMEOUT, dynamicState, staticState);
diff --git a/storm-server/src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java b/storm-server/src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java
index f8e13fd..8b2f5f1 100644
--- a/storm-server/src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java
+++ b/storm-server/src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java
@@ -26,6 +26,7 @@ import org.apache.storm.utils.EnumUtil;
class SlotMetrics {
final Meter numWorkersLaunched;
+ final Meter numWorkerStartTimedOut;
final Map<Slot.KillReason, Meter> numWorkersKilledFor;
final Timer workerLaunchDuration;
final Map<Slot.MachineState, Meter> transitionIntoState;
@@ -34,6 +35,7 @@ class SlotMetrics {
SlotMetrics(StormMetricsRegistry metricsRegistry) {
numWorkersLaunched = metricsRegistry.registerMeter("supervisor:num-workers-launched");
+ numWorkerStartTimedOut = metricsRegistry.registerMeter("supervisor:num-worker-start-timed-out");
numWorkersKilledFor = Collections.unmodifiableMap(EnumUtil.toEnumMap(Slot.KillReason.class,
killReason -> metricsRegistry.registerMeter("supervisor:num-workers-killed-" + killReason.toString())));
workerLaunchDuration = metricsRegistry.registerTimer("supervisor:worker-launch-duration");