You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@storm.apache.org by ag...@apache.org on 2019/11/20 18:57:42 UTC

[storm] branch master updated: STORM-3539 Add metric for worker start time out

This is an automated email from the ASF dual-hosted git repository.

agresch pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/storm.git


View the commit online:
https://github.com/apache/storm/commit/bd42fe98a6e56ac17467f3903d41f93539d234f5

The following commit(s) were added to refs/heads/master by this push:
     new bd42fe9  STORM-3539 Add metric for worker start time out
     new 0c6648a  Merge pull request #3168 from dandsager1/STORM-3539
bd42fe9 is described below

commit bd42fe98a6e56ac17467f3903d41f93539d234f5
Author: david <da...@verizonmedia.com>
AuthorDate: Fri Nov 15 11:25:35 2019 -0600

    STORM-3539 Add metric for worker start time out
---
 docs/ClusterMetrics.md                                                  | 1 +
 storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java | 1 +
 .../src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java   | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/docs/ClusterMetrics.md b/docs/ClusterMetrics.md
index 4e4d0f1..f7f7b4f 100644
--- a/docs/ClusterMetrics.md
+++ b/docs/ClusterMetrics.md
@@ -185,6 +185,7 @@ Metrics associated with the supervisor, which launches the workers for a topolog
 | supervisor:num-launched | meter | number of times the supervisor is launched. |
 | supervisor:num-shell-exceptions | meter | number of exceptions calling shell commands. |
 | supervisor:num-slots-used-gauge | gauge | number of slots used on the supervisor. |
+| supervisor:num-worker-start-timed-out | meter | number of times worker start timed out. |
 | supervisor:num-worker-transitions-into-empty | meter | number of transitions into empty state. |
 | supervisor:num-worker-transitions-into-kill | meter | number of transitions into kill state. |
 | supervisor:num-worker-transitions-into-kill-and-relaunch | meter | number of transitions into kill-and-relaunch state |
diff --git a/storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java b/storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java
index 7575a91..df419b9 100644
--- a/storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java
+++ b/storm-server/src/main/java/org/apache/storm/daemon/supervisor/Slot.java
@@ -684,6 +684,7 @@ public class Slot extends Thread implements AutoCloseable, BlobChangingCallback
         long timeDiffms = (Time.currentTimeMillis() - dynamicState.startTime);
         long hbFirstTimeoutMs = getFirstHbTimeoutMs(staticState, dynamicState);
         if (timeDiffms > hbFirstTimeoutMs) {
+            staticState.slotMetrics.numWorkerStartTimedOut.mark();
             LOG.warn("SLOT {}: Container {} failed to launch in {} ms.", staticState.port, dynamicState.container,
                     hbFirstTimeoutMs);
             return killContainerFor(KillReason.HB_TIMEOUT, dynamicState, staticState);
diff --git a/storm-server/src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java b/storm-server/src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java
index f8e13fd..8b2f5f1 100644
--- a/storm-server/src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java
+++ b/storm-server/src/main/java/org/apache/storm/daemon/supervisor/SlotMetrics.java
@@ -26,6 +26,7 @@ import org.apache.storm.utils.EnumUtil;
 class SlotMetrics {
 
     final Meter numWorkersLaunched;
+    final Meter numWorkerStartTimedOut;
     final Map<Slot.KillReason, Meter> numWorkersKilledFor;
     final Timer workerLaunchDuration;
     final Map<Slot.MachineState, Meter> transitionIntoState;
@@ -34,6 +35,7 @@ class SlotMetrics {
 
     SlotMetrics(StormMetricsRegistry metricsRegistry) {
         numWorkersLaunched = metricsRegistry.registerMeter("supervisor:num-workers-launched");
+        numWorkerStartTimedOut = metricsRegistry.registerMeter("supervisor:num-worker-start-timed-out");
         numWorkersKilledFor = Collections.unmodifiableMap(EnumUtil.toEnumMap(Slot.KillReason.class,
             killReason -> metricsRegistry.registerMeter("supervisor:num-workers-killed-" + killReason.toString())));
         workerLaunchDuration = metricsRegistry.registerTimer("supervisor:worker-launch-duration");