You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@storm.apache.org by et...@apache.org on 2020/04/09 20:32:06 UTC

[storm] branch master updated: STORM-3618 add meter to track scheduling errors

This is an automated email from the ASF dual-hosted git repository.

ethanli pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/storm.git


The following commit(s) were added to refs/heads/master by this push:
     new 15d5872  STORM-3618 add meter to track scheduling errors
     new 3eca57d  Merge pull request #3246 from agresch/agresch_storm_3618
15d5872 is described below

commit 15d58729ef14c45c85d19faa5d409bb8ceae5006
Author: Aaron Gresch <ag...@yahoo-inc.com>
AuthorDate: Wed Apr 8 15:18:56 2020 -0500

    STORM-3618 add meter to track scheduling errors
---
 docs/ClusterMetrics.md                                                 | 3 ++-
 .../org/apache/storm/scheduler/resource/ResourceAwareScheduler.java    | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/ClusterMetrics.md b/docs/ClusterMetrics.md
index 7760f51..c7e9b69 100644
--- a/docs/ClusterMetrics.md
+++ b/docs/ClusterMetrics.md
@@ -58,6 +58,7 @@ These are metrics that are specific to a nimbus instance.  In many instances onl
 |-------------|------|-------------|
 | nimbus:files-upload-duration-ms | timer | Time it takes to upload a file from start to finish (Not Blobs, but this may change) |
 | nimbus:longest-scheduling-time-ms | gauge | Longest time ever taken so far to schedule. This includes the current scheduling run, which is intended to detect if scheduling is stuck for some reason. |
+| nimbus:mkAssignments-Errors | meter | tracks exceptions from mkAssignments |
 | nimbus:num-activate-calls | meter | calls to the activate thrift method. |
 | nimbus:num-added-executors-per-scheduling | histogram | number of executors added after a scheduling run. |
 | nimbus:num-added-slots-per-scheduling | histogram |  number of slots added after a scheduling run. |
@@ -102,7 +103,7 @@ These are metrics that are specific to a nimbus instance.  In many instances onl
 | nimbus:num-uploadChunk-calls | meter | calls to uploadChunk thrift method. |
 | nimbus:num-uploadNewCredentials-calls | meter | calls to uploadNewCredentials thrift method. |
 | nimbus:process-worker-metric-calls | meter | calls to processWorkerMetrics thrift method. |
-| nimbus:mkAssignments-Errors | meter | tracks exceptions from mkAssignments |
+| nimbus:scheduler-internal-errors | meter | tracks internal scheduling errors |
 | nimbus:topology-scheduling-duration-ms | timer | time it takes to do a scheduling run. |
 | nimbus:total-available-memory-non-negative | gauge | available memory on the cluster MB |
 | nimbuses:uptime-secs | histogram | uptime of nimbuses |
diff --git a/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java b/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java
index f7e34ec..a26246a 100644
--- a/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java
+++ b/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java
@@ -58,6 +58,7 @@ public class ResourceAwareScheduler implements IScheduler {
     private int schedulingTimeoutSeconds;
     private ExecutorService backgroundScheduling;
     private Meter schedulingTimeoutMeter;
+    private Meter internalErrorMeter;
 
     private static void markFailedTopology(User u, Cluster c, TopologyDetails td, String message) {
         markFailedTopology(u, c, td, message, null);
@@ -78,6 +79,7 @@ public class ResourceAwareScheduler implements IScheduler {
     public void prepare(Map<String, Object> conf, StormMetricsRegistry metricsRegistry) {
         this.conf = conf;
         schedulingTimeoutMeter = metricsRegistry.registerMeter("nimbus:num-scheduling-timeouts");
+        internalErrorMeter = metricsRegistry.registerMeter("nimbus:scheduler-internal-errors");
         schedulingPriorityStrategy = ReflectionUtils.newInstance(
             (String) conf.get(DaemonConfig.RESOURCE_AWARE_SCHEDULER_PRIORITY_STRATEGY));
         configLoader = ConfigLoaderFactoryService.createConfigLoader(conf);
@@ -235,6 +237,7 @@ public class ResourceAwareScheduler implements IScheduler {
                     }
                 }
             } catch (Exception ex) {
+                internalErrorMeter.mark();
                 markFailedTopology(topologySubmitter, cluster, td,
                         "Internal Error - Exception thrown when scheduling. Please check logs for details", ex);
                 return;