You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@storm.apache.org by et...@apache.org on 2020/04/09 20:32:06 UTC
[storm] branch master updated: STORM-3618 add meter to track
scheduling errors
This is an automated email from the ASF dual-hosted git repository.
ethanli pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/storm.git
The following commit(s) were added to refs/heads/master by this push:
new 15d5872 STORM-3618 add meter to track scheduling errors
new 3eca57d Merge pull request #3246 from agresch/agresch_storm_3618
15d5872 is described below
commit 15d58729ef14c45c85d19faa5d409bb8ceae5006
Author: Aaron Gresch <ag...@yahoo-inc.com>
AuthorDate: Wed Apr 8 15:18:56 2020 -0500
STORM-3618 add meter to track scheduling errors
---
docs/ClusterMetrics.md | 3 ++-
.../org/apache/storm/scheduler/resource/ResourceAwareScheduler.java | 3 +++
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/docs/ClusterMetrics.md b/docs/ClusterMetrics.md
index 7760f51..c7e9b69 100644
--- a/docs/ClusterMetrics.md
+++ b/docs/ClusterMetrics.md
@@ -58,6 +58,7 @@ These are metrics that are specific to a nimbus instance. In many instances onl
|-------------|------|-------------|
| nimbus:files-upload-duration-ms | timer | Time it takes to upload a file from start to finish (Not Blobs, but this may change) |
| nimbus:longest-scheduling-time-ms | gauge | Longest time ever taken so far to schedule. This includes the current scheduling run, which is intended to detect if scheduling is stuck for some reason. |
+| nimbus:mkAssignments-Errors | meter | tracks exceptions from mkAssignments |
| nimbus:num-activate-calls | meter | calls to the activate thrift method. |
| nimbus:num-added-executors-per-scheduling | histogram | number of executors added after a scheduling run. |
| nimbus:num-added-slots-per-scheduling | histogram | number of slots added after a scheduling run. |
@@ -102,7 +103,7 @@ These are metrics that are specific to a nimbus instance. In many instances onl
| nimbus:num-uploadChunk-calls | meter | calls to uploadChunk thrift method. |
| nimbus:num-uploadNewCredentials-calls | meter | calls to uploadNewCredentials thrift method. |
| nimbus:process-worker-metric-calls | meter | calls to processWorkerMetrics thrift method. |
-| nimbus:mkAssignments-Errors | meter | tracks exceptions from mkAssignments |
+| nimbus:scheduler-internal-errors | meter | tracks internal scheduling errors |
| nimbus:topology-scheduling-duration-ms | timer | time it takes to do a scheduling run. |
| nimbus:total-available-memory-non-negative | gauge | available memory on the cluster MB |
| nimbuses:uptime-secs | histogram | uptime of nimbuses |
diff --git a/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java b/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java
index f7e34ec..a26246a 100644
--- a/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java
+++ b/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java
@@ -58,6 +58,7 @@ public class ResourceAwareScheduler implements IScheduler {
private int schedulingTimeoutSeconds;
private ExecutorService backgroundScheduling;
private Meter schedulingTimeoutMeter;
+ private Meter internalErrorMeter;
private static void markFailedTopology(User u, Cluster c, TopologyDetails td, String message) {
markFailedTopology(u, c, td, message, null);
@@ -78,6 +79,7 @@ public class ResourceAwareScheduler implements IScheduler {
public void prepare(Map<String, Object> conf, StormMetricsRegistry metricsRegistry) {
this.conf = conf;
schedulingTimeoutMeter = metricsRegistry.registerMeter("nimbus:num-scheduling-timeouts");
+ internalErrorMeter = metricsRegistry.registerMeter("nimbus:scheduler-internal-errors");
schedulingPriorityStrategy = ReflectionUtils.newInstance(
(String) conf.get(DaemonConfig.RESOURCE_AWARE_SCHEDULER_PRIORITY_STRATEGY));
configLoader = ConfigLoaderFactoryService.createConfigLoader(conf);
@@ -235,6 +237,7 @@ public class ResourceAwareScheduler implements IScheduler {
}
}
} catch (Exception ex) {
+ internalErrorMeter.mark();
markFailedTopology(topologySubmitter, cluster, td,
"Internal Error - Exception thrown when scheduling. Please check logs for details", ex);
return;