Merge pull request apache#3246 from agresch/agresch_storm_3618

STORM-3618 add meter to track scheduling errors
cheng1234 · Apr 9, 2020 · 3eca57d · 3eca57d
2 parents 00f48d6 + 15d5872
commit 3eca57d
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 1 deletion.
diff --git a/docs/ClusterMetrics.md b/docs/ClusterMetrics.md
@@ -58,6 +58,7 @@ These are metrics that are specific to a nimbus instance.  In many instances onl
 |-------------|------|-------------|
 | nimbus:files-upload-duration-ms | timer | Time it takes to upload a file from start to finish (Not Blobs, but this may change) |
 | nimbus:longest-scheduling-time-ms | gauge | Longest time ever taken so far to schedule. This includes the current scheduling run, which is intended to detect if scheduling is stuck for some reason. |
+| nimbus:mkAssignments-Errors | meter | tracks exceptions from mkAssignments |
 | nimbus:num-activate-calls | meter | calls to the activate thrift method. |
 | nimbus:num-added-executors-per-scheduling | histogram | number of executors added after a scheduling run. |
 | nimbus:num-added-slots-per-scheduling | histogram |  number of slots added after a scheduling run. |
@@ -102,7 +103,7 @@ These are metrics that are specific to a nimbus instance.  In many instances onl
 | nimbus:num-uploadChunk-calls | meter | calls to uploadChunk thrift method. |
 | nimbus:num-uploadNewCredentials-calls | meter | calls to uploadNewCredentials thrift method. |
 | nimbus:process-worker-metric-calls | meter | calls to processWorkerMetrics thrift method. |
-| nimbus:mkAssignments-Errors | meter | tracks exceptions from mkAssignments |
+| nimbus:scheduler-internal-errors | meter | tracks internal scheduling errors |
 | nimbus:topology-scheduling-duration-ms | timer | time it takes to do a scheduling run. |
 | nimbus:total-available-memory-non-negative | gauge | available memory on the cluster MB |
 | nimbuses:uptime-secs | histogram | uptime of nimbuses |

diff --git a/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java b/storm-server/src/main/java/org/apache/storm/scheduler/resource/ResourceAwareScheduler.java
@@ -58,6 +58,7 @@ public class ResourceAwareScheduler implements IScheduler {
     private int schedulingTimeoutSeconds;
     private ExecutorService backgroundScheduling;
     private Meter schedulingTimeoutMeter;
+    private Meter internalErrorMeter;
 
     private static void markFailedTopology(User u, Cluster c, TopologyDetails td, String message) {
         markFailedTopology(u, c, td, message, null);
@@ -78,6 +79,7 @@ private static void markFailedTopology(User u, Cluster c, TopologyDetails td, St
     public void prepare(Map<String, Object> conf, StormMetricsRegistry metricsRegistry) {
         this.conf = conf;
         schedulingTimeoutMeter = metricsRegistry.registerMeter("nimbus:num-scheduling-timeouts");
+        internalErrorMeter = metricsRegistry.registerMeter("nimbus:scheduler-internal-errors");
         schedulingPriorityStrategy = ReflectionUtils.newInstance(
             (String) conf.get(DaemonConfig.RESOURCE_AWARE_SCHEDULER_PRIORITY_STRATEGY));
         configLoader = ConfigLoaderFactoryService.createConfigLoader(conf);
@@ -235,6 +237,7 @@ private void scheduleTopology(TopologyDetails td, Cluster cluster, final User to
                     }
                 }
             } catch (Exception ex) {
+                internalErrorMeter.mark();
                 markFailedTopology(topologySubmitter, cluster, td,
                         "Internal Error - Exception thrown when scheduling. Please check logs for details", ex);
                 return;