You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@dolphinscheduler.apache.org by ch...@apache.org on 2022/07/12 03:44:40 UTC

[dolphinscheduler] branch dev updated: [Feature][Metrics] Add resource download related metrics for workers (#10749)

This is an automated email from the ASF dual-hosted git repository.

chufenggao pushed a commit to branch dev
in repository https://gitbox.apache.org/repos/asf/dolphinscheduler.git


The following commit(s) were added to refs/heads/dev by this push:
     new 2f7281c2d2 [Feature][Metrics] Add resource download related metrics for workers (#10749)
2f7281c2d2 is described below

commit 2f7281c2d2d3ceb60bae166932baa04d7703bf26
Author: Eric Gao <er...@gmail.com>
AuthorDate: Tue Jul 12 11:44:34 2022 +0800

    [Feature][Metrics] Add resource download related metrics for workers (#10749)
    
    * [Feature][Metrics] Add resource download related metrics for workers (#9324)
    
    * [Feature][Metrics] Fix bugs and add grafana demos for worker resource download metrics (#9324)
    
    * [Feature][Metrics] Add docs to resource related metrics (#9324)
    
    * [Feature][Metrics] Use tags to indicate status in metrics (#9324)
    
    * [Feature][Metrics] Fix demos, docs and remove redundant code (#9324)
    
    * [Feature][Metrics] Remove .pnpm-debug.log (#9324)
    
    * [Feature][Metrics] Fix style check (#9324)
    
    * [Feature][Metrics] Replace KB with bytes for the unit of resource file size in metrics (#9324)
    
    * [Feature][Metrics] Make code neat (#9324)
---
 docs/docs/en/guide/metrics/metrics.md              |   5 +-
 docs/docs/zh/guide/metrics/metrics.md              |   3 +
 .../resources/grafana/DolphinSchedulerWorker.json  | 408 ++++++++++++++++++++-
 .../server/worker/metrics/WorkerServerMetrics.java |  49 ++-
 .../server/worker/runner/TaskExecuteThread.java    |   9 +
 5 files changed, 460 insertions(+), 14 deletions(-)

diff --git a/docs/docs/en/guide/metrics/metrics.md b/docs/docs/en/guide/metrics/metrics.md
index 2ba4f639d6..9ecdc3619f 100644
--- a/docs/docs/en/guide/metrics/metrics.md
+++ b/docs/docs/en/guide/metrics/metrics.md
@@ -74,7 +74,7 @@ For example, you can get the master metrics by `curl http://localhost:5679/actua
 - ds.task.execution.count.by.type: (counter) the number of task executions grouped by tag `task_type`
 - ds.task.running: (gauge) the number of running tasks 
 - ds.task.prepared: (gauge) the number of tasks prepared for task queue 
-- ds.task.execution.count: (histogram) the number of executed tasks  
+- ds.task.execution.count: (counter) the number of executed tasks  
 - ds.task.execution.duration: (histogram) duration of task executions
 
 
@@ -103,6 +103,9 @@ For example, you can get the master metrics by `curl http://localhost:5679/actua
 
 - ds.worker.overload.count: (counter) the number of times the worker overloaded
 - ds.worker.full.submit.queue.count: (counter) the number of times the worker's submit queue being full
+- ds.worker.resource.download.count: (counter) the number of downloaded resource files on workers, sliced by tag `status`
+- ds.worker.resource.download.duration: (histogram) the time cost of resource download on workers
+- ds.worker.resource.download.size: (histogram) the sizes of downloaded resource files on workers (bytes)
 
 ### Api Server Metrics
 
diff --git a/docs/docs/zh/guide/metrics/metrics.md b/docs/docs/zh/guide/metrics/metrics.md
index 9b3805c613..3116f5445b 100644
--- a/docs/docs/zh/guide/metrics/metrics.md
+++ b/docs/docs/zh/guide/metrics/metrics.md
@@ -104,6 +104,9 @@ metrics exporter端口`server.port`是在application.yaml里定义的: master: `
 
 - ds.worker.overload.count: (counter) worker过载次数
 - ds.worker.full.submit.queue.count: (counter) worker提交队列全满次数
+- ds.worker.resource.download.count: (counter) worker下载资源文件的次数,可由`status`标签切分
+- ds.worker.resource.download.duration: (histogram) worker下载资源文件时花费的时间分布
+- ds.worker.resource.download.size: (histogram) worker下载资源文件大小的分布(bytes)
 
 ### Api Server指标
 
diff --git a/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json b/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json
index 01cd09baad..c13e030259 100644
--- a/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json
+++ b/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json
@@ -764,13 +764,398 @@
         "align": false
       }
     },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 33
+      },
+      "id": 47,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "expr": "sum(increase(ds_worker_resource_download_count_total{}[5m]))",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "expr": "increase(ds_worker_resource_download_count_total{status=\"success\"}[5m])",
+          "hide": false,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "expr": "increase(ds_worker_resource_download_count_total{status=\"fail\"}[5m])",
+          "hide": false,
+          "refId": "C"
+        }
+      ],
+      "title": "Worker Resource Download Count/5m",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 41
+      },
+      "id": 44,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "expr": "increase(ds_worker_resource_download_duration_seconds{}[5m])",
+          "refId": "A"
+        }
+      ],
+      "title": "Worker Resource Download Time/5m",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 41
+      },
+      "id": 45,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "expr": "increase(ds_worker_resource_download_size_bytes[5m])",
+          "refId": "A"
+        }
+      ],
+      "title": "Worker Resource Download Size/5m",
+      "type": "timeseries"
+    },
+    {
+      "cards": {},
+      "color": {
+        "cardColor": "#b4ff00",
+        "colorScale": "sqrt",
+        "colorScheme": "interpolateOranges",
+        "exponent": 0.5,
+        "mode": "spectrum"
+      },
+      "dataFormat": "timeseries",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 49
+      },
+      "heatmap": {},
+      "hideZeroBuckets": false,
+      "highlightCards": true,
+      "id": 46,
+      "legend": {
+        "show": false
+      },
+      "reverseYBuckets": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "expr": "histogram_quantile(0.75, sum(increase(ds_worker_resource_download_size_bytes_bucket[5m])) by (le))",
+          "refId": "A"
+        }
+      ],
+      "title": "Worker Resource Download Size Distribution/5m",
+      "tooltip": {
+        "show": true,
+        "showHistogram": false
+      },
+      "type": "heatmap",
+      "xAxis": {
+        "show": true
+      },
+      "yAxis": {
+        "format": "short",
+        "logBase": 1,
+        "show": true
+      },
+      "yBucketBound": "auto"
+    },
+    {
+      "cards": {},
+      "color": {
+        "cardColor": "#b4ff00",
+        "colorScale": "sqrt",
+        "colorScheme": "interpolateOranges",
+        "exponent": 0.5,
+        "mode": "spectrum"
+      },
+      "dataFormat": "timeseries",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 49
+      },
+      "heatmap": {},
+      "hideZeroBuckets": false,
+      "highlightCards": true,
+      "id": 48,
+      "legend": {
+        "show": false
+      },
+      "pluginVersion": "8.5.3",
+      "reverseYBuckets": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "expr": "histogram_quantile(0.95, sum(rate(ds_worker_resource_download_duration_seconds_bucket[5m])) by (le))",
+          "refId": "A"
+        }
+      ],
+      "title": "Worker Resource Download Time Distribution/5m",
+      "tooltip": {
+        "show": true,
+        "showHistogram": false
+      },
+      "type": "heatmap",
+      "xAxis": {
+        "show": true
+      },
+      "yAxis": {
+        "format": "short",
+        "logBase": 1,
+        "show": true
+      },
+      "yBucketBound": "auto"
+    },
     {
       "collapsed": false,
       "gridPos": {
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 33
+        "y": 57
       },
       "id": 26,
       "panels": [],
@@ -836,7 +1221,7 @@
         "h": 8,
         "w": 8,
         "x": 0,
-        "y": 34
+        "y": 58
       },
       "id": 14,
       "options": {
@@ -925,7 +1310,7 @@
         "h": 8,
         "w": 9,
         "x": 8,
-        "y": 34
+        "y": 58
       },
       "id": 20,
       "options": {
@@ -1014,7 +1399,7 @@
         "h": 8,
         "w": 7,
         "x": 17,
-        "y": 34
+        "y": 58
       },
       "id": 22,
       "options": {
@@ -1103,7 +1488,7 @@
         "h": 8,
         "w": 8,
         "x": 0,
-        "y": 42
+        "y": 66
       },
       "id": 32,
       "options": {
@@ -1192,7 +1577,7 @@
         "h": 8,
         "w": 9,
         "x": 8,
-        "y": 42
+        "y": 66
       },
       "id": 34,
       "options": {
@@ -1281,7 +1666,7 @@
         "h": 8,
         "w": 7,
         "x": 17,
-        "y": 42
+        "y": 66
       },
       "id": 36,
       "options": {
@@ -1370,7 +1755,7 @@
         "h": 8,
         "w": 8,
         "x": 0,
-        "y": 50
+        "y": 74
       },
       "id": 38,
       "options": {
@@ -1459,7 +1844,7 @@
         "h": 8,
         "w": 9,
         "x": 8,
-        "y": 50
+        "y": 74
       },
       "id": 40,
       "options": {
@@ -1548,7 +1933,7 @@
         "h": 8,
         "w": 7,
         "x": 17,
-        "y": 50
+        "y": 74
       },
       "id": 42,
       "options": {
@@ -1579,6 +1964,7 @@
       "type": "timeseries"
     }
   ],
+  "refresh": "30s",
   "schemaVersion": 36,
   "style": "dark",
   "tags": [],
@@ -1593,6 +1979,6 @@
   "timezone": "",
   "title": "Worker",
   "uid": "6wXtd3r7k",
-  "version": 2,
+  "version": 11,
   "weekStart": ""
 }
\ No newline at end of file
diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java
index 6c68cc00ec..c0ae07269b 100644
--- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java
+++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java
@@ -17,11 +17,14 @@
 
 package org.apache.dolphinscheduler.server.worker.metrics;
 
+import java.util.concurrent.TimeUnit;
 import java.util.function.Supplier;
 
 import io.micrometer.core.instrument.Counter;
+import io.micrometer.core.instrument.DistributionSummary;
 import io.micrometer.core.instrument.Gauge;
 import io.micrometer.core.instrument.Metrics;
+import io.micrometer.core.instrument.Timer;
 import lombok.experimental.UtilityClass;
 
 @UtilityClass
@@ -37,6 +40,33 @@ public class WorkerServerMetrics {
             .description("full worker submit queues count")
             .register(Metrics.globalRegistry);
 
+    private static final Counter WORKER_RESOURCE_DOWNLOAD_SUCCESS_COUNTER =
+            Counter.builder("ds.worker.resource.download.count")
+                    .tag("status", "success")
+                    .description("worker resource download success count")
+                    .register(Metrics.globalRegistry);
+
+    private static final Counter WORKER_RESOURCE_DOWNLOAD_FAILURE_COUNTER =
+            Counter.builder("ds.worker.resource.download.count")
+                    .tag("status", "fail")
+                    .description("worker resource download failure count")
+                    .register(Metrics.globalRegistry);
+
+    private static final Timer WORKER_RESOURCE_DOWNLOAD_DURATION_TIMER =
+            Timer.builder("ds.worker.resource.download.duration")
+                    .publishPercentiles(0.5, 0.75, 0.95, 0.99)
+                    .publishPercentileHistogram()
+                    .description("time cost of resource download on workers")
+                    .register(Metrics.globalRegistry);
+
+    private static final DistributionSummary WORKER_RESOURCE_DOWNLOAD_SIZE_DISTRIBUTION =
+            DistributionSummary.builder("ds.worker.resource.download.size")
+            .baseUnit("bytes")
+            .publishPercentiles(0.5, 0.75, 0.95, 0.99)
+            .publishPercentileHistogram()
+            .description("size of downloaded resource files on worker")
+            .register(Metrics.globalRegistry);
+
     public static void incWorkerOverloadCount() {
         WORKER_OVERLOAD_COUNTER.increment();
     }
@@ -45,11 +75,26 @@ public class WorkerServerMetrics {
         WORKER_SUBMIT_QUEUE_IS_FULL_COUNTER.increment();
     }
 
-    public static void registerWorkerRunningTaskGauge(Supplier<Number> supplier) {
+    public static void incWorkerResourceDownloadSuccessCount() {
+        WORKER_RESOURCE_DOWNLOAD_SUCCESS_COUNTER.increment();
+    }
+
+    public static void incWorkerResourceDownloadFailureCount() {
+        WORKER_RESOURCE_DOWNLOAD_FAILURE_COUNTER.increment();
+    }
+
+    public static void recordWorkerResourceDownloadTime(final long milliseconds) {
+        WORKER_RESOURCE_DOWNLOAD_DURATION_TIMER.record(milliseconds, TimeUnit.MILLISECONDS);
+    }
+
+    public static void recordWorkerResourceDownloadSize(final long size) {
+        WORKER_RESOURCE_DOWNLOAD_SIZE_DISTRIBUTION.record(size);
+    }
+
+    public static void registerWorkerRunningTaskGauge(final Supplier<Number> supplier) {
         Gauge.builder("ds.task.running", supplier)
             .description("number of running tasks on workers")
             .register(Metrics.globalRegistry);
-
     }
 
 }
diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java
index 2aa34a21fd..7e03f83844 100644
--- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java
+++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java
@@ -36,6 +36,7 @@ import org.apache.dolphinscheduler.plugin.task.api.enums.ExecutionStatus;
 import org.apache.dolphinscheduler.plugin.task.api.model.Property;
 import org.apache.dolphinscheduler.plugin.task.api.model.TaskAlertInfo;
 import org.apache.dolphinscheduler.server.utils.ProcessUtils;
+import org.apache.dolphinscheduler.server.worker.metrics.WorkerServerMetrics;
 import org.apache.dolphinscheduler.server.worker.processor.TaskCallbackService;
 import org.apache.dolphinscheduler.service.alert.AlertClientService;
 import org.apache.dolphinscheduler.service.exceptions.ServiceException;
@@ -46,7 +47,9 @@ import org.apache.commons.lang3.tuple.Pair;
 
 import java.io.File;
 import java.io.IOException;
+import java.nio.file.Files;
 import java.nio.file.NoSuchFileException;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Date;
@@ -278,8 +281,14 @@ public class TaskExecuteThread implements Runnable, Delayed {
                 String tenantCode = fileDownload.getRight();
                 String resHdfsPath = storageOperate.getResourceFileName(tenantCode, fullName);
                 logger.info("get resource file from hdfs :{}", resHdfsPath);
+                long resourceDownloadStartTime = System.currentTimeMillis();
                 storageOperate.download(tenantCode, resHdfsPath, execLocalPath + File.separator + fullName, false, true);
+                WorkerServerMetrics.recordWorkerResourceDownloadTime(System.currentTimeMillis() - resourceDownloadStartTime);
+                WorkerServerMetrics.recordWorkerResourceDownloadSize(
+                        Files.size(Paths.get(execLocalPath, fullName)));
+                WorkerServerMetrics.incWorkerResourceDownloadSuccessCount();
             } catch (Exception e) {
+                WorkerServerMetrics.incWorkerResourceDownloadFailureCount();
                 logger.error(e.getMessage(), e);
                 throw new ServiceException(e.getMessage());
             }