You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@dolphinscheduler.apache.org by ch...@apache.org on 2022/07/12 03:44:40 UTC
[dolphinscheduler] branch dev updated: [Feature][Metrics] Add resource download related metrics for workers (#10749)
This is an automated email from the ASF dual-hosted git repository.
chufenggao pushed a commit to branch dev
in repository https://gitbox.apache.org/repos/asf/dolphinscheduler.git
The following commit(s) were added to refs/heads/dev by this push:
new 2f7281c2d2 [Feature][Metrics] Add resource download related metrics for workers (#10749)
2f7281c2d2 is described below
commit 2f7281c2d2d3ceb60bae166932baa04d7703bf26
Author: Eric Gao <er...@gmail.com>
AuthorDate: Tue Jul 12 11:44:34 2022 +0800
[Feature][Metrics] Add resource download related metrics for workers (#10749)
* [Feature][Metrics] Add resource download related metrics for workers (#9324)
* [Feature][Metrics] Fix bugs and add grafana demos for worker resource download metrics (#9324)
* [Feature][Metrics] Add docs to resource related metrics (#9324)
* [Feature][Metrics] Use tags to indicate status in metrics (#9324)
* [Feature][Metrics] Fix demos, docs and remove redundant code (#9324)
* [Feature][Metrics] Remove .pnpm-debug.log (#9324)
* [Feature][Metrics] Fix style check (#9324)
* [Feature][Metrics] Replace KB with bytes for the unit of resource file size in metrics (#9324)
* [Feature][Metrics] Make code neat (#9324)
---
docs/docs/en/guide/metrics/metrics.md | 5 +-
docs/docs/zh/guide/metrics/metrics.md | 3 +
.../resources/grafana/DolphinSchedulerWorker.json | 408 ++++++++++++++++++++-
.../server/worker/metrics/WorkerServerMetrics.java | 49 ++-
.../server/worker/runner/TaskExecuteThread.java | 9 +
5 files changed, 460 insertions(+), 14 deletions(-)
diff --git a/docs/docs/en/guide/metrics/metrics.md b/docs/docs/en/guide/metrics/metrics.md
index 2ba4f639d6..9ecdc3619f 100644
--- a/docs/docs/en/guide/metrics/metrics.md
+++ b/docs/docs/en/guide/metrics/metrics.md
@@ -74,7 +74,7 @@ For example, you can get the master metrics by `curl http://localhost:5679/actua
- ds.task.execution.count.by.type: (counter) the number of task executions grouped by tag `task_type`
- ds.task.running: (gauge) the number of running tasks
- ds.task.prepared: (gauge) the number of tasks prepared for task queue
-- ds.task.execution.count: (histogram) the number of executed tasks
+- ds.task.execution.count: (counter) the number of executed tasks
- ds.task.execution.duration: (histogram) duration of task executions
@@ -103,6 +103,9 @@ For example, you can get the master metrics by `curl http://localhost:5679/actua
- ds.worker.overload.count: (counter) the number of times the worker overloaded
- ds.worker.full.submit.queue.count: (counter) the number of times the worker's submit queue being full
+- ds.worker.resource.download.count: (counter) the number of downloaded resource files on workers, sliced by tag `status`
+- ds.worker.resource.download.duration: (histogram) the time cost of resource download on workers
+- ds.worker.resource.download.size: (histogram) the sizes of downloaded resource files on workers (bytes)
### Api Server Metrics
diff --git a/docs/docs/zh/guide/metrics/metrics.md b/docs/docs/zh/guide/metrics/metrics.md
index 9b3805c613..3116f5445b 100644
--- a/docs/docs/zh/guide/metrics/metrics.md
+++ b/docs/docs/zh/guide/metrics/metrics.md
@@ -104,6 +104,9 @@ metrics exporter端口`server.port`是在application.yaml里定义的: master: `
- ds.worker.overload.count: (counter) worker过载次数
- ds.worker.full.submit.queue.count: (counter) worker提交队列全满次数
+- ds.worker.resource.download.count: (counter) worker下载资源文件的次数,可由`status`标签切分
+- ds.worker.resource.download.duration: (histogram) worker下载资源文件时花费的时间分布
+- ds.worker.resource.download.size: (histogram) worker下载资源文件大小的分布(bytes)
### Api Server指标
diff --git a/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json b/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json
index 01cd09baad..c13e030259 100644
--- a/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json
+++ b/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json
@@ -764,13 +764,398 @@
"align": false
}
},
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "PBFA97CFB590B2093"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 33
+ },
+ "id": 47,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "PBFA97CFB590B2093"
+ },
+ "expr": "sum(increase(ds_worker_resource_download_count_total{}[5m]))",
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "PBFA97CFB590B2093"
+ },
+ "expr": "increase(ds_worker_resource_download_count_total{status=\"success\"}[5m])",
+ "hide": false,
+ "refId": "B"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "PBFA97CFB590B2093"
+ },
+ "expr": "increase(ds_worker_resource_download_count_total{status=\"fail\"}[5m])",
+ "hide": false,
+ "refId": "C"
+ }
+ ],
+ "title": "Worker Resource Download Count/5m",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "PBFA97CFB590B2093"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 41
+ },
+ "id": 44,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "PBFA97CFB590B2093"
+ },
+ "expr": "increase(ds_worker_resource_download_duration_seconds{}[5m])",
+ "refId": "A"
+ }
+ ],
+ "title": "Worker Resource Download Time/5m",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "PBFA97CFB590B2093"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 41
+ },
+ "id": 45,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom"
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "PBFA97CFB590B2093"
+ },
+ "expr": "increase(ds_worker_resource_download_size_bytes[5m])",
+ "refId": "A"
+ }
+ ],
+ "title": "Worker Resource Download Size/5m",
+ "type": "timeseries"
+ },
+ {
+ "cards": {},
+ "color": {
+ "cardColor": "#b4ff00",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateOranges",
+ "exponent": 0.5,
+ "mode": "spectrum"
+ },
+ "dataFormat": "timeseries",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "PBFA97CFB590B2093"
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 49
+ },
+ "heatmap": {},
+ "hideZeroBuckets": false,
+ "highlightCards": true,
+ "id": 46,
+ "legend": {
+ "show": false
+ },
+ "reverseYBuckets": false,
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "PBFA97CFB590B2093"
+ },
+ "expr": "histogram_quantile(0.75, sum(increase(ds_worker_resource_download_size_bytes_bucket[5m])) by (le))",
+ "refId": "A"
+ }
+ ],
+ "title": "Worker Resource Download Size Distribution/5m",
+ "tooltip": {
+ "show": true,
+ "showHistogram": false
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "yAxis": {
+ "format": "short",
+ "logBase": 1,
+ "show": true
+ },
+ "yBucketBound": "auto"
+ },
+ {
+ "cards": {},
+ "color": {
+ "cardColor": "#b4ff00",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateOranges",
+ "exponent": 0.5,
+ "mode": "spectrum"
+ },
+ "dataFormat": "timeseries",
+ "datasource": {
+ "type": "prometheus",
+ "uid": "PBFA97CFB590B2093"
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 49
+ },
+ "heatmap": {},
+ "hideZeroBuckets": false,
+ "highlightCards": true,
+ "id": 48,
+ "legend": {
+ "show": false
+ },
+ "pluginVersion": "8.5.3",
+ "reverseYBuckets": false,
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "PBFA97CFB590B2093"
+ },
+ "expr": "histogram_quantile(0.95, sum(rate(ds_worker_resource_download_duration_seconds_bucket[5m])) by (le))",
+ "refId": "A"
+ }
+ ],
+ "title": "Worker Resource Download Time Distribution/5m",
+ "tooltip": {
+ "show": true,
+ "showHistogram": false
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "yAxis": {
+ "format": "short",
+ "logBase": 1,
+ "show": true
+ },
+ "yBucketBound": "auto"
+ },
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
- "y": 33
+ "y": 57
},
"id": 26,
"panels": [],
@@ -836,7 +1221,7 @@
"h": 8,
"w": 8,
"x": 0,
- "y": 34
+ "y": 58
},
"id": 14,
"options": {
@@ -925,7 +1310,7 @@
"h": 8,
"w": 9,
"x": 8,
- "y": 34
+ "y": 58
},
"id": 20,
"options": {
@@ -1014,7 +1399,7 @@
"h": 8,
"w": 7,
"x": 17,
- "y": 34
+ "y": 58
},
"id": 22,
"options": {
@@ -1103,7 +1488,7 @@
"h": 8,
"w": 8,
"x": 0,
- "y": 42
+ "y": 66
},
"id": 32,
"options": {
@@ -1192,7 +1577,7 @@
"h": 8,
"w": 9,
"x": 8,
- "y": 42
+ "y": 66
},
"id": 34,
"options": {
@@ -1281,7 +1666,7 @@
"h": 8,
"w": 7,
"x": 17,
- "y": 42
+ "y": 66
},
"id": 36,
"options": {
@@ -1370,7 +1755,7 @@
"h": 8,
"w": 8,
"x": 0,
- "y": 50
+ "y": 74
},
"id": 38,
"options": {
@@ -1459,7 +1844,7 @@
"h": 8,
"w": 9,
"x": 8,
- "y": 50
+ "y": 74
},
"id": 40,
"options": {
@@ -1548,7 +1933,7 @@
"h": 8,
"w": 7,
"x": 17,
- "y": 50
+ "y": 74
},
"id": 42,
"options": {
@@ -1579,6 +1964,7 @@
"type": "timeseries"
}
],
+ "refresh": "30s",
"schemaVersion": 36,
"style": "dark",
"tags": [],
@@ -1593,6 +1979,6 @@
"timezone": "",
"title": "Worker",
"uid": "6wXtd3r7k",
- "version": 2,
+ "version": 11,
"weekStart": ""
}
\ No newline at end of file
diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java
index 6c68cc00ec..c0ae07269b 100644
--- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java
+++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java
@@ -17,11 +17,14 @@
package org.apache.dolphinscheduler.server.worker.metrics;
+import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
import io.micrometer.core.instrument.Counter;
+import io.micrometer.core.instrument.DistributionSummary;
import io.micrometer.core.instrument.Gauge;
import io.micrometer.core.instrument.Metrics;
+import io.micrometer.core.instrument.Timer;
import lombok.experimental.UtilityClass;
@UtilityClass
@@ -37,6 +40,33 @@ public class WorkerServerMetrics {
.description("full worker submit queues count")
.register(Metrics.globalRegistry);
+ private static final Counter WORKER_RESOURCE_DOWNLOAD_SUCCESS_COUNTER =
+ Counter.builder("ds.worker.resource.download.count")
+ .tag("status", "success")
+ .description("worker resource download success count")
+ .register(Metrics.globalRegistry);
+
+ private static final Counter WORKER_RESOURCE_DOWNLOAD_FAILURE_COUNTER =
+ Counter.builder("ds.worker.resource.download.count")
+ .tag("status", "fail")
+ .description("worker resource download failure count")
+ .register(Metrics.globalRegistry);
+
+ private static final Timer WORKER_RESOURCE_DOWNLOAD_DURATION_TIMER =
+ Timer.builder("ds.worker.resource.download.duration")
+ .publishPercentiles(0.5, 0.75, 0.95, 0.99)
+ .publishPercentileHistogram()
+ .description("time cost of resource download on workers")
+ .register(Metrics.globalRegistry);
+
+ private static final DistributionSummary WORKER_RESOURCE_DOWNLOAD_SIZE_DISTRIBUTION =
+ DistributionSummary.builder("ds.worker.resource.download.size")
+ .baseUnit("bytes")
+ .publishPercentiles(0.5, 0.75, 0.95, 0.99)
+ .publishPercentileHistogram()
+ .description("size of downloaded resource files on worker")
+ .register(Metrics.globalRegistry);
+
public static void incWorkerOverloadCount() {
WORKER_OVERLOAD_COUNTER.increment();
}
@@ -45,11 +75,26 @@ public class WorkerServerMetrics {
WORKER_SUBMIT_QUEUE_IS_FULL_COUNTER.increment();
}
- public static void registerWorkerRunningTaskGauge(Supplier<Number> supplier) {
+ public static void incWorkerResourceDownloadSuccessCount() {
+ WORKER_RESOURCE_DOWNLOAD_SUCCESS_COUNTER.increment();
+ }
+
+ public static void incWorkerResourceDownloadFailureCount() {
+ WORKER_RESOURCE_DOWNLOAD_FAILURE_COUNTER.increment();
+ }
+
+ public static void recordWorkerResourceDownloadTime(final long milliseconds) {
+ WORKER_RESOURCE_DOWNLOAD_DURATION_TIMER.record(milliseconds, TimeUnit.MILLISECONDS);
+ }
+
+ public static void recordWorkerResourceDownloadSize(final long size) {
+ WORKER_RESOURCE_DOWNLOAD_SIZE_DISTRIBUTION.record(size);
+ }
+
+ public static void registerWorkerRunningTaskGauge(final Supplier<Number> supplier) {
Gauge.builder("ds.task.running", supplier)
.description("number of running tasks on workers")
.register(Metrics.globalRegistry);
-
}
}
diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java
index 2aa34a21fd..7e03f83844 100644
--- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java
+++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java
@@ -36,6 +36,7 @@ import org.apache.dolphinscheduler.plugin.task.api.enums.ExecutionStatus;
import org.apache.dolphinscheduler.plugin.task.api.model.Property;
import org.apache.dolphinscheduler.plugin.task.api.model.TaskAlertInfo;
import org.apache.dolphinscheduler.server.utils.ProcessUtils;
+import org.apache.dolphinscheduler.server.worker.metrics.WorkerServerMetrics;
import org.apache.dolphinscheduler.server.worker.processor.TaskCallbackService;
import org.apache.dolphinscheduler.service.alert.AlertClientService;
import org.apache.dolphinscheduler.service.exceptions.ServiceException;
@@ -46,7 +47,9 @@ import org.apache.commons.lang3.tuple.Pair;
import java.io.File;
import java.io.IOException;
+import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
+import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
@@ -278,8 +281,14 @@ public class TaskExecuteThread implements Runnable, Delayed {
String tenantCode = fileDownload.getRight();
String resHdfsPath = storageOperate.getResourceFileName(tenantCode, fullName);
logger.info("get resource file from hdfs :{}", resHdfsPath);
+ long resourceDownloadStartTime = System.currentTimeMillis();
storageOperate.download(tenantCode, resHdfsPath, execLocalPath + File.separator + fullName, false, true);
+ WorkerServerMetrics.recordWorkerResourceDownloadTime(System.currentTimeMillis() - resourceDownloadStartTime);
+ WorkerServerMetrics.recordWorkerResourceDownloadSize(
+ Files.size(Paths.get(execLocalPath, fullName)));
+ WorkerServerMetrics.incWorkerResourceDownloadSuccessCount();
} catch (Exception e) {
+ WorkerServerMetrics.incWorkerResourceDownloadFailureCount();
logger.error(e.getMessage(), e);
throw new ServiceException(e.getMessage());
}