You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@celeborn.apache.org by et...@apache.org on 2024/01/09 13:55:33 UTC
(incubator-celeborn) branch branch-0.4 updated: [CELEBORN-1214] Introduce WriteDataHardSplitCount metric to record HARD_SPLIT partitions of PushData and PushMergedData
This is an automated email from the ASF dual-hosted git repository.
ethanfeng pushed a commit to branch branch-0.4
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git
The following commit(s) were added to refs/heads/branch-0.4 by this push:
new 3e3f6cb16 [CELEBORN-1214] Introduce WriteDataHardSplitCount metric to record HARD_SPLIT partitions of PushData and PushMergedData
3e3f6cb16 is described below
commit 3e3f6cb16fd7968cfcc5118c3435f19c1e9da908
Author: SteNicholas <pr...@163.com>
AuthorDate: Tue Jan 9 21:54:53 2024 +0800
[CELEBORN-1214] Introduce WriteDataHardSplitCount metric to record HARD_SPLIT partitions of PushData and PushMergedData
### What changes were proposed in this pull request?
Introduce `WriteDataHardSplitCount` metric to record `HARD_SPLIT` partitions of PushData and PushMergedData.
### Why are the changes needed?
As the log level of `PushDataHandler#handlePushData` and `PushDataHandler#handlePushMergedData` use the DEBUG level, `WriteDataHardSplitCount` metric shoud be introduced to record HARD_SPLIT partitions of PushData and PushMergedData for `PushDataHandler`.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
[Celeborn Dashboard](https://stenicholas.grafana.net/d/U_qgru_7z/celeborn?orgId=1&refresh=5s)
Closes #2217 from SteNicholas/CELEBORN-1214.
Authored-by: SteNicholas <pr...@163.com>
Signed-off-by: mingji <fe...@alibaba-inc.com>
(cherry picked from commit 0cd1291f6cba5c6b56c2ad337353a42f09c1f895)
Signed-off-by: mingji <fe...@alibaba-inc.com>
---
METRICS.md | 1 +
assets/grafana/celeborn-dashboard.json | 157 ++++++++++++++++-----
docs/monitoring.md | 1 +
.../service/deploy/worker/PushDataHandler.scala | 4 +
.../service/deploy/worker/WorkerSource.scala | 2 +
5 files changed, 132 insertions(+), 33 deletions(-)
diff --git a/METRICS.md b/METRICS.md
index 518d83189..2da833924 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -98,6 +98,7 @@ Here is an example of Grafana dashboard importing.
| FetchChunkFailCount | worker | The count of fetching chunk failed in current worker. |
| PrimaryPushDataTime | worker | PrimaryPushData means handle pushdata of primary partition location. |
| ReplicaPushDataTime | worker | ReplicaPushData means handle pushdata of replica partition location. |
+| WriteDataHardSplitCount | worker | The count of writing PushData or PushMergedData to HARD_SPLIT partition in current worker. |
| WriteDataFailCount | worker | The count of writing PushData or PushMergedData failed in current worker. |
| ReplicateDataFailCount | worker | The count of replicating PushData or PushMergedData failed in current worker. |
| ReplicateDataWriteFailCount | worker | The count of replicating PushData or PushMergedData failed caused by write failure in peer worker. |
diff --git a/assets/grafana/celeborn-dashboard.json b/assets/grafana/celeborn-dashboard.json
index 575c2aa15..3f0de33c9 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -1927,7 +1927,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 46
+ "y": 3
},
"id": 134,
"panels": [
@@ -1992,7 +1992,7 @@
"h": 9,
"w": 12,
"x": 0,
- "y": 47
+ "y": 4
},
"id": 68,
"options": {
@@ -2082,7 +2082,7 @@
"h": 9,
"w": 12,
"x": 12,
- "y": 47
+ "y": 4
},
"id": 70,
"options": {
@@ -2172,7 +2172,7 @@
"h": 9,
"w": 12,
"x": 0,
- "y": 56
+ "y": 13
},
"id": 72,
"options": {
@@ -2262,7 +2262,7 @@
"h": 9,
"w": 12,
"x": 12,
- "y": 56
+ "y": 13
},
"id": 74,
"options": {
@@ -2351,9 +2351,9 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 65
+ "y": 22
},
- "id": 76,
+ "id": 79,
"options": {
"legend": {
"calcs": [],
@@ -2373,13 +2373,13 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_WriteDataFailCount_Count",
+ "expr": "metrics_WriteDataHardSplitCount_Count",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
- "title": "metrics_WriteDataFailCount_Count",
+ "title": "metrics_WriteDataHardSplitCount_Count",
"type": "timeseries"
},
{
@@ -2442,9 +2442,9 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 65
+ "y": 22
},
- "id": 129,
+ "id": 76,
"options": {
"legend": {
"calcs": [],
@@ -2464,13 +2464,13 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicateDataWriteFailCount_Count",
+ "expr": "metrics_WriteDataFailCount_Count",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
- "title": "metrics_ReplicateDataWriteFailCount_Count",
+ "title": "metrics_WriteDataFailCount_Count",
"type": "timeseries"
},
{
@@ -2533,7 +2533,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 73
+ "y": 30
},
"id": 128,
"options": {
@@ -2624,9 +2624,9 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 73
+ "y": 30
},
- "id": 131,
+ "id": 129,
"options": {
"legend": {
"calcs": [],
@@ -2646,13 +2646,13 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicateDataTimeoutCount_Count",
+ "expr": "metrics_ReplicateDataWriteFailCount_Count",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
- "title": "metrics_ReplicateDataTimeoutCount_Count",
+ "title": "metrics_ReplicateDataWriteFailCount_Count",
"type": "timeseries"
},
{
@@ -2715,7 +2715,98 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 81
+ "y": 38
+ },
+ "id": 130,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "metrics_ReplicateDataCreateConnectionFailCount_Count",
+ "legendFormat": "${baseLegend}",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "metrics_ReplicateDataCreateConnectionFailCount_Count",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 38
},
"id": 132,
"options": {
@@ -2805,10 +2896,10 @@
"gridPos": {
"h": 8,
"w": 12,
- "x": 12,
- "y": 81
+ "x": 0,
+ "y": 46
},
- "id": 130,
+ "id": 131,
"options": {
"legend": {
"calcs": [],
@@ -2828,13 +2919,13 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
- "expr": "metrics_ReplicateDataCreateConnectionFailCount_Count",
+ "expr": "metrics_ReplicateDataTimeoutCount_Count",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
- "title": "metrics_ReplicateDataCreateConnectionFailCount_Count",
+ "title": "metrics_ReplicateDataTimeoutCount_Count",
"type": "timeseries"
}
],
@@ -2847,7 +2938,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 89
+ "y": 54
},
"id": 12,
"panels": [
@@ -3494,7 +3585,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 122
+ "y": 55
},
"id": 10,
"panels": [
@@ -4048,7 +4139,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 148
+ "y": 56
},
"id": 8,
"panels": [
@@ -5070,7 +5161,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 197
+ "y": 57
},
"id": 50,
"panels": [
@@ -5626,7 +5717,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 222
+ "y": 58
},
"id": 157,
"panels": [
@@ -5919,7 +6010,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 239
+ "y": 59
},
"id": 137,
"panels": [
@@ -7310,7 +7401,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 310
+ "y": 60
},
"id": 110,
"panels": [
@@ -7506,7 +7597,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 319
+ "y": 61
},
"id": 123,
"panels": [
@@ -7984,7 +8075,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 344
+ "y": 62
},
"id": 172,
"panels": [
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 531b703e3..1ada46e2c 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -148,6 +148,7 @@ These metrics are exposed by Celeborn worker.
- The time for a worker to handle a pushData RPC sent from a celeborn client.
- ReplicaPushDataTime
- The time for a worker to handle a pushData RPC sent from a celeborn worker by replicating.
+ - WriteDataHardSplitCount
- WriteDataFailCount
- ReplicateDataFailCount
- ReplicateDataWriteFailCount
diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/PushDataHandler.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/PushDataHandler.scala
index edd8bfe03..6705325e0 100644
--- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/PushDataHandler.scala
+++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/PushDataHandler.scala
@@ -210,6 +210,7 @@ class PushDataHandler(val workerSource: WorkerSource) extends BaseMessageHandler
logDebug(
s"Receive push data for committed hard split partition of (shuffle $shuffleKey, " +
s"map $mapId attempt $attemptId)")
+ workerSource.incCounter(WorkerSource.WRITE_DATA_HARD_SPLIT_COUNT)
callbackWithTimer.onSuccess(ByteBuffer.wrap(Array[Byte](StatusCode.HARD_SPLIT.getValue)))
}
} else {
@@ -474,6 +475,7 @@ class PushDataHandler(val workerSource: WorkerSource) extends BaseMessageHandler
} else {
logDebug(s"[Case1] Receive push merged data for committed hard split partition of " +
s"(shuffle $shuffleKey, map $mapId attempt $attemptId)")
+ workerSource.incCounter(WorkerSource.WRITE_DATA_HARD_SPLIT_COUNT)
callbackWithTimer.onSuccess(
ByteBuffer.wrap(Array[Byte](StatusCode.HARD_SPLIT.getValue)))
}
@@ -484,6 +486,7 @@ class PushDataHandler(val workerSource: WorkerSource) extends BaseMessageHandler
// after worker restart, some tasks still push data to this HARD_SPLIT partition.
logDebug(s"[Case2] Receive push merged data for committed hard split partition of " +
s"(shuffle $shuffleKey, map $mapId attempt $attemptId)")
+ workerSource.incCounter(WorkerSource.WRITE_DATA_HARD_SPLIT_COUNT)
callbackWithTimer.onSuccess(
ByteBuffer.wrap(Array[Byte](StatusCode.HARD_SPLIT.getValue)))
} else {
@@ -1196,6 +1199,7 @@ class PushDataHandler(val workerSource: WorkerSource) extends BaseMessageHandler
(fileWriter.getFileInfo.getFileLength < partitionSplitMaximumSize)) {
softSplit.set(true)
} else {
+ workerSource.incCounter(WorkerSource.WRITE_DATA_HARD_SPLIT_COUNT)
callback.onSuccess(ByteBuffer.wrap(Array[Byte](StatusCode.HARD_SPLIT.getValue)))
logTrace(
s"""
diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index e65b83fa6..911db859e 100644
--- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -28,6 +28,7 @@ class WorkerSource(conf: CelebornConf) extends AbstractSource(conf, MetricsSyste
// add counters
addCounter(OPEN_STREAM_FAIL_COUNT)
addCounter(FETCH_CHUNK_FAIL_COUNT)
+ addCounter(WRITE_DATA_HARD_SPLIT_COUNT)
addCounter(WRITE_DATA_FAIL_COUNT)
addCounter(REPLICATE_DATA_FAIL_COUNT)
addCounter(REPLICATE_DATA_WRITE_FAIL_COUNT)
@@ -84,6 +85,7 @@ object WorkerSource {
// push data
val PRIMARY_PUSH_DATA_TIME = "PrimaryPushDataTime"
val REPLICA_PUSH_DATA_TIME = "ReplicaPushDataTime"
+ val WRITE_DATA_HARD_SPLIT_COUNT = "WriteDataHardSplitCount"
val WRITE_DATA_FAIL_COUNT = "WriteDataFailCount"
val REPLICATE_DATA_FAIL_COUNT = "ReplicateDataFailCount"
val REPLICATE_DATA_WRITE_FAIL_COUNT = "ReplicateDataWriteFailCount"