You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@celeborn.apache.org by et...@apache.org on 2024/01/09 13:55:09 UTC

(incubator-celeborn) branch main updated: [CELEBORN-1214] Introduce WriteDataHardSplitCount metric to record HARD_SPLIT partitions of PushData and PushMergedData

This is an automated email from the ASF dual-hosted git repository.

ethanfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git


The following commit(s) were added to refs/heads/main by this push:
     new 0cd1291f6 [CELEBORN-1214] Introduce WriteDataHardSplitCount metric to record HARD_SPLIT partitions of PushData and PushMergedData
0cd1291f6 is described below

commit 0cd1291f6cba5c6b56c2ad337353a42f09c1f895
Author: SteNicholas <pr...@163.com>
AuthorDate: Tue Jan 9 21:54:53 2024 +0800

    [CELEBORN-1214] Introduce WriteDataHardSplitCount metric to record HARD_SPLIT partitions of PushData and PushMergedData
    
    ### What changes were proposed in this pull request?
    
    Introduce `WriteDataHardSplitCount` metric to record `HARD_SPLIT` partitions of PushData and PushMergedData.
    
    ### Why are the changes needed?
    
    As the log level of `PushDataHandler#handlePushData` and `PushDataHandler#handlePushMergedData` use the DEBUG level, `WriteDataHardSplitCount` metric shoud be introduced to record HARD_SPLIT partitions of PushData and PushMergedData for `PushDataHandler`.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    [Celeborn Dashboard](https://stenicholas.grafana.net/d/U_qgru_7z/celeborn?orgId=1&refresh=5s)
    
    Closes #2217 from SteNicholas/CELEBORN-1214.
    
    Authored-by: SteNicholas <pr...@163.com>
    Signed-off-by: mingji <fe...@alibaba-inc.com>
---
 METRICS.md                                         |   1 +
 assets/grafana/celeborn-dashboard.json             | 157 ++++++++++++++++-----
 docs/monitoring.md                                 |   1 +
 .../service/deploy/worker/PushDataHandler.scala    |   4 +
 .../service/deploy/worker/WorkerSource.scala       |   2 +
 5 files changed, 132 insertions(+), 33 deletions(-)

diff --git a/METRICS.md b/METRICS.md
index 518d83189..2da833924 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -98,6 +98,7 @@ Here is an example of Grafana dashboard importing.
 |          FetchChunkFailCount           |      worker       |                              The count of fetching chunk failed in current worker.                              |
 |          PrimaryPushDataTime           |      worker       |                      PrimaryPushData means handle pushdata of primary partition location.                       |
 |          ReplicaPushDataTime           |      worker       |                      ReplicaPushData means handle pushdata of replica partition location.                       |
+|        WriteDataHardSplitCount         |      worker       |           The count of writing PushData or PushMergedData to HARD_SPLIT partition in current worker.            |
 |           WriteDataFailCount           |      worker       |                    The count of writing PushData or PushMergedData failed in current worker.                    |
 |         ReplicateDataFailCount         |      worker       |                  The count of replicating PushData or PushMergedData failed in current worker.                  |
 |      ReplicateDataWriteFailCount       |      worker       |       The count of replicating PushData or PushMergedData failed caused by write failure in peer worker.        |
diff --git a/assets/grafana/celeborn-dashboard.json b/assets/grafana/celeborn-dashboard.json
index 575c2aa15..3f0de33c9 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -1927,7 +1927,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 46
+        "y": 3
       },
       "id": 134,
       "panels": [
@@ -1992,7 +1992,7 @@
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 47
+            "y": 4
           },
           "id": 68,
           "options": {
@@ -2082,7 +2082,7 @@
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 47
+            "y": 4
           },
           "id": 70,
           "options": {
@@ -2172,7 +2172,7 @@
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 56
+            "y": 13
           },
           "id": 72,
           "options": {
@@ -2262,7 +2262,7 @@
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 56
+            "y": 13
           },
           "id": 74,
           "options": {
@@ -2351,9 +2351,9 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 65
+            "y": 22
           },
-          "id": 76,
+          "id": 79,
           "options": {
             "legend": {
               "calcs": [],
@@ -2373,13 +2373,13 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "editorMode": "code",
-              "expr": "metrics_WriteDataFailCount_Count",
+              "expr": "metrics_WriteDataHardSplitCount_Count",
               "legendFormat": "${baseLegend}",
               "range": true,
               "refId": "A"
             }
           ],
-          "title": "metrics_WriteDataFailCount_Count",
+          "title": "metrics_WriteDataHardSplitCount_Count",
           "type": "timeseries"
         },
         {
@@ -2442,9 +2442,9 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 65
+            "y": 22
           },
-          "id": 129,
+          "id": 76,
           "options": {
             "legend": {
               "calcs": [],
@@ -2464,13 +2464,13 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "editorMode": "code",
-              "expr": "metrics_ReplicateDataWriteFailCount_Count",
+              "expr": "metrics_WriteDataFailCount_Count",
               "legendFormat": "${baseLegend}",
               "range": true,
               "refId": "A"
             }
           ],
-          "title": "metrics_ReplicateDataWriteFailCount_Count",
+          "title": "metrics_WriteDataFailCount_Count",
           "type": "timeseries"
         },
         {
@@ -2533,7 +2533,7 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 73
+            "y": 30
           },
           "id": 128,
           "options": {
@@ -2624,9 +2624,9 @@
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 73
+            "y": 30
           },
-          "id": 131,
+          "id": 129,
           "options": {
             "legend": {
               "calcs": [],
@@ -2646,13 +2646,13 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "editorMode": "code",
-              "expr": "metrics_ReplicateDataTimeoutCount_Count",
+              "expr": "metrics_ReplicateDataWriteFailCount_Count",
               "legendFormat": "${baseLegend}",
               "range": true,
               "refId": "A"
             }
           ],
-          "title": "metrics_ReplicateDataTimeoutCount_Count",
+          "title": "metrics_ReplicateDataWriteFailCount_Count",
           "type": "timeseries"
         },
         {
@@ -2715,7 +2715,98 @@
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 81
+            "y": 38
+          },
+          "id": 130,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${DS_PROMETHEUS}"
+              },
+              "editorMode": "code",
+              "expr": "metrics_ReplicateDataCreateConnectionFailCount_Count",
+              "legendFormat": "${baseLegend}",
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "metrics_ReplicateDataCreateConnectionFailCount_Count",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green"
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 38
           },
           "id": 132,
           "options": {
@@ -2805,10 +2896,10 @@
           "gridPos": {
             "h": 8,
             "w": 12,
-            "x": 12,
-            "y": 81
+            "x": 0,
+            "y": 46
           },
-          "id": 130,
+          "id": 131,
           "options": {
             "legend": {
               "calcs": [],
@@ -2828,13 +2919,13 @@
                 "uid": "${DS_PROMETHEUS}"
               },
               "editorMode": "code",
-              "expr": "metrics_ReplicateDataCreateConnectionFailCount_Count",
+              "expr": "metrics_ReplicateDataTimeoutCount_Count",
               "legendFormat": "${baseLegend}",
               "range": true,
               "refId": "A"
             }
           ],
-          "title": "metrics_ReplicateDataCreateConnectionFailCount_Count",
+          "title": "metrics_ReplicateDataTimeoutCount_Count",
           "type": "timeseries"
         }
       ],
@@ -2847,7 +2938,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 89
+        "y": 54
       },
       "id": 12,
       "panels": [
@@ -3494,7 +3585,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 122
+        "y": 55
       },
       "id": 10,
       "panels": [
@@ -4048,7 +4139,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 148
+        "y": 56
       },
       "id": 8,
       "panels": [
@@ -5070,7 +5161,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 197
+        "y": 57
       },
       "id": 50,
       "panels": [
@@ -5626,7 +5717,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 222
+        "y": 58
       },
       "id": 157,
       "panels": [
@@ -5919,7 +6010,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 239
+        "y": 59
       },
       "id": 137,
       "panels": [
@@ -7310,7 +7401,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 310
+        "y": 60
       },
       "id": 110,
       "panels": [
@@ -7506,7 +7597,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 319
+        "y": 61
       },
       "id": 123,
       "panels": [
@@ -7984,7 +8075,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 344
+        "y": 62
       },
       "id": 172,
       "panels": [
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 531b703e3..1ada46e2c 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -148,6 +148,7 @@ These metrics are exposed by Celeborn worker.
         - The time for a worker to handle a pushData RPC sent from a celeborn client.
     - ReplicaPushDataTime
         - The time for a worker to handle a pushData RPC sent from a celeborn worker by replicating.
+    - WriteDataHardSplitCount
     - WriteDataFailCount
     - ReplicateDataFailCount
     - ReplicateDataWriteFailCount
diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/PushDataHandler.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/PushDataHandler.scala
index 553d4d25d..edc3dc75f 100644
--- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/PushDataHandler.scala
+++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/PushDataHandler.scala
@@ -210,6 +210,7 @@ class PushDataHandler(val workerSource: WorkerSource) extends BaseMessageHandler
           logDebug(
             s"Receive push data for committed hard split partition of (shuffle $shuffleKey, " +
               s"map $mapId attempt $attemptId)")
+          workerSource.incCounter(WorkerSource.WRITE_DATA_HARD_SPLIT_COUNT)
           callbackWithTimer.onSuccess(ByteBuffer.wrap(Array[Byte](StatusCode.HARD_SPLIT.getValue)))
         }
       } else {
@@ -474,6 +475,7 @@ class PushDataHandler(val workerSource: WorkerSource) extends BaseMessageHandler
           } else {
             logDebug(s"[Case1] Receive push merged data for committed hard split partition of " +
               s"(shuffle $shuffleKey, map $mapId attempt $attemptId)")
+            workerSource.incCounter(WorkerSource.WRITE_DATA_HARD_SPLIT_COUNT)
             callbackWithTimer.onSuccess(
               ByteBuffer.wrap(Array[Byte](StatusCode.HARD_SPLIT.getValue)))
           }
@@ -484,6 +486,7 @@ class PushDataHandler(val workerSource: WorkerSource) extends BaseMessageHandler
             // after worker restart, some tasks still push data to this HARD_SPLIT partition.
             logDebug(s"[Case2] Receive push merged data for committed hard split partition of " +
               s"(shuffle $shuffleKey, map $mapId attempt $attemptId)")
+            workerSource.incCounter(WorkerSource.WRITE_DATA_HARD_SPLIT_COUNT)
             callbackWithTimer.onSuccess(
               ByteBuffer.wrap(Array[Byte](StatusCode.HARD_SPLIT.getValue)))
           } else {
@@ -1196,6 +1199,7 @@ class PushDataHandler(val workerSource: WorkerSource) extends BaseMessageHandler
         (fileWriter.getDiskFileInfo.getFileLength < partitionSplitMaximumSize)) {
         softSplit.set(true)
       } else {
+        workerSource.incCounter(WorkerSource.WRITE_DATA_HARD_SPLIT_COUNT)
         callback.onSuccess(ByteBuffer.wrap(Array[Byte](StatusCode.HARD_SPLIT.getValue)))
         logTrace(
           s"""
diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index e65b83fa6..911db859e 100644
--- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -28,6 +28,7 @@ class WorkerSource(conf: CelebornConf) extends AbstractSource(conf, MetricsSyste
   // add counters
   addCounter(OPEN_STREAM_FAIL_COUNT)
   addCounter(FETCH_CHUNK_FAIL_COUNT)
+  addCounter(WRITE_DATA_HARD_SPLIT_COUNT)
   addCounter(WRITE_DATA_FAIL_COUNT)
   addCounter(REPLICATE_DATA_FAIL_COUNT)
   addCounter(REPLICATE_DATA_WRITE_FAIL_COUNT)
@@ -84,6 +85,7 @@ object WorkerSource {
   // push data
   val PRIMARY_PUSH_DATA_TIME = "PrimaryPushDataTime"
   val REPLICA_PUSH_DATA_TIME = "ReplicaPushDataTime"
+  val WRITE_DATA_HARD_SPLIT_COUNT = "WriteDataHardSplitCount"
   val WRITE_DATA_FAIL_COUNT = "WriteDataFailCount"
   val REPLICATE_DATA_FAIL_COUNT = "ReplicateDataFailCount"
   val REPLICATE_DATA_WRITE_FAIL_COUNT = "ReplicateDataWriteFailCount"