You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@celeborn.apache.org by ch...@apache.org on 2023/10/23 03:16:18 UTC

[incubator-celeborn] 07/08: [CELEBORN-916] Add new metric about active shuffle file count in worker

This is an automated email from the ASF dual-hosted git repository.

chengpan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-celeborn.git

commit 11c90d8e726a50477a55a0338c37a0d76ec55226
Author: SteNicholas <pr...@163.com>
AuthorDate: Mon Oct 23 09:56:04 2023 +0800

    [CELEBORN-916] Add new metric about active shuffle file count in worker
    
    ### What changes were proposed in this pull request?
    
    Adds new metric `ActiveShuffleFileCount` about active shuffle file count of Celeborn Worker.
    
    ### Why are the changes needed?
    
    `ActiveShuffleSize` metric report the active shuffle size of peer worker at present. Therefore, it's better to introduce `ActiveShuffleFileCount` to report the active shuffle file count of Celeborn Worker.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Internal tests.
    
    Closes #2009 from SteNicholas/CELEBORN-916.
    
    Authored-by: SteNicholas <pr...@163.com>
    Signed-off-by: mingji <fe...@alibaba-inc.com>
---
 METRICS.md                                         |   2 +
 assets/grafana/celeborn-dashboard.json             | 123 ++++++++++++++++++++-
 docs/monitoring.md                                 |   8 +-
 .../celeborn/service/deploy/worker/Worker.scala    |   3 +
 .../service/deploy/worker/WorkerSource.scala       |   4 +-
 .../deploy/worker/storage/StorageManager.scala     |   3 +
 6 files changed, 138 insertions(+), 5 deletions(-)

diff --git a/METRICS.md b/METRICS.md
index 7b5081a9d..42a40fe29 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -95,6 +95,8 @@ Here is an example of grafana dashboard importing.
 |               DiskBuffer               |      worker       | Disk buffers are part of netty used memory, means data need to write to disk but haven't been written to disk.  |
 |             PausePushData              |      worker       |                   PausePushData means the count of worker stopped receiving data from client.                   |
 |       PausePushDataAndReplicate        |      worker       |    PausePushDataAndReplicate means the count of worker stopped receiving data from client and other workers.    |
+|           ActiveShuffleSize            |      worker       |                 The active shuffle size of a worker including master replica and slave replica.                 |
+|         ActiveShuffleFileCount         |      worker       |              The active shuffle file count of a worker including master replica and slave replica.              |
 |         OutstandingFetchCount          |      worker       |                         The count of outstanding fetch request received in peer worker.                         |
 |          OutstandingRpcCount           |      worker       |                          The count of outstanding rpc request received in peer worker.                          |
 |          OutstandingPushCount          |      worker       |                         The count of outstanding push request received in peer worker.                          |
diff --git a/assets/grafana/celeborn-dashboard.json b/assets/grafana/celeborn-dashboard.json
index cb676a140..aa1f7fd4a 100644
--- a/assets/grafana/celeborn-dashboard.json
+++ b/assets/grafana/celeborn-dashboard.json
@@ -542,7 +542,7 @@
             "type": "prometheus",
             "uid": "${DS_PROMETHEUS}"
           },
-          "description": "The active shuffle size.",
+          "description": "The active shuffle size of workers.",
           "fieldConfig": {
             "defaults": {
               "color": {
@@ -634,7 +634,7 @@
             "type": "prometheus",
             "uid": "${DS_PROMETHEUS}"
           },
-          "description": "The active shuffle partition count.",
+          "description": "The active shuffle file count of workers.",
           "fieldConfig": {
             "defaults": {
               "color": {
@@ -1722,6 +1722,7 @@
             "type": "prometheus",
             "uid": "${DS_PROMETHEUS}"
           },
+          "description": "The active shuffle size of a worker including master replica and slave replica.",
           "fieldConfig": {
             "defaults": {
               "color": {
@@ -1834,6 +1835,124 @@
           ],
           "title": "metrics_ActiveShuffleSize_Value",
           "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "description": "The active shuffle file count of a worker including master replica and slave replica.",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "custom": {
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "drawStyle": "line",
+                "fillOpacity": 0,
+                "gradientMode": "none",
+                "hideFrom": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": false
+                },
+                "lineInterpolation": "linear",
+                "lineWidth": 1,
+                "pointSize": 5,
+                "scaleDistribution": {
+                  "type": "linear"
+                },
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {
+                  "group": "A",
+                  "mode": "none"
+                },
+                "thresholdsStyle": {
+                  "mode": "off"
+                }
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 80
+                  }
+                ]
+              }
+            },
+            "overrides": [
+              {
+                "__systemRef": "hideSeriesFrom",
+                "matcher": {
+                  "id": "byNames",
+                  "options": {
+                    "mode": "exclude",
+                    "names": [
+                      "metrics_ActiveShuffleFileCount_Value{instance=\"core-1-1:9096\", job=\"RSS\", role=\"Worker\"}"
+                    ],
+                    "prefix": "All except:",
+                    "readOnly": true
+                  }
+                },
+                "properties": [
+                  {
+                    "id": "custom.hideFrom",
+                    "value": {
+                      "legend": false,
+                      "tooltip": false,
+                      "viz": true
+                    }
+                  }
+                ]
+              }
+            ]
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 30
+          },
+          "id": 181,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${DS_PROMETHEUS}"
+              },
+              "editorMode": "builder",
+              "expr": "metrics_ActiveShuffleFileCount_Value",
+              "instant": false,
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "metrics_ActiveShuffleFileCount_Value",
+          "type": "timeseries"
         }
       ],
       "title": "Worker",
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 440b5e4de..228d47a3d 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -100,9 +100,9 @@ These metrics are exposed by Celeborn master.
     - PartitionSize
         - The size of estimated shuffle partition.
     - PartitionWritten
-        - The active shuffle size.
+        - The active shuffle size of workers.
     - PartitionFileCount
-        - The active shuffle partition count.
+        - The active shuffle file count of workers.
     - OfferSlotsTime
         - The time for masters to handle `RequestSlots` request when registering shuffle.
 
@@ -194,6 +194,10 @@ These metrics are exposed by Celeborn worker.
     - PotentialConsumeSpeed
     - UserProduceSpeed
     - WorkerConsumeSpeed
+    - ActiveShuffleSize
+        - The active shuffle size of a worker including master replica and slave replica.
+    - ActiveShuffleFileCount
+        - The active shuffle file count of a worker including master replica and slave replica.
     - OutstandingFetchCount
         - The count of outstanding fetch request.
     - OutstandingRpcCount
diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
index 90e5e58ff..1cc85db3d 100644
--- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
+++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala
@@ -304,6 +304,9 @@ private[celeborn] class Worker(
   workerSource.addGauge(WorkerSource.ACTIVE_SHUFFLE_SIZE) { () =>
     storageManager.getActiveShuffleSize()
   }
+  workerSource.addGauge(WorkerSource.ACTIVE_SHUFFLE_FILE_COUNT) { () =>
+    storageManager.getActiveShuffleFileCount()
+  }
   workerSource.addGauge(WorkerSource.PAUSE_PUSH_DATA_TIME) { () =>
     memoryManager.getPausePushDataTime
   }
diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
index 20a138f9a..df07970ab 100644
--- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
+++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala
@@ -100,6 +100,7 @@ object WorkerSource {
   // slots
   val SLOTS_ALLOCATED = "SlotsAllocated"
 
+  // connection
   val ACTIVE_CONNECTION_COUNT = "ActiveConnectionCount"
 
   // memory
@@ -124,11 +125,12 @@ object WorkerSource {
   val DEVICE_CELEBORN_FREE_CAPACITY = "DeviceCelebornFreeBytes"
   val DEVICE_CELEBORN_TOTAL_CAPACITY = "DeviceCelebornTotalBytes"
 
-  // Congestion control
+  // congestion control
   val POTENTIAL_CONSUME_SPEED = "PotentialConsumeSpeed"
   val USER_PRODUCE_SPEED = "UserProduceSpeed"
   val WORKER_CONSUME_SPEED = "WorkerConsumeSpeed"
 
   // active shuffle size
   val ACTIVE_SHUFFLE_SIZE = "ActiveShuffleSize"
+  val ACTIVE_SHUFFLE_FILE_COUNT = "ActiveShuffleFileCount"
 }
diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/StorageManager.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/StorageManager.scala
index c836e98e6..9f370a6bc 100644
--- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/StorageManager.scala
+++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/StorageManager.scala
@@ -845,6 +845,9 @@ final private[worker] class StorageManager(conf: CelebornConf, workerSource: Abs
     fileInfos.values().asScala.map(_.values().asScala.map(_.getBytesFlushed).sum).sum
   }
 
+  def getActiveShuffleFileCount(): Long = {
+    fileInfos.asScala.values.map(_.size()).sum
+  }
 }
 
 object StorageManager {