You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uniffle.apache.org by ro...@apache.org on 2022/09/21 02:22:56 UTC

[incubator-uniffle] branch master updated: Add more metrics about requiring read memory (#231)

This is an automated email from the ASF dual-hosted git repository.

roryqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git


The following commit(s) were added to refs/heads/master by this push:
     new 42e2605d Add more metrics about requiring read memory (#231)
42e2605d is described below

commit 42e2605dad47a923500c2088af9eac49ad2329c6
Author: Junfan Zhang <ju...@outlook.com>
AuthorDate: Wed Sep 21 10:22:50 2022 +0800

    Add more metrics about requiring read memory (#231)
    
    ### What changes were proposed in this pull request?
    Add more metrics about requiring read memory
    1. total_require_read_memory_num
    2. total_require_read_memory_retry_num
    3. total_require_read_memory_failed_num
    
    ### Why are the changes needed?
    These metrics can be as the indicator of high-watermark pressure.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    No need.
---
 .../java/org/apache/uniffle/server/ShuffleServerMetrics.java     | 9 +++++++++
 .../org/apache/uniffle/server/buffer/ShuffleBufferManager.java   | 3 +++
 .../java/org/apache/uniffle/server/ShuffleServerMetricsTest.java | 2 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
index 88f921b2..0bc8d903 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
@@ -52,6 +52,9 @@ public class ShuffleServerMetrics {
   private static final String TOTAL_READ_LOCAL_INDEX_FILE = "total_read_local_index_file";
   private static final String TOTAL_READ_MEMORY_DATA = "total_read_memory_data";
   private static final String TOTAL_READ_TIME = "total_read_time";
+  private static final String TOTAL_REQUIRE_READ_MEMORY = "total_require_read_memory_num";
+  private static final String TOTAL_REQUIRE_READ_MEMORY_RETRY = "total_require_read_memory_retry_num";
+  private static final String TOTAL_REQUIRE_READ_MEMORY_FAILED = "total_require_read_memory_failed_num";
 
   private static final String LOCAL_STORAGE_TOTAL_DIRS_NUM = "local_storage_total_dirs_num";
   private static final String LOCAL_STORAGE_CORRUPTED_DIRS_NUM = "local_storage_corrupted_dirs_num";
@@ -107,6 +110,9 @@ public class ShuffleServerMetrics {
   public static Counter counterLocalStorageRetryWrite;
   public static Counter counterLocalStorageFailedWrite;
   public static Counter counterLocalStorageSuccessWrite;
+  public static Counter counterTotalRequireReadMemoryNum;
+  public static Counter counterTotalRequireReadMemoryRetryNum;
+  public static Counter counterTotalRequireReadMemoryFailedNum;
 
   public static Gauge gaugeLocalStorageTotalDirsNum;
   public static Gauge gaugeLocalStorageCorruptedDirsNum;
@@ -252,6 +258,9 @@ public class ShuffleServerMetrics {
     counterLocalStorageRetryWrite = metricsManager.addCounter(STORAGE_RETRY_WRITE_LOCAL);
     counterLocalStorageFailedWrite = metricsManager.addCounter(STORAGE_FAILED_WRITE_LOCAL);
     counterLocalStorageSuccessWrite = metricsManager.addCounter(STORAGE_SUCCESS_WRITE_LOCAL);
+    counterTotalRequireReadMemoryNum = metricsManager.addCounter(TOTAL_REQUIRE_READ_MEMORY);
+    counterTotalRequireReadMemoryRetryNum = metricsManager.addCounter(TOTAL_REQUIRE_READ_MEMORY_RETRY);
+    counterTotalRequireReadMemoryFailedNum = metricsManager.addCounter(TOTAL_REQUIRE_READ_MEMORY_FAILED);
 
     gaugeLocalStorageTotalDirsNum = metricsManager.addGauge(LOCAL_STORAGE_TOTAL_DIRS_NUM);
     gaugeLocalStorageCorruptedDirsNum = metricsManager.addGauge(LOCAL_STORAGE_CORRUPTED_DIRS_NUM);
diff --git a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferManager.java b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferManager.java
index 2e3d09d4..c0ce2f4f 100644
--- a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferManager.java
+++ b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferManager.java
@@ -275,6 +275,7 @@ public class ShuffleBufferManager {
   }
 
   public boolean requireReadMemoryWithRetry(long size) {
+    ShuffleServerMetrics.counterTotalRequireReadMemoryNum.inc();
     for (int i = 0; i < retryNum; i++) {
       synchronized (this) {
         if (readDataMemory.get() + size < readCapacity) {
@@ -284,12 +285,14 @@ public class ShuffleBufferManager {
       }
       LOG.info("Can't require[" + size + "] for read data, current[" + readDataMemory.get()
           + "], capacity[" + readCapacity + "], re-try " + i + " times");
+      ShuffleServerMetrics.counterTotalRequireReadMemoryRetryNum.inc();
       try {
         Thread.sleep(1000);
       } catch (Exception e) {
         LOG.warn("Error happened when require memory", e);
       }
     }
+    ShuffleServerMetrics.counterTotalRequireReadMemoryFailedNum.inc();
     return false;
   }
 
diff --git a/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java b/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java
index cd6b33ec..965a567b 100644
--- a/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java
+++ b/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java
@@ -90,7 +90,7 @@ public class ShuffleServerMetricsTest {
     JsonNode actualObj = mapper.readTree(content);
     assertEquals(2, actualObj.size());
     JsonNode metricsNode = actualObj.get("metrics");
-    assertEquals(46, metricsNode.size());
+    assertEquals(49, metricsNode.size());
 
     List<String> expectedMetricNames = Lists.newArrayList(
         ShuffleServerMetrics.STORAGE_TOTAL_WRITE_REMOTE_PREFIX + STORAGE_HOST,