You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uniffle.apache.org by ro...@apache.org on 2022/09/21 02:22:56 UTC
[incubator-uniffle] branch master updated: Add more metrics about requiring read memory (#231)
This is an automated email from the ASF dual-hosted git repository.
roryqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-uniffle.git
The following commit(s) were added to refs/heads/master by this push:
new 42e2605d Add more metrics about requiring read memory (#231)
42e2605d is described below
commit 42e2605dad47a923500c2088af9eac49ad2329c6
Author: Junfan Zhang <ju...@outlook.com>
AuthorDate: Wed Sep 21 10:22:50 2022 +0800
Add more metrics about requiring read memory (#231)
### What changes were proposed in this pull request?
Add more metrics about requiring read memory
1. total_require_read_memory_num
2. total_require_read_memory_retry_num
3. total_require_read_memory_failed_num
### Why are the changes needed?
These metrics can be as the indicator of high-watermark pressure.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
No need.
---
.../java/org/apache/uniffle/server/ShuffleServerMetrics.java | 9 +++++++++
.../org/apache/uniffle/server/buffer/ShuffleBufferManager.java | 3 +++
.../java/org/apache/uniffle/server/ShuffleServerMetricsTest.java | 2 +-
3 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
index 88f921b2..0bc8d903 100644
--- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
+++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java
@@ -52,6 +52,9 @@ public class ShuffleServerMetrics {
private static final String TOTAL_READ_LOCAL_INDEX_FILE = "total_read_local_index_file";
private static final String TOTAL_READ_MEMORY_DATA = "total_read_memory_data";
private static final String TOTAL_READ_TIME = "total_read_time";
+ private static final String TOTAL_REQUIRE_READ_MEMORY = "total_require_read_memory_num";
+ private static final String TOTAL_REQUIRE_READ_MEMORY_RETRY = "total_require_read_memory_retry_num";
+ private static final String TOTAL_REQUIRE_READ_MEMORY_FAILED = "total_require_read_memory_failed_num";
private static final String LOCAL_STORAGE_TOTAL_DIRS_NUM = "local_storage_total_dirs_num";
private static final String LOCAL_STORAGE_CORRUPTED_DIRS_NUM = "local_storage_corrupted_dirs_num";
@@ -107,6 +110,9 @@ public class ShuffleServerMetrics {
public static Counter counterLocalStorageRetryWrite;
public static Counter counterLocalStorageFailedWrite;
public static Counter counterLocalStorageSuccessWrite;
+ public static Counter counterTotalRequireReadMemoryNum;
+ public static Counter counterTotalRequireReadMemoryRetryNum;
+ public static Counter counterTotalRequireReadMemoryFailedNum;
public static Gauge gaugeLocalStorageTotalDirsNum;
public static Gauge gaugeLocalStorageCorruptedDirsNum;
@@ -252,6 +258,9 @@ public class ShuffleServerMetrics {
counterLocalStorageRetryWrite = metricsManager.addCounter(STORAGE_RETRY_WRITE_LOCAL);
counterLocalStorageFailedWrite = metricsManager.addCounter(STORAGE_FAILED_WRITE_LOCAL);
counterLocalStorageSuccessWrite = metricsManager.addCounter(STORAGE_SUCCESS_WRITE_LOCAL);
+ counterTotalRequireReadMemoryNum = metricsManager.addCounter(TOTAL_REQUIRE_READ_MEMORY);
+ counterTotalRequireReadMemoryRetryNum = metricsManager.addCounter(TOTAL_REQUIRE_READ_MEMORY_RETRY);
+ counterTotalRequireReadMemoryFailedNum = metricsManager.addCounter(TOTAL_REQUIRE_READ_MEMORY_FAILED);
gaugeLocalStorageTotalDirsNum = metricsManager.addGauge(LOCAL_STORAGE_TOTAL_DIRS_NUM);
gaugeLocalStorageCorruptedDirsNum = metricsManager.addGauge(LOCAL_STORAGE_CORRUPTED_DIRS_NUM);
diff --git a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferManager.java b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferManager.java
index 2e3d09d4..c0ce2f4f 100644
--- a/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferManager.java
+++ b/server/src/main/java/org/apache/uniffle/server/buffer/ShuffleBufferManager.java
@@ -275,6 +275,7 @@ public class ShuffleBufferManager {
}
public boolean requireReadMemoryWithRetry(long size) {
+ ShuffleServerMetrics.counterTotalRequireReadMemoryNum.inc();
for (int i = 0; i < retryNum; i++) {
synchronized (this) {
if (readDataMemory.get() + size < readCapacity) {
@@ -284,12 +285,14 @@ public class ShuffleBufferManager {
}
LOG.info("Can't require[" + size + "] for read data, current[" + readDataMemory.get()
+ "], capacity[" + readCapacity + "], re-try " + i + " times");
+ ShuffleServerMetrics.counterTotalRequireReadMemoryRetryNum.inc();
try {
Thread.sleep(1000);
} catch (Exception e) {
LOG.warn("Error happened when require memory", e);
}
}
+ ShuffleServerMetrics.counterTotalRequireReadMemoryFailedNum.inc();
return false;
}
diff --git a/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java b/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java
index cd6b33ec..965a567b 100644
--- a/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java
+++ b/server/src/test/java/org/apache/uniffle/server/ShuffleServerMetricsTest.java
@@ -90,7 +90,7 @@ public class ShuffleServerMetricsTest {
JsonNode actualObj = mapper.readTree(content);
assertEquals(2, actualObj.size());
JsonNode metricsNode = actualObj.get("metrics");
- assertEquals(46, metricsNode.size());
+ assertEquals(49, metricsNode.size());
List<String> expectedMetricNames = Lists.newArrayList(
ShuffleServerMetrics.STORAGE_TOTAL_WRITE_REMOTE_PREFIX + STORAGE_HOST,