You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iotdb.apache.org by ta...@apache.org on 2023/07/28 07:53:37 UTC
[iotdb] branch master updated: Fixing partially concurrent GC's duration statistics and adding a new GC monitor (#10623)
This is an automated email from the ASF dual-hosted git repository.
tanxinyu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iotdb.git
The following commit(s) were added to refs/heads/master by this push:
new 7b02fea4a2f Fixing partially concurrent GC's duration statistics and adding a new GC monitor (#10623)
7b02fea4a2f is described below
commit 7b02fea4a2f8ac3c1765a3c13af4112b7d6c5ecf
Author: Peng Junzhi <78...@users.noreply.github.com>
AuthorDate: Fri Jul 28 15:53:30 2023 +0800
Fixing partially concurrent GC's duration statistics and adding a new GC monitor (#10623)
---
.../iotdb/confignode/service/ConfigNode.java | 2 +
.../db/service/metrics/DataNodeMetricsHelper.java | 4 +
.../iotdb/metrics/metricsets/jvm/JvmGcMetrics.java | 201 ++++++++++----
.../apache/iotdb/metrics/utils/SystemMetric.java | 6 +
.../iotdb/commons/concurrent/ThreadName.java | 2 +
.../commons/service/metric/GcTimeAlerter.java | 54 ++++
.../service/metric/JvmGcMonitorMetrics.java | 290 +++++++++++++++++++++
7 files changed, 514 insertions(+), 45 deletions(-)
diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java
index 39a93cdb2bb..40314a5a856 100644
--- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java
+++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java
@@ -30,6 +30,7 @@ import org.apache.iotdb.commons.conf.IoTDBConstant;
import org.apache.iotdb.commons.exception.StartupException;
import org.apache.iotdb.commons.service.JMXService;
import org.apache.iotdb.commons.service.RegisterManager;
+import org.apache.iotdb.commons.service.metric.JvmGcMonitorMetrics;
import org.apache.iotdb.commons.service.metric.MetricService;
import org.apache.iotdb.commons.utils.TestOnly;
import org.apache.iotdb.confignode.client.ConfigNodeRequestType;
@@ -255,6 +256,7 @@ public class ConfigNode implements ConfigNodeMBean {
MetricService.getInstance().addMetricSet(new ProcessMetrics());
MetricService.getInstance().addMetricSet(new DiskMetrics(IoTDBConstant.CN_ROLE));
MetricService.getInstance().addMetricSet(new NetMetrics(IoTDBConstant.CN_ROLE));
+ MetricService.getInstance().addMetricSet(JvmGcMonitorMetrics.getInstance());
MetricService.getInstance().addMetricSet(ClientManagerMetrics.getInstance());
MetricService.getInstance().addMetricSet(ThreadPoolMetrics.getInstance());
initCpuMetrics();
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java
index cbcba846174..ea188be3cf8 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/DataNodeMetricsHelper.java
@@ -25,6 +25,7 @@ import org.apache.iotdb.commons.concurrent.ThreadName;
import org.apache.iotdb.commons.concurrent.ThreadPoolMetrics;
import org.apache.iotdb.commons.conf.CommonDescriptor;
import org.apache.iotdb.commons.conf.IoTDBConstant;
+import org.apache.iotdb.commons.service.metric.JvmGcMonitorMetrics;
import org.apache.iotdb.commons.service.metric.MetricService;
import org.apache.iotdb.commons.service.metric.PerformanceOverviewMetrics;
import org.apache.iotdb.db.conf.IoTDBDescriptor;
@@ -77,6 +78,9 @@ public class DataNodeMetricsHelper {
// bind performance overview related metrics
MetricService.getInstance().addMetricSet(PerformanceOverviewMetrics.getInstance());
+
+ // bind gc metrics
+ MetricService.getInstance().addMetricSet(JvmGcMonitorMetrics.getInstance());
}
private static void initSystemMetrics() {
diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java
index b0d175a0cc0..413ffba13a8 100644
--- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java
+++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/jvm/JvmGcMetrics.java
@@ -45,6 +45,7 @@ import java.lang.management.MemoryUsage;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
@@ -52,10 +53,11 @@ import java.util.concurrent.atomic.AtomicLong;
/** This file is modified from io.micrometer.core.instrument.binder.jvm.JvmGcMetrics */
public class JvmGcMetrics implements IMetricSet, AutoCloseable {
private static final Logger logger = LoggerFactory.getLogger(JvmGcMetrics.class);
+ private final List<Runnable> notificationListenerCleanUpRunnables = new CopyOnWriteArrayList<>();
private String youngGenPoolName;
private String oldGenPoolName;
private String nonGenerationalMemoryPool;
- private final List<Runnable> notificationListenerCleanUpRunnables = new CopyOnWriteArrayList<>();
+ private final Map<String, AtomicLong> lastGcTotalDurationMap = new ConcurrentHashMap<>();
public JvmGcMetrics() {
for (MemoryPoolMXBean mbean : ManagementFactory.getMemoryPoolMXBeans()) {
@@ -70,6 +72,48 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable {
}
}
+ private static boolean isPartiallyConcurrentGC(GarbageCollectorMXBean gc) {
+ switch (gc.getName()) {
+ // First two are from the 'serial' collector which are not concurrent, obviously.
+ case "Copy":
+ case "MarkSweepCompact":
+ // The following 4 GCs do not contain concurrent execution phase.
+ case "PS MarkSweep":
+ case "PS Scavenge":
+ case "G1 Young Generation":
+ case "ParNew":
+ return false;
+
+ // The following 2 GCs' execution process consists of concurrent phase, which means they can
+ // run simultaneously with the user thread in some phases.
+
+ // Concurrent mark and concurrent sweep
+ case "ConcurrentMarkSweep":
+ // Concurrent mark
+ case "G1 Old Generation":
+ return true;
+ default:
+ // Assume possibly concurrent if unsure
+ return true;
+ }
+ }
+
+ private static boolean isConcurrentPhase(String cause, String name) {
+ return "No GC".equals(cause) || "Shenandoah Cycles".equals(name);
+ }
+
+ private static boolean isYoungGenPool(String name) {
+ return name != null && (name.endsWith("Eden Space") || name.endsWith("Survivor Space"));
+ }
+
+ private static boolean isOldGenPool(String name) {
+ return name != null && (name.endsWith("Old Gen") || name.endsWith("Tenured Gen"));
+ }
+
+ private static boolean isNonGenerationalHeapPool(String name) {
+ return "Shenandoah".equals(name) || "ZHeap".equals(name);
+ }
+
@Override
public void bindTo(AbstractMetricService metricService) {
if (!preCheck()) {
@@ -98,6 +142,13 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable {
liveDataSize,
AtomicLong::get);
+ AtomicLong heapMemUsedPercentage = new AtomicLong(calculateMemoryUsagePercentage());
+ metricService.createAutoGauge(
+ SystemMetric.JVM_GC_MEMORY_USED_PERCENT.toString(),
+ MetricLevel.CORE,
+ heapMemUsedPercentage,
+ AtomicLong::get);
+
Counter allocatedBytes =
metricService.getOrCreateCounter(
SystemMetric.JVM_GC_MEMORY_ALLOCATED_BYTES.toString(), MetricLevel.CORE);
@@ -124,18 +175,69 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable {
String gcCause = notificationInfo.getGcCause();
String gcAction = notificationInfo.getGcAction();
GcInfo gcInfo = notificationInfo.getGcInfo();
+
+ // The duration supplied in the notification info includes more than just
+ // application stopped time for concurrent GCs (since the concurrent phase is not
+ // stop-the-world).
+ // E.g. For mixed GC or full GC in collector 'G1 old generation', the duration collected
+ // here is more than the actual pause time (the latter can be accessed by GC
+ // log/-XX:PrintGCDetails)
long duration = gcInfo.getDuration();
+
+ // Try and do a better job coming up with a good stopped time
+ // value by asking for and tracking cumulative time spent blocked in GC.
+ if (isPartiallyConcurrentGC(mbean)) {
+ AtomicLong previousTotal =
+ lastGcTotalDurationMap.computeIfAbsent(mbean.getName(), k -> new AtomicLong());
+ long total = mbean.getCollectionTime();
+ duration = total - previousTotal.get(); // may be zero for a really fast collection
+ previousTotal.set(total);
+ }
+
String timerName;
if (isConcurrentPhase(gcCause, notificationInfo.getGcName())) {
timerName = "jvm_gc_concurrent_phase_time";
} else {
timerName = "jvm_gc_pause";
}
+ // create a timer with tags named by gcCause, which binds gcCause with gcDuration
Timer timer =
metricService.getOrCreateTimer(
timerName, MetricLevel.CORE, "action", gcAction, "cause", gcCause);
timer.update(duration, TimeUnit.MILLISECONDS);
+ // add support for ZGC
+ if (mbean.getName().equals("ZGC Cycles")) {
+ Counter cyclesCount =
+ metricService.getOrCreateCounter(
+ SystemMetric.JVM_ZGC_CYCLES_COUNT.toString(), MetricLevel.CORE);
+ cyclesCount.inc();
+ } else if (mbean.getName().equals("ZGC Pauses")) {
+ Counter pausesCount =
+ metricService.getOrCreateCounter(
+ SystemMetric.JVM_ZGC_PAUSES_COUNT.toString(), MetricLevel.CORE);
+ pausesCount.inc();
+ }
+
+ // monitoring old/young GC count, which is helpful for users to locate GC exception.
+ // Unfortunately, the JMX doesn't seem to provide an api for monitoring mixed gc in G1.
+ // In fact, JMX may treat mixed GCs as minor GCs.
+ if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) {
+ Counter oldGcCounter =
+ metricService.getOrCreateCounter(
+ SystemMetric.JVM_GC_YOUNG_GC_COUNT.toString(), MetricLevel.CORE);
+ oldGcCounter.inc();
+ } else if (GcGenerationAge.fromName(notificationInfo.getGcName())
+ == GcGenerationAge.YOUNG) {
+ Counter youngGcCounter =
+ metricService.getOrCreateCounter(
+ SystemMetric.JVM_GC_OLD_GC_COUNT.toString(), MetricLevel.CORE);
+ youngGcCounter.inc();
+ }
+
+ // update memory usage percentage
+ heapMemUsedPercentage.set(calculateMemoryUsagePercentage());
+
// Update promotion and allocation counters
final Map<String, MemoryUsage> before = gcInfo.getMemoryUsageBeforeGc();
final Map<String, MemoryUsage> after = gcInfo.getMemoryUsageAfterGc();
@@ -153,39 +255,43 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable {
final long longLivedMaxAfter = after.get(nonGenerationalMemoryPool).getMax();
maxDataSize.set(longLivedMaxAfter);
}
- return;
}
- if (oldGenPoolName != null) {
- final long oldBefore = before.get(oldGenPoolName).getUsed();
- final long oldAfter = after.get(oldGenPoolName).getUsed();
- final long delta = oldAfter - oldBefore;
- if (delta > 0L && promotedBytes != null) {
- promotedBytes.inc(delta);
+ // should add `else` here, since there are only two
+ // cases: generational and non-generational
+ else {
+ if (oldGenPoolName != null) {
+ final long oldBefore = before.get(oldGenPoolName).getUsed();
+ final long oldAfter = after.get(oldGenPoolName).getUsed();
+ final long delta = oldAfter - oldBefore;
+ if (delta > 0L && promotedBytes != null) {
+ promotedBytes.inc(delta);
+ }
+
+ // Some GC implementations such as G1 can reduce the old gen size as part of a minor
+ // GC (since in JMX, a minor GC of G1 may actually represent mixed GC, which collect
+ // some obj in old gen region). To track the
+ // live data size we record the value if we see a reduction in the old gen heap size
+ // or
+ // after a major GC.
+ if (oldAfter < oldBefore
+ || GcGenerationAge.fromName(notificationInfo.getGcName())
+ == GcGenerationAge.OLD) {
+ liveDataSize.set(oldAfter);
+ final long oldMaxAfter = after.get(oldGenPoolName).getMax();
+ maxDataSize.set(oldMaxAfter);
+ }
}
- // Some GC implementations such as G1 can reduce the old gen size as part of a minor
- // GC. To track the
- // live data size we record the value if we see a reduction in the old gen heap size
- // or
- // after a major GC.
- if (oldAfter < oldBefore
- || GcGenerationAge.fromName(notificationInfo.getGcName())
- == GcGenerationAge.OLD) {
- liveDataSize.set(oldAfter);
- final long oldMaxAfter = after.get(oldGenPoolName).getMax();
- maxDataSize.set(oldMaxAfter);
+ if (youngGenPoolName != null) {
+ countPoolSizeDelta(
+ gcInfo.getMemoryUsageBeforeGc(),
+ gcInfo.getMemoryUsageAfterGc(),
+ allocatedBytes,
+ heapPoolSizeAfterGc,
+ youngGenPoolName);
}
}
-
- if (youngGenPoolName != null) {
- countPoolSizeDelta(
- gcInfo.getMemoryUsageBeforeGc(),
- gcInfo.getMemoryUsageAfterGc(),
- allocatedBytes,
- heapPoolSizeAfterGc,
- youngGenPoolName);
- }
};
NotificationEmitter notificationEmitter = (NotificationEmitter) mbean;
notificationEmitter.addNotificationListener(
@@ -216,6 +322,7 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable {
metricService.remove(
MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_LIVE_DATA_SIZE_BYTES.toString());
metricService.remove(MetricType.COUNTER, SystemMetric.JVM_GC_MEMORY_ALLOCATED_BYTES.toString());
+ metricService.remove(MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_MEMORY_USED_PERCENT.toString());
if (oldGenPoolName != null) {
metricService.remove(
@@ -242,6 +349,20 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable {
timerName = "jvm_gc_pause";
}
metricService.remove(MetricType.TIMER, timerName, "action", gcAction, "cause", gcCause);
+ if (mbean.getName().equals("ZGC Cycles")) {
+ metricService.remove(
+ MetricType.COUNTER, SystemMetric.JVM_ZGC_CYCLES_COUNT.toString());
+ } else if (mbean.getName().equals("ZGC Pauses")) {
+ metricService.remove(
+ MetricType.COUNTER, SystemMetric.JVM_ZGC_PAUSES_COUNT.toString());
+ }
+ if (GcGenerationAge.fromName(notificationInfo.getGcName()) == GcGenerationAge.OLD) {
+ metricService.remove(
+ MetricType.COUNTER, SystemMetric.JVM_GC_YOUNG_GC_COUNT.toString());
+ } else if (GcGenerationAge.fromName(notificationInfo.getGcName())
+ == GcGenerationAge.YOUNG) {
+ metricService.remove(MetricType.COUNTER, SystemMetric.JVM_GC_OLD_GC_COUNT.toString());
+ }
};
NotificationEmitter notificationEmitter = (NotificationEmitter) mbean;
notificationEmitter.addNotificationListener(
@@ -305,12 +426,18 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable {
notificationListenerCleanUpRunnables.forEach(Runnable::run);
}
+ public long calculateMemoryUsagePercentage() {
+ return (ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getUsed()
+ * 100
+ / Runtime.getRuntime().maxMemory());
+ }
+
enum GcGenerationAge {
OLD,
YOUNG,
UNKNOWN;
- private static Map<String, GcGenerationAge> knownCollectors = new HashMap<>();
+ private static final Map<String, GcGenerationAge> knownCollectors = new HashMap<>();
static {
knownCollectors.put("ConcurrentMarkSweep", OLD);
@@ -327,20 +454,4 @@ public class JvmGcMetrics implements IMetricSet, AutoCloseable {
return knownCollectors.getOrDefault(name, UNKNOWN);
}
}
-
- private static boolean isConcurrentPhase(String cause, String name) {
- return "No GC".equals(cause) || "Shenandoah Cycles".equals(name);
- }
-
- private static boolean isYoungGenPool(String name) {
- return name != null && name.endsWith("Eden Space");
- }
-
- private static boolean isOldGenPool(String name) {
- return name != null && (name.endsWith("Old Gen") || name.endsWith("Tenured Gen"));
- }
-
- private static boolean isNonGenerationalHeapPool(String name) {
- return "Shenandoah".equals(name) || "ZHeap".equals(name);
- }
}
diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java
index 6898a1dab83..645c2307eea 100644
--- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java
+++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/utils/SystemMetric.java
@@ -81,6 +81,12 @@ public enum SystemMetric {
JVM_GC_LIVE_DATA_SIZE_BYTES("jvm_gc_live_data_size_bytes"),
JVM_GC_MEMORY_ALLOCATED_BYTES("jvm_gc_memory_allocated_bytes"),
JVM_GC_MEMORY_PROMOTED_BYTES("jvm_gc_memory_promoted_bytes"),
+ JVM_GC_MEMORY_USED_PERCENT("jvm_gc_memory_used_percent"),
+ JVM_ZGC_CYCLES_COUNT("jvm_zgc_cycles_count"),
+ JVM_ZGC_PAUSES_COUNT("jvm_zgc_pauses_count"),
+ JVM_GC_YOUNG_GC_COUNT("jvm_gc_old_gc_count"),
+ JVM_GC_OLD_GC_COUNT("jvm_gc_young_gc_count"),
+ JVM_GC_ACCUMULATED_TIME_PERCENTAGE("jvm_gc_accumulated_time_percentage"),
// net related
RECEIVED_BYTES("received_bytes"),
RECEIVED_PACKETS("received_packets"),
diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java
index a4ee9044800..aff32b22695 100644
--- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java
+++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java
@@ -138,6 +138,7 @@ public enum ThreadName {
// NOTICE: The thread name of jvm cannot be edited here!
// We list the thread name here just for distinguishing what module the thread belongs to.
JVM_PAUSE_MONITOR("JvmPauseMonitor"),
+ JVM_GC_STATISTICS_MONITOR("JVM-GC-Statistics-Monitor"),
PARALLEL_GC("GC task thread"),
G1_GC("GC Thread"),
G1_MAIN_MARKER("G1 Main Marker"),
@@ -273,6 +274,7 @@ public enum ThreadName {
new HashSet<>(
Arrays.asList(
JVM_PAUSE_MONITOR,
+ JVM_GC_STATISTICS_MONITOR,
PARALLEL_GC,
G1_GC,
G1_MAIN_MARKER,
diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java
new file mode 100644
index 00000000000..e4396c09b11
--- /dev/null
+++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/GcTimeAlerter.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.commons.service.metric;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.text.SimpleDateFormat;
+import java.util.Date;
+
+public class GcTimeAlerter implements JvmGcMonitorMetrics.GcTimeAlertHandler {
+ private static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ private static final Logger logger = LoggerFactory.getLogger(GcTimeAlerter.class);
+
+ /**
+ * Alert handler func User can tailor their handle logic here
+ *
+ * @param gcData
+ */
+ @Override
+ public void alert(JvmGcMonitorMetrics.GcData gcData) {
+ logger.warn(
+ "Error metrics taken time: "
+ + sdf.format(new Date(Long.parseLong(String.valueOf(gcData.getTimestamp())))));
+ logger.warn("Gc Time Percentage: " + gcData.getGcTimePercentage() + "%");
+ logger.warn(
+ "Accumulated GC time within current observation window: "
+ + gcData.getGcTimeWithinObsWindow()
+ + " ms");
+ logger.warn(
+ "The observation window is from: "
+ + sdf.format(new Date(Long.parseLong(String.valueOf(gcData.getStartObsWindowTs()))))
+ + " to: "
+ + sdf.format(new Date(Long.parseLong(String.valueOf(gcData.getTimestamp())))));
+ logger.warn("The observation window time is: " + gcData.getCurrentObsWindowTs() + " ms.");
+ }
+}
diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java
new file mode 100644
index 00000000000..6624c3047de
--- /dev/null
+++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/JvmGcMonitorMetrics.java
@@ -0,0 +1,290 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.commons.service.metric;
+
+import org.apache.iotdb.commons.concurrent.IoTDBThreadPoolFactory;
+import org.apache.iotdb.commons.concurrent.ThreadName;
+import org.apache.iotdb.commons.concurrent.threadpool.ScheduledExecutorUtil;
+import org.apache.iotdb.metrics.AbstractMetricService;
+import org.apache.iotdb.metrics.metricsets.IMetricSet;
+import org.apache.iotdb.metrics.utils.MetricLevel;
+import org.apache.iotdb.metrics.utils.MetricType;
+import org.apache.iotdb.metrics.utils.SystemMetric;
+
+import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.concurrent.Future;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
+
+public class JvmGcMonitorMetrics implements IMetricSet {
+ // Duration of observation window
+ public static final long OBSERVATION_WINDOW_MS = TimeUnit.MINUTES.toMillis(1);
+ // Interval for data collection
+ public static final long SLEEP_INTERVAL_MS = TimeUnit.SECONDS.toMillis(5);
+ // Max GC time threshold
+ public static final long MAX_GC_TIME_PERCENTAGE = 40L;
+ // The time when IoTDB start running
+ private static long startTime;
+ private static final Logger logger = LoggerFactory.getLogger(JvmGcMonitorMetrics.class);
+ private final ScheduledExecutorService scheduledGCInfoMonitor =
+ IoTDBThreadPoolFactory.newSingleThreadScheduledExecutor(
+ ThreadName.JVM_GC_STATISTICS_MONITOR.getName());
+ private Future<?> scheduledGcMonitorFuture;
+ // Ring buffers containing GC timings and timestamps when timings were taken
+ private final TsAndData[] gcDataBuf;
+ // Buffer size
+ private final int bufSize;
+ // Buffer start index
+ private int startIdx;
+ // Buffer end index
+ private int endIdx;
+ // Container to hold collected GC data
+ private final GcData curData = new GcData();
+ // Hook function called with GC exception
+ private final GcTimeAlertHandler alertHandler;
+
+ public JvmGcMonitorMetrics() {
+ bufSize = (int) (OBSERVATION_WINDOW_MS / SLEEP_INTERVAL_MS + 2);
+ // Prevent the user from accidentally creating an abnormally big buffer, which will result in
+ // slow calculations and likely inaccuracy.
+ Preconditions.checkArgument(bufSize <= 128 * 1024);
+ gcDataBuf = new TsAndData[bufSize];
+ for (int i = 0; i < bufSize; i++) {
+ gcDataBuf[i] = new TsAndData();
+ }
+
+ alertHandler = new GcTimeAlerter();
+ }
+
+ @Override
+ public void bindTo(AbstractMetricService metricService) {
+ metricService.createAutoGauge(
+ SystemMetric.JVM_GC_ACCUMULATED_TIME_PERCENTAGE.toString(),
+ MetricLevel.CORE,
+ curData,
+ GcData::getGcTimePercentage);
+
+ startTime = System.currentTimeMillis();
+ // current collect time: startTime + start delay(50ms)
+ gcDataBuf[startIdx].setValues(startTime + TimeUnit.MILLISECONDS.toMillis(50), 0);
+ scheduledGcMonitorFuture =
+ ScheduledExecutorUtil.safelyScheduleWithFixedDelay(
+ scheduledGCInfoMonitor,
+ this::scheduledMonitoring,
+ TimeUnit.MILLISECONDS.toMillis(50), // to prevent / ZERO exception
+ SLEEP_INTERVAL_MS,
+ TimeUnit.MILLISECONDS);
+ }
+
+ @Override
+ public void unbindFrom(AbstractMetricService metricService) {
+ metricService.remove(
+ MetricType.AUTO_GAUGE, SystemMetric.JVM_GC_ACCUMULATED_TIME_PERCENTAGE.toString());
+ if (scheduledGcMonitorFuture != null) {
+ scheduledGcMonitorFuture.cancel(false);
+ scheduledGcMonitorFuture = null;
+ logger.info("JVM GC scheduled monitor is stopped successfully.");
+ }
+ }
+
+ private void scheduledMonitoring() {
+ calculateGCTimePercentageWithinObservedInterval();
+ if (alertHandler != null && curData.gcTimePercentage.get() > MAX_GC_TIME_PERCENTAGE) {
+ alertHandler.alert(curData.clone());
+ }
+ }
+
+ private void calculateGCTimePercentageWithinObservedInterval() {
+ long prevTotalGcTime = curData.getAccumulatedGcTime();
+ long totalGcTime = 0;
+ for (GarbageCollectorMXBean gcBean : ManagementFactory.getGarbageCollectorMXBeans()) {
+ totalGcTime += gcBean.getCollectionTime();
+ }
+ long gcTimeWithinSleepInterval = totalGcTime - prevTotalGcTime;
+
+ long curTime = System.currentTimeMillis();
+ long gcMonitorRunTime = curTime - startTime;
+
+ endIdx = (endIdx + 1) % bufSize;
+ gcDataBuf[endIdx].setValues(curTime, gcTimeWithinSleepInterval);
+
+ // Move startIdx forward until we reach the first buffer entry with
+ // timestamp within the observation window.
+ long startObsWindowTs = curTime - OBSERVATION_WINDOW_MS;
+ while (gcDataBuf[startIdx].ts < startObsWindowTs && startIdx != endIdx) {
+ startIdx = (startIdx + 1) % bufSize;
+ }
+
+ // Calculate total GC time within observationWindowMs.
+ // We should be careful about GC time that passed before the first timestamp
+ // in our observation window.
+ long gcTimeWithinObservationWindow =
+ Math.min(gcDataBuf[startIdx].gcPause, gcDataBuf[startIdx].ts - startObsWindowTs);
+ if (startIdx != endIdx) {
+ for (int i = (startIdx + 1) % bufSize; i != endIdx; i = (i + 1) % bufSize) {
+ gcTimeWithinObservationWindow += gcDataBuf[i].gcPause;
+ }
+ }
+
+ curData.update(
+ curTime,
+ startObsWindowTs,
+ totalGcTime,
+ gcTimeWithinObservationWindow,
+ (int)
+ (gcTimeWithinObservationWindow
+ * 100
+ / Math.min(OBSERVATION_WINDOW_MS, gcMonitorRunTime)));
+ }
+
+ /** Encapsulates data about GC pauses measured at the specific timestamp. */
+ public static class GcData implements Cloneable {
+ // The time when this object get updated.
+ private final AtomicLong timestamp = new AtomicLong();
+ // The theoretical start time of the observation window, usually equal to `timestamp -
+ // OBSERVATION_WINDOW_MS`
+ private final AtomicLong startObsWindowTs = new AtomicLong();
+ // Accumulated GC time since the start of IoTDB.
+ private final AtomicLong accumulatedGcTime = new AtomicLong();
+ // The percentage (0..100) of time that the JVM spent in GC pauses within the observation window
+ private final AtomicLong gcTimePercentage = new AtomicLong();
+ // Accumulated GC time within the latest observation window.
+ private final AtomicLong gcTimeWithinObsWindow = new AtomicLong();
+
+ /**
+ * Returns the length of current observation window, usually equal to OBSERVATION_WINDOW_MS. If
+ * IoTDB is started after the start of the theoretical time window, then IoTDB startup time is
+ * returned.
+ *
+ * @return current observation window time, millisecond.
+ */
+ public long getCurrentObsWindowTs() {
+ return Math.min(timestamp.get() - startTime, timestamp.get() - startObsWindowTs.get());
+ }
+
+ /**
+ * Returns the absolute timestamp when this measurement was taken.
+ *
+ * @return timestamp.
+ */
+ public long getTimestamp() {
+ return timestamp.get();
+ }
+
+ /**
+ * Returns the start timestamp of the latest observation window.
+ *
+ * @return the actual start timestamp of the obs window.
+ */
+ public long getStartObsWindowTs() {
+ return Math.max(startObsWindowTs.get(), startTime);
+ }
+
+ /**
+ * Returns accumulated GC time since the start of IoTDB.
+ *
+ * @return AccumulatedGcTime.
+ */
+ public long getAccumulatedGcTime() {
+ return accumulatedGcTime.get();
+ }
+
+ /**
+ * Returns accumulated GC time within the latest observation window.
+ *
+ * @return gcTimeWithinObsWindow.
+ */
+ public long getGcTimeWithinObsWindow() {
+ return gcTimeWithinObsWindow.get();
+ }
+
+ /**
+ * Returns the percentage (0..100) of time that the JVM spent in GC pauses within the
+ * observation window of the associated GCTimeMonitor.
+ *
+ * @return GcTimePercentage.
+ */
+ public long getGcTimePercentage() {
+ return gcTimePercentage.get();
+ }
+
+ private synchronized void update(
+ long inTimestamp,
+ long inStartObsWindowTs,
+ long inTotalGcTime,
+ long inGcTimeWithinObsWindow,
+ int inGcTimePercentage) {
+ this.timestamp.set(inTimestamp);
+ this.startObsWindowTs.set(inStartObsWindowTs);
+ this.accumulatedGcTime.set(inTotalGcTime);
+ this.gcTimeWithinObsWindow.set(inGcTimeWithinObsWindow);
+ this.gcTimePercentage.set(inGcTimePercentage);
+ }
+
+ @Override
+ public synchronized GcData clone() {
+ try {
+ return (GcData) super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ private static class TsAndData {
+ // Timestamp when this measurement was taken
+ private long ts;
+ // Total GC pause time within the interval between ts
+ // and the timestamp of the previous measurement.
+ private long gcPause;
+
+ void setValues(long inTs, long inGcPause) {
+ this.ts = inTs;
+ this.gcPause = inGcPause;
+ }
+ }
+
+ /**
+ * The user can provide an instance of a class implementing this interface when initializing a
+ * GCTimeMonitor to receive alerts when GC time percentage exceeds the specified threshold.
+ */
+ public interface GcTimeAlertHandler {
+ void alert(GcData gcData);
+ }
+
+ private static class JvmGcMetricsHolder {
+
+ private static final JvmGcMonitorMetrics INSTANCE = new JvmGcMonitorMetrics();
+
+ private JvmGcMetricsHolder() {
+ // empty constructor
+ }
+ }
+
+ public static JvmGcMonitorMetrics getInstance() {
+ return JvmGcMetricsHolder.INSTANCE;
+ }
+}