You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@ratis.apache.org by GitBox <gi...@apache.org> on 2020/11/13 15:35:54 UTC

[GitHub] [incubator-ratis] szetszwo commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

szetszwo commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r523023721



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;

Review comment:
       Need license header.

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff
+                                                                     .toString());

Review comment:
       change `diff.toString()` to `diff`

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff
+                                                                     .toString());
+      }
+    }
+
+    String ret = "Detected pause in JVM or host machine (eg GC): "
+                   + "pause of approximately " + extraSleepTime + "ms\n";
+    if (gcDiffs.isEmpty()) {
+      ret += "No GCs detected";
+    } else {
+      ret += Joiner.on("\n").join(gcDiffs);
+    }
+    return ret;
+  }
+
+  public JvmPauseMonitor(RaftServerProxy proxy) {
+    this.proxy = proxy;
+  }
+
+  private Map<String, GcTimes> getGcTimes() {
+    Map<String, GcTimes> map = Maps.newHashMap();
+    List<GarbageCollectorMXBean> gcBeans =
+      ManagementFactory.getGarbageCollectorMXBeans();
+    for (GarbageCollectorMXBean gcBean : gcBeans) {
+      map.put(gcBean.getName(), new GcTimes(gcBean));
+    }
+    return map;
+  }
+
+  private static class GcTimes {
+    private GcTimes(GarbageCollectorMXBean gcBean) {
+      gcCount = gcBean.getCollectionCount();
+      gcTimeMillis = gcBean.getCollectionTime();
+    }
+
+    private GcTimes(long count, long time) {
+      this.gcCount = count;
+      this.gcTimeMillis = time;
+    }
+
+    private GcTimes subtract(GcTimes other) {
+      return new GcTimes(this.gcCount - other.gcCount,
+        this.gcTimeMillis - other.gcTimeMillis);
+    }
+
+    @Override
+    public String toString() {
+      return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+    }
+
+    private long gcCount;
+    private long gcTimeMillis;
+  }
+
+  class Monitor implements Runnable {
+    private final String name = JvmPauseMonitor.this + "-" + JavaUtils
+                                                               .getClassSimpleName(getClass());
+
+    public void run() {
+      while (shouldRun) {
+        Stopwatch sw = Stopwatch.createUnstarted();
+        Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
+        LOG.info("Starting Ratis JVM pause monitor");
+        while (shouldRun) {
+          sw.reset().start();
+          try {
+            Thread.sleep(SLEEP_INTERVAL_MS);
+          } catch (InterruptedException ie) {
+            return;
+          }
+          long extraSleepTime =
+            sw.elapsed(TimeUnit.MILLISECONDS) - SLEEP_INTERVAL_MS;
+          Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
+
+          if (extraSleepTime > 0) {
+            LOG.warn(formatMessage(extraSleepTime, gcTimesAfterSleep,
+              gcTimesBeforeSleep));
+          }
+          totalGcExtraSleepTime += extraSleepTime;
+          gcTimesBeforeSleep = gcTimesAfterSleep;
+          int leaderStepDownWaitTime =
+            RaftServerConfigKeys.LeaderElection.leaderStepDownWaitTime(
+              proxy.getProperties()).toIntExact(TimeUnit.MILLISECONDS);
+          int rpcSlownessTimeoutMs = RaftServerConfigKeys.Rpc.slownessTimeout(
+            proxy.getProperties()).toIntExact(TimeUnit.MILLISECONDS);
+          try {
+            if (totalGcExtraSleepTime > leaderStepDownWaitTime) {
+              proxy.getImpls().forEach(RaftServerImpl::stepDownOnJvmPause);
+            }
+            if (totalGcExtraSleepTime > rpcSlownessTimeoutMs) {

Review comment:
       They should use extraSleepTime instead of totalGcExtraSleepTime.

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff
+                                                                     .toString());
+      }
+    }
+
+    String ret = "Detected pause in JVM or host machine (eg GC): "
+                   + "pause of approximately " + extraSleepTime + "ms\n";
+    if (gcDiffs.isEmpty()) {
+      ret += "No GCs detected";
+    } else {
+      ret += Joiner.on("\n").join(gcDiffs);
+    }
+    return ret;
+  }
+
+  public JvmPauseMonitor(RaftServerProxy proxy) {
+    this.proxy = proxy;
+  }
+
+  private Map<String, GcTimes> getGcTimes() {
+    Map<String, GcTimes> map = Maps.newHashMap();
+    List<GarbageCollectorMXBean> gcBeans =
+      ManagementFactory.getGarbageCollectorMXBeans();
+    for (GarbageCollectorMXBean gcBean : gcBeans) {
+      map.put(gcBean.getName(), new GcTimes(gcBean));
+    }
+    return map;
+  }
+
+  private static class GcTimes {
+    private GcTimes(GarbageCollectorMXBean gcBean) {
+      gcCount = gcBean.getCollectionCount();
+      gcTimeMillis = gcBean.getCollectionTime();
+    }
+
+    private GcTimes(long count, long time) {
+      this.gcCount = count;
+      this.gcTimeMillis = time;
+    }
+
+    private GcTimes subtract(GcTimes other) {
+      return new GcTimes(this.gcCount - other.gcCount,
+        this.gcTimeMillis - other.gcTimeMillis);
+    }
+
+    @Override
+    public String toString() {
+      return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+    }
+
+    private long gcCount;
+    private long gcTimeMillis;
+  }
+
+  class Monitor implements Runnable {
+    private final String name = JvmPauseMonitor.this + "-" + JavaUtils
+                                                               .getClassSimpleName(getClass());
+
+    public void run() {
+      while (shouldRun) {

Review comment:
       This while-loop can be removed.  There is another while loop inside.

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/impl/RaftServerImpl.java
##########
@@ -633,6 +633,13 @@ void assertGroup(Object requestorId, RaftGroupId requestorGroupId) throws GroupM
     return pending.getFuture();
   }
 
+  public void stepDownOnJvmPause() {
+    if (isLeader()) {
+      final LeaderState leaderState = role.getLeaderStateNonNull();
+      leaderState.submitStepDownEvent(LeaderState.StepDownReason.JVM_PAUSE);
+    }

Review comment:
       It should use `getLeaderState()` instead of `getLeaderStateNonNull()`
   ```suggestion
       role.getLeaderState().ifPresent(leader -> leader.submitStepDownEvent(LeaderState.StepDownReason.JVM_PAUSE));
   ```

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;

Review comment:
       Remove totalGcExtraSleepTime.  It is used incorrectly in the code below.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org