You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@ratis.apache.org by GitBox <gi...@apache.org> on 2020/11/13 09:35:42 UTC

[GitHub] [incubator-ratis] bshashikant opened a new pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

bshashikant opened a new pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275


   
   
   ## What changes were proposed in this pull request?
   
   Idea of JvmpauseMonitor taken from hadoop with two additional conditions added:
   
   1) If the total jvm pause time exceeds maxRpcTimeout in Ratis, ratis leader will stepdown
   2) If the total pause time excceds rpcSlownesstimeout, kill all the ratis pipeline on the node
   
   ## What is the link to the Apache JIRA
   https://issues.apache.org/jira/browse/RATIS-1149
   
   ## How was this patch tested?
   Will be adding init tests
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] runzhiwang commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
runzhiwang commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r524969253



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.ratis.server;
+
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff);
+      }
+    }
+
+    String ret = "Detected pause in JVM or host machine (eg GC): "
+                   + "pause of approximately " + extraSleepTime + "ms\n";
+    if (gcDiffs.isEmpty()) {
+      ret += "No GCs detected";
+    } else {
+      ret += Joiner.on("\n").join(gcDiffs);
+    }
+    return ret;
+  }
+
+  public JvmPauseMonitor(RaftServerProxy proxy) {
+    this.proxy = proxy;
+  }
+
+  private Map<String, GcTimes> getGcTimes() {
+    Map<String, GcTimes> map = Maps.newHashMap();
+    List<GarbageCollectorMXBean> gcBeans =
+      ManagementFactory.getGarbageCollectorMXBeans();
+    for (GarbageCollectorMXBean gcBean : gcBeans) {
+      map.put(gcBean.getName(), new GcTimes(gcBean));
+    }
+    return map;
+  }
+
+  private static class GcTimes {
+    private GcTimes(GarbageCollectorMXBean gcBean) {
+      gcCount = gcBean.getCollectionCount();
+      gcTimeMillis = gcBean.getCollectionTime();
+    }
+
+    private GcTimes(long count, long time) {
+      this.gcCount = count;
+      this.gcTimeMillis = time;
+    }
+
+    private GcTimes subtract(GcTimes other) {
+      return new GcTimes(this.gcCount - other.gcCount,
+        this.gcTimeMillis - other.gcTimeMillis);
+    }
+
+    @Override
+    public String toString() {
+      return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+    }
+
+    private long gcCount;
+    private long gcTimeMillis;
+  }
+
+  class Monitor implements Runnable {
+    private final String name = JvmPauseMonitor.this + "-" + JavaUtils
+                                                               .getClassSimpleName(getClass());
+
+    public void run() {
+      int leaderStepDownWaitTime =
+        RaftServerConfigKeys.LeaderElection.leaderStepDownWaitTime(proxy.getProperties()).toIntExact(TimeUnit
+                                                                                                       .MILLISECONDS);
+      int rpcSlownessTimeoutMs = RaftServerConfigKeys.Rpc.slownessTimeout(proxy.getProperties()).toIntExact(
+        TimeUnit.MILLISECONDS);
+      Stopwatch sw = Stopwatch.createUnstarted();
+      Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
+      LOG.info("Starting Ratis JVM pause monitor");
+      while (shouldRun) {
+        sw.reset().start();
+        try {
+          Thread.sleep(SLEEP_INTERVAL_MS);
+        } catch (InterruptedException ie) {
+          return;
+        }
+        long extraSleepTime = sw.elapsed(TimeUnit.MILLISECONDS) - SLEEP_INTERVAL_MS;
+        Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
+
+        if (extraSleepTime > 0) {
+          LOG.warn(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));

Review comment:
       Do we need `gcTimesBeforeSleep = gcTimesAfterSleep` after warn log?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] runzhiwang commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
runzhiwang commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r524846351



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff
+                                                                     .toString());
+      }
+    }
+
+    String ret = "Detected pause in JVM or host machine (eg GC): "
+                   + "pause of approximately " + extraSleepTime + "ms\n";
+    if (gcDiffs.isEmpty()) {
+      ret += "No GCs detected";
+    } else {
+      ret += Joiner.on("\n").join(gcDiffs);
+    }
+    return ret;
+  }
+
+  public JvmPauseMonitor(RaftServerProxy proxy) {
+    this.proxy = proxy;
+  }
+
+  private Map<String, GcTimes> getGcTimes() {
+    Map<String, GcTimes> map = Maps.newHashMap();
+    List<GarbageCollectorMXBean> gcBeans =
+      ManagementFactory.getGarbageCollectorMXBeans();
+    for (GarbageCollectorMXBean gcBean : gcBeans) {
+      map.put(gcBean.getName(), new GcTimes(gcBean));
+    }
+    return map;
+  }
+
+  private static class GcTimes {
+    private GcTimes(GarbageCollectorMXBean gcBean) {
+      gcCount = gcBean.getCollectionCount();
+      gcTimeMillis = gcBean.getCollectionTime();
+    }
+
+    private GcTimes(long count, long time) {
+      this.gcCount = count;
+      this.gcTimeMillis = time;
+    }
+
+    private GcTimes subtract(GcTimes other) {
+      return new GcTimes(this.gcCount - other.gcCount,
+        this.gcTimeMillis - other.gcTimeMillis);
+    }
+
+    @Override
+    public String toString() {
+      return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+    }
+
+    private long gcCount;
+    private long gcTimeMillis;
+  }
+
+  class Monitor implements Runnable {
+    private final String name = JvmPauseMonitor.this + "-" + JavaUtils
+                                                               .getClassSimpleName(getClass());
+
+    public void run() {
+      while (shouldRun) {
+        Stopwatch sw = Stopwatch.createUnstarted();
+        Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
+        LOG.info("Starting Ratis JVM pause monitor");
+        while (shouldRun) {
+          sw.reset().start();
+          try {
+            Thread.sleep(SLEEP_INTERVAL_MS);
+          } catch (InterruptedException ie) {
+            return;
+          }
+          long extraSleepTime =
+            sw.elapsed(TimeUnit.MILLISECONDS) - SLEEP_INTERVAL_MS;
+          Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
+
+          if (extraSleepTime > 0) {
+            LOG.warn(formatMessage(extraSleepTime, gcTimesAfterSleep,
+              gcTimesBeforeSleep));
+          }
+          totalGcExtraSleepTime += extraSleepTime;
+          gcTimesBeforeSleep = gcTimesAfterSleep;
+          int leaderStepDownWaitTime =
+            RaftServerConfigKeys.LeaderElection.leaderStepDownWaitTime(
+              proxy.getProperties()).toIntExact(TimeUnit.MILLISECONDS);
+          int rpcSlownessTimeoutMs = RaftServerConfigKeys.Rpc.slownessTimeout(

Review comment:
       We can get these const variable out of while loop.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] runzhiwang commented on pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
runzhiwang commented on pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#issuecomment-728809901


   @bshashikant Thanks the patch. @szetszwo @amaliujia Thanks for review. I have merged it.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] amaliujia commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
amaliujia commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r523234860



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/impl/RaftServerProxy.java
##########
@@ -292,7 +295,7 @@ public RaftServerImpl getImpl(RaftGroupId groupId) throws IOException {
     return IOUtils.getFromFuture(getImplFuture(groupId), this::getId);
   }
 
-  List<RaftServerImpl> getImpls() throws IOException {
+  public List<RaftServerImpl> getImpls() throws IOException {

Review comment:
       Instead of making this function public, will it better to expose this function to `RaftServer`(though I am not sure whether it is good to expose RaftServerImpl to RaftServer)

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff
+                                                                     .toString());
+      }
+    }
+
+    String ret = "Detected pause in JVM or host machine (eg GC): "
+                   + "pause of approximately " + extraSleepTime + "ms\n";
+    if (gcDiffs.isEmpty()) {
+      ret += "No GCs detected";
+    } else {
+      ret += Joiner.on("\n").join(gcDiffs);
+    }
+    return ret;
+  }
+
+  public JvmPauseMonitor(RaftServerProxy proxy) {
+    this.proxy = proxy;
+  }
+
+  private Map<String, GcTimes> getGcTimes() {
+    Map<String, GcTimes> map = Maps.newHashMap();
+    List<GarbageCollectorMXBean> gcBeans =
+      ManagementFactory.getGarbageCollectorMXBeans();
+    for (GarbageCollectorMXBean gcBean : gcBeans) {
+      map.put(gcBean.getName(), new GcTimes(gcBean));
+    }
+    return map;
+  }
+
+  private static class GcTimes {
+    private GcTimes(GarbageCollectorMXBean gcBean) {
+      gcCount = gcBean.getCollectionCount();
+      gcTimeMillis = gcBean.getCollectionTime();
+    }
+
+    private GcTimes(long count, long time) {
+      this.gcCount = count;
+      this.gcTimeMillis = time;
+    }
+
+    private GcTimes subtract(GcTimes other) {
+      return new GcTimes(this.gcCount - other.gcCount,
+        this.gcTimeMillis - other.gcTimeMillis);
+    }
+
+    @Override
+    public String toString() {
+      return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+    }
+
+    private long gcCount;
+    private long gcTimeMillis;
+  }
+
+  class Monitor implements Runnable {
+    private final String name = JvmPauseMonitor.this + "-" + JavaUtils
+                                                               .getClassSimpleName(getClass());
+
+    public void run() {
+      while (shouldRun) {
+        Stopwatch sw = Stopwatch.createUnstarted();
+        Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
+        LOG.info("Starting Ratis JVM pause monitor");
+        while (shouldRun) {
+          sw.reset().start();
+          try {
+            Thread.sleep(SLEEP_INTERVAL_MS);
+          } catch (InterruptedException ie) {
+            return;

Review comment:
       Will call `stop()` instead of return better?

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff
+                                                                     .toString());
+      }
+    }
+
+    String ret = "Detected pause in JVM or host machine (eg GC): "
+                   + "pause of approximately " + extraSleepTime + "ms\n";
+    if (gcDiffs.isEmpty()) {
+      ret += "No GCs detected";
+    } else {
+      ret += Joiner.on("\n").join(gcDiffs);
+    }
+    return ret;
+  }
+
+  public JvmPauseMonitor(RaftServerProxy proxy) {
+    this.proxy = proxy;
+  }
+
+  private Map<String, GcTimes> getGcTimes() {
+    Map<String, GcTimes> map = Maps.newHashMap();
+    List<GarbageCollectorMXBean> gcBeans =
+      ManagementFactory.getGarbageCollectorMXBeans();
+    for (GarbageCollectorMXBean gcBean : gcBeans) {
+      map.put(gcBean.getName(), new GcTimes(gcBean));
+    }
+    return map;
+  }
+
+  private static class GcTimes {
+    private GcTimes(GarbageCollectorMXBean gcBean) {
+      gcCount = gcBean.getCollectionCount();
+      gcTimeMillis = gcBean.getCollectionTime();
+    }
+
+    private GcTimes(long count, long time) {
+      this.gcCount = count;
+      this.gcTimeMillis = time;
+    }
+
+    private GcTimes subtract(GcTimes other) {
+      return new GcTimes(this.gcCount - other.gcCount,
+        this.gcTimeMillis - other.gcTimeMillis);
+    }
+
+    @Override
+    public String toString() {
+      return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+    }
+
+    private long gcCount;
+    private long gcTimeMillis;
+  }
+
+  class Monitor implements Runnable {
+    private final String name = JvmPauseMonitor.this + "-" + JavaUtils
+                                                               .getClassSimpleName(getClass());
+
+    public void run() {
+      while (shouldRun) {

Review comment:
       +1

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff
+                                                                     .toString());
+      }
+    }
+
+    String ret = "Detected pause in JVM or host machine (eg GC): "
+                   + "pause of approximately " + extraSleepTime + "ms\n";
+    if (gcDiffs.isEmpty()) {
+      ret += "No GCs detected";
+    } else {
+      ret += Joiner.on("\n").join(gcDiffs);
+    }
+    return ret;
+  }
+
+  public JvmPauseMonitor(RaftServerProxy proxy) {
+    this.proxy = proxy;
+  }
+
+  private Map<String, GcTimes> getGcTimes() {
+    Map<String, GcTimes> map = Maps.newHashMap();
+    List<GarbageCollectorMXBean> gcBeans =
+      ManagementFactory.getGarbageCollectorMXBeans();
+    for (GarbageCollectorMXBean gcBean : gcBeans) {
+      map.put(gcBean.getName(), new GcTimes(gcBean));
+    }
+    return map;
+  }
+
+  private static class GcTimes {
+    private GcTimes(GarbageCollectorMXBean gcBean) {
+      gcCount = gcBean.getCollectionCount();
+      gcTimeMillis = gcBean.getCollectionTime();
+    }
+
+    private GcTimes(long count, long time) {
+      this.gcCount = count;
+      this.gcTimeMillis = time;
+    }
+
+    private GcTimes subtract(GcTimes other) {
+      return new GcTimes(this.gcCount - other.gcCount,
+        this.gcTimeMillis - other.gcTimeMillis);
+    }
+
+    @Override
+    public String toString() {
+      return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+    }
+
+    private long gcCount;
+    private long gcTimeMillis;
+  }
+
+  class Monitor implements Runnable {
+    private final String name = JvmPauseMonitor.this + "-" + JavaUtils
+                                                               .getClassSimpleName(getClass());
+
+    public void run() {
+      while (shouldRun) {
+        Stopwatch sw = Stopwatch.createUnstarted();
+        Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
+        LOG.info("Starting Ratis JVM pause monitor");
+        while (shouldRun) {
+          sw.reset().start();
+          try {
+            Thread.sleep(SLEEP_INTERVAL_MS);
+          } catch (InterruptedException ie) {
+            return;

Review comment:
       Instead of return, will call `stop()`  better?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] runzhiwang merged pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
runzhiwang merged pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275


   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] bshashikant commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
bshashikant commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r524974076



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.ratis.server;
+
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff);
+      }
+    }
+
+    String ret = "Detected pause in JVM or host machine (eg GC): "
+                   + "pause of approximately " + extraSleepTime + "ms\n";
+    if (gcDiffs.isEmpty()) {
+      ret += "No GCs detected";
+    } else {
+      ret += Joiner.on("\n").join(gcDiffs);
+    }
+    return ret;
+  }
+
+  public JvmPauseMonitor(RaftServerProxy proxy) {
+    this.proxy = proxy;
+  }
+
+  private Map<String, GcTimes> getGcTimes() {
+    Map<String, GcTimes> map = Maps.newHashMap();
+    List<GarbageCollectorMXBean> gcBeans =
+      ManagementFactory.getGarbageCollectorMXBeans();
+    for (GarbageCollectorMXBean gcBean : gcBeans) {
+      map.put(gcBean.getName(), new GcTimes(gcBean));
+    }
+    return map;
+  }
+
+  private static class GcTimes {
+    private GcTimes(GarbageCollectorMXBean gcBean) {
+      gcCount = gcBean.getCollectionCount();
+      gcTimeMillis = gcBean.getCollectionTime();
+    }
+
+    private GcTimes(long count, long time) {
+      this.gcCount = count;
+      this.gcTimeMillis = time;
+    }
+
+    private GcTimes subtract(GcTimes other) {
+      return new GcTimes(this.gcCount - other.gcCount,
+        this.gcTimeMillis - other.gcTimeMillis);
+    }
+
+    @Override
+    public String toString() {
+      return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+    }
+
+    private long gcCount;
+    private long gcTimeMillis;
+  }
+
+  class Monitor implements Runnable {
+    private final String name = JvmPauseMonitor.this + "-" + JavaUtils
+                                                               .getClassSimpleName(getClass());
+
+    public void run() {
+      int leaderStepDownWaitTime =
+        RaftServerConfigKeys.LeaderElection.leaderStepDownWaitTime(proxy.getProperties()).toIntExact(TimeUnit
+                                                                                                       .MILLISECONDS);
+      int rpcSlownessTimeoutMs = RaftServerConfigKeys.Rpc.slownessTimeout(proxy.getProperties()).toIntExact(
+        TimeUnit.MILLISECONDS);
+      Stopwatch sw = Stopwatch.createUnstarted();
+      Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
+      LOG.info("Starting Ratis JVM pause monitor");
+      while (shouldRun) {
+        sw.reset().start();
+        try {
+          Thread.sleep(SLEEP_INTERVAL_MS);
+        } catch (InterruptedException ie) {
+          return;
+        }
+        long extraSleepTime = sw.elapsed(TimeUnit.MILLISECONDS) - SLEEP_INTERVAL_MS;
+        Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
+
+        if (extraSleepTime > 0) {
+          LOG.warn(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));

Review comment:
       This is addressed in the latest patch.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] runzhiwang commented on pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
runzhiwang commented on pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#issuecomment-726663367


   @bshashikant Thanks the patch. It looks good to me.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] runzhiwang commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
runzhiwang commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r524845988



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff
+                                                                     .toString());
+      }
+    }
+
+    String ret = "Detected pause in JVM or host machine (eg GC): "
+                   + "pause of approximately " + extraSleepTime + "ms\n";
+    if (gcDiffs.isEmpty()) {
+      ret += "No GCs detected";
+    } else {
+      ret += Joiner.on("\n").join(gcDiffs);
+    }
+    return ret;
+  }
+
+  public JvmPauseMonitor(RaftServerProxy proxy) {
+    this.proxy = proxy;
+  }
+
+  private Map<String, GcTimes> getGcTimes() {
+    Map<String, GcTimes> map = Maps.newHashMap();
+    List<GarbageCollectorMXBean> gcBeans =
+      ManagementFactory.getGarbageCollectorMXBeans();
+    for (GarbageCollectorMXBean gcBean : gcBeans) {
+      map.put(gcBean.getName(), new GcTimes(gcBean));
+    }
+    return map;
+  }
+
+  private static class GcTimes {
+    private GcTimes(GarbageCollectorMXBean gcBean) {
+      gcCount = gcBean.getCollectionCount();
+      gcTimeMillis = gcBean.getCollectionTime();
+    }
+
+    private GcTimes(long count, long time) {
+      this.gcCount = count;
+      this.gcTimeMillis = time;
+    }
+
+    private GcTimes subtract(GcTimes other) {
+      return new GcTimes(this.gcCount - other.gcCount,
+        this.gcTimeMillis - other.gcTimeMillis);
+    }
+
+    @Override
+    public String toString() {
+      return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+    }
+
+    private long gcCount;
+    private long gcTimeMillis;
+  }
+
+  class Monitor implements Runnable {
+    private final String name = JvmPauseMonitor.this + "-" + JavaUtils
+                                                               .getClassSimpleName(getClass());
+
+    public void run() {
+      while (shouldRun) {
+        Stopwatch sw = Stopwatch.createUnstarted();
+        Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
+        LOG.info("Starting Ratis JVM pause monitor");
+        while (shouldRun) {
+          sw.reset().start();
+          try {
+            Thread.sleep(SLEEP_INTERVAL_MS);
+          } catch (InterruptedException ie) {
+            return;
+          }
+          long extraSleepTime =
+            sw.elapsed(TimeUnit.MILLISECONDS) - SLEEP_INTERVAL_MS;
+          Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
+
+          if (extraSleepTime > 0) {
+            LOG.warn(formatMessage(extraSleepTime, gcTimesAfterSleep,
+              gcTimesBeforeSleep));
+          }
+          totalGcExtraSleepTime += extraSleepTime;
+          gcTimesBeforeSleep = gcTimesAfterSleep;
+          int leaderStepDownWaitTime =
+            RaftServerConfigKeys.LeaderElection.leaderStepDownWaitTime(
+              proxy.getProperties()).toIntExact(TimeUnit.MILLISECONDS);
+          int rpcSlownessTimeoutMs = RaftServerConfigKeys.Rpc.slownessTimeout(
+            proxy.getProperties()).toIntExact(TimeUnit.MILLISECONDS);
+          try {
+            if (totalGcExtraSleepTime > leaderStepDownWaitTime) {
+              proxy.getImpls().forEach(RaftServerImpl::stepDownOnJvmPause);
+            }
+            if (totalGcExtraSleepTime > rpcSlownessTimeoutMs) {

Review comment:
        If close proxy,  we do not need to step down. So
    ```
   if (extraSleepTime > rpcSlownessTimeoutMs) {
      proxy.close();
    } else if (extraSleepTime > leaderStepDownWaitTime) {
     proxy.getImpls().forEach(RaftServerImpl::stepDownOnJvmPause);
    }
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] bshashikant commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
bshashikant commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r524904940



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/impl/RaftServerProxy.java
##########
@@ -346,6 +350,7 @@ public void close() {
         LOG.warn(getId() + ": Failed to close " + SupportedDataStreamType.NETTY + " server", ignored);
       }
     });
+    pauseMonitor.stop();

Review comment:
       I think, this is the last step in the proxy.close() call. The exception should be thrown.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] runzhiwang commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
runzhiwang commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r524836727



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),

Review comment:
       The max line length is 120 in ratis. do not need to change line. The same to other places.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] runzhiwang commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
runzhiwang commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r524837751



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {

Review comment:
       Seems useless.

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {

Review comment:
       Seems useless.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] szetszwo commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
szetszwo commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r523023721



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;

Review comment:
       Need license header.

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff
+                                                                     .toString());

Review comment:
       change `diff.toString()` to `diff`

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff
+                                                                     .toString());
+      }
+    }
+
+    String ret = "Detected pause in JVM or host machine (eg GC): "
+                   + "pause of approximately " + extraSleepTime + "ms\n";
+    if (gcDiffs.isEmpty()) {
+      ret += "No GCs detected";
+    } else {
+      ret += Joiner.on("\n").join(gcDiffs);
+    }
+    return ret;
+  }
+
+  public JvmPauseMonitor(RaftServerProxy proxy) {
+    this.proxy = proxy;
+  }
+
+  private Map<String, GcTimes> getGcTimes() {
+    Map<String, GcTimes> map = Maps.newHashMap();
+    List<GarbageCollectorMXBean> gcBeans =
+      ManagementFactory.getGarbageCollectorMXBeans();
+    for (GarbageCollectorMXBean gcBean : gcBeans) {
+      map.put(gcBean.getName(), new GcTimes(gcBean));
+    }
+    return map;
+  }
+
+  private static class GcTimes {
+    private GcTimes(GarbageCollectorMXBean gcBean) {
+      gcCount = gcBean.getCollectionCount();
+      gcTimeMillis = gcBean.getCollectionTime();
+    }
+
+    private GcTimes(long count, long time) {
+      this.gcCount = count;
+      this.gcTimeMillis = time;
+    }
+
+    private GcTimes subtract(GcTimes other) {
+      return new GcTimes(this.gcCount - other.gcCount,
+        this.gcTimeMillis - other.gcTimeMillis);
+    }
+
+    @Override
+    public String toString() {
+      return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+    }
+
+    private long gcCount;
+    private long gcTimeMillis;
+  }
+
+  class Monitor implements Runnable {
+    private final String name = JvmPauseMonitor.this + "-" + JavaUtils
+                                                               .getClassSimpleName(getClass());
+
+    public void run() {
+      while (shouldRun) {
+        Stopwatch sw = Stopwatch.createUnstarted();
+        Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
+        LOG.info("Starting Ratis JVM pause monitor");
+        while (shouldRun) {
+          sw.reset().start();
+          try {
+            Thread.sleep(SLEEP_INTERVAL_MS);
+          } catch (InterruptedException ie) {
+            return;
+          }
+          long extraSleepTime =
+            sw.elapsed(TimeUnit.MILLISECONDS) - SLEEP_INTERVAL_MS;
+          Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
+
+          if (extraSleepTime > 0) {
+            LOG.warn(formatMessage(extraSleepTime, gcTimesAfterSleep,
+              gcTimesBeforeSleep));
+          }
+          totalGcExtraSleepTime += extraSleepTime;
+          gcTimesBeforeSleep = gcTimesAfterSleep;
+          int leaderStepDownWaitTime =
+            RaftServerConfigKeys.LeaderElection.leaderStepDownWaitTime(
+              proxy.getProperties()).toIntExact(TimeUnit.MILLISECONDS);
+          int rpcSlownessTimeoutMs = RaftServerConfigKeys.Rpc.slownessTimeout(
+            proxy.getProperties()).toIntExact(TimeUnit.MILLISECONDS);
+          try {
+            if (totalGcExtraSleepTime > leaderStepDownWaitTime) {
+              proxy.getImpls().forEach(RaftServerImpl::stepDownOnJvmPause);
+            }
+            if (totalGcExtraSleepTime > rpcSlownessTimeoutMs) {

Review comment:
       They should use extraSleepTime instead of totalGcExtraSleepTime.

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff
+                                                                     .toString());
+      }
+    }
+
+    String ret = "Detected pause in JVM or host machine (eg GC): "
+                   + "pause of approximately " + extraSleepTime + "ms\n";
+    if (gcDiffs.isEmpty()) {
+      ret += "No GCs detected";
+    } else {
+      ret += Joiner.on("\n").join(gcDiffs);
+    }
+    return ret;
+  }
+
+  public JvmPauseMonitor(RaftServerProxy proxy) {
+    this.proxy = proxy;
+  }
+
+  private Map<String, GcTimes> getGcTimes() {
+    Map<String, GcTimes> map = Maps.newHashMap();
+    List<GarbageCollectorMXBean> gcBeans =
+      ManagementFactory.getGarbageCollectorMXBeans();
+    for (GarbageCollectorMXBean gcBean : gcBeans) {
+      map.put(gcBean.getName(), new GcTimes(gcBean));
+    }
+    return map;
+  }
+
+  private static class GcTimes {
+    private GcTimes(GarbageCollectorMXBean gcBean) {
+      gcCount = gcBean.getCollectionCount();
+      gcTimeMillis = gcBean.getCollectionTime();
+    }
+
+    private GcTimes(long count, long time) {
+      this.gcCount = count;
+      this.gcTimeMillis = time;
+    }
+
+    private GcTimes subtract(GcTimes other) {
+      return new GcTimes(this.gcCount - other.gcCount,
+        this.gcTimeMillis - other.gcTimeMillis);
+    }
+
+    @Override
+    public String toString() {
+      return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+    }
+
+    private long gcCount;
+    private long gcTimeMillis;
+  }
+
+  class Monitor implements Runnable {
+    private final String name = JvmPauseMonitor.this + "-" + JavaUtils
+                                                               .getClassSimpleName(getClass());
+
+    public void run() {
+      while (shouldRun) {

Review comment:
       This while-loop can be removed.  There is another while loop inside.

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/impl/RaftServerImpl.java
##########
@@ -633,6 +633,13 @@ void assertGroup(Object requestorId, RaftGroupId requestorGroupId) throws GroupM
     return pending.getFuture();
   }
 
+  public void stepDownOnJvmPause() {
+    if (isLeader()) {
+      final LeaderState leaderState = role.getLeaderStateNonNull();
+      leaderState.submitStepDownEvent(LeaderState.StepDownReason.JVM_PAUSE);
+    }

Review comment:
       It should use `getLeaderState()` instead of `getLeaderStateNonNull()`
   ```suggestion
       role.getLeaderState().ifPresent(leader -> leader.submitStepDownEvent(LeaderState.StepDownReason.JVM_PAUSE));
   ```

##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;

Review comment:
       Remove totalGcExtraSleepTime.  It is used incorrectly in the code below.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] runzhiwang commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
runzhiwang commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r524849196



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/impl/RaftServerProxy.java
##########
@@ -346,6 +350,7 @@ public void close() {
         LOG.warn(getId() + ": Failed to close " + SupportedDataStreamType.NETTY + " server", ignored);
       }
     });
+    pauseMonitor.stop();

Review comment:
       Do we need try catch ?  so that close can continue when pauseMonitor.stop() throw exception.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] bshashikant commented on pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
bshashikant commented on pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#issuecomment-726658388


   @szetszwo / @runzhiwang , can you please have a look at the idea. This is a patch in progress and will add unit tests once the idea seems ok to you.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] szetszwo commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
szetszwo commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r523366237



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/impl/RaftServerProxy.java
##########
@@ -292,7 +295,7 @@ public RaftServerImpl getImpl(RaftGroupId groupId) throws IOException {
     return IOUtils.getFromFuture(getImplFuture(groupId), this::getId);
   }
 
-  List<RaftServerImpl> getImpls() throws IOException {
+  public List<RaftServerImpl> getImpls() throws IOException {

Review comment:
       We should not expose RaftServerImpl to RaftServer.  Thanks for asking.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] runzhiwang commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
runzhiwang commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r524836727



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),

Review comment:
       The max line length is 120 in ratis. do not need to change line.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [incubator-ratis] bshashikant commented on a change in pull request #275: RATIS-1149. Leader should step down and initiate a stateMachine action in case of jvm pauses.

Posted by GitBox <gi...@apache.org>.
bshashikant commented on a change in pull request #275:
URL: https://github.com/apache/incubator-ratis/pull/275#discussion_r524903696



##########
File path: ratis-server/src/main/java/org/apache/ratis/server/JvmPauseMonitor.java
##########
@@ -0,0 +1,181 @@
+package org.apache.ratis.server;
+
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.server.impl.RaftServerImpl;
+import org.apache.ratis.server.impl.RaftServerProxy;
+import org.apache.ratis.thirdparty.com.google.common.base.Joiner;
+import org.apache.ratis.thirdparty.com.google.common.base.Stopwatch;
+import org.apache.ratis.thirdparty.com.google.common.collect.Lists;
+import org.apache.ratis.thirdparty.com.google.common.collect.Maps;
+import org.apache.ratis.thirdparty.com.google.common.collect.Sets;
+import org.apache.ratis.util.Daemon;
+import org.apache.ratis.util.JavaUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+
+public class JvmPauseMonitor {
+  public static final Logger LOG =
+    LoggerFactory.getLogger(JvmPauseMonitor.class);
+  /**
+   * The target sleep time
+   */
+  private static final long SLEEP_INTERVAL_MS = 500;
+  private long totalGcExtraSleepTime = 0;
+
+  private Thread monitorThread;
+  private volatile boolean shouldRun = true;
+  private final RaftServerProxy proxy;
+
+  public boolean isStarted() {
+    return monitorThread != null;
+  }
+
+  public long getTotalGcExtraSleepTime() {
+    return totalGcExtraSleepTime;
+  }
+
+  private String formatMessage(long extraSleepTime,
+                               Map<String, GcTimes> gcTimesAfterSleep,
+                               Map<String, GcTimes> gcTimesBeforeSleep) {
+
+    Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
+      gcTimesBeforeSleep.keySet());
+    List<String> gcDiffs = Lists.newArrayList();
+    for (String name : gcBeanNames) {
+      GcTimes diff = gcTimesAfterSleep.get(name)
+                       .subtract(gcTimesBeforeSleep.get(name));
+      if (diff.gcCount != 0) {
+        gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff
+                                                                     .toString());
+      }
+    }
+
+    String ret = "Detected pause in JVM or host machine (eg GC): "
+                   + "pause of approximately " + extraSleepTime + "ms\n";
+    if (gcDiffs.isEmpty()) {
+      ret += "No GCs detected";
+    } else {
+      ret += Joiner.on("\n").join(gcDiffs);
+    }
+    return ret;
+  }
+
+  public JvmPauseMonitor(RaftServerProxy proxy) {
+    this.proxy = proxy;
+  }
+
+  private Map<String, GcTimes> getGcTimes() {
+    Map<String, GcTimes> map = Maps.newHashMap();
+    List<GarbageCollectorMXBean> gcBeans =
+      ManagementFactory.getGarbageCollectorMXBeans();
+    for (GarbageCollectorMXBean gcBean : gcBeans) {
+      map.put(gcBean.getName(), new GcTimes(gcBean));
+    }
+    return map;
+  }
+
+  private static class GcTimes {
+    private GcTimes(GarbageCollectorMXBean gcBean) {
+      gcCount = gcBean.getCollectionCount();
+      gcTimeMillis = gcBean.getCollectionTime();
+    }
+
+    private GcTimes(long count, long time) {
+      this.gcCount = count;
+      this.gcTimeMillis = time;
+    }
+
+    private GcTimes subtract(GcTimes other) {
+      return new GcTimes(this.gcCount - other.gcCount,
+        this.gcTimeMillis - other.gcTimeMillis);
+    }
+
+    @Override
+    public String toString() {
+      return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+    }
+
+    private long gcCount;
+    private long gcTimeMillis;
+  }
+
+  class Monitor implements Runnable {
+    private final String name = JvmPauseMonitor.this + "-" + JavaUtils
+                                                               .getClassSimpleName(getClass());
+
+    public void run() {
+      while (shouldRun) {
+        Stopwatch sw = Stopwatch.createUnstarted();
+        Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
+        LOG.info("Starting Ratis JVM pause monitor");
+        while (shouldRun) {
+          sw.reset().start();
+          try {
+            Thread.sleep(SLEEP_INTERVAL_MS);
+          } catch (InterruptedException ie) {
+            return;

Review comment:
       I think return is better , as the monitor will be stopped by RaftServerProxy on close.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org