You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/05/15 15:03:24 UTC

[geode] 01/01: GEODE-6583 Integrate phi-accrual failure detection into Geode

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch feature/GEODE-6583b
in repository https://gitbox.apache.org/repos/asf/geode.git

commit 5bdfeda7e3b4a583fe1afe585c0e4729ab4c9ae2
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Wed May 15 08:01:36 2019 -0700

    GEODE-6583 Integrate phi-accrual failure detection into Geode
    
    give all members a heartbeat if we notice a JVM pause
---
 .../membership/gms/fd/GMSHealthMonitor.java        | 38 ++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index 43da869..63b30b4 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -59,6 +59,7 @@ import org.apache.geode.GemFireConfigException;
 import org.apache.geode.SystemConnectException;
 import org.apache.geode.distributed.DistributedMember;
 import org.apache.geode.distributed.internal.DMStats;
+import org.apache.geode.distributed.internal.DistributionConfig;
 import org.apache.geode.distributed.internal.DistributionMessage;
 import org.apache.geode.distributed.internal.membership.InternalDistributedMember;
 import org.apache.geode.distributed.internal.membership.NetView;
@@ -198,6 +199,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
   private DMStats stats;
 
   /**
+   * Interval to run the Monitor task
+   */
+  private long monitorInterval;
+
+  /**
+   * /**
    * this class is to avoid garbage
    */
   private static class TimeStamp {
@@ -226,6 +233,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
    * member in last interval(member-timeout)
    */
   private class Monitor implements Runnable {
+    /**
+     * Here we use the same threshold for detecting JVM pauses as the StatSampler
+     */
+    private final long MONITOR_DELAY_THRESHOLD =
+        Long.getLong(DistributionConfig.GEMFIRE_PREFIX + "statSamplerDelayThreshold", 3000);
+
 
     final long memberTimeoutInMillis;
 
@@ -246,6 +259,26 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       // this is the start of interval to record member activity
       GMSHealthMonitor.this.currentTimeStamp = currentTime;
 
+
+      long oldTimeStamp = currentTimeStamp;
+      currentTimeStamp = System.currentTimeMillis();
+
+      NetView myView = GMSHealthMonitor.this.currentView;
+      if (myView == null) {
+        return;
+      }
+
+      if (currentTimeStamp - oldTimeStamp > monitorInterval + MONITOR_DELAY_THRESHOLD) {
+        // delay in running this task - don't suspect anyone for a while
+        logger.info(
+            "Failure detector has noticed a JVM pause and is giving all members a heartbeat in view {}",
+            currentView);
+        for (InternalDistributedMember member : myView.getMembers()) {
+          contactedBy(member);
+        }
+        return;
+      }
+
       if (neighbour != null) {
         TimeStamp nextNeighborTS;
         synchronized (GMSHealthMonitor.this) {
@@ -629,8 +662,9 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
     scheduler = LoggingExecutors.newScheduledThreadPool("Geode Failure Detection Scheduler", 1);
     checkExecutor = LoggingExecutors.newCachedThreadPool("Geode Failure Detection thread ", true);
     Monitor m = this.new Monitor(memberTimeout);
-    long delay = memberTimeout / LOGICAL_INTERVAL;
-    monitorFuture = scheduler.scheduleAtFixedRate(m, delay, delay, TimeUnit.MILLISECONDS);
+    monitorInterval = memberTimeout / LOGICAL_INTERVAL;
+    monitorFuture =
+        scheduler.scheduleAtFixedRate(m, monitorInterval, monitorInterval, TimeUnit.MILLISECONDS);
     serverSocketExecutor =
         LoggingExecutors.newCachedThreadPool("Geode Failure Detection Server thread ", true);
   }