You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/05/15 15:03:23 UTC

[geode] branch feature/GEODE-6583b created (now 5bdfeda)

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a change to branch feature/GEODE-6583b
in repository https://gitbox.apache.org/repos/asf/geode.git.


      at 5bdfeda  GEODE-6583 Integrate phi-accrual failure detection into Geode

This branch includes the following new commits:

     new 5bdfeda  GEODE-6583 Integrate phi-accrual failure detection into Geode

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[geode] 01/01: GEODE-6583 Integrate phi-accrual failure detection into Geode

Posted by bs...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch feature/GEODE-6583b
in repository https://gitbox.apache.org/repos/asf/geode.git

commit 5bdfeda7e3b4a583fe1afe585c0e4729ab4c9ae2
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Wed May 15 08:01:36 2019 -0700

    GEODE-6583 Integrate phi-accrual failure detection into Geode
    
    give all members a heartbeat if we notice a JVM pause
---
 .../membership/gms/fd/GMSHealthMonitor.java        | 38 ++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index 43da869..63b30b4 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -59,6 +59,7 @@ import org.apache.geode.GemFireConfigException;
 import org.apache.geode.SystemConnectException;
 import org.apache.geode.distributed.DistributedMember;
 import org.apache.geode.distributed.internal.DMStats;
+import org.apache.geode.distributed.internal.DistributionConfig;
 import org.apache.geode.distributed.internal.DistributionMessage;
 import org.apache.geode.distributed.internal.membership.InternalDistributedMember;
 import org.apache.geode.distributed.internal.membership.NetView;
@@ -198,6 +199,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
   private DMStats stats;
 
   /**
+   * Interval to run the Monitor task
+   */
+  private long monitorInterval;
+
+  /**
+   * /**
    * this class is to avoid garbage
    */
   private static class TimeStamp {
@@ -226,6 +233,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
    * member in last interval(member-timeout)
    */
   private class Monitor implements Runnable {
+    /**
+     * Here we use the same threshold for detecting JVM pauses as the StatSampler
+     */
+    private final long MONITOR_DELAY_THRESHOLD =
+        Long.getLong(DistributionConfig.GEMFIRE_PREFIX + "statSamplerDelayThreshold", 3000);
+
 
     final long memberTimeoutInMillis;
 
@@ -246,6 +259,26 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       // this is the start of interval to record member activity
       GMSHealthMonitor.this.currentTimeStamp = currentTime;
 
+
+      long oldTimeStamp = currentTimeStamp;
+      currentTimeStamp = System.currentTimeMillis();
+
+      NetView myView = GMSHealthMonitor.this.currentView;
+      if (myView == null) {
+        return;
+      }
+
+      if (currentTimeStamp - oldTimeStamp > monitorInterval + MONITOR_DELAY_THRESHOLD) {
+        // delay in running this task - don't suspect anyone for a while
+        logger.info(
+            "Failure detector has noticed a JVM pause and is giving all members a heartbeat in view {}",
+            currentView);
+        for (InternalDistributedMember member : myView.getMembers()) {
+          contactedBy(member);
+        }
+        return;
+      }
+
       if (neighbour != null) {
         TimeStamp nextNeighborTS;
         synchronized (GMSHealthMonitor.this) {
@@ -629,8 +662,9 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
     scheduler = LoggingExecutors.newScheduledThreadPool("Geode Failure Detection Scheduler", 1);
     checkExecutor = LoggingExecutors.newCachedThreadPool("Geode Failure Detection thread ", true);
     Monitor m = this.new Monitor(memberTimeout);
-    long delay = memberTimeout / LOGICAL_INTERVAL;
-    monitorFuture = scheduler.scheduleAtFixedRate(m, delay, delay, TimeUnit.MILLISECONDS);
+    monitorInterval = memberTimeout / LOGICAL_INTERVAL;
+    monitorFuture =
+        scheduler.scheduleAtFixedRate(m, monitorInterval, monitorInterval, TimeUnit.MILLISECONDS);
     serverSocketExecutor =
         LoggingExecutors.newCachedThreadPool("Geode Failure Detection Server thread ", true);
   }