You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2015/09/09 23:19:36 UTC

incubator-geode git commit: GEODE-77 reducing aggressiveness of the health monitor

Repository: incubator-geode
Updated Branches:
  refs/heads/feature/GEODE-77 299966d7a -> 36b81da23


GEODE-77 reducing aggressiveness of the health monitor

The health monitor was timing out a peer after not seeing a message from
it in 100ms.  HA testing has shown that this is far too aggressive, with
most test runs dieing a quick and tragic death with all members trying
to kick each other out of the distributed system.

This commit reverts to our old way of waiting member-timeout ms before
giving up on a peer, and adds a final check of the member activity map
before allowing a suspect message to be sent to the coordinator.


Project: http://git-wip-us.apache.org/repos/asf/incubator-geode/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-geode/commit/36b81da2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-geode/tree/36b81da2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-geode/diff/36b81da2

Branch: refs/heads/feature/GEODE-77
Commit: 36b81da23e8e6ceeedd6b5189f54a9c3af226695
Parents: 299966d
Author: Bruce Schuchardt <bs...@pivotal.io>
Authored: Wed Sep 9 14:19:02 2015 -0700
Committer: Bruce Schuchardt <bs...@pivotal.io>
Committed: Wed Sep 9 14:19:02 2015 -0700

----------------------------------------------------------------------
 .../internal/InternalDistributedSystem.java         |  9 ++++-----
 .../membership/gms/fd/GMSHealthMonitor.java         | 16 ++++++++--------
 2 files changed, 12 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/36b81da2/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java
index 8a4e20a..3ed9607 100644
--- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java
+++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java
@@ -2573,12 +2573,11 @@ public final class InternalDistributedSystem
     
 //    logger.info("reconnecting IDS@"+System.identityHashCode(this));
 
-    if (Thread.currentThread().getName().equals("CloserThread")) {
-      if (isDebugEnabled) {
-        logger.debug("changing thread name to ReconnectThread"); // wha?! really?
-      }
-      Thread.currentThread().setName("ReconnectThread");
+    if (isDebugEnabled) {
+      logger.debug("changing thread name to ReconnectThread");
     }
+    Thread.currentThread().setName("ReconnectThread");
+    Thread.currentThread().setDaemon(false);
     
     // get the membership manager for quorum checks
     MembershipManager mbrMgr = this.dm.getMembershipManager();

http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/36b81da2/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index c6f1e02..d0822d0 100755
--- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -81,13 +81,6 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
   /** stall time to wait for members leaving concurrently */
   public static final long MEMBER_SUSPECT_COLLECTION_INTERVAL = Long.getLong("geode.suspect-member-collection-interval", 200);
 
-  /**
-   * If member don't see any activity from particular member then it sends check request
-   * to that member. And then it waits for "geode.member-check-timeout" time for response
-   * from it. If that member doesn't respond then it issues suspect request for it. 
-   */
-  private static final long MEMBER_CHECK_TIMEOUT = Long.getLong("geode.member-check-timeout", 100);
-
   volatile long currentTimeStamp;
 
   final private Map<InternalDistributedMember, CustomTimeStamp> memberVsLastMsgTS = new ConcurrentHashMap<>();
@@ -270,9 +263,16 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       } else {
         synchronized (pingResp) {
           if (pingResp.getResponseMsg() == null) {
-            pingResp.wait(MEMBER_CHECK_TIMEOUT);
+            pingResp.wait(services.getConfig().getMemberTimeout());
           }
           if (pingResp.getResponseMsg() == null) {
+            // double check the activity log
+            CustomTimeStamp ts = memberVsLastMsgTS.get(pingMember);
+            if (ts != null &&
+                ts.getTimeStamp()
+                  > (System.currentTimeMillis() - services.getConfig().getMemberTimeout())) {
+              return true;
+            }
             return false;
           } else {
             return true;