You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2015/09/09 23:19:36 UTC
incubator-geode git commit: GEODE-77 reducing aggressiveness of the
health monitor
Repository: incubator-geode
Updated Branches:
refs/heads/feature/GEODE-77 299966d7a -> 36b81da23
GEODE-77 reducing aggressiveness of the health monitor
The health monitor was timing out a peer after not seeing a message from
it in 100ms. HA testing has shown that this is far too aggressive, with
most test runs dieing a quick and tragic death with all members trying
to kick each other out of the distributed system.
This commit reverts to our old way of waiting member-timeout ms before
giving up on a peer, and adds a final check of the member activity map
before allowing a suspect message to be sent to the coordinator.
Project: http://git-wip-us.apache.org/repos/asf/incubator-geode/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-geode/commit/36b81da2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-geode/tree/36b81da2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-geode/diff/36b81da2
Branch: refs/heads/feature/GEODE-77
Commit: 36b81da23e8e6ceeedd6b5189f54a9c3af226695
Parents: 299966d
Author: Bruce Schuchardt <bs...@pivotal.io>
Authored: Wed Sep 9 14:19:02 2015 -0700
Committer: Bruce Schuchardt <bs...@pivotal.io>
Committed: Wed Sep 9 14:19:02 2015 -0700
----------------------------------------------------------------------
.../internal/InternalDistributedSystem.java | 9 ++++-----
.../membership/gms/fd/GMSHealthMonitor.java | 16 ++++++++--------
2 files changed, 12 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/36b81da2/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java
index 8a4e20a..3ed9607 100644
--- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java
+++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java
@@ -2573,12 +2573,11 @@ public final class InternalDistributedSystem
// logger.info("reconnecting IDS@"+System.identityHashCode(this));
- if (Thread.currentThread().getName().equals("CloserThread")) {
- if (isDebugEnabled) {
- logger.debug("changing thread name to ReconnectThread"); // wha?! really?
- }
- Thread.currentThread().setName("ReconnectThread");
+ if (isDebugEnabled) {
+ logger.debug("changing thread name to ReconnectThread");
}
+ Thread.currentThread().setName("ReconnectThread");
+ Thread.currentThread().setDaemon(false);
// get the membership manager for quorum checks
MembershipManager mbrMgr = this.dm.getMembershipManager();
http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/36b81da2/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
----------------------------------------------------------------------
diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index c6f1e02..d0822d0 100755
--- a/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/gemfire-core/src/main/java/com/gemstone/gemfire/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -81,13 +81,6 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
/** stall time to wait for members leaving concurrently */
public static final long MEMBER_SUSPECT_COLLECTION_INTERVAL = Long.getLong("geode.suspect-member-collection-interval", 200);
- /**
- * If member don't see any activity from particular member then it sends check request
- * to that member. And then it waits for "geode.member-check-timeout" time for response
- * from it. If that member doesn't respond then it issues suspect request for it.
- */
- private static final long MEMBER_CHECK_TIMEOUT = Long.getLong("geode.member-check-timeout", 100);
-
volatile long currentTimeStamp;
final private Map<InternalDistributedMember, CustomTimeStamp> memberVsLastMsgTS = new ConcurrentHashMap<>();
@@ -270,9 +263,16 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
} else {
synchronized (pingResp) {
if (pingResp.getResponseMsg() == null) {
- pingResp.wait(MEMBER_CHECK_TIMEOUT);
+ pingResp.wait(services.getConfig().getMemberTimeout());
}
if (pingResp.getResponseMsg() == null) {
+ // double check the activity log
+ CustomTimeStamp ts = memberVsLastMsgTS.get(pingMember);
+ if (ts != null &&
+ ts.getTimeStamp()
+ > (System.currentTimeMillis() - services.getConfig().getMemberTimeout())) {
+ return true;
+ }
return false;
} else {
return true;