You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/05/15 15:03:24 UTC
[geode] 01/01: GEODE-6583 Integrate phi-accrual failure detection
into Geode
This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch feature/GEODE-6583b
in repository https://gitbox.apache.org/repos/asf/geode.git
commit 5bdfeda7e3b4a583fe1afe585c0e4729ab4c9ae2
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Wed May 15 08:01:36 2019 -0700
GEODE-6583 Integrate phi-accrual failure detection into Geode
give all members a heartbeat if we notice a JVM pause
---
.../membership/gms/fd/GMSHealthMonitor.java | 38 ++++++++++++++++++++--
1 file changed, 36 insertions(+), 2 deletions(-)
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index 43da869..63b30b4 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -59,6 +59,7 @@ import org.apache.geode.GemFireConfigException;
import org.apache.geode.SystemConnectException;
import org.apache.geode.distributed.DistributedMember;
import org.apache.geode.distributed.internal.DMStats;
+import org.apache.geode.distributed.internal.DistributionConfig;
import org.apache.geode.distributed.internal.DistributionMessage;
import org.apache.geode.distributed.internal.membership.InternalDistributedMember;
import org.apache.geode.distributed.internal.membership.NetView;
@@ -198,6 +199,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
private DMStats stats;
/**
+ * Interval to run the Monitor task
+ */
+ private long monitorInterval;
+
+ /**
+ * /**
* this class is to avoid garbage
*/
private static class TimeStamp {
@@ -226,6 +233,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
* member in last interval(member-timeout)
*/
private class Monitor implements Runnable {
+ /**
+ * Here we use the same threshold for detecting JVM pauses as the StatSampler
+ */
+ private final long MONITOR_DELAY_THRESHOLD =
+ Long.getLong(DistributionConfig.GEMFIRE_PREFIX + "statSamplerDelayThreshold", 3000);
+
final long memberTimeoutInMillis;
@@ -246,6 +259,26 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
// this is the start of interval to record member activity
GMSHealthMonitor.this.currentTimeStamp = currentTime;
+
+ long oldTimeStamp = currentTimeStamp;
+ currentTimeStamp = System.currentTimeMillis();
+
+ NetView myView = GMSHealthMonitor.this.currentView;
+ if (myView == null) {
+ return;
+ }
+
+ if (currentTimeStamp - oldTimeStamp > monitorInterval + MONITOR_DELAY_THRESHOLD) {
+ // delay in running this task - don't suspect anyone for a while
+ logger.info(
+ "Failure detector has noticed a JVM pause and is giving all members a heartbeat in view {}",
+ currentView);
+ for (InternalDistributedMember member : myView.getMembers()) {
+ contactedBy(member);
+ }
+ return;
+ }
+
if (neighbour != null) {
TimeStamp nextNeighborTS;
synchronized (GMSHealthMonitor.this) {
@@ -629,8 +662,9 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
scheduler = LoggingExecutors.newScheduledThreadPool("Geode Failure Detection Scheduler", 1);
checkExecutor = LoggingExecutors.newCachedThreadPool("Geode Failure Detection thread ", true);
Monitor m = this.new Monitor(memberTimeout);
- long delay = memberTimeout / LOGICAL_INTERVAL;
- monitorFuture = scheduler.scheduleAtFixedRate(m, delay, delay, TimeUnit.MILLISECONDS);
+ monitorInterval = memberTimeout / LOGICAL_INTERVAL;
+ monitorFuture =
+ scheduler.scheduleAtFixedRate(m, monitorInterval, monitorInterval, TimeUnit.MILLISECONDS);
serverSocketExecutor =
LoggingExecutors.newCachedThreadPool("Geode Failure Detection Server thread ", true);
}