You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/05/15 15:03:23 UTC
[geode] branch feature/GEODE-6583b created (now 5bdfeda)
This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a change to branch feature/GEODE-6583b
in repository https://gitbox.apache.org/repos/asf/geode.git.
at 5bdfeda GEODE-6583 Integrate phi-accrual failure detection into Geode
This branch includes the following new commits:
new 5bdfeda GEODE-6583 Integrate phi-accrual failure detection into Geode
The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
[geode] 01/01: GEODE-6583 Integrate phi-accrual failure detection
into Geode
Posted by bs...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch feature/GEODE-6583b
in repository https://gitbox.apache.org/repos/asf/geode.git
commit 5bdfeda7e3b4a583fe1afe585c0e4729ab4c9ae2
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Wed May 15 08:01:36 2019 -0700
GEODE-6583 Integrate phi-accrual failure detection into Geode
give all members a heartbeat if we notice a JVM pause
---
.../membership/gms/fd/GMSHealthMonitor.java | 38 ++++++++++++++++++++--
1 file changed, 36 insertions(+), 2 deletions(-)
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index 43da869..63b30b4 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -59,6 +59,7 @@ import org.apache.geode.GemFireConfigException;
import org.apache.geode.SystemConnectException;
import org.apache.geode.distributed.DistributedMember;
import org.apache.geode.distributed.internal.DMStats;
+import org.apache.geode.distributed.internal.DistributionConfig;
import org.apache.geode.distributed.internal.DistributionMessage;
import org.apache.geode.distributed.internal.membership.InternalDistributedMember;
import org.apache.geode.distributed.internal.membership.NetView;
@@ -198,6 +199,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
private DMStats stats;
/**
+ * Interval to run the Monitor task
+ */
+ private long monitorInterval;
+
+ /**
+ * /**
* this class is to avoid garbage
*/
private static class TimeStamp {
@@ -226,6 +233,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
* member in last interval(member-timeout)
*/
private class Monitor implements Runnable {
+ /**
+ * Here we use the same threshold for detecting JVM pauses as the StatSampler
+ */
+ private final long MONITOR_DELAY_THRESHOLD =
+ Long.getLong(DistributionConfig.GEMFIRE_PREFIX + "statSamplerDelayThreshold", 3000);
+
final long memberTimeoutInMillis;
@@ -246,6 +259,26 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
// this is the start of interval to record member activity
GMSHealthMonitor.this.currentTimeStamp = currentTime;
+
+ long oldTimeStamp = currentTimeStamp;
+ currentTimeStamp = System.currentTimeMillis();
+
+ NetView myView = GMSHealthMonitor.this.currentView;
+ if (myView == null) {
+ return;
+ }
+
+ if (currentTimeStamp - oldTimeStamp > monitorInterval + MONITOR_DELAY_THRESHOLD) {
+ // delay in running this task - don't suspect anyone for a while
+ logger.info(
+ "Failure detector has noticed a JVM pause and is giving all members a heartbeat in view {}",
+ currentView);
+ for (InternalDistributedMember member : myView.getMembers()) {
+ contactedBy(member);
+ }
+ return;
+ }
+
if (neighbour != null) {
TimeStamp nextNeighborTS;
synchronized (GMSHealthMonitor.this) {
@@ -629,8 +662,9 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
scheduler = LoggingExecutors.newScheduledThreadPool("Geode Failure Detection Scheduler", 1);
checkExecutor = LoggingExecutors.newCachedThreadPool("Geode Failure Detection thread ", true);
Monitor m = this.new Monitor(memberTimeout);
- long delay = memberTimeout / LOGICAL_INTERVAL;
- monitorFuture = scheduler.scheduleAtFixedRate(m, delay, delay, TimeUnit.MILLISECONDS);
+ monitorInterval = memberTimeout / LOGICAL_INTERVAL;
+ monitorFuture =
+ scheduler.scheduleAtFixedRate(m, monitorInterval, monitorInterval, TimeUnit.MILLISECONDS);
serverSocketExecutor =
LoggingExecutors.newCachedThreadPool("Geode Failure Detection Server thread ", true);
}