You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2020/12/01 23:32:33 UTC
[geode] branch support/1.13 updated: GEODE-8721: member that should
become coordinator never detects loss of current coordinator (#5758)
This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch support/1.13
in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/support/1.13 by this push:
new 956c9aa GEODE-8721: member that should become coordinator never detects loss of current coordinator (#5758)
956c9aa is described below
commit 956c9aa66328a8bed14892f16d230f8d4f6c8105
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Mon Nov 30 08:08:58 2020 -0800
GEODE-8721: member that should become coordinator never detects loss of current coordinator (#5758)
* GEODE-8721: member that should become coordinator never detects loss of current coordinator
If a server is in the process of performing an availability check on
another server we shouldn't update the contact timestamp for
the suspected server based on gossip from another server. Doing so
will make the availability check pass and send out another gossip
message that would likewise update their timestamps for the suspected
server, perpetuating the notion that the suspect is still around.
* added VisibleForTesting
(cherry picked from commit b7afc604b9c2fafe4388dcdcf05fc7ec49c0ce86)
---
.../gms/fd/GMSHealthMonitorJUnitTest.java | 51 ++++++++++++++++++++++
.../membership/gms/fd/GMSHealthMonitor.java | 19 ++++++--
2 files changed, 66 insertions(+), 4 deletions(-)
diff --git a/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java b/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
index f734e81..2aaf2f5 100644
--- a/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
+++ b/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
@@ -634,6 +634,29 @@ public class GMSHealthMonitorJUnitTest {
}
@Test
+ public void testFinalCheckInProgressPreemptsLivenessGossip() throws Exception {
+ // if a member is undergoing a final check we should not accept another member's
+ // gossip about the suspect being alive and should not update the contact timestamp
+ // because that interferes with the final check
+ useGMSHealthMonitorTestClass = true;
+ simulateHeartbeatInGMSHealthMonitorTestClass = false;
+
+ GMSMembershipView v = installAView();
+ setFailureDetectionPorts(v);
+
+ MemberIdentifier memberToCheck = gmsHealthMonitor.getNextNeighbor();
+ GMSHealthMonitorTest testMonitor = (GMSHealthMonitorTest) gmsHealthMonitor;
+
+ // set an old contact timestamp for the suspect, tell the monitor that an availability
+ // check succeeded and then make sure it didn't update the timestamp for the suspect
+ final long timestamp = testMonitor.establishCurrentTime() - 2000;
+ testMonitor.setContactTimestamp(memberToCheck, timestamp);
+ testMonitor.addMemberInFinalCheck(memberToCheck);
+ testMonitor.processMessage(new FinalCheckPassedMessage(null, memberToCheck));
+ assertThat(testMonitor.getContactTimestamp(memberToCheck)).isEqualTo(timestamp);
+ }
+
+ @Test
public void testFailedSelfCheckRemovesMemberAsSuspect() throws Exception {
useGMSHealthMonitorTestClass = true;
simulateHeartbeatInGMSHealthMonitorTestClass = false;
@@ -1019,6 +1042,34 @@ public class GMSHealthMonitorJUnitTest {
return serverSocket;
}
}
+
+ /**
+ * when a suspect is undergoing an availability check its identifier will
+ * be in the membersInFinalCheck collection
+ */
+ public void addMemberInFinalCheck(MemberIdentifier memberToCheck) {
+ membersInFinalCheck.add(memberToCheck);
+ }
+
+ public void setContactTimestamp(MemberIdentifier memberToCheck, long timestamp) {
+ memberTimeStamps.put(memberToCheck, new TimeStamp(timestamp));
+ }
+
+ public long getContactTimestamp(MemberIdentifier memberIdentifier) {
+ return ((TimeStamp) memberTimeStamps.get(memberIdentifier)).getTime();
+ }
+
+ /**
+ * Establish the currentTimeStamp for the health monitor. This is the timestamp
+ * used in contactedBy() updates and is usually established by the Monitor thread
+ * in GMSHealthMonitor but is initially zero.
+ *
+ * @return the timestamp
+ */
+ public long establishCurrentTime() {
+ currentTimeStamp = System.currentTimeMillis();
+ return currentTimeStamp;
+ }
}
public class TrickySocket extends ServerSocket {
diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index bd95e57..2590e23 100644
--- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -52,6 +52,7 @@ import java.util.stream.Collectors;
import org.apache.logging.log4j.Logger;
import org.jgroups.util.UUID;
+import org.apache.geode.annotations.VisibleForTesting;
import org.apache.geode.distributed.internal.membership.api.MemberIdentifier;
import org.apache.geode.distributed.internal.membership.api.MemberStartupException;
import org.apache.geode.distributed.internal.membership.api.MembershipClosedException;
@@ -130,7 +131,11 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni
public static final long MEMBER_SUSPECT_COLLECTION_INTERVAL =
Long.getLong("geode.suspect-member-collection-interval", 200);
- private volatile long currentTimeStamp;
+ /**
+ * A millisecond clock reading used to mark the last time a peer made contact.
+ */
+ @VisibleForTesting
+ volatile long currentTimeStamp;
/**
* this member's ID
@@ -152,7 +157,8 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni
/**
* Members undergoing final checks
*/
- private final List<ID> membersInFinalCheck =
+ @VisibleForTesting
+ final List<ID> membersInFinalCheck =
Collections.synchronizedList(new ArrayList<>(30));
/**
@@ -206,7 +212,7 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni
* /**
* this class is to avoid garbage
*/
- private static class TimeStamp {
+ static class TimeStamp {
private volatile long timeStamp;
@@ -257,6 +263,7 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni
return;
}
+ // TODO - why are we taking two clock readings and setting currentTimeStamp twice?
long currentTime = System.currentTimeMillis();
// this is the start of interval to record member activity
GMSHealthMonitor.this.currentTimeStamp = currentTime;
@@ -1240,7 +1247,11 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni
if (isStopping) {
return;
}
- contactedBy(m.getSuspect());
+ // if we're currently processing a final-check for this member don't artificially update the
+ // timestamp of the member or the final-check will be invalid
+ if (!membersInFinalCheck.contains(m.getSuspect())) {
+ contactedBy(m.getSuspect());
+ }
}