You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/05/22 21:45:20 UTC
[geode] 01/01: GEODE-6732 GMSHealthMonitor reports member is not
available when self-health check fails
This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch feature/GEODE-6732
in repository https://gitbox.apache.org/repos/asf/geode.git
commit c65b63303d297c502d9ae52d1cace1c5dceae15f
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Wed May 22 14:42:37 2019 -0700
GEODE-6732 GMSHealthMonitor reports member is not available when self-health check fails
If a self-health check fails we should give a member that's under
suspicion a break and stop suspecting it for the moment. If it's really
not there anymore a subsequent check will happen and that will resolve
the issue.
---
.../gms/fd/GMSHealthMonitorJUnitTest.java | 23 ++++++++++++++++++++++
.../membership/gms/fd/GMSHealthMonitor.java | 13 +++++++++++-
2 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
index 071b750..5dd04ab 100644
--- a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
+++ b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
@@ -50,8 +50,10 @@ import java.net.Socket;
import java.net.SocketAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.List;
import java.util.Properties;
+import java.util.Set;
import java.util.Timer;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
@@ -617,6 +619,25 @@ public class GMSHealthMonitorJUnitTest {
assertTrue(gmsHealthMonitor.isSuspectMember(memberToCheck));
}
+ @Test
+ public void testFailedSelfCheckRemovesMemberAsSuspect() {
+ useGMSHealthMonitorTestClass = true;
+ simulateHeartbeatInGMSHealthMonitorTestClass = false;
+ NetView v = installAView();
+
+ setFailureDetectionPorts(v);
+
+ InternalDistributedMember memberToCheck = gmsHealthMonitor.getNextNeighbor();
+ gmsHealthMonitor.stopServer();
+ boolean available = gmsHealthMonitor.checkIfAvailable(memberToCheck, "Not responding", false);
+ assertTrue(available);
+ verify(joinLeave, never()).remove(isA(InternalDistributedMember.class), isA(String.class));
+ assertTrue(((GMSHealthMonitorTest) gmsHealthMonitor).availabilityCheckedMembers
+ .contains(memberToCheck));
+ assertTrue(((GMSHealthMonitorTest) gmsHealthMonitor).availabilityCheckedMembers
+ .contains(joinLeave.getMemberID()));
+ }
+
/**
* a failed availablility check should initiate suspect processing
*/
@@ -901,10 +922,12 @@ public class GMSHealthMonitorJUnitTest {
public class GMSHealthMonitorTest extends GMSHealthMonitor {
public boolean useBlockingSocket = false;
+ public Set<InternalDistributedMember> availabilityCheckedMembers = new HashSet<>();
@Override
boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port,
boolean retryIfConnectFails) {
+ availabilityCheckedMembers.add(suspectMember);
if (useGMSHealthMonitorTestClass) {
if (simulateHeartbeatInGMSHealthMonitorTestClass) {
HeartbeatMessage fakeHeartbeat = new HeartbeatMessage();
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index 43da869..4976d5d 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -925,6 +925,10 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
checkExecutor.shutdown();
}
+ stopServer();
+ }
+
+ void stopServer() {
if (serverSocketExecutor != null) {
if (serverSocket != null && !serverSocket.isClosed()) {
try {
@@ -1321,9 +1325,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
// make sure it is still suspected
memberSuspected(localAddress, mbr, reason);
} else {
+ failed = true;
// if this node can survive an availability check then initiate suspicion about
// the node that failed the availability check
+ logger.info("BRUCE: invoking self-check on {}", localAddress);
if (doTCPCheckMember(localAddress, this.socketPort, false)) {
+ logger.info("BRUCE: self-check passed");
membersInFinalCheck.remove(mbr);
// tell peers about this member and then perform another availability check
memberSuspected(localAddress, mbr, reason);
@@ -1335,9 +1342,13 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
suspectMembersMessage.setSender(localAddress);
logger.debug("Performing local processing on suspect request");
processSuspectMembersRequest(suspectMembersMessage);
+ } else {
+ logger.info(
+ "Self-check for availability failed - will not continue to suspect {} for now",
+ mbr);
+ failed = false;
}
}
- failed = true;
} else {
logger.info(
"Availability check failed but detected recent message traffic for suspect member "