You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/05/22 21:45:20 UTC

[geode] 01/01: GEODE-6732 GMSHealthMonitor reports member is not available when self-health check fails

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch feature/GEODE-6732
in repository https://gitbox.apache.org/repos/asf/geode.git

commit c65b63303d297c502d9ae52d1cace1c5dceae15f
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Wed May 22 14:42:37 2019 -0700

    GEODE-6732 GMSHealthMonitor reports member is not available when self-health check fails
    
    If a self-health check fails we should give a member that's under
    suspicion a break and stop suspecting it for the moment.  If it's really
    not there anymore a subsequent check will happen and that will resolve
    the issue.
---
 .../gms/fd/GMSHealthMonitorJUnitTest.java          | 23 ++++++++++++++++++++++
 .../membership/gms/fd/GMSHealthMonitor.java        | 13 +++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
index 071b750..5dd04ab 100644
--- a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
+++ b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
@@ -50,8 +50,10 @@ import java.net.Socket;
 import java.net.SocketAddress;
 import java.net.UnknownHostException;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Properties;
+import java.util.Set;
 import java.util.Timer;
 import java.util.concurrent.locks.Condition;
 import java.util.concurrent.locks.Lock;
@@ -617,6 +619,25 @@ public class GMSHealthMonitorJUnitTest {
     assertTrue(gmsHealthMonitor.isSuspectMember(memberToCheck));
   }
 
+  @Test
+  public void testFailedSelfCheckRemovesMemberAsSuspect() {
+    useGMSHealthMonitorTestClass = true;
+    simulateHeartbeatInGMSHealthMonitorTestClass = false;
+    NetView v = installAView();
+
+    setFailureDetectionPorts(v);
+
+    InternalDistributedMember memberToCheck = gmsHealthMonitor.getNextNeighbor();
+    gmsHealthMonitor.stopServer();
+    boolean available = gmsHealthMonitor.checkIfAvailable(memberToCheck, "Not responding", false);
+    assertTrue(available);
+    verify(joinLeave, never()).remove(isA(InternalDistributedMember.class), isA(String.class));
+    assertTrue(((GMSHealthMonitorTest) gmsHealthMonitor).availabilityCheckedMembers
+        .contains(memberToCheck));
+    assertTrue(((GMSHealthMonitorTest) gmsHealthMonitor).availabilityCheckedMembers
+        .contains(joinLeave.getMemberID()));
+  }
+
   /**
    * a failed availablility check should initiate suspect processing
    */
@@ -901,10 +922,12 @@ public class GMSHealthMonitorJUnitTest {
 
   public class GMSHealthMonitorTest extends GMSHealthMonitor {
     public boolean useBlockingSocket = false;
+    public Set<InternalDistributedMember> availabilityCheckedMembers = new HashSet<>();
 
     @Override
     boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port,
         boolean retryIfConnectFails) {
+      availabilityCheckedMembers.add(suspectMember);
       if (useGMSHealthMonitorTestClass) {
         if (simulateHeartbeatInGMSHealthMonitorTestClass) {
           HeartbeatMessage fakeHeartbeat = new HeartbeatMessage();
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index 43da869..4976d5d 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -925,6 +925,10 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       checkExecutor.shutdown();
     }
 
+    stopServer();
+  }
+
+  void stopServer() {
     if (serverSocketExecutor != null) {
       if (serverSocket != null && !serverSocket.isClosed()) {
         try {
@@ -1321,9 +1325,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
             // make sure it is still suspected
             memberSuspected(localAddress, mbr, reason);
           } else {
+            failed = true;
             // if this node can survive an availability check then initiate suspicion about
             // the node that failed the availability check
+            logger.info("BRUCE: invoking self-check on {}", localAddress);
             if (doTCPCheckMember(localAddress, this.socketPort, false)) {
+              logger.info("BRUCE: self-check passed");
               membersInFinalCheck.remove(mbr);
               // tell peers about this member and then perform another availability check
               memberSuspected(localAddress, mbr, reason);
@@ -1335,9 +1342,13 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
               suspectMembersMessage.setSender(localAddress);
               logger.debug("Performing local processing on suspect request");
               processSuspectMembersRequest(suspectMembersMessage);
+            } else {
+              logger.info(
+                  "Self-check for availability failed - will not continue to suspect {} for now",
+                  mbr);
+              failed = false;
             }
           }
-          failed = true;
         } else {
           logger.info(
               "Availability check failed but detected recent message traffic for suspect member "