You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/05/22 21:45:19 UTC

[geode] branch feature/GEODE-6732 created (now c65b633)

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a change to branch feature/GEODE-6732
in repository https://gitbox.apache.org/repos/asf/geode.git.


      at c65b633  GEODE-6732 GMSHealthMonitor reports member is not available when self-health check fails

This branch includes the following new commits:

     new c65b633  GEODE-6732 GMSHealthMonitor reports member is not available when self-health check fails

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[geode] 01/01: GEODE-6732 GMSHealthMonitor reports member is not available when self-health check fails

Posted by bs...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch feature/GEODE-6732
in repository https://gitbox.apache.org/repos/asf/geode.git

commit c65b63303d297c502d9ae52d1cace1c5dceae15f
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Wed May 22 14:42:37 2019 -0700

    GEODE-6732 GMSHealthMonitor reports member is not available when self-health check fails
    
    If a self-health check fails we should give a member that's under
    suspicion a break and stop suspecting it for the moment.  If it's really
    not there anymore a subsequent check will happen and that will resolve
    the issue.
---
 .../gms/fd/GMSHealthMonitorJUnitTest.java          | 23 ++++++++++++++++++++++
 .../membership/gms/fd/GMSHealthMonitor.java        | 13 +++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
index 071b750..5dd04ab 100644
--- a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
+++ b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
@@ -50,8 +50,10 @@ import java.net.Socket;
 import java.net.SocketAddress;
 import java.net.UnknownHostException;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Properties;
+import java.util.Set;
 import java.util.Timer;
 import java.util.concurrent.locks.Condition;
 import java.util.concurrent.locks.Lock;
@@ -617,6 +619,25 @@ public class GMSHealthMonitorJUnitTest {
     assertTrue(gmsHealthMonitor.isSuspectMember(memberToCheck));
   }
 
+  @Test
+  public void testFailedSelfCheckRemovesMemberAsSuspect() {
+    useGMSHealthMonitorTestClass = true;
+    simulateHeartbeatInGMSHealthMonitorTestClass = false;
+    NetView v = installAView();
+
+    setFailureDetectionPorts(v);
+
+    InternalDistributedMember memberToCheck = gmsHealthMonitor.getNextNeighbor();
+    gmsHealthMonitor.stopServer();
+    boolean available = gmsHealthMonitor.checkIfAvailable(memberToCheck, "Not responding", false);
+    assertTrue(available);
+    verify(joinLeave, never()).remove(isA(InternalDistributedMember.class), isA(String.class));
+    assertTrue(((GMSHealthMonitorTest) gmsHealthMonitor).availabilityCheckedMembers
+        .contains(memberToCheck));
+    assertTrue(((GMSHealthMonitorTest) gmsHealthMonitor).availabilityCheckedMembers
+        .contains(joinLeave.getMemberID()));
+  }
+
   /**
    * a failed availablility check should initiate suspect processing
    */
@@ -901,10 +922,12 @@ public class GMSHealthMonitorJUnitTest {
 
   public class GMSHealthMonitorTest extends GMSHealthMonitor {
     public boolean useBlockingSocket = false;
+    public Set<InternalDistributedMember> availabilityCheckedMembers = new HashSet<>();
 
     @Override
     boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port,
         boolean retryIfConnectFails) {
+      availabilityCheckedMembers.add(suspectMember);
       if (useGMSHealthMonitorTestClass) {
         if (simulateHeartbeatInGMSHealthMonitorTestClass) {
           HeartbeatMessage fakeHeartbeat = new HeartbeatMessage();
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index 43da869..4976d5d 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -925,6 +925,10 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       checkExecutor.shutdown();
     }
 
+    stopServer();
+  }
+
+  void stopServer() {
     if (serverSocketExecutor != null) {
       if (serverSocket != null && !serverSocket.isClosed()) {
         try {
@@ -1321,9 +1325,12 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
             // make sure it is still suspected
             memberSuspected(localAddress, mbr, reason);
           } else {
+            failed = true;
             // if this node can survive an availability check then initiate suspicion about
             // the node that failed the availability check
+            logger.info("BRUCE: invoking self-check on {}", localAddress);
             if (doTCPCheckMember(localAddress, this.socketPort, false)) {
+              logger.info("BRUCE: self-check passed");
               membersInFinalCheck.remove(mbr);
               // tell peers about this member and then perform another availability check
               memberSuspected(localAddress, mbr, reason);
@@ -1335,9 +1342,13 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
               suspectMembersMessage.setSender(localAddress);
               logger.debug("Performing local processing on suspect request");
               processSuspectMembersRequest(suspectMembersMessage);
+            } else {
+              logger.info(
+                  "Self-check for availability failed - will not continue to suspect {} for now",
+                  mbr);
+              failed = false;
             }
           }
-          failed = true;
         } else {
           logger.info(
               "Availability check failed but detected recent message traffic for suspect member "