You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/04/02 20:50:03 UTC

[geode] branch feature/GEODE-6423b created (now 251436c)

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a change to branch feature/GEODE-6423b
in repository https://gitbox.apache.org/repos/asf/geode.git.


      at 251436c  GEODE-6423 availability checks sometimes immediately initiate removal

This branch includes the following new commits:

     new 251436c  GEODE-6423 availability checks sometimes immediately initiate removal

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[geode] 01/01: GEODE-6423 availability checks sometimes immediately initiate removal

Posted by bs...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch feature/GEODE-6423b
in repository https://gitbox.apache.org/repos/asf/geode.git

commit 251436cfb8618580a331841c8dd7938e48b56a8c
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Tue Apr 2 13:46:37 2019 -0700

    GEODE-6423 availability checks sometimes immediately initiate removal
    
    Do not loop in trying to form a tcp/ip connection to a suspect unless
    the next step is to remove the suspect from membership.  In this case
    there will be another invocation of the same method that will take the
    removal step next.
---
 .../gms/fd/GMSHealthMonitorJUnitTest.java          |  7 +++--
 .../membership/gms/fd/GMSHealthMonitor.java        | 34 ++++++++++------------
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
index d80b424..071b750 100644
--- a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
+++ b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
@@ -817,7 +817,7 @@ public class GMSHealthMonitorJUnitTest {
     InternalDistributedMember otherMember =
         createInternalDistributedMember(Version.CURRENT_ORDINAL, 0, 1, 1);
     long startTime = System.currentTimeMillis();
-    gmsHealthMonitor.doTCPCheckMember(otherMember, mySocket.getLocalPort());
+    gmsHealthMonitor.doTCPCheckMember(otherMember, mySocket.getLocalPort(), true);
     mySocket.close();
     serverThread.interrupt();
     serverThread.join(getTimeout().getValueInMS());
@@ -903,7 +903,8 @@ public class GMSHealthMonitorJUnitTest {
     public boolean useBlockingSocket = false;
 
     @Override
-    boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port) {
+    boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port,
+        boolean retryIfConnectFails) {
       if (useGMSHealthMonitorTestClass) {
         if (simulateHeartbeatInGMSHealthMonitorTestClass) {
           HeartbeatMessage fakeHeartbeat = new HeartbeatMessage();
@@ -912,7 +913,7 @@ public class GMSHealthMonitorJUnitTest {
         }
         return false;
       }
-      return super.doTCPCheckMember(suspectMember, port);
+      return super.doTCPCheckMember(suspectMember, port, retryIfConnectFails);
     }
 
     @Override
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index d1d19e7..c054782 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -511,7 +511,8 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
    * @param suspectMember member that does not respond to HeartbeatRequestMessage
    * @return true if successfully exchanged PING/PONG with TCP connection, otherwise false.
    */
-  boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port) {
+  boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port,
+      boolean retryIfConnectFails) {
     Socket clientSocket = null;
     // make sure we try to check on the member for the contracted memberTimeout period
     // in case a timed socket.connect() returns immediately
@@ -555,7 +556,8 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
           // expected
         }
       }
-    } while (!passed && !this.isShutdown() && System.nanoTime() < giveupTime);
+    } while (retryIfConnectFails && !passed && !this.isShutdown()
+        && System.nanoTime() < giveupTime);
     return passed;
   }
 
@@ -690,7 +692,6 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
         if (!ssocket.isClosed()) {
           try {
             ssocket.close();
-            logger.info("GMSHealthMonitor server socket closed.");
           } catch (IOException e) {
             logger.debug("Unexpected exception", e);
           }
@@ -857,7 +858,7 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       }
       InternalDistributedMember oldNeighbor = nextNeighbor;
       if (oldNeighbor != newNeighbor) {
-        logger.info("Failure detection is now watching " + newNeighbor);
+        logger.debug("Failure detection is now watching " + newNeighbor);
         nextNeighbor = newNeighbor;
       }
     }
@@ -928,7 +929,6 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       if (serverSocket != null && !serverSocket.isClosed()) {
         try {
           serverSocket.close();
-          logger.info("GMSHealthMonitor server socket is closed in stopServices().");
         } catch (IOException e) {
           logger.trace("Unexpected exception", e);
         }
@@ -939,8 +939,6 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       } catch (InterruptedException e) {
         Thread.currentThread().interrupt();
       }
-      logger.info("GMSHealthMonitor serverSocketExecutor is "
-          + (serverSocketExecutor.isTerminated() ? "terminated" : "not terminated"));
     }
   }
 
@@ -1122,9 +1120,6 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
 
     NetView cv = currentView;
 
-    logger.info("Received suspect message {} with current view {}", incomingRequest,
-        cv == null ? "<no view>" : cv.getViewId());
-
     if (cv == null) {
       return;
     }
@@ -1176,7 +1171,7 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       synchronized (suspectRequestsInView) {
         recordSuspectRequests(suspectRequests, cv);
         Set<SuspectRequest> suspectsInView = suspectRequestsInView.get(cv);
-        logger.info("Current suspects are {}", suspectsInView);
+        logger.debug("Current suspects are {}", suspectsInView);
         for (final SuspectRequest sr : suspectsInView) {
           check.remove(sr.getSuspectMember());
           membersToCheck.add(sr);
@@ -1189,10 +1184,10 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
         }
       }
       if (!membersLeaving.isEmpty()) {
-        logger.info("Current leave requests are {}", membersLeaving);
+        logger.debug("Current leave requests are {}", membersLeaving);
         check.removeAll(membersLeaving);
       }
-      logger.info(
+      logger.trace(
           "Proposed view with suspects & leaving members removed is {}\nwith coordinator {}\nmy address is {}",
           check,
           check.getCoordinator(), localAddress);
@@ -1308,7 +1303,10 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
         // if we will get heartbeat then it will change the timestamp, which we are
         // checking below in case of tcp check failure..
         doCheckMember(mbr, false);
-        pinged = doTCPCheckMember(mbr, port);
+        // now, while waiting for a heartbeat, try connecting to the suspect's failure detection
+        // port
+        final boolean retryIfConnectFailes = forceRemovalIfCheckFails;
+        pinged = doTCPCheckMember(mbr, port, retryIfConnectFailes);
       }
 
       if (!pinged && !isStopping) {
@@ -1325,7 +1323,7 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
           } else {
             // if this node can survive an availability check then initiate suspicion about
             // the node that failed the availability check
-            if (doTCPCheckMember(localAddress, this.socketPort)) {
+            if (doTCPCheckMember(localAddress, this.socketPort, false)) {
               membersInFinalCheck.remove(mbr);
               // tell peers about this member and then perform another availability check
               memberSuspected(localAddress, mbr, reason);
@@ -1335,7 +1333,7 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
                       Collections
                           .singletonList(new SuspectRequest(mbr, "failed availability check")));
               suspectMembersMessage.setSender(localAddress);
-              logger.info("Performing local processing on suspect request");
+              logger.debug("Performing local processing on suspect request");
               processSuspectMembersRequest(suspectMembersMessage);
             }
           }
@@ -1398,7 +1396,7 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       recipients = currentView.getMembers();
     }
 
-    logger.info("Sending suspect messages to {}", recipients);
+    logger.trace("Sending suspect messages to {}", recipients);
     SuspectMembersMessage smm = new SuspectMembersMessage(recipients, requests);
     Set<InternalDistributedMember> failedRecipients;
     try {
@@ -1409,7 +1407,7 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
     }
 
     if (failedRecipients != null && failedRecipients.size() > 0) {
-      logger.info("Unable to send suspect message to {}", failedRecipients);
+      logger.trace("Unable to send suspect message to {}", failedRecipients);
     }
   }