You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2019/01/17 22:56:42 UTC

[geode] branch develop updated: GEODE-6244 Healthy member kicked out by sick member

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode.git


The following commit(s) were added to refs/heads/develop by this push:
     new f4b8cf2  GEODE-6244 Healthy member kicked out by sick member
f4b8cf2 is described below

commit f4b8cf2f8dbcb98b541b24238b50b4066ff136a8
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Thu Jan 17 14:52:23 2019 -0800

    GEODE-6244 Healthy member kicked out by sick member
    
    - do not allow membership manager suspect initiation to kick out a
    member on the first failed check
    - perform a self-health check before sending SuspectRequest messages
    - consider members who have sent shutdown messages as gone when
    performing "should I become coordinator" checks in GMSHealthMonitor
    - modified the membership view installed by GMSJoinLeave to be immutable
    so it isn't inadvertently changes
    
    Squashed commit of the following:
    
    commit 44f37c38d3b42f1ec7b1c440cae234b3fc123955
    Author: Bruce Schuchardt <bs...@pivotal.io>
    Date:   Thu Jan 17 14:28:25 2019 -0800
    
        fixes for regression failures
    
    commit 320e98f85f2e16ea5a48dc316aeb81094b7cfd8d
    Author: Bruce Schuchardt <bs...@pivotal.io>
    Date:   Fri Jan 4 14:42:03 2019 -0800
    
        fix for failing unit tests & a lgtm warning
    
    commit 144f94335042fa8d879413edefe48aa02abb7cb3
    Author: Bruce Schuchardt <bs...@pivotal.io>
    Date:   Fri Jan 4 10:36:22 2019 -0800
    
        fixes for unit test hang
    
        - remove suspect from members-in-final-check collection and initiate
        both remote and local suspect processing
        - renamed "final check" to "availability check" since it isn't
        necessarily a "final" check
        - perform a self-check before telling others to check a suspect
    
    commit fb3dfd00477cc48fb2d4dd85fe1ec532ed68f82b
    Author: Bruce Schuchardt <bs...@pivotal.io>
    Date:   Thu Jan 3 14:22:51 2019 -0800
    
        leave the member unsuspected after final check fails
    
        If the final check fails and we're not going to remove the suspect from
        the distributed system we need to leave it in an "unsuspected" state
        locally so that the background monitoring thread will look at it again.
    
        Also, if the final check failed in the membership coordinator there's no
        point in doing another check so we move directly to removing the
        suspect.
    
    commit 25134b19e2a324ff04c3a3d1139bafe641031729
    Author: Bruce Schuchardt <bs...@pivotal.io>
    Date:   Thu Jan 3 12:49:56 2019 -0800
    
        GEODE-6244 Healthy member kicked out by Sick member
    
        GMSMembershipManager.verifyMember() should not initiate direct removal
        of the target member if an availability check fails.  Instead it should
        initiate suspect processing.
    
        This adds new unit tests for GMSHealthMonitor.checkIfAvailable() and
        changes the availability check to initiate suspect processing if the
        check fails.
---
 .../cache/query/dunit/QueryUsingPoolDUnitTest.java |  1 +
 .../gms/fd/GMSHealthMonitorJUnitTest.java          | 35 ++++++++++-
 .../distributed/internal/membership/NetView.java   | 34 +++++-----
 .../membership/gms/fd/GMSHealthMonitor.java        | 72 ++++++++++++++++------
 .../membership/gms/membership/GMSJoinLeave.java    |  3 +-
 .../membership/gms/mgr/GMSMembershipManager.java   |  2 +-
 .../tier/sockets/command/ExecuteFunction66.java    |  2 +-
 7 files changed, 109 insertions(+), 40 deletions(-)

diff --git a/geode-core/src/distributedTest/java/org/apache/geode/cache/query/dunit/QueryUsingPoolDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/cache/query/dunit/QueryUsingPoolDUnitTest.java
index ab2423a..f1e3ace 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/cache/query/dunit/QueryUsingPoolDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/cache/query/dunit/QueryUsingPoolDUnitTest.java
@@ -105,6 +105,7 @@ public class QueryUsingPoolDUnitTest extends JUnit4CacheTestCase {
     disconnectAllFromDS();
     IgnoredException.addIgnoredException("Connection reset");
     IgnoredException.addIgnoredException("Socket input is shutdown");
+    IgnoredException.addIgnoredException("Connection refused");
   }
 
   @Override
diff --git a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
index 8001aed..f8f8f7f 100644
--- a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
+++ b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
@@ -150,7 +150,7 @@ public class GMSHealthMonitorJUnitTest {
     when(stopper.isCancelInProgress()).thenReturn(false);
 
     if (mockMembers == null) {
-      mockMembers = new ArrayList<InternalDistributedMember>();
+      mockMembers = new ArrayList<>();
       for (int i = 0; i < 7; i++) {
         InternalDistributedMember mbr = new InternalDistributedMember("localhost", 8888 + i);
 
@@ -615,6 +615,39 @@ public class GMSHealthMonitorJUnitTest {
     assertTrue(gmsHealthMonitor.isSuspectMember(memberToCheck));
   }
 
+  /**
+   * a failed availablility check should initiate suspect processing
+   */
+  @Test
+  public void testFailedCheckIfAvailableDoesNotRemoveMember() {
+    NetView v = installAView();
+
+    setFailureDetectionPorts(v);
+
+    InternalDistributedMember memberToCheck = gmsHealthMonitor.getNextNeighbor();
+    boolean available = gmsHealthMonitor.checkIfAvailable(memberToCheck, "Not responding", false);
+    assertFalse(available);
+    verify(joinLeave, never()).remove(isA(InternalDistributedMember.class), isA(String.class));
+    assertFalse(gmsHealthMonitor.isSuspectMember(memberToCheck));
+  }
+
+
+  /**
+   * Same test as above but with request to initiate removal
+   */
+  @Test
+  public void testFailedCheckIfAvailableRemovesMember() {
+    NetView v = installAView();
+
+    setFailureDetectionPorts(v);
+
+    InternalDistributedMember memberToCheck = gmsHealthMonitor.getNextNeighbor();
+    boolean available = gmsHealthMonitor.checkIfAvailable(memberToCheck, "Not responding", true);
+    assertFalse(available);
+    verify(joinLeave).remove(isA(InternalDistributedMember.class), isA(String.class));
+  }
+
+
 
   @Test
   public void testShutdown() {
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/NetView.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/NetView.java
index c9ed332..6e304e7 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/NetView.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/NetView.java
@@ -512,35 +512,23 @@ public class NetView implements DataSerializableFixedID {
     InternalDistributedMember lead = getLeadMember();
 
     StringBuilder sb = new StringBuilder(200);
-    sb.append("View[").append(creator).append('|').append(viewId).append("] members: [");
-    boolean first = true;
+    sb.append("View[").append(creator).append('|').append(viewId).append("]\nmembers: [");
     for (InternalDistributedMember mbr : this.members) {
-      if (!first)
-        sb.append(", ");
-      sb.append(mbr);
+      sb.append("\n").append(mbr);
       if (mbr == lead) {
         sb.append("{lead}");
       }
-      first = false;
     }
     if (!this.shutdownMembers.isEmpty()) {
-      sb.append("]  shutdown: [");
-      first = true;
+      sb.append("]\nshutdown: [");
       for (InternalDistributedMember mbr : this.shutdownMembers) {
-        if (!first)
-          sb.append(", ");
-        sb.append(mbr);
-        first = false;
+        sb.append("\n").append(mbr);
       }
     }
     if (!this.crashedMembers.isEmpty()) {
-      sb.append("]  crashed: [");
-      first = true;
+      sb.append("]\ncrashed: [");
       for (InternalDistributedMember mbr : this.crashedMembers) {
-        if (!first)
-          sb.append(", ");
-        sb.append(mbr);
-        first = false;
+        sb.append("\n").append(mbr);
       }
     }
     // sb.append("] fd ports: [");
@@ -640,4 +628,14 @@ public class NetView implements DataSerializableFixedID {
   public int getDSFID() {
     return NETVIEW;
   }
+
+
+  /**
+   * This will alter the NetView to throw exceptions if an attempt is made to
+   * add or remove members from the view. Do this to avoid concurrent
+   * modification exceptions.
+   */
+  public void makeImmutable() {
+    this.members = Collections.unmodifiableList(members);
+  }
 }
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index 45bd9c1..cf6e9e5 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -437,10 +437,7 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
     if (services.getJoinLeave().isMemberLeaving(mbr)) {
       return;
     }
-    SuspectRequest sr = new SuspectRequest(mbr, reason);
-    List<SuspectRequest> sl = new ArrayList<>();
-    sl.add(sr);
-    sendSuspectRequest(sl);
+    sendSuspectRequest(Collections.singletonList(new SuspectRequest(mbr, reason)));
   }
 
   /**
@@ -582,7 +579,8 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
         return false;
       }
     } catch (SocketTimeoutException e) {
-      logger.debug("Final check TCP/IP connection timed out for suspect member {}", suspectMember);
+      logger.debug("Availability check TCP/IP connection timed out for suspect member {}",
+          suspectMember);
       return false;
     } catch (IOException e) {
       logger.trace("Unexpected exception", e);
@@ -1104,6 +1102,9 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
 
     NetView cv = currentView;
 
+    logger.info("Received suspect message {} with current view {}", incomingRequest,
+        cv == null ? "<no view>" : cv.getViewId());
+
     if (cv == null) {
       return;
     }
@@ -1138,7 +1139,9 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       }
     }
 
-    logger.debug("Processing suspect requests {}", suspectRequests);
+    logger.debug(
+        "Processing suspect requests {}\nproposed view is currently {}\nwith coordinator {}",
+        suspectRequests, cv, cv.getCoordinator());
     if (cv.getCoordinator().equals(localAddress)) {
       // This process is the membership coordinator and should perform a final check
       logSuspectRequests(incomingRequest, sender);
@@ -1153,13 +1156,26 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       synchronized (suspectRequestsInView) {
         recordSuspectRequests(suspectRequests, cv);
         Set<SuspectRequest> suspectsInView = suspectRequestsInView.get(cv);
-        logger.debug("Current suspects for view #{} are {}", cv.getViewId(), suspectsInView);
+        logger.info("Current suspects are {}", suspectsInView);
         for (final SuspectRequest sr : suspectsInView) {
           check.remove(sr.getSuspectMember());
           membersToCheck.add(sr);
         }
       }
-      logger.trace("Trial view with suspects removed is {}\nmy address is {}", check, localAddress);
+      List membersLeaving = new ArrayList();
+      for (InternalDistributedMember member : cv.getMembers()) {
+        if (services.getJoinLeave().isMemberLeaving(member)) {
+          membersLeaving.add(member);
+        }
+      }
+      if (!membersLeaving.isEmpty()) {
+        logger.info("Current leave requests are {}", membersLeaving);
+        check.removeAll(membersLeaving);
+      }
+      logger.info(
+          "Proposed view with suspects & leaving members removed is {}\nwith coordinator {}\nmy address is {}",
+          check,
+          check.getCoordinator(), localAddress);
 
       InternalDistributedMember coordinator = check.getCoordinator();
       if (coordinator != null && coordinator.equals(localAddress)) {
@@ -1217,7 +1233,7 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       }
 
       final String reason = sr.getReason();
-      logger.debug("Scheduling final check for member {}; reason={}", mbr, reason);
+      logger.debug("Scheduling availability check for member {}; reason={}", mbr, reason);
       // its a coordinator
       checkExecutor.execute(() -> {
         try {
@@ -1232,7 +1248,7 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
   }
 
   protected boolean inlineCheckIfAvailable(final InternalDistributedMember initiator,
-      final NetView cv, boolean initiateRemoval, final InternalDistributedMember mbr,
+      final NetView cv, boolean forceRemovalIfCheckFails, final InternalDistributedMember mbr,
       final String reason) {
 
     if (services.getJoinLeave().isMemberLeaving(mbr)) {
@@ -1241,7 +1257,7 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
 
     boolean failed = false;
 
-    logger.info("Performing final check for suspect member {} reason={}", mbr, reason);
+    logger.info("Performing availability check for suspect member {} reason={}", mbr, reason);
     membersInFinalCheck.add(mbr);
     setNextNeighbor(currentView, mbr);
 
@@ -1278,17 +1294,36 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       if (!pinged && !isStopping) {
         TimeStamp ts = memberTimeStamps.get(mbr);
         if (ts == null || ts.getTime() < startTime) {
-          logger.info("Final check failed for member {}", mbr);
-          if (initiateRemoval) {
+          logger.info("Availability check failed for member {}", mbr);
+          // if the final check fails & this VM is the coordinator we don't need to do another final
+          // check
+          if (forceRemovalIfCheckFails) {
             logger.info("Requesting removal of suspect member {}", mbr);
             services.getJoinLeave().remove(mbr, reason);
+            // make sure it is still suspected
+            memberSuspected(localAddress, mbr, reason);
+          } else {
+            // if this node can survive an availability check then initiate suspicion about
+            // the node that failed the availability check
+            if (doTCPCheckMember(localAddress, this.socketPort)) {
+              membersInFinalCheck.remove(mbr);
+              // tell peers about this member and then perform another availability check
+              memberSuspected(localAddress, mbr, reason);
+              initiateSuspicion(mbr, reason);
+              SuspectMembersMessage suspectMembersMessage =
+                  new SuspectMembersMessage(Collections.singletonList(localAddress),
+                      Collections
+                          .singletonList(new SuspectRequest(mbr, "failed availability check")));
+              suspectMembersMessage.setSender(localAddress);
+              logger.info("Performing local processing on suspect request");
+              processSuspectMembersRequest(suspectMembersMessage);
+            }
           }
-          // make sure it is still suspected
-          memberSuspected(localAddress, mbr, reason);
           failed = true;
         } else {
           logger.info(
-              "Final check failed but detected recent message traffic for suspect member " + mbr);
+              "Availability check failed but detected recent message traffic for suspect member "
+                  + mbr);
         }
       }
 
@@ -1300,7 +1335,7 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
           services.getMessenger().send(message);
         }
 
-        logger.info("Final check passed for suspect member " + mbr);
+        logger.info("Availability check passed for suspect member " + mbr);
       }
     } finally {
       if (!failed) {
@@ -1338,6 +1373,7 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
       recipients = currentView.getMembers();
     }
 
+    logger.info("Sending suspect messages to {}", recipients);
     SuspectMembersMessage smm = new SuspectMembersMessage(recipients, requests);
     Set<InternalDistributedMember> failedRecipients;
     try {
@@ -1348,7 +1384,7 @@ public class GMSHealthMonitor implements HealthMonitor, MessageHandler {
     }
 
     if (failedRecipients != null && failedRecipients.size() > 0) {
-      logger.info("Unable to send suspect message to {}", recipients);
+      logger.info("Unable to send suspect message to {}", failedRecipients);
     }
   }
 
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 5693fbb..39c03cb 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -923,7 +923,7 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
         getViewCreator().markViewCreatorForShutdown();
         this.isCoordinator = false;
       }
-      installView(view);
+      installView(new NetView(view, view.getViewId()));
     }
 
     if (recips.isEmpty()) {
@@ -1397,6 +1397,7 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
       }
 
       logger.info("received new view: {}\nold view is: {}", newView, currentView);
+      newView.makeImmutable();
 
       if (currentView == null && !this.isJoined) {
         boolean found = false;
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/mgr/GMSMembershipManager.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/mgr/GMSMembershipManager.java
index 6ccfcb8..fedbe58 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/mgr/GMSMembershipManager.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/mgr/GMSMembershipManager.java
@@ -1671,7 +1671,7 @@ public class GMSMembershipManager implements MembershipManager, Manager {
   @Override
   public boolean verifyMember(DistributedMember mbr, String reason) {
     return mbr != null && memberExists(mbr)
-        && this.services.getHealthMonitor().checkIfAvailable(mbr, reason, true);
+        && this.services.getHealthMonitor().checkIfAvailable(mbr, reason, false);
   }
 
   /**
diff --git a/geode-core/src/main/java/org/apache/geode/internal/cache/tier/sockets/command/ExecuteFunction66.java b/geode-core/src/main/java/org/apache/geode/internal/cache/tier/sockets/command/ExecuteFunction66.java
index 20d95bd..9a04143 100644
--- a/geode-core/src/main/java/org/apache/geode/internal/cache/tier/sockets/command/ExecuteFunction66.java
+++ b/geode-core/src/main/java/org/apache/geode/internal/cache/tier/sockets/command/ExecuteFunction66.java
@@ -164,7 +164,7 @@ public class ExecuteFunction66 extends BaseCommand {
         functionObject = internalFunctionExecutionService.getFunction((String) function);
         if (functionObject == null) {
           String message = String
-              .format("Function named %s is not registered to FunctionService for %s", function);
+              .format("Function named %s is not registered to FunctionService", function);
           logger.warn("{}: {}", serverConnection.getName(), message);
           sendError(hasResult, clientMessage, message, serverConnection);
           return;