You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2020/12/01 23:32:33 UTC

[geode] branch support/1.13 updated: GEODE-8721: member that should become coordinator never detects loss of current coordinator (#5758)

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch support/1.13
in repository https://gitbox.apache.org/repos/asf/geode.git


The following commit(s) were added to refs/heads/support/1.13 by this push:
     new 956c9aa  GEODE-8721: member that should become coordinator never detects loss of current coordinator (#5758)
956c9aa is described below

commit 956c9aa66328a8bed14892f16d230f8d4f6c8105
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Mon Nov 30 08:08:58 2020 -0800

    GEODE-8721: member that should become coordinator never detects loss of current coordinator (#5758)
    
    * GEODE-8721: member that should become coordinator never detects loss of current coordinator
    
    If a server is in the process of performing an availability check on
    another server we shouldn't update the contact timestamp for
    the suspected server based on gossip from another server.  Doing so
    will make the availability check pass and send out another gossip
    message that would likewise update their timestamps for the suspected
    server, perpetuating the notion that the suspect is still around.
    
    * added VisibleForTesting
    
    (cherry picked from commit b7afc604b9c2fafe4388dcdcf05fc7ec49c0ce86)
---
 .../gms/fd/GMSHealthMonitorJUnitTest.java          | 51 ++++++++++++++++++++++
 .../membership/gms/fd/GMSHealthMonitor.java        | 19 ++++++--
 2 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java b/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
index f734e81..2aaf2f5 100644
--- a/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
+++ b/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
@@ -634,6 +634,29 @@ public class GMSHealthMonitorJUnitTest {
   }
 
   @Test
+  public void testFinalCheckInProgressPreemptsLivenessGossip() throws Exception {
+    // if a member is undergoing a final check we should not accept another member's
+    // gossip about the suspect being alive and should not update the contact timestamp
+    // because that interferes with the final check
+    useGMSHealthMonitorTestClass = true;
+    simulateHeartbeatInGMSHealthMonitorTestClass = false;
+
+    GMSMembershipView v = installAView();
+    setFailureDetectionPorts(v);
+
+    MemberIdentifier memberToCheck = gmsHealthMonitor.getNextNeighbor();
+    GMSHealthMonitorTest testMonitor = (GMSHealthMonitorTest) gmsHealthMonitor;
+
+    // set an old contact timestamp for the suspect, tell the monitor that an availability
+    // check succeeded and then make sure it didn't update the timestamp for the suspect
+    final long timestamp = testMonitor.establishCurrentTime() - 2000;
+    testMonitor.setContactTimestamp(memberToCheck, timestamp);
+    testMonitor.addMemberInFinalCheck(memberToCheck);
+    testMonitor.processMessage(new FinalCheckPassedMessage(null, memberToCheck));
+    assertThat(testMonitor.getContactTimestamp(memberToCheck)).isEqualTo(timestamp);
+  }
+
+  @Test
   public void testFailedSelfCheckRemovesMemberAsSuspect() throws Exception {
     useGMSHealthMonitorTestClass = true;
     simulateHeartbeatInGMSHealthMonitorTestClass = false;
@@ -1019,6 +1042,34 @@ public class GMSHealthMonitorJUnitTest {
         return serverSocket;
       }
     }
+
+    /**
+     * when a suspect is undergoing an availability check its identifier will
+     * be in the membersInFinalCheck collection
+     */
+    public void addMemberInFinalCheck(MemberIdentifier memberToCheck) {
+      membersInFinalCheck.add(memberToCheck);
+    }
+
+    public void setContactTimestamp(MemberIdentifier memberToCheck, long timestamp) {
+      memberTimeStamps.put(memberToCheck, new TimeStamp(timestamp));
+    }
+
+    public long getContactTimestamp(MemberIdentifier memberIdentifier) {
+      return ((TimeStamp) memberTimeStamps.get(memberIdentifier)).getTime();
+    }
+
+    /**
+     * Establish the currentTimeStamp for the health monitor. This is the timestamp
+     * used in contactedBy() updates and is usually established by the Monitor thread
+     * in GMSHealthMonitor but is initially zero.
+     *
+     * @return the timestamp
+     */
+    public long establishCurrentTime() {
+      currentTimeStamp = System.currentTimeMillis();
+      return currentTimeStamp;
+    }
   }
 
   public class TrickySocket extends ServerSocket {
diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index bd95e57..2590e23 100644
--- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -52,6 +52,7 @@ import java.util.stream.Collectors;
 import org.apache.logging.log4j.Logger;
 import org.jgroups.util.UUID;
 
+import org.apache.geode.annotations.VisibleForTesting;
 import org.apache.geode.distributed.internal.membership.api.MemberIdentifier;
 import org.apache.geode.distributed.internal.membership.api.MemberStartupException;
 import org.apache.geode.distributed.internal.membership.api.MembershipClosedException;
@@ -130,7 +131,11 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni
   public static final long MEMBER_SUSPECT_COLLECTION_INTERVAL =
       Long.getLong("geode.suspect-member-collection-interval", 200);
 
-  private volatile long currentTimeStamp;
+  /**
+   * A millisecond clock reading used to mark the last time a peer made contact.
+   */
+  @VisibleForTesting
+  volatile long currentTimeStamp;
 
   /**
    * this member's ID
@@ -152,7 +157,8 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni
   /**
    * Members undergoing final checks
    */
-  private final List<ID> membersInFinalCheck =
+  @VisibleForTesting
+  final List<ID> membersInFinalCheck =
       Collections.synchronizedList(new ArrayList<>(30));
 
   /**
@@ -206,7 +212,7 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni
    * /**
    * this class is to avoid garbage
    */
-  private static class TimeStamp {
+  static class TimeStamp {
 
     private volatile long timeStamp;
 
@@ -257,6 +263,7 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni
           return;
         }
 
+        // TODO - why are we taking two clock readings and setting currentTimeStamp twice?
         long currentTime = System.currentTimeMillis();
         // this is the start of interval to record member activity
         GMSHealthMonitor.this.currentTimeStamp = currentTime;
@@ -1240,7 +1247,11 @@ public class GMSHealthMonitor<ID extends MemberIdentifier> implements HealthMoni
     if (isStopping) {
       return;
     }
-    contactedBy(m.getSuspect());
+    // if we're currently processing a final-check for this member don't artificially update the
+    // timestamp of the member or the final-check will be invalid
+    if (!membersInFinalCheck.contains(m.getSuspect())) {
+      contactedBy(m.getSuspect());
+    }
   }